From 3063153b5643e5ed04e8a9d7b50feecf3eba325e Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 14 Feb 2025 21:10:08 -0800
Subject: [PATCH 001/109] [clang-format] Fix a bug in annotating
 ObjCMethodSpecifier (#127159)

Fixes #58202.
---
 clang/lib/Format/TokenAnnotator.cpp           | 2 +-
 clang/unittests/Format/FormatTestObjC.cpp     | 7 +++++++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 7 +++++++
 3 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index b3540f39e6f69..069fd40e2834c 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1313,7 +1313,7 @@ class AnnotatingParser {
     switch (bool IsIf = false; Tok->Tok.getKind()) {
     case tok::plus:
     case tok::minus:
-      if (!Tok->Previous && Line.MustBeDeclaration)
+      if (!Tok->getPreviousNonComment() && Line.MustBeDeclaration)
         Tok->setType(TT_ObjCMethodSpecifier);
       break;
     case tok::colon:
diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp
index 9b6f0c396d4db..f7f73db62045c 100644
--- a/clang/unittests/Format/FormatTestObjC.cpp
+++ b/clang/unittests/Format/FormatTestObjC.cpp
@@ -567,6 +567,13 @@ TEST_F(FormatTestObjC, FormatObjCMethodDeclarations) {
                "                error:(NSError **)theError {\n"
                "}");
   verifyFormat("+ (instancetype)new;");
+
+  verifyFormat("/*\n"
+               " */\n"
+               "- (void)foo;",
+               "/*\n"
+               " */- (void)foo;");
+
   Style.ColumnLimit = 60;
   verifyFormat("- (instancetype)initXxxxxx:(id<x>)x\n"
                "                         y:(id<yyyyyyyyyyyyyyyyyyyy>)y\n"
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 5ab0867490122..7b489b1764cb2 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1849,6 +1849,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsObjCMethodExpr) {
   EXPECT_TOKEN(Tokens[15], tok::greater, TT_BinaryOperator);
 }
 
+TEST_F(TokenAnnotatorTest, UnderstandsObjCMethodDecl) {
+  auto Tokens = annotate("/**/ - (void)foo;");
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::minus, TT_ObjCMethodSpecifier);
+  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_SelectorName);
+}
+
 TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   auto Tokens = annotate("[]() constexpr {}");
   ASSERT_EQ(Tokens.size(), 8u) << Tokens;

From 66465c3b0ab1b32403ad5a1c3114174d87830f54 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 14 Feb 2025 21:23:33 -0800
Subject: [PATCH 002/109] Revert "Make llvm::telemetry::Manager::preDispatch
 protected.  (#127114)"

This reverts commit f7a2d70bd91094e7a85f7e189602c826a3eeb6cd.

Multiple buildbot failures have been reported.  See:
https://github.com/llvm/llvm-project/pull/127114
---
 llvm/include/llvm/Telemetry/Telemetry.h |  9 ++++-----
 llvm/lib/Telemetry/Telemetry.cpp        | 13 -------------
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/Telemetry/Telemetry.h b/llvm/include/llvm/Telemetry/Telemetry.h
index 8efea645ab51c..344a49df5cbf0 100644
--- a/llvm/include/llvm/Telemetry/Telemetry.h
+++ b/llvm/include/llvm/Telemetry/Telemetry.h
@@ -138,6 +138,10 @@ class Manager {
 public:
   virtual ~Manager() = default;
 
+  // Optional callback for subclasses to perform additional tasks before
+  // dispatching to Destinations.
+  virtual Error preDispatch(TelemetryInfo *Entry) = 0;
+
   // Dispatch Telemetry data to the Destination(s).
   // The argument is non-const because the Manager may add or remove
   // data from the entry.
@@ -146,11 +150,6 @@ class Manager {
   // Register a Destination.
   void addDestination(std::unique_ptr<Destination> Destination);
 
-protected:
-  // Optional callback for subclasses to perform additional tasks before
-  // dispatching to Destinations.
-  virtual Error preDispatch(TelemetryInfo *Entry) {}
-
 private:
   std::vector<std::unique_ptr<Destination>> Destinations;
 };
diff --git a/llvm/lib/Telemetry/Telemetry.cpp b/llvm/lib/Telemetry/Telemetry.cpp
index badb07bff5477..9e13d08334e3b 100644
--- a/llvm/lib/Telemetry/Telemetry.cpp
+++ b/llvm/lib/Telemetry/Telemetry.cpp
@@ -1,16 +1,3 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file provides the basic framework for Telemetry.
-/// Refer to its documentation at llvm/docs/Telemetry.rst for more details.
-//===---------------------------------------------------------------------===//
-
 #include "llvm/Telemetry/Telemetry.h"
 
 namespace llvm {

From 8fff0c181f26a5e8b2344c061ebf2559118b1160 Mon Sep 17 00:00:00 2001
From: Jordan R AW <ajordanr@google.com>
Date: Fri, 14 Feb 2025 21:37:39 -0800
Subject: [PATCH 003/109] [lldb] Add terminfo dependency for ncurses support
 (#126810)

For some operating systems (e.g. chromiumos), terminfo is a separate
package and library from ncurses. Both are still requirements for curses
support in lldb, individually.

This is a rework of this original spack commit:

https://github.com/spack/spack/commit/9ea261265010eacd250691a8361f661d0576f25c

Instead though, this PR uses CMake to detect whether the symbol is
present and defined in the curses library, and only falls back to a separate
tinfo if not found.

Without this fix, LLDB cannot be built on these systems.

Fixes #101368
---
 lldb/cmake/modules/FindCursesAndPanel.cmake | 42 ++++++++++++++++++---
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/lldb/cmake/modules/FindCursesAndPanel.cmake b/lldb/cmake/modules/FindCursesAndPanel.cmake
index aaadf214bf54b..75ebaa35d7ea1 100644
--- a/lldb/cmake/modules/FindCursesAndPanel.cmake
+++ b/lldb/cmake/modules/FindCursesAndPanel.cmake
@@ -2,23 +2,55 @@
 # FindCursesAndPanel
 # -----------
 #
-# Find the curses and panel library as a whole.
+# Find the curses, terminfo, and panel library as a whole.
 
-if(CURSES_INCLUDE_DIRS AND CURSES_LIBRARIES AND PANEL_LIBRARIES)
+include(CMakePushCheckState)
+
+function(lldb_check_curses_tinfo CURSES_LIBRARIES CURSES_HAS_TINFO)
+  cmake_reset_check_state()
+  set(CMAKE_REQUIRED_LIBRARIES "${CURSES_LIBRARIES}")
+  # acs_map is one of many symbols that are part of tinfo but could
+  # be bundled in curses.
+  check_symbol_exists(acs_map "curses.h" CURSES_HAS_TINFO)
+endfunction()
+
+if(CURSES_INCLUDE_DIRS AND CURSES_LIBRARIES AND TINFO_LIBRARIES AND PANEL_LIBRARIES)
   set(CURSESANDPANEL_FOUND TRUE)
 else()
   find_package(Curses QUIET)
   find_library(PANEL_LIBRARIES NAMES panel DOC "The curses panel library" QUIET)
   include(FindPackageHandleStandardArgs)
+
+  if(CURSES_FOUND AND PANEL_LIBRARIES)
+    # Sometimes the curses libraries define their own terminfo symbols,
+    # other times they're extern and are defined by a separate terminfo library.
+    # Auto-detect which.
+    lldb_check_curses_tinfo("${CURSES_LIBRARIES}" CURSES_HAS_TINFO)
+    if (NOT CURSES_HAS_TINFO)
+      message(STATUS "curses library missing terminfo symbols, looking for tinfo separately")
+      find_library(TINFO_LIBRARIES NAMES tinfo DOC "The curses tinfo library" QUIET)
+      list(APPEND CURSES_LIBRARIES "${TINFO_LIBRARIES}")
+    endif()
+    set(HAS_TERMINFO_SYMBOLS "$<OR:$<BOOL:${TERMINFO_LIBRARIES}>,$<BOOL:${CURSES_HAS_TINFO}>>")
+  endif()
+
   find_package_handle_standard_args(CursesAndPanel
                                     FOUND_VAR
                                       CURSESANDPANEL_FOUND
                                     REQUIRED_VARS
                                       CURSES_INCLUDE_DIRS
                                       CURSES_LIBRARIES
-                                      PANEL_LIBRARIES)
-  if(CURSES_FOUND AND PANEL_LIBRARIES)
-    mark_as_advanced(CURSES_INCLUDE_DIRS CURSES_LIBRARIES PANEL_LIBRARIES)
+                                      PANEL_LIBRARIES
+                                      HAS_TERMINFO_SYMBOLS)
+
+  if(CURSES_FOUND AND PANEL_LIBRARIES AND HAS_TERMINFO_SYMBOLS)
+    mark_as_advanced(CURSES_INCLUDE_DIRS
+                      PANEL_LIBRARIES
+                      HAS_TERMINFO_SYMBOLS
+                      CURSES_HAS_TINFO)
+  endif()
+  if(TINFO_LIBRARIES)
+    mark_as_advanced(TINFO_LIBRARIES)
   endif()
 endif()
 

From ed32d85d31999756602a7d5c4647cb6771d8f857 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 14 Feb 2025 22:01:29 -0800
Subject: [PATCH 004/109] [lldb] Use async output & error stream for
 EvaluateExpression

Similar to #126821, in support of #126630.
---
 lldb/source/Commands/CommandObjectExpression.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp
index 13491b5c79442..7e26381c92405 100644
--- a/lldb/source/Commands/CommandObjectExpression.cpp
+++ b/lldb/source/Commands/CommandObjectExpression.cpp
@@ -500,19 +500,17 @@ bool CommandObjectExpression::EvaluateExpression(llvm::StringRef expr,
 void CommandObjectExpression::IOHandlerInputComplete(IOHandler &io_handler,
                                                      std::string &line) {
   io_handler.SetIsDone(true);
-  StreamFileSP output_sp = io_handler.GetOutputStreamFileSP();
-  StreamFileSP error_sp = io_handler.GetErrorStreamFileSP();
+  StreamSP output_stream =
+      GetCommandInterpreter().GetDebugger().GetAsyncOutputStream();
+  StreamSP error_stream =
+      GetCommandInterpreter().GetDebugger().GetAsyncErrorStream();
 
   CommandReturnObject return_obj(
       GetCommandInterpreter().GetDebugger().GetUseColor());
-  EvaluateExpression(line.c_str(), *output_sp, *error_sp, return_obj);
+  EvaluateExpression(line.c_str(), *output_stream, *error_stream, return_obj);
 
-  if (output_sp)
-    output_sp->Flush();
-  if (error_sp) {
-    *error_sp << return_obj.GetErrorString();
-    error_sp->Flush();
-  }
+  output_stream->Flush();
+  *error_stream << return_obj.GetErrorString();
 }
 
 bool CommandObjectExpression::IOHandlerIsInputComplete(IOHandler &io_handler,

From b24e14093dae04440f22a2da128ba29576f5b3c3 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sat, 15 Feb 2025 06:14:21 +0000
Subject: [PATCH 005/109] [CI] Keep Track of Workflow Name Instead of Job Name

The metrics script includes some logic to only read look at workflows up
to the most recent workflow it has seen previously. This was broken in a
previous patch when workflow metrics began to be emitted per job. The
logic ending the metrics gathering would never trigger, so we would
continually fetch more and more workflows until OOM.
---
 .ci/metrics/metrics.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index d219c9e55169e..354b5058100e7 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -24,6 +24,7 @@ class JobMetrics:
     status: int
     created_at_ns: int
     workflow_id: int
+    workflow_name: str
 
 
 @dataclass
@@ -199,6 +200,7 @@ def get_per_workflow_metrics(
                     job_result,
                     created_at_ns,
                     workflow_run.id,
+                    workflow_run.name,
                 )
             )
 
@@ -278,7 +280,7 @@ def main():
         for workflow_metric in reversed(current_metrics):
             if isinstance(workflow_metric, JobMetrics):
                 workflows_to_track[
-                    workflow_metric.job_name
+                    workflow_metric.workflow_name
                 ] = workflow_metric.workflow_id
 
         time.sleep(SCRAPE_INTERVAL_SECONDS)

From 776fa2d731c17d6ba0afad2554ebc89cf5e3e5ef Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 14 Feb 2025 22:23:46 -0800
Subject: [PATCH 006/109] [lldb] Gardening in IOHandlerCurses (NFC)

 - Remove _ap (auto_ptr) suffix with _up (unique_ptr) suffix
 - Move forward declaration from IOHandler.h to IOHandlerCursesGUI.h
 - Move curses namespace under lldb_private

Motivated by Alex' comment in #126630.
---
 lldb/include/lldb/Core/IOHandler.h          |  5 -----
 lldb/include/lldb/Core/IOHandlerCursesGUI.h |  5 ++++-
 lldb/source/Core/IOHandlerCursesGUI.cpp     | 20 +++++++++++---------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/lldb/include/lldb/Core/IOHandler.h b/lldb/include/lldb/Core/IOHandler.h
index cb14d72413209..d6ac1cc8b5a14 100644
--- a/lldb/include/lldb/Core/IOHandler.h
+++ b/lldb/include/lldb/Core/IOHandler.h
@@ -32,11 +32,6 @@ namespace lldb_private {
 class Debugger;
 } // namespace lldb_private
 
-namespace curses {
-class Application;
-typedef std::unique_ptr<Application> ApplicationAP;
-} // namespace curses
-
 namespace lldb_private {
 
 class IOHandler {
diff --git a/lldb/include/lldb/Core/IOHandlerCursesGUI.h b/lldb/include/lldb/Core/IOHandlerCursesGUI.h
index 22ca735063ba1..e9871e0532194 100644
--- a/lldb/include/lldb/Core/IOHandlerCursesGUI.h
+++ b/lldb/include/lldb/Core/IOHandlerCursesGUI.h
@@ -12,6 +12,9 @@
 #include "lldb/Core/IOHandler.h"
 
 namespace lldb_private {
+namespace curses {
+class Application;
+} // namespace curses
 
 class IOHandlerCursesGUI : public IOHandler {
 public:
@@ -34,7 +37,7 @@ class IOHandlerCursesGUI : public IOHandler {
   void TerminalSizeChanged() override;
 
 protected:
-  curses::ApplicationAP m_app_ap;
+  std::unique_ptr<curses::Application> m_app_up;
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp
index 456ce7d16e102..c5eed0c0b4089 100644
--- a/lldb/source/Core/IOHandlerCursesGUI.cpp
+++ b/lldb/source/Core/IOHandlerCursesGUI.cpp
@@ -94,6 +94,7 @@ using llvm::StringRef;
 #define KEY_SHIFT_TAB (KEY_MAX + 1)
 #define KEY_ALT_ENTER (KEY_MAX + 2)
 
+namespace lldb_private {
 namespace curses {
 class Menu;
 class MenuDelegate;
@@ -4479,8 +4480,9 @@ class Application {
 };
 
 } // namespace curses
+} // namespace lldb_private
 
-using namespace curses;
+using namespace lldb_private::curses;
 
 struct Row {
   ValueObjectUpdater value;
@@ -7573,12 +7575,12 @@ IOHandlerCursesGUI::IOHandlerCursesGUI(Debugger &debugger)
 
 void IOHandlerCursesGUI::Activate() {
   IOHandler::Activate();
-  if (!m_app_ap) {
-    m_app_ap = std::make_unique<Application>(GetInputFILE(), GetOutputFILE());
+  if (!m_app_up) {
+    m_app_up = std::make_unique<Application>(GetInputFILE(), GetOutputFILE());
 
     // This is both a window and a menu delegate
     std::shared_ptr<ApplicationDelegate> app_delegate_sp(
-        new ApplicationDelegate(*m_app_ap, m_debugger));
+        new ApplicationDelegate(*m_app_up, m_debugger));
 
     MenuDelegateSP app_menu_delegate_sp =
         std::static_pointer_cast<MenuDelegate>(app_delegate_sp);
@@ -7652,8 +7654,8 @@ void IOHandlerCursesGUI::Activate() {
     help_menu_sp->AddSubmenu(MenuSP(new Menu(
         "GUI Help", nullptr, 'g', ApplicationDelegate::eMenuID_HelpGUIHelp)));
 
-    m_app_ap->Initialize();
-    WindowSP &main_window_sp = m_app_ap->GetMainWindow();
+    m_app_up->Initialize();
+    WindowSP &main_window_sp = m_app_up->GetMainWindow();
 
     MenuSP menubar_sp(new Menu(Menu::Type::Bar));
     menubar_sp->AddSubmenu(lldb_menu_sp);
@@ -7734,10 +7736,10 @@ void IOHandlerCursesGUI::Activate() {
   }
 }
 
-void IOHandlerCursesGUI::Deactivate() { m_app_ap->Terminate(); }
+void IOHandlerCursesGUI::Deactivate() { m_app_up->Terminate(); }
 
 void IOHandlerCursesGUI::Run() {
-  m_app_ap->Run(m_debugger);
+  m_app_up->Run(m_debugger);
   SetIsDone(true);
 }
 
@@ -7752,7 +7754,7 @@ bool IOHandlerCursesGUI::Interrupt() {
 void IOHandlerCursesGUI::GotEOF() {}
 
 void IOHandlerCursesGUI::TerminalSizeChanged() {
-  m_app_ap->TerminalSizeChanged();
+  m_app_up->TerminalSizeChanged();
 }
 
 #endif // LLDB_ENABLE_CURSES

From c30a7f459452d5766da244564bc1d5888346c364 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Sat, 15 Feb 2025 08:15:00 +0100
Subject: [PATCH 007/109] [flang] Fix standalone builds against installed MLIR
 (#126387)

1. Add a new `MLIR_DEPS` argument group to `flang_add_library()`, and
move MLIR-specific dependencies to that group. These dependencies are
added as usual in regular builds, and are skipped in standalone builds,
since MLIR targets are not visible there (and were already built and
installed).
2. Fix the value of `MLIR_MAIN_SRC_DIR` to refer to the current source
directory rather than the directory written into MLIR CMake files. The
latter refers to the directory used to build the MLIR package, and is no
longer valid.
3. Fix non-dylib friendly linking of `LLVMTargetParser` in `Optimizer`
unittests.

With these changes, I can successfully run Flang's regression tests.
---
 flang/CMakeLists.txt                          |  5 +++++
 flang/cmake/modules/AddFlang.cmake            |  5 ++++-
 flang/lib/Frontend/CMakeLists.txt             |  8 ++++---
 flang/lib/Lower/CMakeLists.txt                |  6 +++--
 flang/lib/Optimizer/Analysis/CMakeLists.txt   |  6 +++--
 flang/lib/Optimizer/Builder/CMakeLists.txt    |  6 +++--
 flang/lib/Optimizer/Dialect/CMakeLists.txt    |  4 +++-
 .../Dialect/CUF/Attributes/CMakeLists.txt     |  4 +++-
 .../lib/Optimizer/Dialect/CUF/CMakeLists.txt  |  4 +++-
 .../Optimizer/Dialect/Support/CMakeLists.txt  |  4 +++-
 flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt   |  4 +++-
 .../Optimizer/HLFIR/Transforms/CMakeLists.txt |  4 +++-
 flang/lib/Optimizer/OpenACC/CMakeLists.txt    |  4 +++-
 flang/lib/Optimizer/OpenMP/CMakeLists.txt     |  4 +++-
 flang/lib/Optimizer/Support/CMakeLists.txt    |  8 ++++---
 flang/test/CMakeLists.txt                     | 22 +++++++++++--------
 .../lib/Analysis/AliasAnalysis/CMakeLists.txt |  4 +++-
 flang/test/lib/OpenACC/CMakeLists.txt         | 10 +++++----
 flang/unittests/Optimizer/CMakeLists.txt      | 13 ++++++++---
 19 files changed, 87 insertions(+), 38 deletions(-)

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index c012b884ae3be..cca56bfdc88e6 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -230,6 +230,11 @@ if (FLANG_STANDALONE_BUILD)
     add_custom_target(doxygen ALL)
   endif()
 
+  # Override the value from installed CMake files, as they refer
+  # to the directory used during the original MLIR package build,
+  # which may be no longer available.  Instead, use the current checkout.
+  set(MLIR_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../mlir )
+
 else()
   option(FLANG_INCLUDE_TESTS
          "Generate build targets for the Flang unit tests."
diff --git a/flang/cmake/modules/AddFlang.cmake b/flang/cmake/modules/AddFlang.cmake
index c9f65eb73fef0..badbd4e7b964b 100644
--- a/flang/cmake/modules/AddFlang.cmake
+++ b/flang/cmake/modules/AddFlang.cmake
@@ -18,7 +18,7 @@ endmacro()
 
 function(add_flang_library name)
   set(options SHARED STATIC INSTALL_WITH_TOOLCHAIN)
-  set(multiValueArgs ADDITIONAL_HEADERS CLANG_LIBS MLIR_LIBS)
+  set(multiValueArgs ADDITIONAL_HEADERS CLANG_LIBS MLIR_LIBS MLIR_DEPS)
   cmake_parse_arguments(ARG
     "${options}"
     ""
@@ -69,6 +69,9 @@ function(add_flang_library name)
   if (ARG_MLIR_LIBS)
     mlir_target_link_libraries(${name} PRIVATE ${ARG_MLIR_LIBS})
   endif()
+  if (ARG_MLIR_DEPS AND NOT FLANG_STANDALONE_BUILD)
+    add_dependencies(${name} ${ARG_MLIR_DEPS})
+  endif()
 
   if (TARGET ${name})
 
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index 81eef2d468d8c..80d63fca6fb76 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -18,9 +18,6 @@ add_flang_library(flangFrontend
   FIROptCodeGenPassIncGen
   FIROptTransformsPassIncGen
   HLFIRDialect
-  MLIRIR
-  ${dialect_libs}
-  ${extension_libs}
 
   LINK_LIBS
   CUFDialect
@@ -56,6 +53,11 @@ add_flang_library(flangFrontend
   FrontendOpenACC
   FrontendOpenMP
 
+  MLIR_DEPS
+  MLIRIR
+  ${dialect_libs}
+  ${extension_libs}
+
   MLIR_LIBS
   MLIRTransforms
   MLIRBuiltinToLLVMIRTranslation
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index c9b249781552e..87dc2a052796a 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -44,8 +44,6 @@ add_flang_library(FortranLower
   FIRDialect
   FIRTransforms
   HLFIRDialect
-  ${dialect_libs}
-  ${extension_libs}
 
   LINK_LIBS
   CUFAttrs
@@ -64,6 +62,10 @@ add_flang_library(FortranLower
   LINK_COMPONENTS
   Support
 
+  MLIR_DEPS
+  ${dialect_libs}
+  ${extension_libs}
+
   MLIR_LIBS
   ${dialect_libs}
   ${extension_libs}
diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt
index c4dae898f8e57..4d4ad882c27d3 100644
--- a/flang/lib/Optimizer/Analysis/CMakeLists.txt
+++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt
@@ -6,8 +6,6 @@ add_flang_library(FIRAnalysis
   FIRDialect
   FIRSupport
   HLFIRDialect
-  MLIRIR
-  MLIROpenMPDialect
 
   LINK_LIBS
   FIRBuilder
@@ -15,6 +13,10 @@ add_flang_library(FIRAnalysis
   FIRSupport
   HLFIRDialect
 
+  MLIR_DEPS
+  MLIRIR
+  MLIROpenMPDialect
+
   MLIR_LIBS
   MLIRFuncDialect
   MLIRLLVMDialect
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index f8faeaa81c90c..f0563d092e3dc 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -40,8 +40,6 @@ add_flang_library(FIRBuilder
   CUFDialect
   FIRDialect
   HLFIRDialect
-  ${dialect_libs}
-  ${extension_libs}
 
   LINK_LIBS
   CUFAttrs
@@ -52,6 +50,10 @@ add_flang_library(FIRBuilder
   FortranEvaluate
   HLFIRDialect
 
+  MLIR_DEPS
+  ${dialect_libs}
+  ${extension_libs}
+
   MLIR_LIBS
   ${dialect_libs}
   ${extension_libs}
diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index d39dca8ed0000..61f9c6110491e 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -12,7 +12,6 @@ add_flang_library(FIRDialect
 
   DEPENDS
   CanonicalizationPatternsIncGen
-  MLIRIR
   FIROpsIncGen
   CUFAttrsIncGen
   intrinsics_gen
@@ -26,6 +25,9 @@ add_flang_library(FIRDialect
   AsmPrinter
   Remarks
 
+  MLIR_DEPS
+  MLIRIR
+
   MLIR_LIBS
   MLIRArithDialect
   MLIRBuiltinToLLVMIRTranslation
diff --git a/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
index a0f58504eff05..713bd0e97bac3 100644
--- a/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
@@ -3,7 +3,6 @@ add_flang_library(CUFAttrs
   CUFAttr.cpp
 
   DEPENDS
-  MLIRIR
   CUFAttrsIncGen
   CUFOpsIncGen
 
@@ -12,6 +11,9 @@ add_flang_library(CUFAttrs
   AsmPrinter
   Remarks
 
+  MLIR_DEPS
+  MLIRIR
+
   MLIR_LIBS
   MLIRTargetLLVMIRExport
 )
diff --git a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
index e483b4a164113..5b398f2ad506a 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
@@ -6,7 +6,6 @@ add_flang_library(CUFDialect
   CUFToLLVMIRTranslation.cpp
 
   DEPENDS
-  MLIRIR
   CUFAttrsIncGen
   CUFOpsIncGen
 
@@ -20,6 +19,9 @@ add_flang_library(CUFDialect
   AsmPrinter
   Remarks
 
+  MLIR_DEPS
+  MLIRIR
+
   MLIR_LIBS
   MLIRIR
   MLIRGPUDialect
diff --git a/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt b/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt
index bfdd5279b6f29..a85d9521af1c4 100644
--- a/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt
@@ -5,9 +5,11 @@ add_flang_library(FIRDialectSupport
   FIRContext.cpp
 
   DEPENDS
-  MLIRIR
   intrinsics_gen
 
+  MLIR_DEPS
+  MLIRIR
+
   MLIR_LIBS
   ${dialect_libs}
 )
diff --git a/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt
index 8a646bedf94b8..99e31a43e01e5 100644
--- a/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt
@@ -8,7 +8,6 @@ add_flang_library(HLFIRDialect
   CUFAttrsIncGen
   FIRDialect
   HLFIROpsIncGen
-  ${dialect_libs}
 
   LINK_LIBS
   CUFAttrs
@@ -19,6 +18,9 @@ add_flang_library(HLFIRDialect
   AsmPrinter
   Remarks
 
+  MLIR_DEPS
+  ${dialect_libs}
+
   MLIR_LIBS
   MLIRIR
   ${dialect_libs}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index 09286aced6089..7eb3cb4001d5f 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -15,7 +15,6 @@ add_flang_library(HLFIRTransforms
   CUFAttrsIncGen
   FIRDialect
   HLFIROpsIncGen
-  ${dialect_libs}
 
   LINK_LIBS
   CUFAttrs
@@ -33,6 +32,9 @@ add_flang_library(HLFIRTransforms
   AsmPrinter
   Remarks
 
+  MLIR_DEPS
+  ${dialect_libs}
+
   MLIR_LIBS
   MLIRIR
   ${dialect_libs}
diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
index 1bfae603fd80d..4a09133fc110d 100644
--- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
@@ -11,7 +11,6 @@ add_flang_library(FIROpenACCSupport
   FIRDialectSupport
   FIRSupport
   HLFIRDialect
-  MLIROpenACCDialect
 
   LINK_LIBS
   FIRBuilder
@@ -21,6 +20,9 @@ add_flang_library(FIROpenACCSupport
   FIRSupport
   HLFIRDialect
 
+  MLIR_DEPS
+  MLIROpenACCDialect
+
   MLIR_LIBS
   MLIROpenACCDialect
 )
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index 86ae93f3207cc..4a48d6e0936db 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -12,7 +12,6 @@ add_flang_library(FlangOpenMPTransforms
   FIRDialect
   HLFIROpsIncGen
   FlangOpenMPPassesIncGen
-  ${dialect_libs}
 
   LINK_LIBS
   FIRAnalysis
@@ -24,6 +23,9 @@ add_flang_library(FlangOpenMPTransforms
   FortranSupport
   HLFIRDialect
 
+  MLIR_DEPS
+  ${dialect_libs}
+
   MLIR_LIBS
   MLIRFuncDialect
   MLIROpenMPDialect
diff --git a/flang/lib/Optimizer/Support/CMakeLists.txt b/flang/lib/Optimizer/Support/CMakeLists.txt
index f8e4fc5bcefea..7ccdd4fd9c25c 100644
--- a/flang/lib/Optimizer/Support/CMakeLists.txt
+++ b/flang/lib/Optimizer/Support/CMakeLists.txt
@@ -10,9 +10,6 @@ add_flang_library(FIRSupport
   DEPENDS
   FIROpsIncGen
   HLFIROpsIncGen
-  MLIRIR
-  ${dialect_libs}
-  ${extension_libs}
 
   LINK_LIBS
   FIRDialect
@@ -20,6 +17,11 @@ add_flang_library(FIRSupport
   LINK_COMPONENTS
   TargetParser
 
+  MLIR_DEPS
+  MLIRIR
+  ${dialect_libs}
+  ${extension_libs}
+
   MLIR_LIBS
   ${dialect_libs}
   ${extension_libs}
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index 3fac8717e9bd9..777cf5fc5433b 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -59,26 +59,30 @@ set(FLANG_TEST_PARAMS
 
 set(FLANG_TEST_DEPENDS
   flang
-  llvm-config
-  FileCheck
-  count
-  not
   module_files
   fir-opt
   tco
   bbc
-  llvm-dis
-  llvm-objdump
-  llvm-readobj
-  split-file
   FortranDecimal
 )
+if (NOT FLANG_STANDALONE_BUILD)
+  list(APPEND FLANG_TEST_DEPENDS
+    llvm-config
+    FileCheck
+    count
+    not
+    llvm-dis
+    llvm-objdump
+    llvm-readobj
+    split-file
+  )
+endif ()
 
 if (FLANG_INCLUDE_RUNTIME)
   list(APPEND FLANG_TEST_DEPENDS flang_rt.runtime)
 endif ()
 
-if (LLVM_ENABLE_PLUGINS AND NOT WIN32)
+if (LLVM_ENABLE_PLUGINS AND NOT WIN32 AND NOT FLANG_STANDALONE_BUILD)
   list(APPEND FLANG_TEST_DEPENDS Bye)
 endif()
 
diff --git a/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt b/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt
index cba47a4114517..16df2b607ca93 100644
--- a/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt
+++ b/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt
@@ -8,7 +8,6 @@ add_flang_library(FIRTestAnalysis
   FIRSupport
   FIRTransforms
   FIRAnalysis
-  ${dialect_libs}
 
   LINK_LIBS
   FIRDialect
@@ -18,6 +17,9 @@ add_flang_library(FIRTestAnalysis
   FIRAnalysis
   MLIRTestAnalysis
 
+  MLIR_DEPS
+  ${dialect_libs}
+
   MLIR_LIBS
   ${dialect_libs}
   MLIRFuncDialect
diff --git a/flang/test/lib/OpenACC/CMakeLists.txt b/flang/test/lib/OpenACC/CMakeLists.txt
index e296827ef53be..1c0ac748f85e8 100644
--- a/flang/test/lib/OpenACC/CMakeLists.txt
+++ b/flang/test/lib/OpenACC/CMakeLists.txt
@@ -5,16 +5,18 @@ add_flang_library(FIRTestOpenACCInterfaces
   FIRDialect
   FIROpenACCSupport
   FIRSupport
-  MLIRIR
-  MLIROpenACCDialect
-  MLIRPass
-  MLIRSupport
 
   LINK_LIBS
   FIRDialect
   FIROpenACCSupport
   FIRSupport
 
+  MLIR_DEPS
+  MLIRIR
+  MLIROpenACCDialect
+  MLIRPass
+  MLIRSupport
+
   MLIR_LIBS
   MLIRIR
   MLIROpenACCDialect
diff --git a/flang/unittests/Optimizer/CMakeLists.txt b/flang/unittests/Optimizer/CMakeLists.txt
index f535677c19fd2..1289341619118 100644
--- a/flang/unittests/Optimizer/CMakeLists.txt
+++ b/flang/unittests/Optimizer/CMakeLists.txt
@@ -1,6 +1,10 @@
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
 
+set(LLVM_LINK_COMPONENTS
+  TargetParser
+)
+
 set(LIBS
   CUFDialect
   FIRBuilder
@@ -9,7 +13,6 @@ set(LIBS
   FIRDialectSupport
   FIRSupport
   HLFIRDialect
-  LLVMTargetParser
 )
 
 add_flang_unittest(FlangOptimizerTests
@@ -39,8 +42,12 @@ DEPENDS
   CUFDialect
   FIRDialect
   FIRSupport
-  HLFIRDialect
-  ${dialect_libs})
+  HLFIRDialect)
+
+if(NOT FLANG_STANDALONE_BUILD)
+  add_dependencies(FlangOptimizerTests
+    ${dialect_libs})
+endif()
 
 target_link_libraries(FlangOptimizerTests
   PRIVATE

From b6be53d4cb92592940618555ba5fbf412c0cfca8 Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Sat, 15 Feb 2025 09:19:20 +0100
Subject: [PATCH 008/109] [ValueTracking] Test for not cond to assume (NFC)

---
 llvm/test/Transforms/InstCombine/assume.ll | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index c21f8457e82d1..0007cc1518730 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -977,6 +977,24 @@ define i32 @range_15_31_top27(i32 %x) {
   ret i32 %res
 }
 
+define i1 @not_cond_use(i8 %x) {
+; CHECK-LABEL: @not_cond_use(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X:%.*]], 0
+; CHECK-NEXT:    tail call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[CMP]], true
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[NOT]])
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    ret i1 [[RVAL]]
+;
+  %cmp = icmp eq i8 %x, 0
+  tail call void @use(i1 %cmp)
+  %not = xor i1 %cmp, true
+  tail call void @llvm.assume(i1 %not)
+  %rval = icmp eq i8 %x, 0
+  ret i1 %rval
+}
+
+declare void @use(i1)
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}

From 77b309d0721b70f7e2e646f50317478fa76b1ba5 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 15 Feb 2025 01:35:01 -0800
Subject: [PATCH 009/109] [AST] Avoid repeated hash lookups (NFC) (#127299)

---
 clang/lib/AST/VTableBuilder.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/VTableBuilder.cpp b/clang/lib/AST/VTableBuilder.cpp
index 19d76df99dbe3..18893b996b5d6 100644
--- a/clang/lib/AST/VTableBuilder.cpp
+++ b/clang/lib/AST/VTableBuilder.cpp
@@ -2115,8 +2115,8 @@ void ItaniumVTableBuilder::dumpLayout(raw_ostream &Out) {
 
     // Dump the next address point.
     uint64_t NextIndex = Index + 1;
-    if (AddressPointsByIndex.count(NextIndex)) {
-      if (AddressPointsByIndex.count(NextIndex) == 1) {
+    if (unsigned Count = AddressPointsByIndex.count(NextIndex)) {
+      if (Count == 1) {
         const BaseSubobject &Base =
           AddressPointsByIndex.find(NextIndex)->second;
 

From 8bdc312272543e8fb21868e57a6c1592668b49a4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 15 Feb 2025 01:35:33 -0800
Subject: [PATCH 010/109] [Index] Avoid repeated hash lookups (NFC) (#127300)

---
 clang/lib/Index/USRGeneration.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index 1e54b413dc59c..0a5a1bcc74865 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -859,16 +859,12 @@ void USRGenerator::VisitType(QualType T) {
     }
 
     // If we have already seen this (non-built-in) type, use a substitution
-    // encoding.
-    llvm::DenseMap<const Type *, unsigned>::iterator Substitution
-      = TypeSubstitutions.find(T.getTypePtr());
-    if (Substitution != TypeSubstitutions.end()) {
+    // encoding.  Otherwise, record this as a substitution.
+    auto [Substitution, Inserted] =
+        TypeSubstitutions.try_emplace(T.getTypePtr(), TypeSubstitutions.size());
+    if (!Inserted) {
       Out << 'S' << Substitution->second << '_';
       return;
-    } else {
-      // Record this as a substitution.
-      unsigned Number = TypeSubstitutions.size();
-      TypeSubstitutions[T.getTypePtr()] = Number;
     }
 
     if (const PointerType *PT = T->getAs<PointerType>()) {

From 42e0ee4d7eaafd86a27418cd8c752229ce90c8e2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 15 Feb 2025 01:36:16 -0800
Subject: [PATCH 011/109] [Sema] Avoid repeated hash lookups (NFC) (#127301)

---
 clang/lib/Sema/SemaDecl.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 98c245cdea78f..362df485a025c 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -16014,7 +16014,8 @@ static void diagnoseImplicitlyRetainedSelf(Sema &S) {
   llvm::DenseMap<const BlockDecl *, bool> EscapeInfo;
 
   auto IsOrNestedInEscapingBlock = [&](const BlockDecl *BD) {
-    if (auto It = EscapeInfo.find(BD); It != EscapeInfo.end())
+    auto [It, Inserted] = EscapeInfo.try_emplace(BD);
+    if (!Inserted)
       return It->second;
 
     bool R = false;
@@ -16027,7 +16028,7 @@ static void diagnoseImplicitlyRetainedSelf(Sema &S) {
       CurBD = CurBD->getParent()->getInnermostBlockDecl();
     } while (CurBD);
 
-    return EscapeInfo[BD] = R;
+    return It->second = R;
   };
 
   // If the location where 'self' is implicitly retained is inside a escaping

From 9453b38ac74f0d6797f12213996eac40d56537d9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 15 Feb 2025 01:36:39 -0800
Subject: [PATCH 012/109] [clang-offload-packager] Avoid repeated hash lookups
 (NFC) (#127302)

---
 .../tools/clang-offload-packager/ClangOffloadPackager.cpp  | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp b/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
index c6d5b31ab512c..49cb0d70f492b 100644
--- a/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
+++ b/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
@@ -70,10 +70,9 @@ static DenseMap<StringRef, StringRef> getImageArguments(StringRef Image,
   DenseMap<StringRef, StringRef> Args;
   for (StringRef Arg : llvm::split(Image, ",")) {
     auto [Key, Value] = Arg.split("=");
-    if (Args.count(Key))
-      Args[Key] = Saver.save(Args[Key] + "," + Value);
-    else
-      Args[Key] = Value;
+    auto [It, Inserted] = Args.try_emplace(Key, Value);
+    if (!Inserted)
+      It->second = Saver.save(It->second + "," + Value);
   }
 
   return Args;

From 05209f1e598f73913bf0284bfbbb88131149bbcf Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 15 Feb 2025 01:37:02 -0800
Subject: [PATCH 013/109] [ExecutionEngine] Avoid repeated hash lookups (NFC)
 (#127303)

---
 llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index d4e341a96f5b1..380a173c1d7ed 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -635,11 +635,12 @@ void MachOPlatform::pushInitializersLoop(
       Worklist.pop_back();
 
       // If we've already visited this JITDylib on this iteration then continue.
-      if (JDDepMap.count(DepJD))
+      auto [It, Inserted] = JDDepMap.try_emplace(DepJD);
+      if (!Inserted)
         continue;
 
       // Add dep info.
-      auto &DM = JDDepMap[DepJD];
+      auto &DM = It->second;
       DepJD->withLinkOrderDo([&](const JITDylibSearchOrder &O) {
         for (auto &KV : O) {
           if (KV.first == DepJD)

From 7e7a3623b44da5019878b91d8334d4c16d7b86a9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 15 Feb 2025 01:38:00 -0800
Subject: [PATCH 014/109] [Hexagon] Avoid repeated map lookups (NFC) (#127304)

---
 llvm/lib/Target/Hexagon/RDFCopy.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp
index fdd7e4cf99e35..fafdad08909dd 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -133,8 +133,8 @@ bool CopyPropagation::run() {
     for (NodeId I : Copies) {
       dbgs() << "Instr: " << *DFG.addr<StmtNode*>(I).Addr->getCode();
       dbgs() << "   eq: {";
-      if (CopyMap.count(I)) {
-        for (auto J : CopyMap.at(I))
+      if (auto It = CopyMap.find(I); It != CopyMap.end()) {
+        for (auto J : It->second)
           dbgs() << ' ' << Print<RegisterRef>(J.first, DFG) << '='
                  << Print<RegisterRef>(J.second, DFG);
       }

From 4887e41055686eede9c155e6b3296b92fe86c2d5 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sat, 15 Feb 2025 10:38:59 +0100
Subject: [PATCH 015/109] [libc++][NFC] Make enable_ifs in <optional>
 consistent (#127184)

We've documented the preferred `enable_if` style in the coding
guidelines. This updates `<optional>` to conform to them
---
 libcxx/include/optional | 235 +++++++++++++++++++---------------------
 1 file changed, 114 insertions(+), 121 deletions(-)

diff --git a/libcxx/include/optional b/libcxx/include/optional
index c325140ee66f2..db236f86e74dd 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -672,44 +672,41 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr optional(optional&&)      = default;
   _LIBCPP_HIDE_FROM_ABI constexpr optional(nullopt_t) noexcept {}
 
-  template <
-      class _InPlaceT,
-      class... _Args,
-      class = enable_if_t< _And< _IsSame<_InPlaceT, in_place_t>, is_constructible<value_type, _Args...> >::value > >
+  template <class _InPlaceT,
+            class... _Args,
+            enable_if_t<_And<_IsSame<_InPlaceT, in_place_t>, is_constructible<value_type, _Args...>>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(_InPlaceT, _Args&&... __args)
       : __base(in_place, std::forward<_Args>(__args)...) {}
 
   template <class _Up,
             class... _Args,
-            class = enable_if_t< is_constructible_v<value_type, initializer_list<_Up>&, _Args...>> >
+            enable_if_t<is_constructible_v<value_type, initializer_list<_Up>&, _Args...>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(in_place_t, initializer_list<_Up> __il, _Args&&... __args)
       : __base(in_place, __il, std::forward<_Args>(__args)...) {}
 
-  template <class _Up                                                                         = value_type,
-            enable_if_t< _CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), int> = 0>
+  template <class _Up                                                                        = value_type,
+            enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI constexpr optional(_Up&& __v) : __base(in_place, std::forward<_Up>(__v)) {}
 
-  template <class _Up, enable_if_t< _CheckOptionalArgsCtor<_Up>::template __enable_explicit<_Up>(), int> = 0>
+  template <class _Up, enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_explicit<_Up>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(_Up&& __v) : __base(in_place, std::forward<_Up>(__v)) {}
 
   // LWG2756: conditionally explicit conversion from const optional<_Up>&
-  template <class _Up,
-            enable_if_t< _CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_implicit<_Up>(), int> = 0>
+  template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_implicit<_Up>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional(const optional<_Up>& __v) {
     this->__construct_from(__v);
   }
-  template <class _Up,
-            enable_if_t< _CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_explicit<_Up>(), int> = 0>
+  template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_explicit<_Up>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit optional(const optional<_Up>& __v) {
     this->__construct_from(__v);
   }
 
   // LWG2756: conditionally explicit conversion from optional<_Up>&&
-  template <class _Up, enable_if_t< _CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_implicit<_Up>(), int> = 0>
+  template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_implicit<_Up>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional(optional<_Up>&& __v) {
     this->__construct_from(std::move(__v));
   }
-  template <class _Up, enable_if_t< _CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_explicit<_Up>(), int> = 0>
+  template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_explicit<_Up>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit optional(optional<_Up>&& __v) {
     this->__construct_from(std::move(__v));
   }
@@ -718,7 +715,7 @@ public:
   template <class _Tag,
             class _Fp,
             class... _Args,
-            __enable_if_t<_IsSame<_Tag, __optional_construct_from_invoke_tag>::value, int> = 0>
+            enable_if_t<_IsSame<_Tag, __optional_construct_from_invoke_tag>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(_Tag, _Fp&& __f, _Args&&... __args)
       : __base(__optional_construct_from_invoke_tag{}, std::forward<_Fp>(__f), std::forward<_Args>(__args)...) {}
 #    endif
@@ -732,12 +729,12 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr optional& operator=(optional&&)      = default;
 
   // LWG2756
-  template <
-      class _Up = value_type,
-      class     = enable_if_t< _And< _IsNotSame<__remove_cvref_t<_Up>, optional>,
-                                     _Or< _IsNotSame<__remove_cvref_t<_Up>, value_type>, _Not<is_scalar<value_type>> >,
-                                     is_constructible<value_type, _Up>,
-                                     is_assignable<value_type&, _Up> >::value> >
+  template <class _Up        = value_type,
+            enable_if_t<_And<_IsNotSame<__remove_cvref_t<_Up>, optional>,
+                             _Or<_IsNotSame<__remove_cvref_t<_Up>, value_type>, _Not<is_scalar<value_type>>>,
+                             is_constructible<value_type, _Up>,
+                             is_assignable<value_type&, _Up>>::value,
+                        int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional& operator=(_Up&& __v) {
     if (this->has_value())
       this->__get() = std::forward<_Up>(__v);
@@ -747,21 +744,20 @@ public:
   }
 
   // LWG2756
-  template <class _Up,
-            enable_if_t< _CheckOptionalLikeAssign<_Up, _Up const&>::template __enable_assign<_Up>(), int> = 0>
+  template <class _Up, enable_if_t<_CheckOptionalLikeAssign<_Up, _Up const&>::template __enable_assign<_Up>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional& operator=(const optional<_Up>& __v) {
     this->__assign_from(__v);
     return *this;
   }
 
   // LWG2756
-  template <class _Up, enable_if_t< _CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_assign<_Up>(), int> = 0>
+  template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>::template __enable_assign<_Up>(), int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional& operator=(optional<_Up>&& __v) {
     this->__assign_from(std::move(__v));
     return *this;
   }
 
-  template <class... _Args, class = enable_if_t< is_constructible_v<value_type, _Args...> > >
+  template <class... _Args, enable_if_t<is_constructible_v<value_type, _Args...>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(_Args&&... __args) {
     reset();
     this->__construct(std::forward<_Args>(__args)...);
@@ -770,7 +766,7 @@ public:
 
   template <class _Up,
             class... _Args,
-            class = enable_if_t< is_constructible_v<value_type, initializer_list<_Up>&, _Args...> > >
+            enable_if_t<is_constructible_v<value_type, initializer_list<_Up>&, _Args...>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) {
     reset();
     this->__construct(__il, std::forward<_Args>(__args)...);
@@ -982,17 +978,15 @@ public:
   using __base::reset;
 };
 
-#    if _LIBCPP_STD_VER >= 17
 template <class _Tp>
 optional(_Tp) -> optional<_Tp>;
-#    endif
 
 // Comparisons between optionals
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>,
-    bool >
-operator==(const optional<_Tp>& __x, const optional<_Up>& __y) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (static_cast<bool>(__x) != static_cast<bool>(__y))
     return false;
   if (!static_cast<bool>(__x))
@@ -1000,11 +994,11 @@ operator==(const optional<_Tp>& __x, const optional<_Up>& __y) {
   return *__x == *__y;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>,
-    bool >
-operator!=(const optional<_Tp>& __x, const optional<_Up>& __y) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (static_cast<bool>(__x) != static_cast<bool>(__y))
     return true;
   if (!static_cast<bool>(__x))
@@ -1012,11 +1006,11 @@ operator!=(const optional<_Tp>& __x, const optional<_Up>& __y) {
   return *__x != *__y;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>,
-    bool >
-operator<(const optional<_Tp>& __x, const optional<_Up>& __y) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (!static_cast<bool>(__y))
     return false;
   if (!static_cast<bool>(__x))
@@ -1024,11 +1018,11 @@ operator<(const optional<_Tp>& __x, const optional<_Up>& __y) {
   return *__x < *__y;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>,
-    bool >
-operator>(const optional<_Tp>& __x, const optional<_Up>& __y) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (!static_cast<bool>(__x))
     return false;
   if (!static_cast<bool>(__y))
@@ -1036,11 +1030,11 @@ operator>(const optional<_Tp>& __x, const optional<_Up>& __y) {
   return *__x > *__y;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>,
-    bool >
-operator<=(const optional<_Tp>& __x, const optional<_Up>& __y) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (!static_cast<bool>(__x))
     return true;
   if (!static_cast<bool>(__y))
@@ -1048,11 +1042,11 @@ operator<=(const optional<_Tp>& __x, const optional<_Up>& __y) {
   return *__x <= *__y;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>,
-    bool >
-operator>=(const optional<_Tp>& __x, const optional<_Up>& __y) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (!static_cast<bool>(__y))
     return true;
   if (!static_cast<bool>(__x))
@@ -1145,99 +1139,99 @@ _LIBCPP_HIDE_FROM_ABI constexpr strong_ordering operator<=>(const optional<_Tp>&
 #    endif // _LIBCPP_STD_VER <= 17
 
 // Comparisons with T
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>,
-    bool >
-operator==(const optional<_Tp>& __x, const _Up& __v) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x == __v : false;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>,
-    bool >
-operator==(const _Tp& __v, const optional<_Up>& __x) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v == *__x : false;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>,
-    bool >
-operator!=(const optional<_Tp>& __x, const _Up& __v) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x != __v : true;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>,
-    bool >
-operator!=(const _Tp& __v, const optional<_Up>& __x) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v != *__x : true;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>,
-    bool >
-operator<(const optional<_Tp>& __x, const _Up& __v) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x < __v : true;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>,
-    bool >
-operator<(const _Tp& __v, const optional<_Up>& __x) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v < *__x : false;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>,
-    bool >
-operator<=(const optional<_Tp>& __x, const _Up& __v) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x <= __v : true;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>,
-    bool >
-operator<=(const _Tp& __v, const optional<_Up>& __x) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v <= *__x : false;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>,
-    bool >
-operator>(const optional<_Tp>& __x, const _Up& __v) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x > __v : false;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>,
-    bool >
-operator>(const _Tp& __v, const optional<_Up>& __x) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v > *__x : true;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>,
-    bool >
-operator>=(const optional<_Tp>& __x, const _Up& __v) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x >= __v : false;
 }
 
-template <class _Tp, class _Up>
-_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t<
-    is_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>,
-    bool >
-operator>=(const _Tp& __v, const optional<_Up>& __x) {
+template <
+    class _Tp,
+    class _Up,
+    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v >= *__x : true;
 }
 
@@ -1252,9 +1246,8 @@ operator<=>(const optional<_Tp>& __x, const _Up& __v) {
 
 #    endif // _LIBCPP_STD_VER >= 20
 
-template <class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX20 enable_if_t< is_move_constructible_v<_Tp> && is_swappable_v<_Tp>, void >
+template <class _Tp, enable_if_t< is_move_constructible_v<_Tp> && is_swappable_v<_Tp>, int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 swap(optional<_Tp>& __x, optional<_Tp>& __y) noexcept(noexcept(__x.swap(__y))) {
   __x.swap(__y);
 }

From cffc1ac3491c891ef4f80bcbfa685710e477eeac Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Sat, 15 Feb 2025 10:54:00 +0100
Subject: [PATCH 016/109] [libc++] Avoid including <features.h> on arbitrary
 platforms (#125587)

This partially reverts commit 5f2389d4. That commit started checking
whether <features.h> was a valid include unconditionally, however codebases
are free to have such a header on their search path, which breaks compilation.
LLVM libc now provides a more standard way of getting configuration macros
like __LLVM_LIBC__.

After this patch, we only include <features.h> when we're on Linux or
when we're compiling for GPUs.
---
 libcxx/include/__configuration/platform.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__configuration/platform.h b/libcxx/include/__configuration/platform.h
index 2a92ce209b91f..cff99376ee24b 100644
--- a/libcxx/include/__configuration/platform.h
+++ b/libcxx/include/__configuration/platform.h
@@ -30,12 +30,9 @@
 // ... add new file formats here ...
 #endif
 
-// To detect which libc we're using
-#if __has_include(<features.h>)
+// Need to detect which libc we're using if we're on Linux.
+#if defined(__linux__) || defined(__AMDGPU__) || defined(__NVPTX__)
 #  include <features.h>
-#endif
-
-#if defined(__linux__)
 #  if defined(__GLIBC_PREREQ)
 #    define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b)
 #  else

From 8f3a070db9bffe78d86d24b583effe4032baa4db Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Sat, 15 Feb 2025 10:45:22 +0100
Subject: [PATCH 017/109] [Clang] Add new WG21 papers(Hagenberg) papers to the
 C++ status page

---
 clang/www/cxx_status.html | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 0fc3b1d314698..2d5b96b47fe2d 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -251,6 +251,42 @@ <h2 id="cxx26">C++2c implementation status</h2>
   <td><a href="https://wg21.link/P3176R1">P3176R1</a></td>
   <td class="unreleased" align="center">Clang 20</td>
  </tr>
+ <!-- Austria, Winter 2025 -->
+ <tr>
+  <td>Trivial unions</td>
+  <td><a href="https://wg21.link/P3074">P3074R7</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Partial program correctness</td>
+  <td><a href="https://wg21.link/P1494">P1494R5</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Contracts</td>
+  <td><a href="https://wg21.link/P2900">P2900R14</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Defang and deprecate <code>memory_order::consume</code></td>
+  <td><a href="https://wg21.link/P3475">P3475R2</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Concept and variable-template template-parameters</td>
+  <td><a href="https://wg21.link/P2841">P2841R7</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td>Trivial Relocatability</pre></td>
+  <td><a href="https://wg21.link/P2786">P2786R13</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
+ <tr>
+  <td><pre>#embed</pre></td>
+  <td><a href="https://wg21.link/P1967">P1967R14</a></td>
+  <td class="none" align="center">No</td>
+ </tr>
 </table>
 </details>
 

From 70b95ca6dbee7036dcfa5995ff804471fd7e8c2a Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Sat, 15 Feb 2025 05:11:54 -0500
Subject: [PATCH 018/109] [libc][math] Fix sqrtf128 implicit conversions.
 (#127154)

This fixes rv32 buildbot failure from
https://github.com/llvm/llvm-project/pull/122578
---
 libc/src/math/generic/sqrtf128.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/libc/src/math/generic/sqrtf128.cpp b/libc/src/math/generic/sqrtf128.cpp
index c844d3afa11c8..3aa7db8362734 100644
--- a/libc/src/math/generic/sqrtf128.cpp
+++ b/libc/src/math/generic/sqrtf128.cpp
@@ -383,25 +383,26 @@ LLVM_LIBC_FUNCTION(float128, sqrtf128, (float128 x)) {
       // 1 so just need to add shifted m and 1.
       Int128 t1 = t0;
       Int128 sgn = t0 >> 127; // sign of the difference
-      t1 -= (m << 1) ^ sgn;
-      t1 += 1 + sgn;
+      Int128 m_xor_sgn = static_cast<Int128>(m << 1) ^ sgn;
+      t1 -= m_xor_sgn;
+      t1 += Int128(1) + sgn;
 
       Int128 sgn1 = t1 >> 127;
       if (LIBC_UNLIKELY(sgn == sgn1)) {
         t0 = t1;
         v -= sgn << 15;
-        t1 -= (m << 1) ^ sgn;
-        t1 += 1 + sgn;
+        t1 -= m_xor_sgn;
+        t1 += Int128(1) + sgn;
       }
 
       if (t1 == 0) {
         // 1 ulp offset brings again an exact root
-        v = (m - (2 * sgn + 1)) << 15;
+        v = (m - static_cast<UInt128>((sgn << 1) + 1)) << 15;
       } else {
         t1 += t0;
         Int128 side = t1 >> 127; // select what is closer m or m+-1
         v &= ~UInt128(0) << 15;  // wipe the fractional bits
-        v -= ((sgn & side) | (~sgn & 1)) << (15 + side);
+        v -= ((sgn & side) | (~sgn & 1)) << (15 + static_cast<int>(side));
         v |= 1; // add sticky bit since we cannot have an exact mid-point
                 // situation
       }

From 2db262886f0c06c079e1b2808c4c14c16f8861b5 Mon Sep 17 00:00:00 2001
From: Edgar <git@edgarluque.com>
Date: Sat, 15 Feb 2025 12:21:20 +0100
Subject: [PATCH 019/109] [MLIR] Fix mlirExecutionEngineLookup throwing assert
 on lookup fail (#123924)

Apparently trying to lookup a function pointer using the C api
`mlirExecutionEngineLookup` will throw an assert instead of just
returning a nullptr on builds with asserts.

The docs itself says it returns a nullptr when no function is found so
it should be sensible to not throw an assert in this case.
---
 mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp
index 507be9171d328..306cebd236be9 100644
--- a/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp
+++ b/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp
@@ -85,18 +85,20 @@ mlirExecutionEngineInvokePacked(MlirExecutionEngine jit, MlirStringRef name,
 
 extern "C" void *mlirExecutionEngineLookupPacked(MlirExecutionEngine jit,
                                                  MlirStringRef name) {
-  auto expectedFPtr = unwrap(jit)->lookupPacked(unwrap(name));
-  if (!expectedFPtr)
+  auto optionalFPtr =
+      llvm::expectedToOptional(unwrap(jit)->lookupPacked(unwrap(name)));
+  if (!optionalFPtr)
     return nullptr;
-  return reinterpret_cast<void *>(*expectedFPtr);
+  return reinterpret_cast<void *>(*optionalFPtr);
 }
 
 extern "C" void *mlirExecutionEngineLookup(MlirExecutionEngine jit,
                                            MlirStringRef name) {
-  auto expectedFPtr = unwrap(jit)->lookup(unwrap(name));
-  if (!expectedFPtr)
+  auto optionalFPtr =
+      llvm::expectedToOptional(unwrap(jit)->lookup(unwrap(name)));
+  if (!optionalFPtr)
     return nullptr;
-  return reinterpret_cast<void *>(*expectedFPtr);
+  return *optionalFPtr;
 }
 
 extern "C" void mlirExecutionEngineRegisterSymbol(MlirExecutionEngine jit,

From 42ff31aea5828a491269b4db1ba5cff6fef7ca60 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 15 Feb 2025 11:59:52 +0000
Subject: [PATCH 020/109] [X86] combineTargetShuffle - fold VPERMV3(HI,MASK,LO)
 -> VPERMV(COMMUTE(MASK),CONCAT(LO,HI)) (#127199)

We already handle the simpler VPERMV3(LO,MASK,HI) fold which can reuse
the (widened) mask, this attempts to match the flipped concatenation,
and commutes the mask to handle the flip.

I've limited this to cases where we can extract the constant mask for
commutation, a more general solution would XOR the MSB of the shuffle
mask indices to commute, but this almost never constant folds away after
lowering so the benefit was minimal.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  25 +-
 .../any_extend_vector_inreg_of_broadcast.ll   |  46 +-
 ...d_vector_inreg_of_broadcast_from_memory.ll |  14 +-
 .../X86/avx512-shuffles/partial_permute.ll    | 498 +++++++++---------
 .../vector-interleaved-load-i16-stride-6.ll   | 184 ++++---
 .../vector-interleaved-load-i32-stride-3.ll   |  76 ++-
 .../vector-interleaved-load-i32-stride-7.ll   |  36 +-
 .../zero_extend_vector_inreg_of_broadcast.ll  |  14 +-
 ...d_vector_inreg_of_broadcast_from_memory.ll |  14 +-
 9 files changed, 439 insertions(+), 468 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1d2d90d543c05..9592137b34842 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42513,10 +42513,12 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
   case X86ISD::VPERMV3: {
     // Combine VPERMV3 to widened VPERMV if the two source operands can be
     // freely concatenated.
-    if (VT.is128BitVector() ||
-        (VT.is256BitVector() && Subtarget.useAVX512Regs())) {
+    MVT WideVT = VT.getDoubleNumVectorElementsVT();
+    MVT MaskVT = N.getOperand(1).getSimpleValueType();
+    bool CanConcat = VT.is128BitVector() ||
+                     (VT.is256BitVector() && Subtarget.useAVX512Regs());
+    if (CanConcat) {
       SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
-      MVT WideVT = VT.getDoubleNumVectorElementsVT();
       if (SDValue ConcatSrc =
               combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) {
         SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
@@ -42530,9 +42532,24 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
     SmallVector<int, 32> Mask;
     if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
       assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
+      // See if we can concatenate the commuted operands.
+      if (CanConcat) {
+        if (SDValue ConcatSrc = combineConcatVectorOps(
+                DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG, DCI,
+                Subtarget)) {
+          ShuffleVectorSDNode::commuteMask(Mask);
+          SDValue NewMask =
+              getConstVector(Mask, MaskVT, DAG, DL, /*IsMask=*/true);
+          NewMask = widenSubVector(NewMask, false, Subtarget, DAG, DL,
+                                   WideVT.getSizeInBits());
+          SDValue Perm =
+              DAG.getNode(X86ISD::VPERMV, DL, WideVT, NewMask, ConcatSrc);
+          return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
+                             DAG.getVectorIdxConstant(0, DL));
+        }
+      }
       SDValue V1 = peekThroughBitcasts(N.getOperand(0));
       SDValue V2 = peekThroughBitcasts(N.getOperand(2));
-      MVT MaskVT = N.getOperand(1).getSimpleValueType();
       // Canonicalize to VPERMV if both sources are the same.
       if (V1 == V2) {
         for (int &M : Mask)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 4c4d5cb3166a8..951a2b4cafa26 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1754,10 +1754,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
-; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1870,10 +1869,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -3776,12 +3774,11 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,16,13,14,15,16,9,10,11,16,13,14,15]
-; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -3911,11 +3908,10 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,0,0,0,0,16,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -4037,11 +4033,10 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,16,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31]
+; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -4151,10 +4146,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,0,0,0,0,16,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 16f0614743463..c0afc0cfe2c0a 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1415,10 +1415,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
 ;
 ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
-; AVX512BW-NEXT:    vpermi2w (%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31]
+; AVX512BW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1502,10 +1501,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i
 ;
 ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpermi2w (%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31]
+; AVX512BW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index aac5847061cbe..fd9b46e82e0b1 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -227,11 +227,12 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1]
-; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,8,11,8,13,8,15,9]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
@@ -243,11 +244,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1]
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
@@ -304,10 +305,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16
 define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm1
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
   ret <16 x i16> %res
@@ -315,11 +315,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18]
 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -330,11 +330,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -344,11 +343,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <1
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10]
 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -359,11 +358,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -373,11 +371,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <1
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31]
 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -388,11 +386,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -440,10 +437,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <1
 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT:    vpermt2w %ymm0, %ymm2, %ymm1
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [6,11,23,26,29,5,21,30]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
@@ -452,11 +448,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; CHECK-NEXT:    vpermt2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [6,11,23,26,29,5,21,30]
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
@@ -468,11 +464,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
-; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [6,11,23,26,29,5,21,30]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
@@ -657,11 +652,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
-; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
@@ -673,11 +668,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i1
 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16]
 ; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
@@ -731,10 +725,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x
 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm0
-; CHECK-NEXT:    vpermt2w (%rdi), %ymm1, %ymm0
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm0 = [0,1,21,17,30,30,29,1]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -744,11 +737,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm3
-; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,1,21,17,30,30,29,1]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -761,11 +754,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,1,21,17,30,30,29,1]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -778,11 +770,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm3
-; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -795,11 +787,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16>
 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [23,22,20,22,28,20,11,17]
 ; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <32 x i16>, ptr %vp
@@ -1114,11 +1105,12 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0]
-; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,3,3,4]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
@@ -1130,11 +1122,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %
 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,3,3,4]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, ptr %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
@@ -1503,11 +1495,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [15,11,14,3,8,9,13,7]
 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
@@ -1519,11 +1511,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32>
 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,11,14,3,8,9,13,7]
 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
@@ -1535,11 +1526,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [12,6,9,13,12,10,0,2]
 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
@@ -1551,11 +1542,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32>
 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [12,6,9,13,12,10,0,2]
 ; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
@@ -1654,11 +1644,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0]
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [7,13,11,10,7,13,15,14]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1671,11 +1661,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,13,11,10,7,13,15,14]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
@@ -1721,9 +1710,10 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32
 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,4,3,6]
-; CHECK-NEXT:    vpermi2d (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [6,0,7,2]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
@@ -1732,11 +1722,12 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) {
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,4,3,6]
-; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,0,7,2]
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
@@ -1748,11 +1739,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32>
 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,4,3,6]
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,0,7,2]
 ; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x i32>, ptr %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
@@ -2374,11 +2365,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4]
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,7,6,0]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
@@ -2398,11 +2389,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,7,6,0]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
@@ -2422,11 +2412,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1]
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,1,1,5]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
@@ -2446,11 +2436,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,1,1,5]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
@@ -2535,11 +2524,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,0,2,5]
-; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,4,6,1]
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
@@ -2551,11 +2540,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,0,2,5]
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,4,6,1]
 ; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x i64>, ptr %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
@@ -2656,11 +2644,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64>
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5]
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [7,7,5,1]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
@@ -2680,11 +2668,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %
 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovdqa 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5]
+; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [7,7,5,1]
 ; CHECK-FAST-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
@@ -2946,9 +2933,10 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec
 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1]
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [6,2,4,5]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2957,12 +2945,13 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1]
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,2,4,5]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2974,12 +2963,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,2,4,5]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
@@ -2991,12 +2980,13 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [2,7,7,2]
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [6,3,3,6]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -3008,12 +2998,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x
 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,7,7,2]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [6,3,3,6]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, ptr %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
@@ -3497,12 +3487,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
-; CHECK-FAST-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
-; CHECK-FAST-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-FAST-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-FAST-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,13,10,11,10,0,0,9]
+; CHECK-FAST-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-FAST-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
@@ -3524,12 +3514,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) {
 ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
 ; CHECK-FAST:       # %bb.0:
-; CHECK-FAST-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
-; CHECK-FAST-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
-; CHECK-FAST-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-FAST-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9]
+; CHECK-FAST-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-FAST-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
@@ -3551,9 +3540,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8
 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) {
 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
-; CHECK-NEXT:    vpermi2ps (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [15,13,11,11,3,12,4,1]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
@@ -3562,12 +3551,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) {
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
-; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [15,13,11,11,3,12,4,1]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
@@ -3579,12 +3568,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x
 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
-; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [15,13,11,11,3,12,4,1]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
@@ -3644,12 +3632,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,10,6,15,0,0,0,0]
-; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,2,14,7,12,6,14,7]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3662,12 +3650,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,10,6,15,0,0,0,0]
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [8,2,14,7,12,6,14,7]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3680,13 +3667,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm2 = [4,14,4,14]
-; CHECK-NEXT:    # xmm2 = mem[0,0]
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm3
-; CHECK-NEXT:    vpermt2ps (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [12,6,12,6,12,6,12,6]
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -3699,13 +3685,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x
 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm2 = [4,14,4,14]
-; CHECK-NEXT:    # xmm2 = mem[0,0]
-; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT:    vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [12,6,12,6,12,6,12,6]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, ptr %vp
@@ -4527,12 +4511,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp,
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,4,1,5]
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [6,0,5,1]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
@@ -4544,12 +4528,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [2,4,1,5]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [6,0,5,1]
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
@@ -4593,9 +4576,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp,
 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) {
 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm1
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,4,1]
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm1, %ymm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [4,6,0,5]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
@@ -4604,12 +4587,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) {
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,2,4,1]
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [4,6,0,5]
+; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
@@ -4621,12 +4604,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
-; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,1]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,6,0,5]
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, ptr %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index c3b53211978ae..9d0183c816b12 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -582,20 +582,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-NEXT:    vpermd (%rdi), %zmm1, %zmm1
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512-NEXT:    vpermd %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
+; AVX512-NEXT:    vpermd %zmm2, %zmm5, %zmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512-NEXT:    vmovq %xmm1, (%r8)
-; AVX512-NEXT:    vmovq %xmm4, (%r9)
-; AVX512-NEXT:    vmovq %xmm5, (%rax)
+; AVX512-NEXT:    vmovq %xmm5, (%r9)
+; AVX512-NEXT:    vmovq %xmm2, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -613,20 +613,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-FCP-NEXT:    vpermd (%rdi), %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm1, %zmm1
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
+; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512-FCP-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512-FCP-NEXT:    vmovq %xmm1, (%r8)
-; AVX512-FCP-NEXT:    vmovq %xmm4, (%r9)
-; AVX512-FCP-NEXT:    vmovq %xmm5, (%rax)
+; AVX512-FCP-NEXT:    vmovq %xmm5, (%r9)
+; AVX512-FCP-NEXT:    vmovq %xmm2, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -645,20 +645,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-NEXT:    vpermd (%rdi), %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
+; AVX512DQ-NEXT:    vpermd %zmm2, %zmm5, %zmm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512DQ-NEXT:    vmovq %xmm1, (%r8)
-; AVX512DQ-NEXT:    vmovq %xmm4, (%r9)
-; AVX512DQ-NEXT:    vmovq %xmm5, (%rax)
+; AVX512DQ-NEXT:    vmovq %xmm5, (%r9)
+; AVX512DQ-NEXT:    vmovq %xmm2, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -676,20 +676,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-FCP-NEXT:    vpermd (%rdi), %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpermt2d (%rdi), %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
+; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm1, (%r8)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r9)
+; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -2876,22 +2876,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
 ; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm7
-; AVX512BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
-; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
-; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512BW-NEXT:    vpermw %zmm5, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
+; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512BW-NEXT:    vpermw %zmm5, %zmm6, %zmm6
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
+; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermw %zmm5, %zmm7, %zmm7
@@ -2933,22 +2931,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
 ; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
 ; AVX512BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
-; AVX512BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
 ; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
+; AVX512BW-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
+; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512BW-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm7, %zmm7
@@ -2990,22 +2986,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
 ; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
+; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
 ; AVX512DQ-BW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm7
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
-; AVX512DQ-BW-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
 ; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
-; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
+; AVX512DQ-BW-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
+; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512DQ-BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512DQ-BW-NEXT:    vpermw %zmm5, %zmm7, %zmm7
@@ -3047,22 +3041,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm4, %zmm3, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
-; AVX512DQ-BW-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm6, %ymm7, %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
+; AVX512DQ-BW-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm7, %zmm7
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index d9383f524f1d1..34f23213500c1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -103,16 +103,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512-FCP-LABEL: load_i32_stride3_vf2:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [5,0]
-; AVX512-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
-; AVX512-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm0
-; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,3,2,3]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7]
+; AVX512-FCP-NEXT:    vpermps (%rdi), %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm2
+; AVX512-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512-FCP-NEXT:    vmovlps %xmm1, (%rdx)
+; AVX512-FCP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: load_i32_stride3_vf2:
@@ -131,16 +130,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [5,0]
-; AVX512DQ-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
-; AVX512DQ-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,3,2,3]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7]
+; AVX512DQ-FCP-NEXT:    vpermps (%rdi), %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%rdx)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_i32_stride3_vf2:
@@ -159,16 +157,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-FCP-LABEL: load_i32_stride3_vf2:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [5,0]
-; AVX512BW-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
-; AVX512BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm0
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,3,2,3]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7]
+; AVX512BW-FCP-NEXT:    vpermps (%rdi), %ymm1, %ymm1
+; AVX512BW-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm2
+; AVX512BW-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i32_stride3_vf2:
@@ -187,16 +184,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [5,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastd 8(%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,3,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermps (%rdi), %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vbroadcastss 8(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <6 x i32>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 0, i32 3>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index 955a7ffcec795..7948141f6becd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -239,17 +239,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
 ; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
 ; AVX512-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm2
+; AVX512-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
+; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm7, %zmm0
 ; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
 ; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
 ; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512-FCP-NEXT:    vmovq %xmm7, (%r10)
+; AVX512-FCP-NEXT:    vmovlps %xmm2, (%r10)
 ; AVX512-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -304,17 +303,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
 ; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
 ; AVX512DQ-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
+; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm7, %zmm0
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%r10)
+; AVX512DQ-FCP-NEXT:    vmovlps %xmm2, (%r10)
 ; AVX512DQ-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -369,17 +367,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
 ; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512BW-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
 ; AVX512BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
+; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm7, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%r10)
+; AVX512BW-FCP-NEXT:    vmovlps %xmm2, (%r10)
 ; AVX512BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -434,17 +431,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpermi2d %xmm1, %xmm2, %xmm6
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpermt2d (%rdi), %ymm2, %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm7, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm2, (%r10)
 ; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index c9b10d9cc8668..ec7a708fc0b02 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1754,10 +1754,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
-; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1870,10 +1869,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31]
+; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 5ba2257e2b49e..14c2a60a5b998 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1415,10 +1415,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
 ;
 ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15]
-; AVX512BW-NEXT:    vpermi2w (%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31]
+; AVX512BW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1502,10 +1501,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i
 ;
 ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpermi2w (%rdi), %ymm0, %ymm1
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31]
+; AVX512BW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq

From a6093d30348d7116b1112f7532743fda50258d67 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Sat, 15 Feb 2025 14:56:19 +0100
Subject: [PATCH 021/109] [libc++] Explicitly mention vector_bool in the name
 of benchmarks (#127313)

We have some benchmarks that were benchmarking very specific
functionality, namely the optimizations in vector<bool>::iterator. Call
this out in the benchmarks by renaming them appropriately. In the future
we will also increase the coverage of these benchmarks to test other
containers.
---
 libcxx/test/benchmarks/algorithms/fill.bench.cpp | 16 ++++++++--------
 .../algorithms/ranges_contains.bench.cpp         | 12 ++++++------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/libcxx/test/benchmarks/algorithms/fill.bench.cpp b/libcxx/test/benchmarks/algorithms/fill.bench.cpp
index c157b5e5c9862..6a48b25b7eb63 100644
--- a/libcxx/test/benchmarks/algorithms/fill.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/fill.bench.cpp
@@ -12,40 +12,40 @@
 #include <benchmark/benchmark.h>
 #include <vector>
 
-static void bm_fill_n(benchmark::State& state) {
+static void bm_fill_n_vector_bool(benchmark::State& state) {
   std::vector<bool> vec1(state.range());
   for (auto _ : state) {
     benchmark::DoNotOptimize(vec1);
     benchmark::DoNotOptimize(std::fill_n(vec1.begin(), vec1.size(), false));
   }
 }
-BENCHMARK(bm_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_fill_n_vector_bool)->DenseRange(1, 8)->Range(16, 1 << 20);
 
-static void bm_ranges_fill_n(benchmark::State& state) {
+static void bm_ranges_fill_n_vector_bool(benchmark::State& state) {
   std::vector<bool> vec1(state.range());
   for (auto _ : state) {
     benchmark::DoNotOptimize(vec1);
     benchmark::DoNotOptimize(std::ranges::fill_n(vec1.begin(), vec1.size(), false));
   }
 }
-BENCHMARK(bm_ranges_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_ranges_fill_n_vector_bool)->DenseRange(1, 8)->Range(16, 1 << 20);
 
-static void bm_fill(benchmark::State& state) {
+static void bm_fill_vector_bool(benchmark::State& state) {
   std::vector<bool> vec1(state.range());
   for (auto _ : state) {
     benchmark::DoNotOptimize(vec1);
     std::fill(vec1.begin(), vec1.end(), false);
   }
 }
-BENCHMARK(bm_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_fill_vector_bool)->DenseRange(1, 8)->Range(16, 1 << 20);
 
-static void bm_ranges_fill(benchmark::State& state) {
+static void bm_ranges_fill_vector_bool(benchmark::State& state) {
   std::vector<bool> vec1(state.range());
   for (auto _ : state) {
     benchmark::DoNotOptimize(vec1);
     benchmark::DoNotOptimize(std::ranges::fill(vec1, false));
   }
 }
-BENCHMARK(bm_ranges_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_ranges_fill_vector_bool)->DenseRange(1, 8)->Range(16, 1 << 20);
 
 BENCHMARK_MAIN();
diff --git a/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
index b98e17a00ef83..c9a10202c8cfc 100644
--- a/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
@@ -15,7 +15,7 @@
 
 #include "test_iterators.h"
 
-static void bm_contains_char(benchmark::State& state) {
+static void bm_contains_vector_char(benchmark::State& state) {
   std::vector<char> a(state.range(), 'a');
 
   for (auto _ : state) {
@@ -24,9 +24,9 @@ static void bm_contains_char(benchmark::State& state) {
     benchmark::DoNotOptimize(std::ranges::contains(a.begin(), a.end(), 'B'));
   }
 }
-BENCHMARK(bm_contains_char)->RangeMultiplier(16)->Range(16, 16 << 20);
+BENCHMARK(bm_contains_vector_char)->RangeMultiplier(16)->Range(16, 16 << 20);
 
-static void bm_contains_int(benchmark::State& state) {
+static void bm_contains_vector_int(benchmark::State& state) {
   std::vector<int> a(state.range(), 1);
 
   for (auto _ : state) {
@@ -35,9 +35,9 @@ static void bm_contains_int(benchmark::State& state) {
     benchmark::DoNotOptimize(std::ranges::contains(a.begin(), a.end(), 2));
   }
 }
-BENCHMARK(bm_contains_int)->RangeMultiplier(16)->Range(16, 16 << 20);
+BENCHMARK(bm_contains_vector_int)->RangeMultiplier(16)->Range(16, 16 << 20);
 
-static void bm_contains_bool(benchmark::State& state) {
+static void bm_contains_vector_bool(benchmark::State& state) {
   std::vector<bool> a(state.range(), true);
 
   for (auto _ : state) {
@@ -46,6 +46,6 @@ static void bm_contains_bool(benchmark::State& state) {
     benchmark::DoNotOptimize(std::ranges::contains(a.begin(), a.end(), false));
   }
 }
-BENCHMARK(bm_contains_bool)->RangeMultiplier(16)->Range(16, 16 << 20);
+BENCHMARK(bm_contains_vector_bool)->RangeMultiplier(16)->Range(16, 16 << 20);
 
 BENCHMARK_MAIN();

From 88284e4efce09b0c9f46c3893554481815badf01 Mon Sep 17 00:00:00 2001
From: realqhc <caiqihan021@hotmail.com>
Date: Sat, 15 Feb 2025 22:26:02 +0800
Subject: [PATCH 022/109] [RISCV] Support Zb*/P Shared Instructions (#127160)

This enables shared instructions between Zb* and Base-P extension.

Documentation:
https://jhauser.us/RISCV/ext-P/RVP-baseInstrs-014.pdf
https://jhauser.us/RISCV/ext-P/RVP-instrEncodings-014.pdf
---
 .../Driver/print-supported-extensions-riscv.c |  1 +
 llvm/lib/Target/RISCV/RISCVFeatures.td        | 33 ++++++++++++++++
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td     | 26 +++++++------
 llvm/test/MC/RISCV/attribute-arch.s           |  6 +++
 llvm/test/MC/RISCV/rv32i-invalid.s            |  4 +-
 llvm/test/MC/RISCV/rv32p-valid.s              | 36 +++++++++++++++++
 llvm/test/MC/RISCV/rv64p-valid.s              | 39 +++++++++++++++++++
 .../TargetParser/RISCVISAInfoTest.cpp         |  1 +
 8 files changed, 133 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/MC/RISCV/rv32p-valid.s
 create mode 100644 llvm/test/MC/RISCV/rv64p-valid.s

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 3443ff0b69de9..49c5bfca2716f 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -182,6 +182,7 @@
 // CHECK-NEXT:     xwchc                2.2       'Xwchc' (WCH/QingKe additional compressed opcodes)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
+// CHECK-NEXT:     p                    0.14      'P' ('Base P' (Packed SIMD))
 // CHECK-NEXT:     zicfilp              1.0       'Zicfilp' (Landing pad)
 // CHECK-NEXT:     zicfiss              1.0       'Zicfiss' (Shadow stack)
 // CHECK-NEXT:     zalasr               0.1       'Zalasr' (Load-Acquire and Store-Release Instructions)
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 51aa8d7d307e4..30595119e37bf 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1016,6 +1016,39 @@ def HasStdExtSmctrOrSsctr : Predicate<"Subtarget->hasStdExtSmctrOrSsctr()">,
                                "'Smctr' (Control Transfer Records Machine Level) or "
                                "'Ssctr' (Control Transfer Records Supervisor Level)">;
 
+// Packed SIMD Extensions
+def FeatureStdExtP
+    : RISCVExperimentalExtension<0, 14,
+                                 "'Base P' (Packed SIMD)">;
+def HasStdExtP : Predicate<"Subtarget->hasStdExtP()">,
+                 AssemblerPredicate<(all_of FeatureStdExtP),
+                                    "'Base P' (Packed SIMD)">;
+
+def HasStdExtZbaOrP
+    : Predicate<"Subtarget->hasStdExtZba() || Subtarget->hasStdExtP()">,
+      AssemblerPredicate<(any_of FeatureStdExtZba, FeatureStdExtP),
+                         "'Zba' (Address Generation Instructions) or "
+                         "'Base P' (Packed-SIMD)">;
+
+def HasStdExtZbbOrP
+    : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtP()">,
+      AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtP),
+                         "'Zbb' (Basic Bit-Manipulation) or "
+                         "'Base P' (Packed-SIMD)">;
+
+def HasStdExtZbkbOrP
+    : Predicate<"Subtarget->hasStdExtZbkb() || Subtarget->hasStdExtP()">,
+      AssemblerPredicate<(any_of FeatureStdExtZbkb, FeatureStdExtP),
+                         "'Zbkb' (Bitmanip instructions for Cryptography) or "
+                         "'Base P' (Packed-SIMD)">;
+
+def HasStdExtZbbOrZbkbOrP
+    : Predicate<"Subtarget->HasStdExtZbbOrZbkb()|| Subtarget->hasStdExtP()">,
+      AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb, FeatureStdExtP),
+                         "'Zbb' (Basic Bit-Manipulation) or "
+                         "'Zbkb' (Bitmanip instructions for Cryptography) or "
+                         "'Base P' (Packed-SIMD)">;
+
 //===----------------------------------------------------------------------===//
 // Vendor extensions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 124caa3b69d31..2ce909c5d0e21 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -263,9 +263,10 @@ def XNOR  : ALU_rr<0b0100000, 0b100, "xnor">,
             Sched<[WriteIALU, ReadIALU, ReadIALU]>;
 } // Predicates = [HasStdExtZbbOrZbkb]
 
-let Predicates = [HasStdExtZba] in {
+let Predicates = [HasStdExtZbaOrP] in
 def SH1ADD : ALU_rr<0b0010000, 0b010, "sh1add">,
              Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>;
+let Predicates = [HasStdExtZba] in {
 def SH2ADD : ALU_rr<0b0010000, 0b100, "sh2add">,
              Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>;
 def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">,
@@ -337,30 +338,32 @@ def XPERM8 : ALU_rr<0b0010100, 0b100, "xperm8">,
              Sched<[WriteXPERM, ReadXPERM, ReadXPERM]>;
 } // Predicates = [HasStdExtZbkx]
 
-let Predicates = [HasStdExtZbb], IsSignExtendingOpW = 1 in {
+let Predicates = [HasStdExtZbbOrP], IsSignExtendingOpW = 1 in
 def CLZ  : Unary_r<0b011000000000, 0b001, "clz">,
            Sched<[WriteCLZ, ReadCLZ]>;
+let Predicates = [HasStdExtZbb], IsSignExtendingOpW = 1 in {
 def CTZ  : Unary_r<0b011000000001, 0b001, "ctz">,
            Sched<[WriteCTZ, ReadCTZ]>;
 def CPOP : Unary_r<0b011000000010, 0b001, "cpop">,
            Sched<[WriteCPOP, ReadCPOP]>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbb, IsRV64], IsSignExtendingOpW = 1 in {
+let Predicates = [HasStdExtZbbOrP, IsRV64], IsSignExtendingOpW = 1 in
 def CLZW  : UnaryW_r<0b011000000000, 0b001, "clzw">,
             Sched<[WriteCLZ32, ReadCLZ32]>;
+let Predicates = [HasStdExtZbb, IsRV64], IsSignExtendingOpW = 1 in {
 def CTZW  : UnaryW_r<0b011000000001, 0b001, "ctzw">,
             Sched<[WriteCTZ32, ReadCTZ32]>;
 def CPOPW : UnaryW_r<0b011000000010, 0b001, "cpopw">,
             Sched<[WriteCPOP32, ReadCPOP32]>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
-let Predicates = [HasStdExtZbb], IsSignExtendingOpW = 1 in {
+let Predicates = [HasStdExtZbbOrP], IsSignExtendingOpW = 1 in {
 def SEXT_B : Unary_r<0b011000000100, 0b001, "sext.b">,
              Sched<[WriteIALU, ReadIALU]>;
 def SEXT_H : Unary_r<0b011000000101, 0b001, "sext.h">,
              Sched<[WriteIALU, ReadIALU]>;
-} // Predicates = [HasStdExtZbb]
+} // Predicates = [HasStdExtZbbOrP]
 
 let Predicates = [HasStdExtZbc] in {
 def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr", Commutable=1>,
@@ -374,7 +377,7 @@ def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh", Commutable=1>,
              Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>;
 } // Predicates = [HasStdExtZbcOrZbkc]
 
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
 def MIN  : ALU_rr<0b0000101, 0b100, "min", Commutable=1>,
            Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 def MINU : ALU_rr<0b0000101, 0b101, "minu", Commutable=1>,
@@ -385,9 +388,10 @@ def MAXU : ALU_rr<0b0000101, 0b111, "maxu", Commutable=1>,
            Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbkb] in {
+let Predicates = [HasStdExtZbkbOrP] in
 def PACK  : ALU_rr<0b0000100, 0b100, "pack">,
             Sched<[WritePACK, ReadPACK, ReadPACK]>;
+let Predicates = [HasStdExtZbkb] in {
 let IsSignExtendingOpW = 1 in
 def PACKH : ALU_rr<0b0000100, 0b111, "packh">,
             Sched<[WritePACK, ReadPACK, ReadPACK]>;
@@ -407,15 +411,15 @@ def ZEXT_H_RV64 : RVBUnaryR<0b0000100, 0b100, OPC_OP_32, "zext.h">,
                   Sched<[WriteIALU, ReadIALU]>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
-let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in {
+let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in {
 def REV8_RV32 : Unary_r<0b011010011000, 0b101, "rev8">,
                 Sched<[WriteREV8, ReadREV8]>;
-} // Predicates = [HasStdExtZbbOrZbkb, IsRV32]
+} // Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32]
 
-let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
+let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in {
 def REV8_RV64 : Unary_r<0b011010111000, 0b101, "rev8">,
                 Sched<[WriteREV8, ReadREV8]>;
-} // Predicates = [HasStdExtZbbOrZbkb, IsRV64]
+} // Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64]
 
 let Predicates = [HasStdExtZbb] in {
 def ORC_B : Unary_r<0b001010000111, 0b101, "orc.b">,
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index 4e77a53bd706c..a8bb9b7e6cef1 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -473,3 +473,9 @@
 
 .attribute arch, "rv32i_sdtrig1p0"
 # CHECK: attribute      5, "rv32i2p1_sdtrig1p0"
+
+.attribute arch, "rv32i_p0p14"
+# CHECK: attribute      5, "rv32i2p1_p0p14"
+
+.attribute arch, "rv64i_p0p14"
+# CHECK: attribute      5, "rv64i2p1_p0p14"
\ No newline at end of file
diff --git a/llvm/test/MC/RISCV/rv32i-invalid.s b/llvm/test/MC/RISCV/rv32i-invalid.s
index ac0e3c6c1bdbf..1ffb10789bbbd 100644
--- a/llvm/test/MC/RISCV/rv32i-invalid.s
+++ b/llvm/test/MC/RISCV/rv32i-invalid.s
@@ -191,8 +191,8 @@ fadd.s a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the followi
 fadd.d a0, a2, a4 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zdinx' (Double in Integer){{$}}
 fadd.h a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zhinx' (Half Float in Integer){{$}}
 flh ft0, (a0) # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal){{$}}
-sh1add a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zba' (Address Generation Instructions){{$}}
-clz a0, a1 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zbb' (Basic Bit-Manipulation){{$}}
+sh1add a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zba' (Address Generation Instructions) or 'Base P' (Packed-SIMD){{$}}
+clz a0, a1 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zbb' (Basic Bit-Manipulation) or 'Base P' (Packed-SIMD){{$}}
 clmul a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zbc' (Carry-Less Multiplication) or 'Zbkc' (Carry-less multiply instructions for Cryptography){{$}}
 bset a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zbs' (Single-Bit Instructions){{$}}
 pause # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zihintpause' (Pause Hint){{$}}
diff --git a/llvm/test/MC/RISCV/rv32p-valid.s b/llvm/test/MC/RISCV/rv32p-valid.s
new file mode 100644
index 0000000000000..011de0c0d1579
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32p-valid.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-p -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-p < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-p -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: sh1add a0, a1, a2
+# CHECK-ASM: encoding: [0x33,0xa5,0xc5,0x20]
+sh1add a0, a1, a2
+# CHECK-ASM-AND-OBJ: clz a0, a1
+# CHECK-ASM: encoding: [0x13,0x95,0x05,0x60]
+clz a0, a1
+# CHECK-ASM-AND-OBJ: sext.b a2, a3
+# CHECK-ASM: encoding: [0x13,0x96,0x46,0x60]
+sext.b a2, a3
+# CHECK-ASM-AND-OBJ: sext.h t0, t1
+# CHECK-ASM: encoding: [0x93,0x12,0x53,0x60]
+sext.h t0, t1
+# CHECK-ASM-AND-OBJ: min t0, t1, t2
+# CHECK-ASM: encoding: [0xb3,0x42,0x73,0x0a]
+min t0, t1, t2
+# CHECK-ASM-AND-OBJ: minu t0, t1, t2
+# CHECK-ASM: encoding: [0xb3,0x52,0x73,0x0a]
+minu t0, t1, t2
+# CHECK-ASM-AND-OBJ: max t3, t4, t5
+# CHECK-ASM: encoding: [0x33,0xee,0xee,0x0b]
+max t3, t4, t5
+# CHECK-ASM-AND-OBJ: maxu a4, a5, a6
+# CHECK-ASM: encoding: [0x33,0xf7,0x07,0x0b]
+maxu a4, a5, a6
+# CHECK-ASM-AND-OBJ: pack s0, s1, s2
+# CHECK-ASM: encoding: [0x33,0xc4,0x24,0x09]
+pack s0, s1, s2
+# CHECK-ASM-AND-OBJ: rev8 s0, s1
+# CHECK-ASM: encoding: [0x13,0xd4,0x84,0x69]
+rev8 s0, s1
diff --git a/llvm/test/MC/RISCV/rv64p-valid.s b/llvm/test/MC/RISCV/rv64p-valid.s
new file mode 100644
index 0000000000000..48fa26aaaffe4
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64p-valid.s
@@ -0,0 +1,39 @@
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-p -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-p < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-p -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: sh1add a0, a1, a2
+# CHECK-ASM: encoding: [0x33,0xa5,0xc5,0x20]
+sh1add a0, a1, a2
+# CHECK-ASM-AND-OBJ: clz a0, a1
+# CHECK-ASM: encoding: [0x13,0x95,0x05,0x60]
+clz a0, a1
+# CHECK-ASM-AND-OBJ: clzw s0, s1
+# CHECK-ASM: encoding: [0x1b,0x94,0x04,0x60]
+clzw s0, s1
+# CHECK-ASM-AND-OBJ: sext.b a2, a3
+# CHECK-ASM: encoding: [0x13,0x96,0x46,0x60]
+sext.b a2, a3
+# CHECK-ASM-AND-OBJ: sext.h t0, t1
+# CHECK-ASM: encoding: [0x93,0x12,0x53,0x60]
+sext.h t0, t1
+# CHECK-ASM-AND-OBJ: min t0, t1, t2
+# CHECK-ASM: encoding: [0xb3,0x42,0x73,0x0a]
+min t0, t1, t2
+# CHECK-ASM-AND-OBJ: minu t0, t1, t2
+# CHECK-ASM: encoding: [0xb3,0x52,0x73,0x0a]
+minu t0, t1, t2
+# CHECK-ASM-AND-OBJ: max t3, t4, t5
+# CHECK-ASM: encoding: [0x33,0xee,0xee,0x0b]
+max t3, t4, t5
+# CHECK-ASM-AND-OBJ: maxu a4, a5, a6
+# CHECK-ASM: encoding: [0x33,0xf7,0x07,0x0b]
+maxu a4, a5, a6
+# CHECK-ASM-AND-OBJ: pack s0, s1, s2
+# CHECK-ASM: encoding: [0x33,0xc4,0x24,0x09]
+pack s0, s1, s2
+# CHECK-ASM-AND-OBJ: rev8 s0, s1
+# CHECK-ASM: encoding: [0x13,0xd4,0x84,0x6b]
+rev8 s0, s1
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 7ebfcf915a7c5..563f587d9d1c0 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1108,6 +1108,7 @@ R"(All available -march extensions for RISC-V
     xwchc                2.2
 
 Experimental extensions
+    p                    0.14
     zicfilp              1.0       This is a long dummy description
     zicfiss              1.0
     zalasr               0.1

From 21e956df9b2b283c2f2ed710c542ebeebf8473ff Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Mon, 3 Feb 2025 09:36:44 -0500
Subject: [PATCH 023/109] [CodeGen] Remove two dead pass initializer decls. NFC

- After #97727 and #101652, `LowerConstantIntrinsics` and
  `ExpandVectorPredicationPass` are no longer dedicated passes.
---
 llvm/include/llvm/InitializePasses.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index b8df4d1ecab1d..da4ffcd83213a 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -111,7 +111,6 @@ void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
 void initializeExpandPostRAPass(PassRegistry &);
 void initializeExpandReductionsPass(PassRegistry &);
 void initializeExpandVariadicsPass(PassRegistry &);
-void initializeExpandVectorPredicationPass(PassRegistry &);
 void initializeExternalAAWrapperPassPass(PassRegistry &);
 void initializeFEntryInserterPass(PassRegistry &);
 void initializeFinalizeISelPass(PassRegistry &);
@@ -174,7 +173,6 @@ void initializeLoopStrengthReducePass(PassRegistry &);
 void initializeLoopTermFoldPass(PassRegistry &);
 void initializeLoopUnrollPass(PassRegistry &);
 void initializeLowerAtomicLegacyPassPass(PassRegistry &);
-void initializeLowerConstantIntrinsicsPass(PassRegistry &);
 void initializeLowerEmuTLSPass(PassRegistry &);
 void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
 void initializeLowerIntrinsicsPass(PassRegistry &);

From 4664a4c66b816af53f596935c3aaa2eca143ae9c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 15 Feb 2025 16:17:42 +0100
Subject: [PATCH 024/109] [LAA] Use getPointer/setPointer in
 createCheckForAccess (NFC).

Use getPointer/setPointer to clarify we are accessing/modifying the
rurrent value.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 4bdcccdae0b7e..e5b87d2d16230 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1143,9 +1143,8 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
   SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs =
       findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
 
-  for (const auto &P : TranslatedPtrs) {
-    const SCEV *PtrExpr = get<0>(P);
-    if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
+  for (auto &P : TranslatedPtrs) {
+    if (!hasComputableBounds(PSE, Ptr, P.getPointer(), TheLoop, Assume))
       return false;
 
     // When we run after a failing dependency check we have to make sure
@@ -1161,8 +1160,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
     // If there's only one option for Ptr, look it up after bounds and wrap
     // checking, because assumptions might have been added to PSE.
     if (TranslatedPtrs.size() == 1)
-      TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
-                           false};
+      P.setPointer(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr));
   }
 
   for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {

From bfdf30e9b3d0b49344a651a5c7cd87be31d255c4 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 15 Feb 2025 17:04:32 +0000
Subject: [PATCH 025/109] [AArch64] Add patterns for addv(sext) and addv(zext)

This adds patterns for v8i8->i16 vaddlv and v4i16->i32 vaddlv, for both signed
and unsigned extends.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td |  15 ++-
 llvm/test/CodeGen/AArch64/arm64-vabs.ll     |   4 +-
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll |   3 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll  | 129 ++++++--------------
 4 files changed, 56 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c45b311b6ebb2..c9549f12769d1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7357,6 +7357,19 @@ defm FMAXV   : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", AArch64fmaxv>;
 defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", AArch64fminnmv>;
 defm FMINV   : SIMDFPAcrossLanes<0b01111, 1, "fminv", AArch64fminv>;
 
+def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (sext (v8i8 V64:$op))))), (i64 0))),
+          (EXTRACT_SUBREG (v8i16 (SUBREG_TO_REG (i64 0), (SADDLVv8i8v V64:$op), hsub)), ssub)>;
+def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (zext (v8i8 V64:$op))))), (i64 0))),
+          (EXTRACT_SUBREG (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$op), hsub)), ssub)>;
+def : Pat<(v8i16 (AArch64uaddv (v8i16 (sext (v8i8 V64:$op))))),
+          (v8i16 (SUBREG_TO_REG (i64 0), (SADDLVv8i8v V64:$op), hsub))>;
+def : Pat<(v8i16 (AArch64uaddv (v8i16 (zext (v8i8 V64:$op))))),
+          (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$op), hsub))>;
+def : Pat<(v4i32 (AArch64uaddv (v4i32 (sext (v4i16 V64:$op))))),
+          (v4i32 (SUBREG_TO_REG (i64 0), (SADDLVv4i16v V64:$op), ssub))>;
+def : Pat<(v4i32 (AArch64uaddv (v4i32 (zext (v4i16 V64:$op))))),
+          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv4i16v V64:$op), ssub))>;
+
 multiclass SIMDAcrossLaneLongPairIntrinsic<string Opc, SDPatternOperator addlp> {
   // Patterns for addv(addlp(x)) ==> addlv
   def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef,
@@ -7370,7 +7383,7 @@ multiclass SIMDAcrossLaneLongPairIntrinsic<string Opc, SDPatternOperator addlp>
   def : Pat<(v4i32 (AArch64uaddv (v4i32 (addlp (v8i16 V128:$op))))),
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v8i16v") V128:$op), ssub)>;
 
-  // Patterns for addp(addlp(x))) ==> addlv
+  // Patterns for addp(addlp(x)) ==> addlv
   def : Pat<(v2i32 (AArch64uaddv (v2i32 (addlp (v4i16 V64:$op))))),
             (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (!cast<Instruction>(Opc#"v4i16v") V64:$op), ssub)>;
   def : Pat<(v2i64 (AArch64uaddv (v2i64 (addlp (v4i32 V128:$op))))),
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index cc8568709ea21..fe4657186cd2a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -443,8 +443,8 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
 define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-SD-LABEL: uabdl4s_rdx_i32:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    uabdl.4s v0, v0, v1
-; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    uabd.4h v0, v0, v1
+; CHECK-SD-NEXT:    uaddlv.4h s0, v0
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 8e12446164e89..6fb4e219d39f4 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -87,8 +87,7 @@ define i32 @test_udot_v4i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    ldr s0, [x0]
 ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    uaddlv s0, v0.4h
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index c72d00e65fcab..fd24282366282 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -141,18 +141,11 @@ entry:
 }
 
 define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
-; CHECK-SD-LABEL: add_v4i16_v4i32_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    addv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i16_v4i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    uaddlv s0, v0.4h
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v4i16_v4i32_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -160,18 +153,11 @@ entry:
 }
 
 define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
-; CHECK-SD-LABEL: add_v4i16_v4i32_sext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    addv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i16_v4i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    saddlv s0, v0.4h
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v4i16_v4i32_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -483,8 +469,7 @@ define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
 ; CHECK-SD-LABEL: add_v4i8_v4i32_zext:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    uaddlv s0, v0.4h
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -589,8 +574,7 @@ entry:
 define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
 ; CHECK-SD-LABEL: add_v8i8_v8i16_sext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    saddlv h0, v0.8b
 ; CHECK-SD-NEXT:    smov w0, v0.h[0]
 ; CHECK-SD-NEXT:    ret
 ;
@@ -939,20 +923,12 @@ entry:
 }
 
 define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
-; CHECK-SD-LABEL: add_v4i16_v4i32_acc_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    addv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    add w0, w8, w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i16_v4i32_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    uaddlv s0, v0.4h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -961,20 +937,12 @@ entry:
 }
 
 define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
-; CHECK-SD-LABEL: add_v4i16_v4i32_acc_sext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    addv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    add w0, w8, w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i16_v4i32_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    saddlv s0, v0.4h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddlv s0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1324,8 +1292,7 @@ define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
 ; CHECK-SD-LABEL: add_v4i8_v4i32_acc_zext:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    uaddlv s0, v0.4h
 ; CHECK-SD-NEXT:    fmov w8, s0
 ; CHECK-SD-NEXT:    add w0, w8, w0
 ; CHECK-SD-NEXT:    ret
@@ -1402,22 +1369,13 @@ entry:
 }
 
 define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
-; CHECK-SD-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    addv h0, v0.8h
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    add w8, w8, w0
-; CHECK-SD-NEXT:    and w0, w8, #0xffff
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    uaddlv h0, v0.8b
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w8, w0
-; CHECK-GI-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv h0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1426,22 +1384,13 @@ entry:
 }
 
 define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
-; CHECK-SD-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    addv h0, v0.8h
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    add w8, w8, w0
-; CHECK-SD-NEXT:    sxth w0, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    saddlv h0, v0.8b
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w8, w0
-; CHECK-GI-NEXT:    sxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddlv h0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    sxth w0, w8
+; CHECK-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)

From b4030040359656ed20cb29de7b3912b6b249e98e Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Sat, 15 Feb 2025 17:25:03 +0000
Subject: [PATCH 026/109] ConstRange: factor and introduce splitPosNeg (NFC)
 (#126528)

Factor out some code that splits a ConstantRange into positive and
negative components, introducing ConstantRange::splitPosNeg.
---
 llvm/include/llvm/IR/ConstantRange.h    |  4 ++++
 llvm/lib/IR/ConstantRange.cpp           | 27 +++++++++++++++----------
 llvm/unittests/IR/ConstantRangeTest.cpp | 10 +++++++++
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/IR/ConstantRange.h b/llvm/include/llvm/IR/ConstantRange.h
index d086c25390fd2..3561513212ce2 100644
--- a/llvm/include/llvm/IR/ConstantRange.h
+++ b/llvm/include/llvm/IR/ConstantRange.h
@@ -92,6 +92,10 @@ class [[nodiscard]] ConstantRange {
   /// unsigned domain.
   static ConstantRange fromKnownBits(const KnownBits &Known, bool IsSigned);
 
+  /// Split the ConstantRange into positive and negative components, ignoring
+  /// zero values.
+  std::pair<ConstantRange, ConstantRange> splitPosNeg() const;
+
   /// Produce the smallest range such that all values that may satisfy the given
   /// predicate with any value contained within Other is contained in the
   /// returned range.  Formally, this returns a superset of
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 3566435398992..41e40cdf365d2 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -95,6 +95,17 @@ KnownBits ConstantRange::toKnownBits() const {
   return Known;
 }
 
+std::pair<ConstantRange, ConstantRange> ConstantRange::splitPosNeg() const {
+  uint32_t BW = getBitWidth();
+  APInt Zero = APInt::getZero(BW), One = APInt(BW, 1);
+  APInt SignedMin = APInt::getSignedMinValue(BW);
+  // There are no positive 1-bit values. The 1 would get interpreted as -1.
+  ConstantRange PosFilter =
+      BW == 1 ? getEmpty() : ConstantRange(One, SignedMin);
+  ConstantRange NegFilter(SignedMin, Zero);
+  return {intersectWith(PosFilter), intersectWith(NegFilter)};
+}
+
 ConstantRange ConstantRange::makeAllowedICmpRegion(CmpInst::Predicate Pred,
                                                    const ConstantRange &CR) {
   if (CR.isEmptySet())
@@ -1356,20 +1367,14 @@ ConstantRange::udiv(const ConstantRange &RHS) const {
 }
 
 ConstantRange ConstantRange::sdiv(const ConstantRange &RHS) const {
+  APInt Zero = APInt::getZero(getBitWidth());
+  APInt SignedMin = APInt::getSignedMinValue(getBitWidth());
+
   // We split up the LHS and RHS into positive and negative components
   // and then also compute the positive and negative components of the result
   // separately by combining division results with the appropriate signs.
-  APInt Zero = APInt::getZero(getBitWidth());
-  APInt SignedMin = APInt::getSignedMinValue(getBitWidth());
-  // There are no positive 1-bit values. The 1 would get interpreted as -1.
-  ConstantRange PosFilter =
-      getBitWidth() == 1 ? getEmpty()
-                         : ConstantRange(APInt(getBitWidth(), 1), SignedMin);
-  ConstantRange NegFilter(SignedMin, Zero);
-  ConstantRange PosL = intersectWith(PosFilter);
-  ConstantRange NegL = intersectWith(NegFilter);
-  ConstantRange PosR = RHS.intersectWith(PosFilter);
-  ConstantRange NegR = RHS.intersectWith(NegFilter);
+  auto [PosL, NegL] = splitPosNeg();
+  auto [PosR, NegR] = RHS.splitPosNeg();
 
   ConstantRange PosRes = getEmpty();
   if (!PosL.isEmptySet() && !PosR.isEmptySet())
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index c390ffea1c352..daa07bf7d840d 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -2126,6 +2126,16 @@ TEST(ConstantRange, GetEquivalentICmp) {
   });
 }
 
+TEST(ConstantRange, SplitPosNeg) {
+  EnumerateInterestingConstantRanges([](const ConstantRange &CR) {
+    auto [Pos, Neg] = CR.splitPosNeg();
+    EXPECT_TRUE(Pos.isAllPositive());
+    EXPECT_TRUE(Neg.isAllNegative());
+    if (CR.getBitWidth() == 1)
+      EXPECT_TRUE(Pos.isEmptySet());
+  });
+}
+
 #define EXPECT_MAY_OVERFLOW(op) \
   EXPECT_EQ(ConstantRange::OverflowResult::MayOverflow, (op))
 #define EXPECT_ALWAYS_OVERFLOWS_LOW(op) \

From 948e97a40eba6c176183e8e7aefb994681b593ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Sat, 15 Feb 2025 19:36:20 +0100
Subject: [PATCH 027/109] [flang] Revert MLIR_MAIN_SRC_DIR override (#127337)

This change is no longer necessary after #125842. Thanks to @nikic for
letting me know.
---
 flang/CMakeLists.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index cca56bfdc88e6..c012b884ae3be 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -230,11 +230,6 @@ if (FLANG_STANDALONE_BUILD)
     add_custom_target(doxygen ALL)
   endif()
 
-  # Override the value from installed CMake files, as they refer
-  # to the directory used during the original MLIR package build,
-  # which may be no longer available.  Instead, use the current checkout.
-  set(MLIR_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../mlir )
-
 else()
   option(FLANG_INCLUDE_TESTS
          "Generate build targets for the Flang unit tests."

From e60de25c4e9a6d59b7fd868e803cfe3cd77d4078 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 15 Feb 2025 19:44:39 +0100
Subject: [PATCH 028/109] [LAA] Replace symbolic strides for translated
 pointers earlier (NFC).

Move up replaceSymbolicStrideSCEV before isNoWrap. It needs to be called
after hasComputableBounds, as this may create an AddRec via PSE, which
replaceSymbolicStrideSCEV will look up.

This is in preparation for simplifying isNoWrap.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index e5b87d2d16230..43380b59ac49f 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1147,6 +1147,11 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
     if (!hasComputableBounds(PSE, Ptr, P.getPointer(), TheLoop, Assume))
       return false;
 
+    // If there's only one option for Ptr, look it up after bounds and wrap
+    // checking, because assumptions might have been added to PSE.
+    if (TranslatedPtrs.size() == 1)
+      P.setPointer(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr));
+
     // When we run after a failing dependency check we have to make sure
     // we don't have wrapping pointers.
     if (ShouldCheckWrap) {
@@ -1157,10 +1162,6 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
       if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop, Assume))
         return false;
     }
-    // If there's only one option for Ptr, look it up after bounds and wrap
-    // checking, because assumptions might have been added to PSE.
-    if (TranslatedPtrs.size() == 1)
-      P.setPointer(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr));
   }
 
   for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {

From c17df0af23c941cd4fc97851ea51c91eee7c49e4 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Sat, 15 Feb 2025 11:04:06 -0800
Subject: [PATCH 029/109] [webkit.UncountedLambdaCapturesChecker] Fix a crash
 in declProtectsThis (#127309)

Add a missing nullptr check to declProtectsThis.
---
 .../WebKit/UncountedLambdaCapturesChecker.cpp |  7 +++-
 ...mbda-captures-decl-protects-this-crash.cpp | 38 +++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-decl-protects-this-crash.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
index 4ffdac5ca4873..9527993d0edeb 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
@@ -267,6 +267,8 @@ class UncountedLambdaCapturesChecker
             auto OpCode = OpCE->getOperator();
             if (OpCode == OO_Star || OpCode == OO_Amp) {
               auto *Callee = OpCE->getDirectCallee();
+              if (!Callee)
+                return false;
               auto clsName = safeGetName(Callee->getParent());
               if (!isRefType(clsName) || !OpCE->getNumArgs())
                 return false;
@@ -276,9 +278,10 @@ class UncountedLambdaCapturesChecker
           }
           if (auto *UO = dyn_cast<UnaryOperator>(Arg)) {
             auto OpCode = UO->getOpcode();
-            if (OpCode == UO_Deref || OpCode == UO_AddrOf)
+            if (OpCode == UO_Deref || OpCode == UO_AddrOf) {
               Arg = UO->getSubExpr()->IgnoreParenCasts();
-            continue;
+              continue;
+            }
           }
           break;
         } while (Arg);
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-decl-protects-this-crash.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-decl-protects-this-crash.cpp
new file mode 100644
index 0000000000000..840433db5133a
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-decl-protects-this-crash.cpp
@@ -0,0 +1,38 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.UncountedLambdaCapturesChecker -verify %s
+
+struct Foo {
+  int x;
+  int y;
+  Foo(int x, int y) : x(x) , y(y) { }
+};
+
+template <typename T>
+struct Baz {
+  void ref() const;
+  void deref() const;
+  Foo operator*();
+  bool operator!();
+};
+
+inline Foo operator*(const Foo& a, const Foo& b);
+
+Baz<Foo> someFunction();
+template <typename CallbackType> void bar(CallbackType callback) {
+  auto baz = someFunction();
+  callback(baz);
+}
+
+struct Obj {
+  void ref() const;
+  void deref() const;
+
+  void foo(Foo foo) {
+    bar([this](auto baz) {
+      // expected-warning@-1{{Captured raw-pointer 'this' to ref-counted type or CheckedPtr-capable type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+      bar([this, foo = *baz, foo2 = !baz](auto&&) {
+        // expected-warning@-1{{Captured raw-pointer 'this' to ref-counted type or CheckedPtr-capable type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+        someFunction();
+      });
+    });
+  }
+};

From 2472d38338aed9a9cca41a0ca0921b39765256c1 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sat, 15 Feb 2025 20:11:48 +0100
Subject: [PATCH 030/109] [libc++] Move unused basic_string function definition
 to the dylib sources (#126219)

`__init(const value_type*, size_type, size_type)` is part of our ABI,
but we don't actually use the function anymore in the dylib. THis moves
the definition to the `src/` directory to make it clear that the code is
unused. This also allows us to remove it entirely in the unstable ABI.
---
 .../include/__string/extern_template_lists.h  |  2 -
 libcxx/include/string                         | 31 +++------------
 libcxx/src/string.cpp                         | 38 +++++++++++++++++++
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/libcxx/include/__string/extern_template_lists.h b/libcxx/include/__string/extern_template_lists.h
index cc536e514d4ff..dc66fa512b8bd 100644
--- a/libcxx/include/__string/extern_template_lists.h
+++ b/libcxx/include/__string/extern_template_lists.h
@@ -32,7 +32,6 @@
 #define _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST(_Func, _CharType) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*, size_type)) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::size_type basic_string<_CharType>::rfind(value_type const*, size_type, size_type) const) \
-  _Func(_LIBCPP_EXPORTED_FROM_ABI void basic_string<_CharType>::__init(value_type const*, size_type, size_type)) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::basic_string(basic_string const&)) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*)) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::basic_string(basic_string const&, allocator<_CharType> const&)) \
@@ -82,7 +81,6 @@
 #define _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST(_Func, _CharType) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*, size_type)) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::size_type basic_string<_CharType>::rfind(value_type const*, size_type, size_type) const) \
-  _Func(_LIBCPP_EXPORTED_FROM_ABI void basic_string<_CharType>::__init(value_type const*, size_type, size_type)) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*)) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::size_type basic_string<_CharType>::find_last_not_of(value_type const*, size_type, size_type) const) \
   _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::~basic_string()) \
diff --git a/libcxx/include/string b/libcxx/include/string
index b280f5f458459..396e73522d3e7 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -2254,7 +2254,6 @@ private:
     return __guess;
   }
 
-  inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(const value_type* __s, size_type __sz, size_type __reserve);
   inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(const value_type* __s, size_type __sz);
   inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(size_type __n, value_type __c);
 
@@ -2439,6 +2438,12 @@ private:
   template <class _CharT2, class _Traits2, class _Allocator2>
   friend inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool
   operator==(const basic_string<_CharT2, _Traits2, _Allocator2>&, const _CharT2*) _NOEXCEPT;
+
+  // These functions aren't used anymore but are part of out ABI, so we need to provide them in the dylib for backwards
+  // compatibility
+#  ifdef _LIBCPP_BUILDING_LIBRARY
+  void __init(const value_type* __s, size_type __sz, size_type __reserve);
+#  endif
 };
 
 // These declarations must appear before any functions are implicitly used
@@ -2490,30 +2495,6 @@ basic_string(from_range_t, _Range&&, _Allocator = _Allocator())
     -> basic_string<ranges::range_value_t<_Range>, char_traits<ranges::range_value_t<_Range>>, _Allocator>;
 #  endif
 
-template <class _CharT, class _Traits, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void
-basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz, size_type __reserve) {
-  if (__libcpp_is_constant_evaluated())
-    __rep_ = __rep();
-  if (__reserve > max_size())
-    __throw_length_error();
-  pointer __p;
-  if (__fits_in_sso(__reserve)) {
-    __set_short_size(__sz);
-    __p = __get_short_pointer();
-  } else {
-    auto __allocation = std::__allocate_at_least(__alloc_, __recommend(__reserve) + 1);
-    __p               = __allocation.ptr;
-    __begin_lifetime(__p, __allocation.count);
-    __set_long_pointer(__p);
-    __set_long_cap(__allocation.count);
-    __set_long_size(__sz);
-  }
-  traits_type::copy(std::__to_address(__p), __s, __sz);
-  traits_type::assign(__p[__sz], value_type());
-  __annotate_new(__sz);
-}
-
 template <class _CharT, class _Traits, class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz) {
diff --git a/libcxx/src/string.cpp b/libcxx/src/string.cpp
index dc16ce781f76b..e335639883dba 100644
--- a/libcxx/src/string.cpp
+++ b/libcxx/src/string.cpp
@@ -37,6 +37,44 @@ void __basic_string_common<true>::__throw_out_of_range() const { std::__throw_ou
 
 #endif // _LIBCPP_ABI_DO_NOT_EXPORT_BASIC_STRING_COMMON
 
+// Define legacy ABI functions
+// ---------------------------
+
+#ifndef _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION
+
+template <class _CharT, class _Traits, class _Allocator>
+void basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz, size_type __reserve) {
+  if (__libcpp_is_constant_evaluated())
+    __rep_ = __rep();
+  if (__reserve > max_size())
+    __throw_length_error();
+  pointer __p;
+  if (__fits_in_sso(__reserve)) {
+    __set_short_size(__sz);
+    __p = __get_short_pointer();
+  } else {
+    auto __allocation = std::__allocate_at_least(__alloc_, __recommend(__reserve) + 1);
+    __p               = __allocation.ptr;
+    __begin_lifetime(__p, __allocation.count);
+    __set_long_pointer(__p);
+    __set_long_cap(__allocation.count);
+    __set_long_size(__sz);
+  }
+  traits_type::copy(std::__to_address(__p), __s, __sz);
+  traits_type::assign(__p[__sz], value_type());
+  __annotate_new(__sz);
+}
+
+#  define STRING_LEGACY_API(CharT)                                                                                     \
+    template _LIBCPP_EXPORTED_FROM_ABI void basic_string<CharT>::__init(const value_type*, size_type, size_type)
+
+STRING_LEGACY_API(char);
+#  if _LIBCPP_HAS_WIDE_CHARACTERS
+STRING_LEGACY_API(wchar_t);
+#  endif
+
+#endif // _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION
+
 #define _LIBCPP_EXTERN_TEMPLATE_DEFINE(...) template __VA_ARGS__;
 #ifdef _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION
 _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST(_LIBCPP_EXTERN_TEMPLATE_DEFINE, char)

From 248716f814d1d1fef88911d01a0b551d53c87c7a Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 15 Feb 2025 20:15:32 +0100
Subject: [PATCH 031/109] [libc++] Fixes (|multi)_set spaceship operator.
 (#127326)

The operators did not have a _Compare template arguement. The fix
updates the generic container test to use allocators for all types used.
No other issues were found.

Fixes: #127095
---
 libcxx/include/set                            |   8 +-
 .../test/support/test_container_comparisons.h | 266 +++++++++++-------
 2 files changed, 168 insertions(+), 106 deletions(-)

diff --git a/libcxx/include/set b/libcxx/include/set
index 2784e82760d7e..3c6ea360bd06c 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -1003,9 +1003,9 @@ operator<=(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare,
 
 #  else // _LIBCPP_STD_VER <= 17
 
-template <class _Key, class _Allocator>
+template <class _Key, class _Compare, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Key>
-operator<=>(const set<_Key, _Allocator>& __x, const set<_Key, _Allocator>& __y) {
+operator<=>(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare, _Allocator>& __y) {
   return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
 
@@ -1470,9 +1470,9 @@ operator<=(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key,
 
 #  else // _LIBCPP_STD_VER <= 17
 
-template <class _Key, class _Allocator>
+template <class _Key, class _Compare, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Key>
-operator<=>(const multiset<_Key, _Allocator>& __x, const multiset<_Key, _Allocator>& __y) {
+operator<=>(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key, _Compare, _Allocator>& __y) {
   return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), __synth_three_way);
 }
 
diff --git a/libcxx/test/support/test_container_comparisons.h b/libcxx/test/support/test_container_comparisons.h
index 543c5899922d0..f7bf78e48a1f8 100644
--- a/libcxx/test/support/test_container_comparisons.h
+++ b/libcxx/test/support/test_container_comparisons.h
@@ -13,51 +13,52 @@
 #include <functional>
 #include <set>
 
+#include "test_allocator.h"
 #include "test_comparisons.h"
 
 // Implementation detail of `test_sequence_container_spaceship`
-template <template <typename...> typename Container, typename Elem, typename Order>
+template <template <typename...> typename Container, typename Elem, typename Allocator, typename Order>
 constexpr void test_sequence_container_spaceship_with_type() {
   // Empty containers
   {
-    Container<Elem> l1;
-    Container<Elem> l2;
+    Container<Elem, Allocator> l1;
+    Container<Elem, Allocator> l2;
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Identical contents
   {
-    Container<Elem> l1{1, 1};
-    Container<Elem> l2{1, 1};
+    Container<Elem, Allocator> l1{1, 1};
+    Container<Elem, Allocator> l2{1, 1};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Less, due to contained values
   {
-    Container<Elem> l1{1, 1};
-    Container<Elem> l2{1, 2};
+    Container<Elem, Allocator> l1{1, 1};
+    Container<Elem, Allocator> l2{1, 2};
     assert(testOrder(l1, l2, Order::less));
   }
   // Greater, due to contained values
   {
-    Container<Elem> l1{1, 3};
-    Container<Elem> l2{1, 2};
+    Container<Elem, Allocator> l1{1, 3};
+    Container<Elem, Allocator> l2{1, 2};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Shorter list
   {
-    Container<Elem> l1{1};
-    Container<Elem> l2{1, 2};
+    Container<Elem, Allocator> l1{1};
+    Container<Elem, Allocator> l2{1, 2};
     assert(testOrder(l1, l2, Order::less));
   }
   // Longer list
   {
-    Container<Elem> l1{1, 2};
-    Container<Elem> l2{1};
+    Container<Elem, Allocator> l1{1, 2};
+    Container<Elem, Allocator> l2{1};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Unordered
   if constexpr (std::is_same_v<Elem, PartialOrder>) {
-    Container<Elem> l1{1, std::numeric_limits<int>::min()};
-    Container<Elem> l2{1, 2};
+    Container<Elem, Allocator> l1{1, std::numeric_limits<int>::min()};
+    Container<Elem, Allocator> l2{1, 2};
     assert(testOrder(l1, l2, Order::unordered));
   }
 }
@@ -69,13 +70,22 @@ constexpr bool test_sequence_container_spaceship() {
   static_assert(std::three_way_comparable<Container<int>>);
 
   // Test different comparison categories
-  test_sequence_container_spaceship_with_type<Container, int, std::strong_ordering>();
-  test_sequence_container_spaceship_with_type<Container, StrongOrder, std::strong_ordering>();
-  test_sequence_container_spaceship_with_type<Container, WeakOrder, std::weak_ordering>();
-  test_sequence_container_spaceship_with_type<Container, PartialOrder, std::partial_ordering>();
+  test_sequence_container_spaceship_with_type<Container, int, std::allocator<int>, std::strong_ordering>();
+  test_sequence_container_spaceship_with_type<Container,
+                                              StrongOrder,
+                                              test_allocator<StrongOrder>,
+                                              std::strong_ordering>();
+  test_sequence_container_spaceship_with_type<Container, WeakOrder, std::allocator<WeakOrder>, std::weak_ordering>();
+  test_sequence_container_spaceship_with_type<Container,
+                                              PartialOrder,
+                                              test_allocator<PartialOrder>,
+                                              std::partial_ordering>();
 
   // `LessAndEqComp` does not have `operator<=>`. Ordering is synthesized based on `operator<`
-  test_sequence_container_spaceship_with_type<Container, LessAndEqComp, std::weak_ordering>();
+  test_sequence_container_spaceship_with_type<Container,
+                                              LessAndEqComp,
+                                              std::allocator<LessAndEqComp>,
+                                              std::weak_ordering>();
 
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
   struct NonComparable {};
@@ -175,109 +185,114 @@ constexpr bool test_sequence_container_adaptor_spaceship() {
 }
 
 // Implementation detail of `test_ordered_map_container_spaceship`
-template <template <typename...> typename Container, typename Key, typename Val, typename Order, typename Compare>
+template <template <typename...> typename Container,
+          typename Key,
+          typename Val,
+          typename Allocator,
+          typename Order,
+          typename Compare>
 constexpr void test_ordered_map_container_spaceship_with_type(Compare comp) {
   // Empty containers
   {
-    Container<Key, Val, Compare> l1{{}, comp};
-    Container<Key, Val, Compare> l2{{}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Identical contents
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 1}}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Less, due to contained values
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   // Greater, due to contained values
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Shorter list
   {
-    Container<Key, Val, Compare> l1{{{1, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   // Longer list
   {
-    Container<Key, Val, Compare> l1{{{1, 2}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 2}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}}, comp};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Unordered
   if constexpr (std::is_same_v<Val, PartialOrder>) {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, std::numeric_limits<int>::min()}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, std::numeric_limits<int>::min()}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::unordered));
   }
 
   // Identical contents
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 1}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, 1}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 1}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 1}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::equivalent));
   }
   // Less, due to contained values
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 1}, {2, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::less));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, 1}, {2, 1}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 2}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, 1}, {2, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 2}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::less));
   }
   // Greater, due to contained values
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 3}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 3}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::greater));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, 3}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 2}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, 3}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 2}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::greater));
   }
   // Shorter list
   {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}, {2, 2}, {3, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}, {2, 2}, {3, 1}}, comp};
     assert(testOrder(l1, l2, Order::less));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, 2}}, comp};
-    Container<Key, Val, Compare> l4{{{3, 1}, {2, 2}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{3, 1}, {2, 2}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::less));
   }
   // Longer list
   {
-    Container<Key, Val, Compare> l1{{{1, 2}, {2, 2}, {2, 2}, {3, 1}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 2}, {2, 2}, {2, 2}, {3, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}}, comp};
     assert(testOrder(l1, l2, Order::greater));
 
-    Container<Key, Val, Compare> l3{{{1, 2}, {2, 2}, {2, 2}, {3, 1}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 2}, {2, 2}, {2, 2}, {3, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::greater));
   }
   // Unordered
   if constexpr (std::is_same_v<Val, PartialOrder>) {
-    Container<Key, Val, Compare> l1{{{1, 1}, {2, std::numeric_limits<int>::min()}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l2{{{1, 1}, {2, 2}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l1{{{1, 1}, {2, std::numeric_limits<int>::min()}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l2{{{1, 1}, {2, 2}, {2, 3}}, comp};
     assert(testOrder(l1, l2, Order::unordered));
 
-    Container<Key, Val, Compare> l3{{{1, 1}, {2, std::numeric_limits<int>::min()}, {2, 3}}, comp};
-    Container<Key, Val, Compare> l4{{{2, 3}, {2, 2}, {1, 1}}, comp};
+    Container<Key, Val, Compare, Allocator> l3{{{1, 1}, {2, std::numeric_limits<int>::min()}, {2, 3}}, comp};
+    Container<Key, Val, Compare, Allocator> l4{{{2, 3}, {2, 2}, {1, 1}}, comp};
     assert(testOrder(l3, l4, Order::unordered));
   }
 }
@@ -293,94 +308,134 @@ constexpr bool test_ordered_map_container_spaceship() {
   static_assert(std::three_way_comparable<Container<int, int>>);
 
   // Test different comparison categories
-  test_ordered_map_container_spaceship_with_type<Container, int, int, std::strong_ordering>(std::less{});
-  test_ordered_map_container_spaceship_with_type<Container, int, int, std::strong_ordering>(std::greater{});
-  test_ordered_map_container_spaceship_with_type<Container, int, StrongOrder, std::strong_ordering>(std::less{});
-  test_ordered_map_container_spaceship_with_type<Container, int, StrongOrder, std::strong_ordering>(std::greater{});
-  test_ordered_map_container_spaceship_with_type<Container, int, WeakOrder, std::weak_ordering>(std::less{});
-  test_ordered_map_container_spaceship_with_type<Container, int, WeakOrder, std::weak_ordering>(std::greater{});
-  test_ordered_map_container_spaceship_with_type<Container, int, PartialOrder, std::partial_ordering>(std ::less{});
-  test_ordered_map_container_spaceship_with_type<Container, int, PartialOrder, std::partial_ordering>(std ::greater{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 int,
+                                                 std::allocator<std::pair<const int, int>>,
+                                                 std::strong_ordering>(std::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 int,
+                                                 test_allocator<std::pair<const int, int>>,
+                                                 std::strong_ordering>(std::greater{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 StrongOrder,
+                                                 std::allocator<std::pair<const int, StrongOrder>>,
+                                                 std::strong_ordering>(std::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 StrongOrder,
+                                                 test_allocator<std::pair<const int, StrongOrder>>,
+                                                 std::strong_ordering>(std::greater{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 WeakOrder,
+                                                 std::allocator<std::pair<const int, WeakOrder>>,
+                                                 std::weak_ordering>(std::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 WeakOrder,
+                                                 test_allocator<std::pair<const int, WeakOrder>>,
+                                                 std::weak_ordering>(std::greater{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 PartialOrder,
+                                                 std::allocator<std::pair<const int, PartialOrder>>,
+                                                 std::partial_ordering>(std ::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 PartialOrder,
+                                                 test_allocator<std::pair<const int, PartialOrder>>,
+                                                 std::partial_ordering>(std ::greater{});
 
   // `LessAndEqComp` does not have `operator<=>`. Ordering is synthesized based on `operator<`
-  test_ordered_map_container_spaceship_with_type<Container, int, LessAndEqComp, std::weak_ordering>(std::less{});
+  test_ordered_map_container_spaceship_with_type<Container,
+                                                 int,
+                                                 LessAndEqComp,
+                                                 std::allocator<std::pair<const int, LessAndEqComp>>,
+                                                 std::weak_ordering>(std::less{});
 
   return true;
 }
 
 // Implementation detail of `test_ordered_set_container_spaceship`
-template <template <typename...> typename Container, typename Elem, typename Order, typename Compare>
+template <template <typename...> typename Container,
+          typename Elem,
+          typename Allocator,
+          typename Order,
+          typename Compare>
 constexpr void test_ordered_set_spaceship_with_type(Compare comp) {
   // Empty containers
   {
-    Container<Elem, Compare> l1{{}, comp};
-    Container<Elem, Compare> l2{{}, comp};
+    Container<Elem, Compare, Allocator> l1{{}, comp};
+    Container<Elem, Compare, Allocator> l2{{}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Identical contents
   {
-    Container<Elem, Compare> l1{{1, 1, 2}, comp};
-    Container<Elem, Compare> l2{{1, 1, 2}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 1, 2}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 1, 2}, comp};
     assert(testOrder(l1, l2, Order::equivalent));
   }
   // Less, due to contained values
   {
-    Container<Elem, Compare> l1{{1, 1, 2, 3}, comp};
-    Container<Elem, Compare> l2{{1, 2, 2, 4}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 1, 2, 3}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 2, 2, 4}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   // Greater, due to contained values
   {
-    Container<Elem, Compare> l1{{1, 2, 2, 4}, comp};
-    Container<Elem, Compare> l2{{1, 1, 2, 3}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 2, 2, 4}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 1, 2, 3}, comp};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Shorter list
   {
-    Container<Elem, Compare> l1{{1, 1, 2, 2}, comp};
-    Container<Elem, Compare> l2{{1, 1, 2, 2, 3}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 1, 2, 2}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 1, 2, 2, 3}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   // Longer list
   {
-    Container<Elem, Compare> l1{{1, 1, 2, 2, 3}, comp};
-    Container<Elem, Compare> l2{{1, 1, 2, 2}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, 1, 2, 2, 3}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 1, 2, 2}, comp};
     assert(testOrder(l1, l2, Order::greater));
   }
   // Unordered
   if constexpr (std::is_same_v< Container<Elem>, std::multiset<PartialOrder>>) {
     if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::less{})>) {
-      Container<Elem, Compare> l1{{1, std::numeric_limits<int>::min()}, comp};
-      Container<Elem, Compare> l2{{1, 2}, comp};
+      Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::min()}, comp};
+      Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
       assert(testOrder(l1, l2, Order::unordered));
     }
     if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::less{})>) {
-      Container<Elem, Compare> l1{{1, std::numeric_limits<int>::max()}, comp};
-      Container<Elem, Compare> l2{{1, 2}, comp};
+      Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::max()}, comp};
+      Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
       assert(testOrder(l1, l2, Order::unordered));
     }
   }
   if constexpr (std::is_same_v< Container<Elem>, std::set<PartialOrder>>) {
     // Unordered values are not supported for `set`
     if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::less{})>) {
-      Container<Elem, Compare> l1{{1, std::numeric_limits<int>::min()}, comp};
-      Container<Elem, Compare> l2{{1, 2}, comp};
+      Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::min()}, comp};
+      Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
       assert(testOrder(l1, l2, Order::less));
     }
     if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::less{})>) {
-      Container<Elem, Compare> l1{{1, std::numeric_limits<int>::max()}, comp};
-      Container<Elem, Compare> l2{{1, 2}, comp};
+      Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::max()}, comp};
+      Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
       assert(testOrder(l1, l2, Order::less));
     }
   }
   if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::greater{})>) {
-    Container<Elem, Compare> l1{{1, std::numeric_limits<int>::min()}, comp};
-    Container<Elem, Compare> l2{{1, 2}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::min()}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
   if constexpr (std::is_same_v<Elem, PartialOrder> && std::is_same_v<Compare, decltype(std::greater{})>) {
-    Container<Elem, Compare> l1{{1, std::numeric_limits<int>::max()}, comp};
-    Container<Elem, Compare> l2{{1, 2}, comp};
+    Container<Elem, Compare, Allocator> l1{{1, std::numeric_limits<int>::max()}, comp};
+    Container<Elem, Compare, Allocator> l2{{1, 2}, comp};
     assert(testOrder(l1, l2, Order::less));
   }
 }
@@ -396,17 +451,24 @@ constexpr bool test_ordered_set_container_spaceship() {
   static_assert(std::three_way_comparable<Container<int>>);
 
   // Test different comparison categories
-  test_ordered_set_spaceship_with_type<Container, int, std::strong_ordering>(std::less{});
-  test_ordered_set_spaceship_with_type<Container, int, std::strong_ordering>(std::greater{});
-  test_ordered_set_spaceship_with_type<Container, StrongOrder, std::strong_ordering>(std::less{});
-  test_ordered_set_spaceship_with_type<Container, StrongOrder, std::strong_ordering>(std::greater{});
-  test_ordered_set_spaceship_with_type<Container, WeakOrder, std::weak_ordering>(std::less{});
-  test_ordered_set_spaceship_with_type<Container, WeakOrder, std::weak_ordering>(std::greater{});
-  test_ordered_set_spaceship_with_type<Container, PartialOrder, std::partial_ordering>(std::less{});
-  test_ordered_set_spaceship_with_type<Container, PartialOrder, std::partial_ordering>(std::greater{});
+  test_ordered_set_spaceship_with_type<Container, int, std::allocator<int>, std::strong_ordering>(std::less{});
+  test_ordered_set_spaceship_with_type<Container, int, test_allocator<int>, std::strong_ordering>(std::greater{});
+  test_ordered_set_spaceship_with_type<Container, StrongOrder, std::allocator<StrongOrder>, std::strong_ordering>(
+      std::less{});
+  test_ordered_set_spaceship_with_type<Container, StrongOrder, test_allocator<StrongOrder>, std::strong_ordering>(
+      std::greater{});
+  test_ordered_set_spaceship_with_type<Container, WeakOrder, std::allocator<WeakOrder>, std::weak_ordering>(
+      std::less{});
+  test_ordered_set_spaceship_with_type<Container, WeakOrder, test_allocator<WeakOrder>, std::weak_ordering>(
+      std::greater{});
+  test_ordered_set_spaceship_with_type<Container, PartialOrder, std::allocator<PartialOrder>, std::partial_ordering>(
+      std::less{});
+  test_ordered_set_spaceship_with_type<Container, PartialOrder, test_allocator<PartialOrder>, std::partial_ordering>(
+      std::greater{});
 
   // `LessAndEqComp` does not have `operator<=>`. Ordering is synthesized based on `operator<`
-  test_ordered_set_spaceship_with_type<Container, LessAndEqComp, std::weak_ordering>(std::less{});
+  test_ordered_set_spaceship_with_type<Container, LessAndEqComp, std::allocator<LessAndEqComp>, std::weak_ordering>(
+      std::less{});
 
   return true;
 }

From 761d422441c0e6822abb233d339fab4f3cf7f60a Mon Sep 17 00:00:00 2001
From: Chris B <chris.bieneman@me.com>
Date: Sat, 15 Feb 2025 13:21:36 -0600
Subject: [PATCH 032/109] [HLSL] Implement HLSL intialization list support
 (#123141)

This PR implements HLSL's initialization list behvaior as specified in
the draft language specifcation under

[*Decl.Init.Agg*](https://microsoft.github.io/hlsl-specs/specs/hlsl.html#Decl.Init.Agg).

This behavior is a bit unusual for C/C++ because intermediate braces in
initializer lists are ignored and a whole array of additional
conversions occur unintuitively to how initializaiton works in C.

The implementaiton in this PR generates a valid C/C++ initialization
list AST for the HLSL initializer so that there are no changes required
to Clang's CodeGen to support this. This design will also allow us to
use Clang's rewrite to convert HLSL initializers to valid C/C++
initializers that are equivalent. It does have the downside that it will
generate often redundant accesses during codegen. The IR optimizer is
extremely good at eliminating those so this will have no impact on the
final executable performance.

There is some opportunity for optimizing the initializer list generation
that we could consider in subsequent commits. One notable opportunity
would be to identify aggregate objects that occur in the same place in
both initializers and do not require converison, those aggregates could
be initialized as aggregates rather than fully scalarized.

Closes #56067

---------

Co-authored-by: Finn Plummer <50529406+inbelic@users.noreply.github.com>
Co-authored-by: Helena Kotas <hekotas@microsoft.com>
Co-authored-by: Justin Bogner <mail@justinbogner.com>
---
 .../clang/Basic/DiagnosticSemaKinds.td        |   3 +
 clang/include/clang/Sema/SemaHLSL.h           |   5 +
 clang/lib/AST/DeclCXX.cpp                     |  12 +
 clang/lib/CodeGen/CGExpr.cpp                  |   6 +
 clang/lib/CodeGen/CGExprAgg.cpp               |  13 +
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |  32 +
 clang/lib/CodeGen/CGHLSLRuntime.h             |   4 +
 clang/lib/CodeGen/CodeGenFunction.h           |   4 +
 clang/lib/Sema/SemaChecking.cpp               |   8 +-
 clang/lib/Sema/SemaHLSL.cpp                   | 209 +++-
 clang/lib/Sema/SemaInit.cpp                   |   5 +
 clang/test/CodeGenHLSL/ArrayTemporary.hlsl    |   3 +-
 .../CodeGenHLSL/BasicFeatures/InitLists.hlsl  | 963 +++++++++++++++++
 clang/test/SemaHLSL/ArrayTemporary.hlsl       |   2 +-
 .../Language/ElementwiseCast-errors.hlsl      |   4 +-
 clang/test/SemaHLSL/Language/InitListAST.hlsl | 983 ++++++++++++++++++
 clang/test/SemaHLSL/Language/InitLists.hlsl   | 126 +++
 17 files changed, 2369 insertions(+), 13 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
 create mode 100644 clang/test/SemaHLSL/Language/InitListAST.hlsl
 create mode 100644 clang/test/SemaHLSL/Language/InitLists.hlsl

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c4f0fc55b4a38..f10af8f5bd6b2 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12626,6 +12626,9 @@ def err_hlsl_pointers_unsupported : Error<
   "%select{pointers|references}0 are unsupported in HLSL">;
 def err_hlsl_missing_resource_class : Error<"HLSL resource needs to have [[hlsl::resource_class()]] attribute">;
 def err_hlsl_attribute_needs_intangible_type: Error<"attribute %0 can be used only on HLSL intangible type %1">;
+def err_hlsl_incorrect_num_initializers: Error<
+  "too %select{few|many}0 initializers in list for type %1 "
+  "(expected %2 but found %3)">;
 
 def err_hlsl_operator_unsupported : Error<
   "the '%select{&|*|->}0' operator is unsupported in HLSL">;
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index c9266ea50e4bf..4f4bbe95476ee 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -26,6 +26,8 @@
 namespace clang {
 class AttributeCommonInfo;
 class IdentifierInfo;
+class InitializedEntity;
+class InitializationKind;
 class ParsedAttr;
 class Scope;
 class VarDecl;
@@ -149,6 +151,9 @@ class SemaHLSL : public SemaBase {
 
   QualType getInoutParameterType(QualType Ty);
 
+  bool TransformInitList(const InitializedEntity &Entity,
+                         const InitializationKind &Kind, InitListExpr *Init);
+
 private:
   // HLSL resource type attributes need to be processed all at once.
   // This is a list to collect them.
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index e394e0515e599..1aa48f0026335 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -1462,6 +1462,18 @@ void CXXRecordDecl::addedMember(Decl *D) {
     if (Using->getDeclName().getCXXOverloadedOperator() == OO_Equal)
       data().HasInheritedAssignment = true;
   }
+
+  // HLSL: All user-defined data types are aggregates and use aggregate
+  // initialization, meanwhile most, but not all built-in types behave like
+  // aggregates. Resource types, and some other HLSL types that wrap handles
+  // don't behave like aggregates. We can identify these as different because we
+  // implicitly define "special" member functions, which aren't spellable in
+  // HLSL. This all _needs_ to change in the future. There are two
+  // relevant HLSL feature proposals that will depend on this changing:
+  // * 0005-strict-initializer-lists.md
+  // * https://github.com/microsoft/hlsl-specs/pull/325
+  if (getLangOpts().HLSL)
+    data().Aggregate = data().UserDeclaredSpecialMembers == 0;
 }
 
 bool CXXRecordDecl::isLiteral() const {
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 0b0ffd2db853f..191912ca7d800 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5551,6 +5551,12 @@ CodeGenFunction::getOrCreateOpaqueRValueMapping(const OpaqueValueExpr *e) {
   return EmitAnyExpr(e->getSourceExpr());
 }
 
+bool CodeGenFunction::isOpaqueValueEmitted(const OpaqueValueExpr *E) {
+  if (OpaqueValueMapping::shouldBindAsLValue(E))
+    return OpaqueLValues.contains(E);
+  return OpaqueRValues.contains(E);
+}
+
 RValue CodeGenFunction::EmitRValueForField(LValue LV,
                                            const FieldDecl *FD,
                                            SourceLocation Loc) {
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index d25d0f2c2133c..625ca363d9019 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGCXXABI.h"
+#include "CGHLSLRuntime.h"
 #include "CGObjCRuntime.h"
 #include "CGRecordLayout.h"
 #include "CodeGenFunction.h"
@@ -1776,6 +1777,18 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
   }
 #endif
 
+  // HLSL initialization lists in the AST are an expansion which can contain
+  // side-effecting expressions wrapped in opaque value expressions. To properly
+  // emit these we need to emit the opaque values before we emit the argument
+  // expressions themselves. This is a little hacky, but it prevents us needing
+  // to do a bigger AST-level change for a language feature that we need
+  // deprecate in the near future. See related HLSL language proposals:
+  // * 0005-strict-initializer-lists.md
+  // * https://github.com/microsoft/hlsl-specs/pull/325
+  if (CGF.getLangOpts().HLSL && isa<InitListExpr>(ExprToVisit))
+    CGF.CGM.getHLSLRuntime().emitInitListOpaqueValues(
+        CGF, cast<InitListExpr>(ExprToVisit));
+
   AggValueSlot Dest = EnsureSlot(ExprToVisit->getType());
 
   LValue DestLV = CGF.MakeAddrLValue(Dest.getAddress(), ExprToVisit->getType());
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 03ddc87d8d3df..856d8b1b2948d 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -14,9 +14,11 @@
 
 #include "CGHLSLRuntime.h"
 #include "CGDebugInfo.h"
+#include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
 #include "clang/AST/Decl.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/TargetOptions.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
@@ -617,3 +619,33 @@ llvm::Instruction *CGHLSLRuntime::getConvergenceToken(BasicBlock &BB) {
   llvm_unreachable("Convergence token should have been emitted.");
   return nullptr;
 }
+
+class OpaqueValueVisitor : public RecursiveASTVisitor<OpaqueValueVisitor> {
+public:
+  llvm::SmallPtrSet<OpaqueValueExpr *, 8> OVEs;
+  OpaqueValueVisitor() {}
+
+  bool VisitOpaqueValueExpr(OpaqueValueExpr *E) {
+    OVEs.insert(E);
+    return true;
+  }
+};
+
+void CGHLSLRuntime::emitInitListOpaqueValues(CodeGenFunction &CGF,
+                                             InitListExpr *E) {
+
+  typedef CodeGenFunction::OpaqueValueMappingData OpaqueValueMappingData;
+  OpaqueValueVisitor Visitor;
+  Visitor.TraverseStmt(E);
+  for (auto *OVE : Visitor.OVEs) {
+    if (CGF.isOpaqueValueEmitted(OVE))
+      continue;
+    if (OpaqueValueMappingData::shouldBindAsLValue(OVE)) {
+      LValue LV = CGF.EmitLValue(OVE->getSourceExpr());
+      OpaqueValueMappingData::bind(CGF, OVE, LV);
+    } else {
+      RValue RV = CGF.EmitAnyExpr(OVE->getSourceExpr());
+      OpaqueValueMappingData::bind(CGF, OVE, RV);
+    }
+  }
+}
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 032b2dee82f21..8767a2ddceb96 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -55,6 +55,7 @@ class StructType;
 namespace clang {
 class VarDecl;
 class ParmVarDecl;
+class InitListExpr;
 class HLSLBufferDecl;
 class HLSLResourceBindingAttr;
 class Type;
@@ -65,6 +66,7 @@ class FunctionDecl;
 namespace CodeGen {
 
 class CodeGenModule;
+class CodeGenFunction;
 
 class CGHLSLRuntime {
 public:
@@ -161,6 +163,8 @@ class CGHLSLRuntime {
 
   llvm::Instruction *getConvergenceToken(llvm::BasicBlock &BB);
 
+  void emitInitListOpaqueValues(CodeGenFunction &CGF, InitListExpr *E);
+
 private:
   void addBufferResourceAnnotation(llvm::GlobalVariable *GV,
                                    llvm::hlsl::ResourceClass RC,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 64cd8a3ac55e2..8c5362bcc33c4 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3011,6 +3011,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// otherwise create one.
   RValue getOrCreateOpaqueRValueMapping(const OpaqueValueExpr *e);
 
+  /// isOpaqueValueEmitted - Return true if the opaque value expression has
+  /// already been emitted.
+  bool isOpaqueValueEmitted(const OpaqueValueExpr *E);
+
   /// Get the index of the current ArrayInitLoopExpr, if any.
   llvm::Value *getArrayInitIndex() { return ArrayInitIndex; }
 
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 66c233de4ef30..aae61f612a4bc 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -11679,8 +11679,12 @@ static void AnalyzeImplicitConversions(
   // Propagate whether we are in a C++ list initialization expression.
   // If so, we do not issue warnings for implicit int-float conversion
   // precision loss, because C++11 narrowing already handles it.
-  bool IsListInit = Item.IsListInit ||
-                    (isa<InitListExpr>(OrigE) && S.getLangOpts().CPlusPlus);
+  //
+  // HLSL's initialization lists are special, so they shouldn't observe the C++
+  // behavior here.
+  bool IsListInit =
+      Item.IsListInit || (isa<InitListExpr>(OrigE) &&
+                          S.getLangOpts().CPlusPlus && !S.getLangOpts().HLSL);
 
   if (E->isTypeDependent() || E->isValueDependent())
     return;
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 9a60054a6169e..be45761552290 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2589,17 +2589,20 @@ static void BuildFlattenedTypeList(QualType BaseTy,
       continue;
     }
     if (const auto *RT = dyn_cast<RecordType>(T)) {
-      const RecordDecl *RD = RT->getDecl();
-      if (RD->isUnion()) {
+      const CXXRecordDecl *RD = RT->getAsCXXRecordDecl();
+      assert(RD && "HLSL record types should all be CXXRecordDecls!");
+
+      if (RD->isStandardLayout())
+        RD = RD->getStandardLayoutBaseWithFields();
+
+      // For types that we shouldn't decompose (unions and non-aggregates), just
+      // add the type itself to the list.
+      if (RD->isUnion() || !RD->isAggregate()) {
         List.push_back(T);
         continue;
       }
-      const CXXRecordDecl *CXXD = dyn_cast<CXXRecordDecl>(RD);
 
       llvm::SmallVector<QualType, 16> FieldTypes;
-      if (CXXD && CXXD->isStandardLayout())
-        RD = CXXD->getStandardLayoutBaseWithFields();
-
       for (const auto *FD : RD->fields())
         FieldTypes.push_back(FD->getType());
       // Reverse the newly added sub-range.
@@ -2608,9 +2611,9 @@ static void BuildFlattenedTypeList(QualType BaseTy,
 
       // If this wasn't a standard layout type we may also have some base
       // classes to deal with.
-      if (CXXD && !CXXD->isStandardLayout()) {
+      if (!RD->isStandardLayout()) {
         FieldTypes.clear();
-        for (const auto &Base : CXXD->bases())
+        for (const auto &Base : RD->bases())
           FieldTypes.push_back(Base.getType());
         std::reverse(FieldTypes.begin(), FieldTypes.end());
         WorkList.insert(WorkList.end(), FieldTypes.begin(), FieldTypes.end());
@@ -3056,3 +3059,193 @@ void SemaHLSL::processExplicitBindingsOnDecl(VarDecl *VD) {
     }
   }
 }
+
+static bool CastInitializer(Sema &S, ASTContext &Ctx, Expr *E,
+                            llvm::SmallVectorImpl<Expr *> &List,
+                            llvm::SmallVectorImpl<QualType> &DestTypes) {
+  if (List.size() >= DestTypes.size()) {
+    List.push_back(E);
+    // This is odd, but it isn't technically a failure due to conversion, we
+    // handle mismatched counts of arguments differently.
+    return true;
+  }
+  InitializedEntity Entity = InitializedEntity::InitializeParameter(
+      Ctx, DestTypes[List.size()], false);
+  ExprResult Res = S.PerformCopyInitialization(Entity, E->getBeginLoc(), E);
+  if (Res.isInvalid())
+    return false;
+  Expr *Init = Res.get();
+  List.push_back(Init);
+  return true;
+}
+
+static bool BuildInitializerList(Sema &S, ASTContext &Ctx, Expr *E,
+                                 llvm::SmallVectorImpl<Expr *> &List,
+                                 llvm::SmallVectorImpl<QualType> &DestTypes) {
+  // If this is an initialization list, traverse the sub initializers.
+  if (auto *Init = dyn_cast<InitListExpr>(E)) {
+    for (auto *SubInit : Init->inits())
+      if (!BuildInitializerList(S, Ctx, SubInit, List, DestTypes))
+        return false;
+    return true;
+  }
+
+  // If this is a scalar type, just enqueue the expression.
+  QualType Ty = E->getType();
+
+  if (Ty->isScalarType() || (Ty->isRecordType() && !Ty->isAggregateType()))
+    return CastInitializer(S, Ctx, E, List, DestTypes);
+
+  if (auto *VecTy = Ty->getAs<VectorType>()) {
+    uint64_t Size = VecTy->getNumElements();
+
+    QualType SizeTy = Ctx.getSizeType();
+    uint64_t SizeTySize = Ctx.getTypeSize(SizeTy);
+    for (uint64_t I = 0; I < Size; ++I) {
+      auto *Idx = IntegerLiteral::Create(Ctx, llvm::APInt(SizeTySize, I),
+                                         SizeTy, SourceLocation());
+
+      ExprResult ElExpr = S.CreateBuiltinArraySubscriptExpr(
+          E, E->getBeginLoc(), Idx, E->getEndLoc());
+      if (ElExpr.isInvalid())
+        return false;
+      if (!CastInitializer(S, Ctx, ElExpr.get(), List, DestTypes))
+        return false;
+    }
+    return true;
+  }
+
+  if (auto *ArrTy = dyn_cast<ConstantArrayType>(Ty.getTypePtr())) {
+    uint64_t Size = ArrTy->getZExtSize();
+    QualType SizeTy = Ctx.getSizeType();
+    uint64_t SizeTySize = Ctx.getTypeSize(SizeTy);
+    for (uint64_t I = 0; I < Size; ++I) {
+      auto *Idx = IntegerLiteral::Create(Ctx, llvm::APInt(SizeTySize, I),
+                                         SizeTy, SourceLocation());
+      ExprResult ElExpr = S.CreateBuiltinArraySubscriptExpr(
+          E, E->getBeginLoc(), Idx, E->getEndLoc());
+      if (ElExpr.isInvalid())
+        return false;
+      if (!BuildInitializerList(S, Ctx, ElExpr.get(), List, DestTypes))
+        return false;
+    }
+    return true;
+  }
+
+  if (auto *RTy = Ty->getAs<RecordType>()) {
+    llvm::SmallVector<const RecordType *> RecordTypes;
+    RecordTypes.push_back(RTy);
+    while (RecordTypes.back()->getAsCXXRecordDecl()->getNumBases()) {
+      CXXRecordDecl *D = RecordTypes.back()->getAsCXXRecordDecl();
+      assert(D->getNumBases() == 1 &&
+             "HLSL doesn't support multiple inheritance");
+      RecordTypes.push_back(D->bases_begin()->getType()->getAs<RecordType>());
+    }
+    while (!RecordTypes.empty()) {
+      const RecordType *RT = RecordTypes.back();
+      RecordTypes.pop_back();
+      for (auto *FD : RT->getDecl()->fields()) {
+        DeclAccessPair Found = DeclAccessPair::make(FD, FD->getAccess());
+        DeclarationNameInfo NameInfo(FD->getDeclName(), E->getBeginLoc());
+        ExprResult Res = S.BuildFieldReferenceExpr(
+            E, false, E->getBeginLoc(), CXXScopeSpec(), FD, Found, NameInfo);
+        if (Res.isInvalid())
+          return false;
+        if (!BuildInitializerList(S, Ctx, Res.get(), List, DestTypes))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+static Expr *GenerateInitLists(ASTContext &Ctx, QualType Ty,
+                               llvm::SmallVectorImpl<Expr *>::iterator &It) {
+  if (Ty->isScalarType() || (Ty->isRecordType() && !Ty->isAggregateType())) {
+    return *(It++);
+  }
+  llvm::SmallVector<Expr *> Inits;
+  assert(!isa<MatrixType>(Ty) && "Matrix types not yet supported in HLSL");
+  Ty = Ty.getDesugaredType(Ctx);
+  if (Ty->isVectorType() || Ty->isConstantArrayType()) {
+    QualType ElTy;
+    uint64_t Size = 0;
+    if (auto *ATy = Ty->getAs<VectorType>()) {
+      ElTy = ATy->getElementType();
+      Size = ATy->getNumElements();
+    } else {
+      auto *VTy = cast<ConstantArrayType>(Ty.getTypePtr());
+      ElTy = VTy->getElementType();
+      Size = VTy->getZExtSize();
+    }
+    for (uint64_t I = 0; I < Size; ++I)
+      Inits.push_back(GenerateInitLists(Ctx, ElTy, It));
+  }
+  if (auto *RTy = Ty->getAs<RecordType>()) {
+    llvm::SmallVector<const RecordType *> RecordTypes;
+    RecordTypes.push_back(RTy);
+    while (RecordTypes.back()->getAsCXXRecordDecl()->getNumBases()) {
+      CXXRecordDecl *D = RecordTypes.back()->getAsCXXRecordDecl();
+      assert(D->getNumBases() == 1 &&
+             "HLSL doesn't support multiple inheritance");
+      RecordTypes.push_back(D->bases_begin()->getType()->getAs<RecordType>());
+    }
+    while (!RecordTypes.empty()) {
+      const RecordType *RT = RecordTypes.back();
+      RecordTypes.pop_back();
+      for (auto *FD : RT->getDecl()->fields()) {
+        Inits.push_back(GenerateInitLists(Ctx, FD->getType(), It));
+      }
+    }
+  }
+  auto *NewInit = new (Ctx) InitListExpr(Ctx, Inits.front()->getBeginLoc(),
+                                         Inits, Inits.back()->getEndLoc());
+  NewInit->setType(Ty);
+  return NewInit;
+}
+
+bool SemaHLSL::TransformInitList(const InitializedEntity &Entity,
+                                 const InitializationKind &Kind,
+                                 InitListExpr *Init) {
+  // If the initializer is a scalar, just return it.
+  if (Init->getType()->isScalarType())
+    return true;
+  ASTContext &Ctx = SemaRef.getASTContext();
+  llvm::SmallVector<QualType, 16> DestTypes;
+  // An initializer list might be attempting to initialize a reference or
+  // rvalue-reference. When checking the initializer we should look through the
+  // reference.
+  QualType InitTy = Entity.getType().getNonReferenceType();
+  BuildFlattenedTypeList(InitTy, DestTypes);
+
+  llvm::SmallVector<Expr *, 16> ArgExprs;
+  for (unsigned I = 0; I < Init->getNumInits(); ++I) {
+    Expr *E = Init->getInit(I);
+    if (E->HasSideEffects(Ctx)) {
+      QualType Ty = E->getType();
+      if (auto *RTy = Ty->getAs<RecordType>())
+        E = new (Ctx) MaterializeTemporaryExpr(Ty, E, E->isLValue());
+      E = new (Ctx) OpaqueValueExpr(E->getBeginLoc(), Ty, E->getValueKind(),
+                                    E->getObjectKind(), E);
+      Init->setInit(I, E);
+    }
+    if (!BuildInitializerList(SemaRef, Ctx, E, ArgExprs, DestTypes))
+      return false;
+  }
+
+  if (DestTypes.size() != ArgExprs.size()) {
+    int TooManyOrFew = ArgExprs.size() > DestTypes.size() ? 1 : 0;
+    SemaRef.Diag(Init->getBeginLoc(), diag::err_hlsl_incorrect_num_initializers)
+        << TooManyOrFew << InitTy << DestTypes.size() << ArgExprs.size();
+    return false;
+  }
+
+  auto It = ArgExprs.begin();
+  // GenerateInitLists will always return an InitListExpr here, because the
+  // scalar case is handled above.
+  auto *NewInit = cast<InitListExpr>(GenerateInitLists(Ctx, InitTy, It));
+  Init->resizeInits(Ctx, NewInit->getNumInits());
+  for (unsigned I = 0; I < NewInit->getNumInits(); ++I)
+    Init->updateInit(Ctx, I, NewInit->getInit(I));
+  return true;
+}
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 18090eb1c9e9a..6a76e6d74a4b0 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -26,6 +26,7 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Ownership.h"
+#include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaObjC.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -4787,6 +4788,10 @@ static void TryListInitialization(Sema &S,
                                   bool TreatUnavailableAsInvalid) {
   QualType DestType = Entity.getType();
 
+  if (S.getLangOpts().HLSL &&
+      !S.HLSL().TransformInitList(Entity, Kind, InitList))
+    return;
+
   // C++ doesn't allow scalar initialization with more than one argument.
   // But C99 complex numbers are scalars and it makes sense there.
   if (S.getLangOpts().CPlusPlus && DestType->isScalarType() &&
diff --git a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
index e5db7eac37a42..91a283554459d 100644
--- a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
+++ b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
@@ -1,3 +1,4 @@
+
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
 void fn(float x[2]) { }
@@ -27,7 +28,7 @@ void fn2(Obj O[4]) { }
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[Arr]], i32 32, i1 false)
 // CHECK: call void {{.*}}fn2{{.*}}(ptr noundef byval([4 x %struct.Obj]) align 4 [[Tmp]])
 void call2() {
-  Obj Arr[4] = {};
+  Obj Arr[4] = {{0, 0}, {0, 0}, {0, 0}, {0, 0}};
   fn2(Arr);
 }
 
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
new file mode 100644
index 0000000000000..a0590162c7087
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
@@ -0,0 +1,963 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
+
+struct TwoFloats {
+  float X, Y;
+};
+
+struct TwoInts {
+  int Z, W;
+};
+
+struct Doggo {
+  int4 LegState;
+  int TailState;
+  float HairCount;
+  float4 EarDirection[2];
+};
+
+struct AnimalBits {
+  int Legs[4];
+  uint State;
+  int64_t Counter;
+  float4 LeftDir;
+  float4 RightDir;
+};
+
+struct Kitteh {
+  int4 Legs;
+  int TailState;
+  float HairCount;
+  float4 Claws[2];
+};
+
+struct Zoo {
+  Doggo Dogs[2];
+  Kitteh Cats[4];
+};
+
+struct FourFloats : TwoFloats {
+  float Z, W;
+};
+
+struct SlicyBits {
+  int Z : 8;
+  int W : 8;
+};
+
+// Case 1: Extraneous braces get ignored in literal instantiation.
+// CHECK-LABEL: define void @_Z5case1v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 4 [[AGG_RESULT:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_RESULT]], ptr align 4 @__const._Z5case1v.TF1, i32 8, i1 false)
+// CHECK-NEXT:    ret void
+//
+TwoFloats case1() {
+  TwoFloats TF1 = {{{1.0, 2}}};
+  return TF1;
+}
+
+// Case 2: Valid C/C++ initializer is handled appropriately.
+// CHECK-LABEL: define void @_Z5case2v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 4 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_RESULT]], ptr align 4 @__const._Z5case2v.TF2, i32 8, i1 false)
+// CHECK-NEXT:    ret void
+//
+TwoFloats case2() {
+  TwoFloats TF2 = {1, 2};
+  return TF2;
+}
+
+// Case 3: Simple initialization with conversion of an argument.
+// CHECK-LABEL: define void @_Z5case3i(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 4 [[AGG_RESULT:%.*]], i32 noundef [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[VAL]], ptr [[VAL_ADDR]], align 4
+// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VAL_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK-NEXT:    store float [[CONV]], ptr [[X]], align 4
+// CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    store float 2.000000e+00, ptr [[Y]], align 4
+// CHECK-NEXT:    ret void
+//
+TwoFloats case3(int Val) {
+  TwoFloats TF3 = {Val, 2};
+  return TF3;
+}
+
+// Case 4: Initialization from a scalarized vector into a structure with element
+// conversions.
+// CHECK-LABEL: define void @_Z5case4Dv2_i(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 4 [[AGG_RESULT:%.*]], <2 x i32> noundef [[TWOVALS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TWOVALS_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <2 x i32> [[TWOVALS]], ptr [[TWOVALS_ADDR]], align 8
+// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[TWOVALS_ADDR]], align 8
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0
+// CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[VECEXT]] to float
+// CHECK-NEXT:    store float [[CONV]], ptr [[X]], align 4
+// CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[TWOVALS_ADDR]], align 8
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
+// CHECK-NEXT:    [[CONV2:%.*]] = sitofp i32 [[VECEXT1]] to float
+// CHECK-NEXT:    store float [[CONV2]], ptr [[Y]], align 4
+// CHECK-NEXT:    ret void
+//
+TwoFloats case4(int2 TwoVals) {
+  TwoFloats TF4 = {TwoVals};
+  return TF4;
+}
+
+// Case 5: Initialization from a scalarized vector of matching type.
+// CHECK-LABEL: define void @_Z5case5Dv2_i(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 4 [[AGG_RESULT:%.*]], <2 x i32> noundef [[TWOVALS:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TWOVALS_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    store <2 x i32> [[TWOVALS]], ptr [[TWOVALS_ADDR]], align 8
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[TWOVALS_ADDR]], align 8
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0
+// CHECK-NEXT:    store i32 [[VECEXT]], ptr [[Z]], align 4
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[TWOVALS_ADDR]], align 8
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
+// CHECK-NEXT:    store i32 [[VECEXT1]], ptr [[W]], align 4
+// CHECK-NEXT:    ret void
+//
+TwoInts case5(int2 TwoVals) {
+  TwoInts TI1 = {TwoVals};
+  return TI1;
+}
+
+// Case 6: Initialization from a scalarized structure of different type with
+// different element types.
+// CHECK-LABEL: define void @_Z5case69TwoFloats(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 4 [[TF4:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[TMP0]] to i32
+// CHECK-NEXT:    store i32 [[CONV]], ptr [[Z]], align 4
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF4]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[Y]], align 4
+// CHECK-NEXT:    [[CONV1:%.*]] = fptosi float [[TMP1]] to i32
+// CHECK-NEXT:    store i32 [[CONV1]], ptr [[W]], align 4
+// CHECK-NEXT:    ret void
+//
+TwoInts case6(TwoFloats TF4) {
+  TwoInts TI2 = {TF4};
+  return TI2;
+}
+
+// Case 7: Initialization of a complex structure, with bogus braces and element
+// conversions from a collection of scalar values, and structures.
+// CHECK-LABEL: define void @_Z5case77TwoIntsS_i9TwoFloatsS0_S0_S0_(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_DOGGO:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOINTS:%.*]]) align 4 [[TI1:%.*]], ptr noundef byval([[STRUCT_TWOINTS]]) align 4 [[TI2:%.*]], i32 noundef [[VAL:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 4 [[TF1:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 4 [[TF2:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 4 [[TF3:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 4 [[TF4:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[VAL]], ptr [[VAL_ADDR]], align 4
+// CHECK-NEXT:    [[LEGSTATE:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Z]], align 4
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[W]], align 4
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP1]], i32 1
+// CHECK-NEXT:    [[Z2:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI2]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Z2]], align 4
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT1]], i32 [[TMP2]], i32 2
+// CHECK-NEXT:    [[W4:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI2]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[W4]], align 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT5]], ptr [[LEGSTATE]], align 16
+// CHECK-NEXT:    [[TAILSTATE:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[VAL_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[TAILSTATE]], align 16
+// CHECK-NEXT:    [[HAIRCOUNT:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[AGG_RESULT]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VAL_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP5]] to float
+// CHECK-NEXT:    store float [[CONV]], ptr [[HAIRCOUNT]], align 4
+// CHECK-NEXT:    [[EARDIRECTION:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[AGG_RESULT]], i32 0, i32 3
+// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[X]], align 4
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 0
+// CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[Y]], align 4
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <4 x float> [[VECINIT6]], float [[TMP7]], i32 1
+// CHECK-NEXT:    [[X8:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF2]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[X8]], align 4
+// CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x float> [[VECINIT7]], float [[TMP8]], i32 2
+// CHECK-NEXT:    [[Y10:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF2]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[Y10]], align 4
+// CHECK-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[VECINIT9]], float [[TMP9]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT11]], ptr [[EARDIRECTION]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds <4 x float>, ptr [[EARDIRECTION]], i32 1
+// CHECK-NEXT:    [[X12:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[X12]], align 4
+// CHECK-NEXT:    [[VECINIT13:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0
+// CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF3]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[Y14]], align 4
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x float> [[VECINIT13]], float [[TMP11]], i32 1
+// CHECK-NEXT:    [[X16:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF4]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[X16]], align 4
+// CHECK-NEXT:    [[VECINIT17:%.*]] = insertelement <4 x float> [[VECINIT15]], float [[TMP12]], i32 2
+// CHECK-NEXT:    [[Y18:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF4]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[Y18]], align 4
+// CHECK-NEXT:    [[VECINIT19:%.*]] = insertelement <4 x float> [[VECINIT17]], float [[TMP13]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT19]], ptr [[ARRAYINIT_ELEMENT]], align 16
+// CHECK-NEXT:    ret void
+//
+Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2,
+            TwoFloats TF3, TwoFloats TF4) {
+  Doggo D1 = {TI1, TI2, {Val, Val}, {{TF1, TF2}, {TF3, TF4}}};
+  return D1;
+}
+
+// Case 8: Initialization of a structure from a different structure with
+// significantly different element types and grouping.
+// CHECK-LABEL: define void @_Z5case85Doggo(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_ANIMALBITS:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_DOGGO:%.*]]) align 16 [[D1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[LEGS:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[LEGSTATE:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[LEGSTATE]], align 16
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[TMP0]], i64 0
+// CHECK-NEXT:    store i32 [[VECEXT]], ptr [[LEGS]], align 4
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds i32, ptr [[LEGS]], i32 1
+// CHECK-NEXT:    [[LEGSTATE1:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[LEGSTATE1]], align 16
+// CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1
+// CHECK-NEXT:    store i32 [[VECEXT2]], ptr [[ARRAYINIT_ELEMENT]], align 4
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT3:%.*]] = getelementptr inbounds i32, ptr [[LEGS]], i32 2
+// CHECK-NEXT:    [[LEGSTATE4:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[LEGSTATE4]], align 16
+// CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 2
+// CHECK-NEXT:    store i32 [[VECEXT5]], ptr [[ARRAYINIT_ELEMENT3]], align 4
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT6:%.*]] = getelementptr inbounds i32, ptr [[LEGS]], i32 3
+// CHECK-NEXT:    [[LEGSTATE7:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[LEGSTATE7]], align 16
+// CHECK-NEXT:    [[VECEXT8:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+// CHECK-NEXT:    store i32 [[VECEXT8]], ptr [[ARRAYINIT_ELEMENT6]], align 4
+// CHECK-NEXT:    [[STATE:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[TAILSTATE:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TAILSTATE]], align 16
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[STATE]], align 16
+// CHECK-NEXT:    [[COUNTER:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[AGG_RESULT]], i32 0, i32 2
+// CHECK-NEXT:    [[HAIRCOUNT:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[HAIRCOUNT]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[TMP5]] to i64
+// CHECK-NEXT:    store i64 [[CONV]], ptr [[COUNTER]], align 8
+// CHECK-NEXT:    [[LEFTDIR:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[AGG_RESULT]], i32 0, i32 3
+// CHECK-NEXT:    [[EARDIRECTION:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x float> poison, float [[VECEXT9]], i32 0
+// CHECK-NEXT:    [[EARDIRECTION10:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION10]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX11]], align 16
+// CHECK-NEXT:    [[VECEXT12:%.*]] = extractelement <4 x float> [[TMP7]], i64 1
+// CHECK-NEXT:    [[VECINIT13:%.*]] = insertelement <4 x float> [[VECINIT]], float [[VECEXT12]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION14:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION14]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX15]], align 16
+// CHECK-NEXT:    [[VECEXT16:%.*]] = extractelement <4 x float> [[TMP8]], i64 2
+// CHECK-NEXT:    [[VECINIT17:%.*]] = insertelement <4 x float> [[VECINIT13]], float [[VECEXT16]], i32 2
+// CHECK-NEXT:    [[EARDIRECTION18:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION18]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX19]], align 16
+// CHECK-NEXT:    [[VECEXT20:%.*]] = extractelement <4 x float> [[TMP9]], i64 3
+// CHECK-NEXT:    [[VECINIT21:%.*]] = insertelement <4 x float> [[VECINIT17]], float [[VECEXT20]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT21]], ptr [[LEFTDIR]], align 16
+// CHECK-NEXT:    [[RIGHTDIR:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[AGG_RESULT]], i32 0, i32 4
+// CHECK-NEXT:    [[EARDIRECTION22:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION22]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX23]], align 16
+// CHECK-NEXT:    [[VECEXT24:%.*]] = extractelement <4 x float> [[TMP10]], i64 0
+// CHECK-NEXT:    [[VECINIT25:%.*]] = insertelement <4 x float> poison, float [[VECEXT24]], i32 0
+// CHECK-NEXT:    [[EARDIRECTION26:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION26]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX27]], align 16
+// CHECK-NEXT:    [[VECEXT28:%.*]] = extractelement <4 x float> [[TMP11]], i64 1
+// CHECK-NEXT:    [[VECINIT29:%.*]] = insertelement <4 x float> [[VECINIT25]], float [[VECEXT28]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION30:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION30]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX31]], align 16
+// CHECK-NEXT:    [[VECEXT32:%.*]] = extractelement <4 x float> [[TMP12]], i64 2
+// CHECK-NEXT:    [[VECINIT33:%.*]] = insertelement <4 x float> [[VECINIT29]], float [[VECEXT32]], i32 2
+// CHECK-NEXT:    [[EARDIRECTION34:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION34]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX35]], align 16
+// CHECK-NEXT:    [[VECEXT36:%.*]] = extractelement <4 x float> [[TMP13]], i64 3
+// CHECK-NEXT:    [[VECINIT37:%.*]] = insertelement <4 x float> [[VECINIT33]], float [[VECEXT36]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT37]], ptr [[RIGHTDIR]], align 16
+// CHECK-NEXT:    ret void
+//
+AnimalBits case8(Doggo D1) {
+  AnimalBits A1 = {D1};
+  return A1;
+}
+
+// Case 9: Everything everywhere all at once... Initializing mismatched
+// structures from different layouts, different component groupings, with no
+// top-level bracing separation.
+// CHECK-LABEL: define void @_Z5case95Doggo10AnimalBits(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_ZOO:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_DOGGO:%.*]]) align 16 [[D1:%.*]], ptr noundef byval([[STRUCT_ANIMALBITS:%.*]]) align 16 [[A1:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DOGS:%.*]] = getelementptr inbounds nuw [[STRUCT_ZOO]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[LEGSTATE:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[DOGS]], i32 0, i32 0
+// CHECK-NEXT:    [[LEGSTATE1:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[LEGSTATE1]], align 16
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VECEXT]], i32 0
+// CHECK-NEXT:    [[LEGSTATE2:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[LEGSTATE2]], align 16
+// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VECEXT3]], i32 1
+// CHECK-NEXT:    [[LEGSTATE5:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[LEGSTATE5]], align 16
+// CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x i32> [[TMP2]], i64 2
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <4 x i32> [[VECINIT4]], i32 [[VECEXT6]], i32 2
+// CHECK-NEXT:    [[LEGSTATE8:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[LEGSTATE8]], align 16
+// CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT7]], i32 [[VECEXT9]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT10]], ptr [[LEGSTATE]], align 16
+// CHECK-NEXT:    [[TAILSTATE:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[DOGS]], i32 0, i32 1
+// CHECK-NEXT:    [[TAILSTATE11:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TAILSTATE11]], align 16
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[TAILSTATE]], align 16
+// CHECK-NEXT:    [[HAIRCOUNT:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[DOGS]], i32 0, i32 2
+// CHECK-NEXT:    [[HAIRCOUNT12:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[HAIRCOUNT12]], align 4
+// CHECK-NEXT:    store float [[TMP5]], ptr [[HAIRCOUNT]], align 4
+// CHECK-NEXT:    [[EARDIRECTION:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[DOGS]], i32 0, i32 3
+// CHECK-NEXT:    [[EARDIRECTION13:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION13]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VECEXT14:%.*]] = extractelement <4 x float> [[TMP6]], i64 0
+// CHECK-NEXT:    [[VECINIT15:%.*]] = insertelement <4 x float> poison, float [[VECEXT14]], i32 0
+// CHECK-NEXT:    [[EARDIRECTION16:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION16]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX17]], align 16
+// CHECK-NEXT:    [[VECEXT18:%.*]] = extractelement <4 x float> [[TMP7]], i64 1
+// CHECK-NEXT:    [[VECINIT19:%.*]] = insertelement <4 x float> [[VECINIT15]], float [[VECEXT18]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION20:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION20]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX21]], align 16
+// CHECK-NEXT:    [[VECEXT22:%.*]] = extractelement <4 x float> [[TMP8]], i64 2
+// CHECK-NEXT:    [[VECINIT23:%.*]] = insertelement <4 x float> [[VECINIT19]], float [[VECEXT22]], i32 2
+// CHECK-NEXT:    [[EARDIRECTION24:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION24]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX25]], align 16
+// CHECK-NEXT:    [[VECEXT26:%.*]] = extractelement <4 x float> [[TMP9]], i64 3
+// CHECK-NEXT:    [[VECINIT27:%.*]] = insertelement <4 x float> [[VECINIT23]], float [[VECEXT26]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT27]], ptr [[EARDIRECTION]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds <4 x float>, ptr [[EARDIRECTION]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION28:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION28]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX29]], align 16
+// CHECK-NEXT:    [[VECEXT30:%.*]] = extractelement <4 x float> [[TMP10]], i64 0
+// CHECK-NEXT:    [[VECINIT31:%.*]] = insertelement <4 x float> poison, float [[VECEXT30]], i32 0
+// CHECK-NEXT:    [[EARDIRECTION32:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION32]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX33]], align 16
+// CHECK-NEXT:    [[VECEXT34:%.*]] = extractelement <4 x float> [[TMP11]], i64 1
+// CHECK-NEXT:    [[VECINIT35:%.*]] = insertelement <4 x float> [[VECINIT31]], float [[VECEXT34]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION36:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION36]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[ARRAYIDX37]], align 16
+// CHECK-NEXT:    [[VECEXT38:%.*]] = extractelement <4 x float> [[TMP12]], i64 2
+// CHECK-NEXT:    [[VECINIT39:%.*]] = insertelement <4 x float> [[VECINIT35]], float [[VECEXT38]], i32 2
+// CHECK-NEXT:    [[EARDIRECTION40:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION40]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX41]], align 16
+// CHECK-NEXT:    [[VECEXT42:%.*]] = extractelement <4 x float> [[TMP13]], i64 3
+// CHECK-NEXT:    [[VECINIT43:%.*]] = insertelement <4 x float> [[VECINIT39]], float [[VECEXT42]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT43]], ptr [[ARRAYINIT_ELEMENT]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT44:%.*]] = getelementptr inbounds [[STRUCT_DOGGO]], ptr [[DOGS]], i32 1
+// CHECK-NEXT:    [[LEGSTATE45:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[ARRAYINIT_ELEMENT44]], i32 0, i32 0
+// CHECK-NEXT:    [[LEGS:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX46]], align 16
+// CHECK-NEXT:    [[VECINIT47:%.*]] = insertelement <4 x i32> poison, i32 [[TMP14]], i32 0
+// CHECK-NEXT:    [[LEGS48:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS48]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX49]], align 4
+// CHECK-NEXT:    [[VECINIT50:%.*]] = insertelement <4 x i32> [[VECINIT47]], i32 [[TMP15]], i32 1
+// CHECK-NEXT:    [[LEGS51:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS51]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX52]], align 8
+// CHECK-NEXT:    [[VECINIT53:%.*]] = insertelement <4 x i32> [[VECINIT50]], i32 [[TMP16]], i32 2
+// CHECK-NEXT:    [[LEGS54:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS54]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX55]], align 4
+// CHECK-NEXT:    [[VECINIT56:%.*]] = insertelement <4 x i32> [[VECINIT53]], i32 [[TMP17]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT56]], ptr [[LEGSTATE45]], align 16
+// CHECK-NEXT:    [[TAILSTATE57:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[ARRAYINIT_ELEMENT44]], i32 0, i32 1
+// CHECK-NEXT:    [[STATE:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[STATE]], align 16
+// CHECK-NEXT:    store i32 [[TMP18]], ptr [[TAILSTATE57]], align 16
+// CHECK-NEXT:    [[HAIRCOUNT58:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[ARRAYINIT_ELEMENT44]], i32 0, i32 2
+// CHECK-NEXT:    [[COUNTER:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[COUNTER]], align 8
+// CHECK-NEXT:    [[CONV:%.*]] = sitofp i64 [[TMP19]] to float
+// CHECK-NEXT:    store float [[CONV]], ptr [[HAIRCOUNT58]], align 4
+// CHECK-NEXT:    [[EARDIRECTION59:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[ARRAYINIT_ELEMENT44]], i32 0, i32 3
+// CHECK-NEXT:    [[LEFTDIR:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP20:%.*]] = load <4 x float>, ptr [[LEFTDIR]], align 16
+// CHECK-NEXT:    [[VECEXT60:%.*]] = extractelement <4 x float> [[TMP20]], i64 0
+// CHECK-NEXT:    [[VECINIT61:%.*]] = insertelement <4 x float> poison, float [[VECEXT60]], i32 0
+// CHECK-NEXT:    [[LEFTDIR62:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP21:%.*]] = load <4 x float>, ptr [[LEFTDIR62]], align 16
+// CHECK-NEXT:    [[VECEXT63:%.*]] = extractelement <4 x float> [[TMP21]], i64 1
+// CHECK-NEXT:    [[VECINIT64:%.*]] = insertelement <4 x float> [[VECINIT61]], float [[VECEXT63]], i32 1
+// CHECK-NEXT:    [[LEFTDIR65:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP22:%.*]] = load <4 x float>, ptr [[LEFTDIR65]], align 16
+// CHECK-NEXT:    [[VECEXT66:%.*]] = extractelement <4 x float> [[TMP22]], i64 2
+// CHECK-NEXT:    [[VECINIT67:%.*]] = insertelement <4 x float> [[VECINIT64]], float [[VECEXT66]], i32 2
+// CHECK-NEXT:    [[LEFTDIR68:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP23:%.*]] = load <4 x float>, ptr [[LEFTDIR68]], align 16
+// CHECK-NEXT:    [[VECEXT69:%.*]] = extractelement <4 x float> [[TMP23]], i64 3
+// CHECK-NEXT:    [[VECINIT70:%.*]] = insertelement <4 x float> [[VECINIT67]], float [[VECEXT69]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT70]], ptr [[EARDIRECTION59]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT71:%.*]] = getelementptr inbounds <4 x float>, ptr [[EARDIRECTION59]], i32 1
+// CHECK-NEXT:    [[RIGHTDIR:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP24:%.*]] = load <4 x float>, ptr [[RIGHTDIR]], align 16
+// CHECK-NEXT:    [[VECEXT72:%.*]] = extractelement <4 x float> [[TMP24]], i64 0
+// CHECK-NEXT:    [[VECINIT73:%.*]] = insertelement <4 x float> poison, float [[VECEXT72]], i32 0
+// CHECK-NEXT:    [[RIGHTDIR74:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP25:%.*]] = load <4 x float>, ptr [[RIGHTDIR74]], align 16
+// CHECK-NEXT:    [[VECEXT75:%.*]] = extractelement <4 x float> [[TMP25]], i64 1
+// CHECK-NEXT:    [[VECINIT76:%.*]] = insertelement <4 x float> [[VECINIT73]], float [[VECEXT75]], i32 1
+// CHECK-NEXT:    [[RIGHTDIR77:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP26:%.*]] = load <4 x float>, ptr [[RIGHTDIR77]], align 16
+// CHECK-NEXT:    [[VECEXT78:%.*]] = extractelement <4 x float> [[TMP26]], i64 2
+// CHECK-NEXT:    [[VECINIT79:%.*]] = insertelement <4 x float> [[VECINIT76]], float [[VECEXT78]], i32 2
+// CHECK-NEXT:    [[RIGHTDIR80:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load <4 x float>, ptr [[RIGHTDIR80]], align 16
+// CHECK-NEXT:    [[VECEXT81:%.*]] = extractelement <4 x float> [[TMP27]], i64 3
+// CHECK-NEXT:    [[VECINIT82:%.*]] = insertelement <4 x float> [[VECINIT79]], float [[VECEXT81]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT82]], ptr [[ARRAYINIT_ELEMENT71]], align 16
+// CHECK-NEXT:    [[CATS:%.*]] = getelementptr inbounds nuw [[STRUCT_ZOO]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[LEGS83:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH:%.*]], ptr [[CATS]], i32 0, i32 0
+// CHECK-NEXT:    [[LEGSTATE84:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP28:%.*]] = load <4 x i32>, ptr [[LEGSTATE84]], align 16
+// CHECK-NEXT:    [[VECEXT85:%.*]] = extractelement <4 x i32> [[TMP28]], i64 0
+// CHECK-NEXT:    [[VECINIT86:%.*]] = insertelement <4 x i32> poison, i32 [[VECEXT85]], i32 0
+// CHECK-NEXT:    [[LEGSTATE87:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP29:%.*]] = load <4 x i32>, ptr [[LEGSTATE87]], align 16
+// CHECK-NEXT:    [[VECEXT88:%.*]] = extractelement <4 x i32> [[TMP29]], i64 1
+// CHECK-NEXT:    [[VECINIT89:%.*]] = insertelement <4 x i32> [[VECINIT86]], i32 [[VECEXT88]], i32 1
+// CHECK-NEXT:    [[LEGSTATE90:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP30:%.*]] = load <4 x i32>, ptr [[LEGSTATE90]], align 16
+// CHECK-NEXT:    [[VECEXT91:%.*]] = extractelement <4 x i32> [[TMP30]], i64 2
+// CHECK-NEXT:    [[VECINIT92:%.*]] = insertelement <4 x i32> [[VECINIT89]], i32 [[VECEXT91]], i32 2
+// CHECK-NEXT:    [[LEGSTATE93:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP31:%.*]] = load <4 x i32>, ptr [[LEGSTATE93]], align 16
+// CHECK-NEXT:    [[VECEXT94:%.*]] = extractelement <4 x i32> [[TMP31]], i64 3
+// CHECK-NEXT:    [[VECINIT95:%.*]] = insertelement <4 x i32> [[VECINIT92]], i32 [[VECEXT94]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT95]], ptr [[LEGS83]], align 16
+// CHECK-NEXT:    [[TAILSTATE96:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[CATS]], i32 0, i32 1
+// CHECK-NEXT:    [[TAILSTATE97:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TAILSTATE97]], align 16
+// CHECK-NEXT:    store i32 [[TMP32]], ptr [[TAILSTATE96]], align 16
+// CHECK-NEXT:    [[HAIRCOUNT98:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[CATS]], i32 0, i32 2
+// CHECK-NEXT:    [[HAIRCOUNT99:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr [[HAIRCOUNT99]], align 4
+// CHECK-NEXT:    store float [[TMP33]], ptr [[HAIRCOUNT98]], align 4
+// CHECK-NEXT:    [[CLAWS:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[CATS]], i32 0, i32 3
+// CHECK-NEXT:    [[EARDIRECTION100:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX101:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION100]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP34:%.*]] = load <4 x float>, ptr [[ARRAYIDX101]], align 16
+// CHECK-NEXT:    [[VECEXT102:%.*]] = extractelement <4 x float> [[TMP34]], i64 0
+// CHECK-NEXT:    [[VECINIT103:%.*]] = insertelement <4 x float> poison, float [[VECEXT102]], i32 0
+// CHECK-NEXT:    [[EARDIRECTION104:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX105:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION104]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP35:%.*]] = load <4 x float>, ptr [[ARRAYIDX105]], align 16
+// CHECK-NEXT:    [[VECEXT106:%.*]] = extractelement <4 x float> [[TMP35]], i64 1
+// CHECK-NEXT:    [[VECINIT107:%.*]] = insertelement <4 x float> [[VECINIT103]], float [[VECEXT106]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION108:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION108]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP36:%.*]] = load <4 x float>, ptr [[ARRAYIDX109]], align 16
+// CHECK-NEXT:    [[VECEXT110:%.*]] = extractelement <4 x float> [[TMP36]], i64 2
+// CHECK-NEXT:    [[VECINIT111:%.*]] = insertelement <4 x float> [[VECINIT107]], float [[VECEXT110]], i32 2
+// CHECK-NEXT:    [[EARDIRECTION112:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX113:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION112]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP37:%.*]] = load <4 x float>, ptr [[ARRAYIDX113]], align 16
+// CHECK-NEXT:    [[VECEXT114:%.*]] = extractelement <4 x float> [[TMP37]], i64 3
+// CHECK-NEXT:    [[VECINIT115:%.*]] = insertelement <4 x float> [[VECINIT111]], float [[VECEXT114]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT115]], ptr [[CLAWS]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT116:%.*]] = getelementptr inbounds <4 x float>, ptr [[CLAWS]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION117:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX118:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION117]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP38:%.*]] = load <4 x float>, ptr [[ARRAYIDX118]], align 16
+// CHECK-NEXT:    [[VECEXT119:%.*]] = extractelement <4 x float> [[TMP38]], i64 0
+// CHECK-NEXT:    [[VECINIT120:%.*]] = insertelement <4 x float> poison, float [[VECEXT119]], i32 0
+// CHECK-NEXT:    [[EARDIRECTION121:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX122:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION121]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP39:%.*]] = load <4 x float>, ptr [[ARRAYIDX122]], align 16
+// CHECK-NEXT:    [[VECEXT123:%.*]] = extractelement <4 x float> [[TMP39]], i64 1
+// CHECK-NEXT:    [[VECINIT124:%.*]] = insertelement <4 x float> [[VECINIT120]], float [[VECEXT123]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION125:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX126:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION125]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP40:%.*]] = load <4 x float>, ptr [[ARRAYIDX126]], align 16
+// CHECK-NEXT:    [[VECEXT127:%.*]] = extractelement <4 x float> [[TMP40]], i64 2
+// CHECK-NEXT:    [[VECINIT128:%.*]] = insertelement <4 x float> [[VECINIT124]], float [[VECEXT127]], i32 2
+// CHECK-NEXT:    [[EARDIRECTION129:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX130:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION129]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP41:%.*]] = load <4 x float>, ptr [[ARRAYIDX130]], align 16
+// CHECK-NEXT:    [[VECEXT131:%.*]] = extractelement <4 x float> [[TMP41]], i64 3
+// CHECK-NEXT:    [[VECINIT132:%.*]] = insertelement <4 x float> [[VECINIT128]], float [[VECEXT131]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT132]], ptr [[ARRAYINIT_ELEMENT116]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT133:%.*]] = getelementptr inbounds [[STRUCT_KITTEH]], ptr [[CATS]], i32 1
+// CHECK-NEXT:    [[LEGS134:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT133]], i32 0, i32 0
+// CHECK-NEXT:    [[LEGS135:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX136:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS135]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX136]], align 16
+// CHECK-NEXT:    [[VECINIT137:%.*]] = insertelement <4 x i32> poison, i32 [[TMP42]], i32 0
+// CHECK-NEXT:    [[LEGS138:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX139:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS138]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX139]], align 4
+// CHECK-NEXT:    [[VECINIT140:%.*]] = insertelement <4 x i32> [[VECINIT137]], i32 [[TMP43]], i32 1
+// CHECK-NEXT:    [[LEGS141:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX142:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS141]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr [[ARRAYIDX142]], align 8
+// CHECK-NEXT:    [[VECINIT143:%.*]] = insertelement <4 x i32> [[VECINIT140]], i32 [[TMP44]], i32 2
+// CHECK-NEXT:    [[LEGS144:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX145:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS144]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[ARRAYIDX145]], align 4
+// CHECK-NEXT:    [[VECINIT146:%.*]] = insertelement <4 x i32> [[VECINIT143]], i32 [[TMP45]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT146]], ptr [[LEGS134]], align 16
+// CHECK-NEXT:    [[TAILSTATE147:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT133]], i32 0, i32 1
+// CHECK-NEXT:    [[STATE148:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP46:%.*]] = load i32, ptr [[STATE148]], align 16
+// CHECK-NEXT:    store i32 [[TMP46]], ptr [[TAILSTATE147]], align 16
+// CHECK-NEXT:    [[HAIRCOUNT149:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT133]], i32 0, i32 2
+// CHECK-NEXT:    [[COUNTER150:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP47:%.*]] = load i64, ptr [[COUNTER150]], align 8
+// CHECK-NEXT:    [[CONV151:%.*]] = sitofp i64 [[TMP47]] to float
+// CHECK-NEXT:    store float [[CONV151]], ptr [[HAIRCOUNT149]], align 4
+// CHECK-NEXT:    [[CLAWS152:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT133]], i32 0, i32 3
+// CHECK-NEXT:    [[LEFTDIR153:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP48:%.*]] = load <4 x float>, ptr [[LEFTDIR153]], align 16
+// CHECK-NEXT:    [[VECEXT154:%.*]] = extractelement <4 x float> [[TMP48]], i64 0
+// CHECK-NEXT:    [[VECINIT155:%.*]] = insertelement <4 x float> poison, float [[VECEXT154]], i32 0
+// CHECK-NEXT:    [[LEFTDIR156:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP49:%.*]] = load <4 x float>, ptr [[LEFTDIR156]], align 16
+// CHECK-NEXT:    [[VECEXT157:%.*]] = extractelement <4 x float> [[TMP49]], i64 1
+// CHECK-NEXT:    [[VECINIT158:%.*]] = insertelement <4 x float> [[VECINIT155]], float [[VECEXT157]], i32 1
+// CHECK-NEXT:    [[LEFTDIR159:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP50:%.*]] = load <4 x float>, ptr [[LEFTDIR159]], align 16
+// CHECK-NEXT:    [[VECEXT160:%.*]] = extractelement <4 x float> [[TMP50]], i64 2
+// CHECK-NEXT:    [[VECINIT161:%.*]] = insertelement <4 x float> [[VECINIT158]], float [[VECEXT160]], i32 2
+// CHECK-NEXT:    [[LEFTDIR162:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP51:%.*]] = load <4 x float>, ptr [[LEFTDIR162]], align 16
+// CHECK-NEXT:    [[VECEXT163:%.*]] = extractelement <4 x float> [[TMP51]], i64 3
+// CHECK-NEXT:    [[VECINIT164:%.*]] = insertelement <4 x float> [[VECINIT161]], float [[VECEXT163]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT164]], ptr [[CLAWS152]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT165:%.*]] = getelementptr inbounds <4 x float>, ptr [[CLAWS152]], i32 1
+// CHECK-NEXT:    [[RIGHTDIR166:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP52:%.*]] = load <4 x float>, ptr [[RIGHTDIR166]], align 16
+// CHECK-NEXT:    [[VECEXT167:%.*]] = extractelement <4 x float> [[TMP52]], i64 0
+// CHECK-NEXT:    [[VECINIT168:%.*]] = insertelement <4 x float> poison, float [[VECEXT167]], i32 0
+// CHECK-NEXT:    [[RIGHTDIR169:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP53:%.*]] = load <4 x float>, ptr [[RIGHTDIR169]], align 16
+// CHECK-NEXT:    [[VECEXT170:%.*]] = extractelement <4 x float> [[TMP53]], i64 1
+// CHECK-NEXT:    [[VECINIT171:%.*]] = insertelement <4 x float> [[VECINIT168]], float [[VECEXT170]], i32 1
+// CHECK-NEXT:    [[RIGHTDIR172:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP54:%.*]] = load <4 x float>, ptr [[RIGHTDIR172]], align 16
+// CHECK-NEXT:    [[VECEXT173:%.*]] = extractelement <4 x float> [[TMP54]], i64 2
+// CHECK-NEXT:    [[VECINIT174:%.*]] = insertelement <4 x float> [[VECINIT171]], float [[VECEXT173]], i32 2
+// CHECK-NEXT:    [[RIGHTDIR175:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP55:%.*]] = load <4 x float>, ptr [[RIGHTDIR175]], align 16
+// CHECK-NEXT:    [[VECEXT176:%.*]] = extractelement <4 x float> [[TMP55]], i64 3
+// CHECK-NEXT:    [[VECINIT177:%.*]] = insertelement <4 x float> [[VECINIT174]], float [[VECEXT176]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT177]], ptr [[ARRAYINIT_ELEMENT165]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT178:%.*]] = getelementptr inbounds [[STRUCT_KITTEH]], ptr [[CATS]], i32 2
+// CHECK-NEXT:    [[LEGS179:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT178]], i32 0, i32 0
+// CHECK-NEXT:    [[LEGSTATE180:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP56:%.*]] = load <4 x i32>, ptr [[LEGSTATE180]], align 16
+// CHECK-NEXT:    [[VECEXT181:%.*]] = extractelement <4 x i32> [[TMP56]], i64 0
+// CHECK-NEXT:    [[VECINIT182:%.*]] = insertelement <4 x i32> poison, i32 [[VECEXT181]], i32 0
+// CHECK-NEXT:    [[LEGSTATE183:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP57:%.*]] = load <4 x i32>, ptr [[LEGSTATE183]], align 16
+// CHECK-NEXT:    [[VECEXT184:%.*]] = extractelement <4 x i32> [[TMP57]], i64 1
+// CHECK-NEXT:    [[VECINIT185:%.*]] = insertelement <4 x i32> [[VECINIT182]], i32 [[VECEXT184]], i32 1
+// CHECK-NEXT:    [[LEGSTATE186:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP58:%.*]] = load <4 x i32>, ptr [[LEGSTATE186]], align 16
+// CHECK-NEXT:    [[VECEXT187:%.*]] = extractelement <4 x i32> [[TMP58]], i64 2
+// CHECK-NEXT:    [[VECINIT188:%.*]] = insertelement <4 x i32> [[VECINIT185]], i32 [[VECEXT187]], i32 2
+// CHECK-NEXT:    [[LEGSTATE189:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP59:%.*]] = load <4 x i32>, ptr [[LEGSTATE189]], align 16
+// CHECK-NEXT:    [[VECEXT190:%.*]] = extractelement <4 x i32> [[TMP59]], i64 3
+// CHECK-NEXT:    [[VECINIT191:%.*]] = insertelement <4 x i32> [[VECINIT188]], i32 [[VECEXT190]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT191]], ptr [[LEGS179]], align 16
+// CHECK-NEXT:    [[TAILSTATE192:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT178]], i32 0, i32 1
+// CHECK-NEXT:    [[TAILSTATE193:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP60:%.*]] = load i32, ptr [[TAILSTATE193]], align 16
+// CHECK-NEXT:    store i32 [[TMP60]], ptr [[TAILSTATE192]], align 16
+// CHECK-NEXT:    [[HAIRCOUNT194:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT178]], i32 0, i32 2
+// CHECK-NEXT:    [[HAIRCOUNT195:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP61:%.*]] = load float, ptr [[HAIRCOUNT195]], align 4
+// CHECK-NEXT:    store float [[TMP61]], ptr [[HAIRCOUNT194]], align 4
+// CHECK-NEXT:    [[CLAWS196:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT178]], i32 0, i32 3
+// CHECK-NEXT:    [[EARDIRECTION197:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX198:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION197]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP62:%.*]] = load <4 x float>, ptr [[ARRAYIDX198]], align 16
+// CHECK-NEXT:    [[VECEXT199:%.*]] = extractelement <4 x float> [[TMP62]], i64 0
+// CHECK-NEXT:    [[VECINIT200:%.*]] = insertelement <4 x float> poison, float [[VECEXT199]], i32 0
+// CHECK-NEXT:    [[EARDIRECTION201:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX202:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION201]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP63:%.*]] = load <4 x float>, ptr [[ARRAYIDX202]], align 16
+// CHECK-NEXT:    [[VECEXT203:%.*]] = extractelement <4 x float> [[TMP63]], i64 1
+// CHECK-NEXT:    [[VECINIT204:%.*]] = insertelement <4 x float> [[VECINIT200]], float [[VECEXT203]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION205:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX206:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION205]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP64:%.*]] = load <4 x float>, ptr [[ARRAYIDX206]], align 16
+// CHECK-NEXT:    [[VECEXT207:%.*]] = extractelement <4 x float> [[TMP64]], i64 2
+// CHECK-NEXT:    [[VECINIT208:%.*]] = insertelement <4 x float> [[VECINIT204]], float [[VECEXT207]], i32 2
+// CHECK-NEXT:    [[EARDIRECTION209:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX210:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION209]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP65:%.*]] = load <4 x float>, ptr [[ARRAYIDX210]], align 16
+// CHECK-NEXT:    [[VECEXT211:%.*]] = extractelement <4 x float> [[TMP65]], i64 3
+// CHECK-NEXT:    [[VECINIT212:%.*]] = insertelement <4 x float> [[VECINIT208]], float [[VECEXT211]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT212]], ptr [[CLAWS196]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT213:%.*]] = getelementptr inbounds <4 x float>, ptr [[CLAWS196]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION214:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX215:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION214]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP66:%.*]] = load <4 x float>, ptr [[ARRAYIDX215]], align 16
+// CHECK-NEXT:    [[VECEXT216:%.*]] = extractelement <4 x float> [[TMP66]], i64 0
+// CHECK-NEXT:    [[VECINIT217:%.*]] = insertelement <4 x float> poison, float [[VECEXT216]], i32 0
+// CHECK-NEXT:    [[EARDIRECTION218:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX219:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION218]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP67:%.*]] = load <4 x float>, ptr [[ARRAYIDX219]], align 16
+// CHECK-NEXT:    [[VECEXT220:%.*]] = extractelement <4 x float> [[TMP67]], i64 1
+// CHECK-NEXT:    [[VECINIT221:%.*]] = insertelement <4 x float> [[VECINIT217]], float [[VECEXT220]], i32 1
+// CHECK-NEXT:    [[EARDIRECTION222:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX223:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION222]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP68:%.*]] = load <4 x float>, ptr [[ARRAYIDX223]], align 16
+// CHECK-NEXT:    [[VECEXT224:%.*]] = extractelement <4 x float> [[TMP68]], i64 2
+// CHECK-NEXT:    [[VECINIT225:%.*]] = insertelement <4 x float> [[VECINIT221]], float [[VECEXT224]], i32 2
+// CHECK-NEXT:    [[EARDIRECTION226:%.*]] = getelementptr inbounds nuw [[STRUCT_DOGGO]], ptr [[D1]], i32 0, i32 3
+// CHECK-NEXT:    [[ARRAYIDX227:%.*]] = getelementptr inbounds nuw [2 x <4 x float>], ptr [[EARDIRECTION226]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP69:%.*]] = load <4 x float>, ptr [[ARRAYIDX227]], align 16
+// CHECK-NEXT:    [[VECEXT228:%.*]] = extractelement <4 x float> [[TMP69]], i64 3
+// CHECK-NEXT:    [[VECINIT229:%.*]] = insertelement <4 x float> [[VECINIT225]], float [[VECEXT228]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT229]], ptr [[ARRAYINIT_ELEMENT213]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT230:%.*]] = getelementptr inbounds [[STRUCT_KITTEH]], ptr [[CATS]], i32 3
+// CHECK-NEXT:    [[LEGS231:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT230]], i32 0, i32 0
+// CHECK-NEXT:    [[LEGS232:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX233:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS232]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP70:%.*]] = load i32, ptr [[ARRAYIDX233]], align 16
+// CHECK-NEXT:    [[VECINIT234:%.*]] = insertelement <4 x i32> poison, i32 [[TMP70]], i32 0
+// CHECK-NEXT:    [[LEGS235:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX236:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS235]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP71:%.*]] = load i32, ptr [[ARRAYIDX236]], align 4
+// CHECK-NEXT:    [[VECINIT237:%.*]] = insertelement <4 x i32> [[VECINIT234]], i32 [[TMP71]], i32 1
+// CHECK-NEXT:    [[LEGS238:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX239:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS238]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP72:%.*]] = load i32, ptr [[ARRAYIDX239]], align 8
+// CHECK-NEXT:    [[VECINIT240:%.*]] = insertelement <4 x i32> [[VECINIT237]], i32 [[TMP72]], i32 2
+// CHECK-NEXT:    [[LEGS241:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX242:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[LEGS241]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP73:%.*]] = load i32, ptr [[ARRAYIDX242]], align 4
+// CHECK-NEXT:    [[VECINIT243:%.*]] = insertelement <4 x i32> [[VECINIT240]], i32 [[TMP73]], i32 3
+// CHECK-NEXT:    store <4 x i32> [[VECINIT243]], ptr [[LEGS231]], align 16
+// CHECK-NEXT:    [[TAILSTATE244:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT230]], i32 0, i32 1
+// CHECK-NEXT:    [[STATE245:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP74:%.*]] = load i32, ptr [[STATE245]], align 16
+// CHECK-NEXT:    store i32 [[TMP74]], ptr [[TAILSTATE244]], align 16
+// CHECK-NEXT:    [[HAIRCOUNT246:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT230]], i32 0, i32 2
+// CHECK-NEXT:    [[COUNTER247:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP75:%.*]] = load i64, ptr [[COUNTER247]], align 8
+// CHECK-NEXT:    [[CONV248:%.*]] = sitofp i64 [[TMP75]] to float
+// CHECK-NEXT:    store float [[CONV248]], ptr [[HAIRCOUNT246]], align 4
+// CHECK-NEXT:    [[CLAWS249:%.*]] = getelementptr inbounds nuw [[STRUCT_KITTEH]], ptr [[ARRAYINIT_ELEMENT230]], i32 0, i32 3
+// CHECK-NEXT:    [[LEFTDIR250:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP76:%.*]] = load <4 x float>, ptr [[LEFTDIR250]], align 16
+// CHECK-NEXT:    [[VECEXT251:%.*]] = extractelement <4 x float> [[TMP76]], i64 0
+// CHECK-NEXT:    [[VECINIT252:%.*]] = insertelement <4 x float> poison, float [[VECEXT251]], i32 0
+// CHECK-NEXT:    [[LEFTDIR253:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP77:%.*]] = load <4 x float>, ptr [[LEFTDIR253]], align 16
+// CHECK-NEXT:    [[VECEXT254:%.*]] = extractelement <4 x float> [[TMP77]], i64 1
+// CHECK-NEXT:    [[VECINIT255:%.*]] = insertelement <4 x float> [[VECINIT252]], float [[VECEXT254]], i32 1
+// CHECK-NEXT:    [[LEFTDIR256:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP78:%.*]] = load <4 x float>, ptr [[LEFTDIR256]], align 16
+// CHECK-NEXT:    [[VECEXT257:%.*]] = extractelement <4 x float> [[TMP78]], i64 2
+// CHECK-NEXT:    [[VECINIT258:%.*]] = insertelement <4 x float> [[VECINIT255]], float [[VECEXT257]], i32 2
+// CHECK-NEXT:    [[LEFTDIR259:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 3
+// CHECK-NEXT:    [[TMP79:%.*]] = load <4 x float>, ptr [[LEFTDIR259]], align 16
+// CHECK-NEXT:    [[VECEXT260:%.*]] = extractelement <4 x float> [[TMP79]], i64 3
+// CHECK-NEXT:    [[VECINIT261:%.*]] = insertelement <4 x float> [[VECINIT258]], float [[VECEXT260]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT261]], ptr [[CLAWS249]], align 16
+// CHECK-NEXT:    [[ARRAYINIT_ELEMENT262:%.*]] = getelementptr inbounds <4 x float>, ptr [[CLAWS249]], i32 1
+// CHECK-NEXT:    [[RIGHTDIR263:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP80:%.*]] = load <4 x float>, ptr [[RIGHTDIR263]], align 16
+// CHECK-NEXT:    [[VECEXT264:%.*]] = extractelement <4 x float> [[TMP80]], i64 0
+// CHECK-NEXT:    [[VECINIT265:%.*]] = insertelement <4 x float> poison, float [[VECEXT264]], i32 0
+// CHECK-NEXT:    [[RIGHTDIR266:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP81:%.*]] = load <4 x float>, ptr [[RIGHTDIR266]], align 16
+// CHECK-NEXT:    [[VECEXT267:%.*]] = extractelement <4 x float> [[TMP81]], i64 1
+// CHECK-NEXT:    [[VECINIT268:%.*]] = insertelement <4 x float> [[VECINIT265]], float [[VECEXT267]], i32 1
+// CHECK-NEXT:    [[RIGHTDIR269:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP82:%.*]] = load <4 x float>, ptr [[RIGHTDIR269]], align 16
+// CHECK-NEXT:    [[VECEXT270:%.*]] = extractelement <4 x float> [[TMP82]], i64 2
+// CHECK-NEXT:    [[VECINIT271:%.*]] = insertelement <4 x float> [[VECINIT268]], float [[VECEXT270]], i32 2
+// CHECK-NEXT:    [[RIGHTDIR272:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[A1]], i32 0, i32 4
+// CHECK-NEXT:    [[TMP83:%.*]] = load <4 x float>, ptr [[RIGHTDIR272]], align 16
+// CHECK-NEXT:    [[VECEXT273:%.*]] = extractelement <4 x float> [[TMP83]], i64 3
+// CHECK-NEXT:    [[VECINIT274:%.*]] = insertelement <4 x float> [[VECINIT271]], float [[VECEXT273]], i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT274]], ptr [[ARRAYINIT_ELEMENT262]], align 16
+// CHECK-NEXT:    ret void
+//
+Zoo case9(Doggo D1, AnimalBits A1) {
+  Zoo Z1 = {D1, A1, D1, A1, D1, A1};
+  return Z1;
+}
+
+// Case 10: Initialize an object with a base class from two objects.
+// CHECK-LABEL: define void @_Z6case109TwoFloatsS_(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 4 [[TF1:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 4 [[TF2:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X1]], align 4
+// CHECK-NEXT:    store float [[TMP0]], ptr [[X]], align 4
+// CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[Y2:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF1]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[Y2]], align 4
+// CHECK-NEXT:    store float [[TMP1]], ptr [[Y]], align 4
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_FOURFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[X3:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF2]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[X3]], align 4
+// CHECK-NEXT:    store float [[TMP2]], ptr [[Z]], align 4
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_FOURFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 2
+// CHECK-NEXT:    [[Y4:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[TF2]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[Y4]], align 4
+// CHECK-NEXT:    store float [[TMP3]], ptr [[W]], align 4
+// CHECK-NEXT:    ret void
+//
+FourFloats case10(TwoFloats TF1, TwoFloats TF2) {
+  FourFloats FF1 = {TF1, TF2};
+  return FF1;
+}
+
+// Case 11: Initialize an object with a base class from a vector splat.
+// CHECK-LABEL: define void @_Z6case11f(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 4 [[AGG_RESULT:%.*]], float noundef nofpclass(nan inf) [[F:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[REF_TMP1:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[REF_TMP4:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[REF_TMP7:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store float [[F]], ptr [[F_ADDR]], align 4
+// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS:%.*]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[F_ADDR]], align 4
+// CHECK-NEXT:    [[CAST_SPLAT:%.*]] = insertelement <1 x float> poison, float [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x float> [[CAST_SPLAT]], <1 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[REF_TMP]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[REF_TMP]], align 16
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP2]], i64 0
+// CHECK-NEXT:    store float [[VECEXT]], ptr [[X]], align 4
+// CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[F_ADDR]], align 4
+// CHECK-NEXT:    [[CAST_SPLAT2:%.*]] = insertelement <1 x float> poison, float [[TMP3]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <1 x float> [[CAST_SPLAT2]], <1 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[REF_TMP1]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[REF_TMP1]], align 16
+// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[TMP5]], i64 1
+// CHECK-NEXT:    store float [[VECEXT3]], ptr [[Y]], align 4
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_FOURFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[F_ADDR]], align 4
+// CHECK-NEXT:    [[CAST_SPLAT5:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <1 x float> [[CAST_SPLAT5]], <1 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[REF_TMP4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[REF_TMP4]], align 16
+// CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x float> [[TMP8]], i64 2
+// CHECK-NEXT:    store float [[VECEXT6]], ptr [[Z]], align 4
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_FOURFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[F_ADDR]], align 4
+// CHECK-NEXT:    [[CAST_SPLAT8:%.*]] = insertelement <1 x float> poison, float [[TMP9]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <1 x float> [[CAST_SPLAT8]], <1 x float> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[REF_TMP7]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[REF_TMP7]], align 16
+// CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <4 x float> [[TMP11]], i64 3
+// CHECK-NEXT:    store float [[VECEXT9]], ptr [[W]], align 4
+// CHECK-NEXT:    ret void
+//
+FourFloats case11(float F) {
+  FourFloats FF1 = {F.xxxx};
+  return FF1;
+}
+
+// Case 12: Initialize bitfield from two integers.
+// CHECK-LABEL: define void @_Z6case12ii(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_SLICYBITS:%.*]]) align 4 [[AGG_RESULT:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[J_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[I]], ptr [[I_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[J]], ptr [[J_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[BF_LOAD:%.*]] = load i16, ptr [[AGG_RESULT]], align 4
+// CHECK-NEXT:    [[BF_VALUE:%.*]] = and i16 [[TMP1]], 255
+// CHECK-NEXT:    [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD]], -256
+// CHECK-NEXT:    [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK-NEXT:    store i16 [[BF_SET]], ptr [[AGG_RESULT]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[J_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
+// CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i16, ptr [[AGG_RESULT]], align 4
+// CHECK-NEXT:    [[BF_VALUE2:%.*]] = and i16 [[TMP3]], 255
+// CHECK-NEXT:    [[BF_SHL:%.*]] = shl i16 [[BF_VALUE2]], 8
+// CHECK-NEXT:    [[BF_CLEAR3:%.*]] = and i16 [[BF_LOAD1]], 255
+// CHECK-NEXT:    [[BF_SET4:%.*]] = or i16 [[BF_CLEAR3]], [[BF_SHL]]
+// CHECK-NEXT:    store i16 [[BF_SET4]], ptr [[AGG_RESULT]], align 4
+// CHECK-NEXT:    ret void
+//
+SlicyBits case12(int I, int J) {
+  SlicyBits SB = {I, J};
+  return SB;
+}
+
+// Case 13: Initialize bitfield from a struct of two ints.
+// CHECK-LABEL: define void @_Z6case137TwoInts(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_SLICYBITS:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOINTS:%.*]]) align 4 [[TI:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Z]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[BF_LOAD:%.*]] = load i16, ptr [[AGG_RESULT]], align 4
+// CHECK-NEXT:    [[BF_VALUE:%.*]] = and i16 [[TMP1]], 255
+// CHECK-NEXT:    [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD]], -256
+// CHECK-NEXT:    [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_VALUE]]
+// CHECK-NEXT:    store i16 [[BF_SET]], ptr [[AGG_RESULT]], align 4
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[W]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
+// CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i16, ptr [[AGG_RESULT]], align 4
+// CHECK-NEXT:    [[BF_VALUE2:%.*]] = and i16 [[TMP3]], 255
+// CHECK-NEXT:    [[BF_SHL:%.*]] = shl i16 [[BF_VALUE2]], 8
+// CHECK-NEXT:    [[BF_CLEAR3:%.*]] = and i16 [[BF_LOAD1]], 255
+// CHECK-NEXT:    [[BF_SET4:%.*]] = or i16 [[BF_CLEAR3]], [[BF_SHL]]
+// CHECK-NEXT:    store i16 [[BF_SET4]], ptr [[AGG_RESULT]], align 4
+// CHECK-NEXT:    ret void
+//
+SlicyBits case13(TwoInts TI) {
+  SlicyBits SB = {TI};
+  return SB;
+}
+
+// Case 14: Initialize struct of ints from struct with bitfields.
+// CHECK-LABEL: define void @_Z6case149SlicyBits(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_SLICYBITS:%.*]]) align 4 [[SB:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[BF_LOAD:%.*]] = load i16, ptr [[SB]], align 4
+// CHECK-NEXT:    [[BF_SHL:%.*]] = shl i16 [[BF_LOAD]], 8
+// CHECK-NEXT:    [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 8
+// CHECK-NEXT:    [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32
+// CHECK-NEXT:    store i32 [[BF_CAST]], ptr [[Z]], align 4
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i16, ptr [[SB]], align 4
+// CHECK-NEXT:    [[BF_ASHR2:%.*]] = ashr i16 [[BF_LOAD1]], 8
+// CHECK-NEXT:    [[BF_CAST3:%.*]] = sext i16 [[BF_ASHR2]] to i32
+// CHECK-NEXT:    store i32 [[BF_CAST3]], ptr [[W]], align 4
+// CHECK-NEXT:    ret void
+//
+TwoInts case14(SlicyBits SB) {
+  TwoInts TI = {SB};
+  return TI;
+}
+
+// Case 15: Initialize struct of floats from struct with bitfields.
+// CHECK-LABEL: define void @_Z6case159SlicyBits(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_SLICYBITS:%.*]]) align 4 [[SB:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[BF_LOAD:%.*]] = load i16, ptr [[SB]], align 4
+// CHECK-NEXT:    [[BF_SHL:%.*]] = shl i16 [[BF_LOAD]], 8
+// CHECK-NEXT:    [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 8
+// CHECK-NEXT:    [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32
+// CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[BF_CAST]] to float
+// CHECK-NEXT:    store float [[CONV]], ptr [[X]], align 4
+// CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i16, ptr [[SB]], align 4
+// CHECK-NEXT:    [[BF_ASHR2:%.*]] = ashr i16 [[BF_LOAD1]], 8
+// CHECK-NEXT:    [[BF_CAST3:%.*]] = sext i16 [[BF_ASHR2]] to i32
+// CHECK-NEXT:    [[CONV4:%.*]] = sitofp i32 [[BF_CAST3]] to float
+// CHECK-NEXT:    store float [[CONV4]], ptr [[Y]], align 4
+// CHECK-NEXT:    ret void
+//
+TwoFloats case15(SlicyBits SB) {
+  TwoFloats TI = {SB};
+  return TI;
+}
+
+// Case 16: Side-effecting initialization list arguments. The important thing
+// here is that case16 only has _one_ call to makeTwo.
+// CHECK-LABEL: define void @_Z7makeTwoRf(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noalias noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
+// CHECK-NEXT:    store float [[TMP1]], ptr [[X1]], align 4
+// CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP3]], 1.500000e+00
+// CHECK-NEXT:    store float [[MUL]], ptr [[Y]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4
+// CHECK-NEXT:    [[MUL2:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP5]], 2.000000e+00
+// CHECK-NEXT:    store float [[MUL2]], ptr [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
+TwoFloats makeTwo(inout float X) {
+    TwoFloats TF = {X, X*1.5};
+    X *= 2;
+    return TF;
+}
+
+// CHECK-LABEL: define void @_Z6case16v(
+// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 4 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_TWOFLOATS:%.*]], align 4
+// CHECK-NEXT:    [[TMP:%.*]] = alloca float, align 4
+// CHECK-NEXT:    store float 0.000000e+00, ptr [[X]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+// CHECK-NEXT:    store float [[TMP0]], ptr [[TMP]], align 4
+// CHECK-NEXT:    call void @_Z7makeTwoRf(ptr dead_on_unwind writable sret([[STRUCT_TWOFLOATS]]) align 4 [[REF_TMP]], ptr noalias noundef nonnull align 4 dereferenceable(4) [[TMP]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP]], align 4
+// CHECK-NEXT:    store float [[TMP1]], ptr [[X]], align 4
+// CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
+// CHECK-NEXT:    store float 0.000000e+00, ptr [[X1]], align 4
+// CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[X2]], align 4
+// CHECK-NEXT:    store float [[TMP2]], ptr [[Y]], align 4
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_FOURFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
+// CHECK-NEXT:    [[Y3:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[REF_TMP]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[Y3]], align 4
+// CHECK-NEXT:    store float [[TMP3]], ptr [[Z]], align 4
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_FOURFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 2
+// CHECK-NEXT:    store float 3.000000e+00, ptr [[W]], align 4
+// CHECK-NEXT:    ret void
+//
+FourFloats case16() {
+    float X = 0;
+    FourFloats FF = {0, makeTwo(X), 3};
+    return FF;
+}
diff --git a/clang/test/SemaHLSL/ArrayTemporary.hlsl b/clang/test/SemaHLSL/ArrayTemporary.hlsl
index 0266a198e7ec9..3d713a89adf3b 100644
--- a/clang/test/SemaHLSL/ArrayTemporary.hlsl
+++ b/clang/test/SemaHLSL/ArrayTemporary.hlsl
@@ -25,7 +25,7 @@ void fn2(Obj O[4]) { }
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'Obj[4]' <HLSLArrayRValue>
 
 void call2() {
-  Obj Arr[4] = {};
+  Obj Arr[4] = {0, 0, 0, 0, 0, 0, 0, 0};
   fn2(Arr);
 }
 
diff --git a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
index b7085bc69547b..9417249383469 100644
--- a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
+++ b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
@@ -43,7 +43,9 @@ export void cantCast4() {
   int2 A = {1,2};
   R r = R(A);
   // expected-error@-1 {{no matching conversion for functional-style cast from 'int2' (aka 'vector<int, 2>') to 'R'}}
-  R r2 = {1, 2};
+  R r2;
+  r2.A = 1;
+  r2.F = 2.0;
   int2 B = (int2)r2;
   // expected-error@-1 {{cannot convert 'R' to 'int2' (aka 'vector<int, 2>') without a conversion operator}}
 }
diff --git a/clang/test/SemaHLSL/Language/InitListAST.hlsl b/clang/test/SemaHLSL/Language/InitListAST.hlsl
new file mode 100644
index 0000000000000..d58582f9029fe
--- /dev/null
+++ b/clang/test/SemaHLSL/Language/InitListAST.hlsl
@@ -0,0 +1,983 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -finclude-default-header -ast-dump -ast-dump-filter=case %s | FileCheck %s
+
+struct TwoFloats {
+  float X, Y;
+};
+
+struct TwoInts {
+  int Z, W;
+};
+
+struct Doggo {
+  int4 LegState;
+  int TailState;
+  float HairCount;
+  float4 EarDirection[2];
+};
+
+struct AnimalBits {
+  int Legs[4];
+  uint State;
+  int64_t Counter;
+  float4 LeftDir;
+  float4 RightDir;
+};
+
+struct Kitteh {
+  int4 Legs;
+  int TailState;
+  float HairCount;
+  float4 Claws[2];
+};
+
+struct Zoo {
+  Doggo Dogs[2];
+  Kitteh Cats[4];
+};
+
+struct FourFloats : TwoFloats {
+  float Z, W;
+};
+
+struct SlicyBits {
+  int Z : 8;
+  int W : 8;
+};
+
+// Case 1: Extraneous braces get ignored in literal instantiation.
+// CHECK-LABEL: Dumping case1
+// CHECK: VarDecl {{.*}} used TF1 'TwoFloats' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoFloats'
+// CHECK-NEXT: FloatingLiteral {{.*}} 'float' 1.000000e+00
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+TwoFloats case1() {
+  TwoFloats TF1 = {{{1.0, 2}}};
+  return TF1;
+}
+
+// Case 2: Valid C/C++ initializer is handled appropriately.
+//CHECK-LABEL: Dumping case2
+//CHECK: VarDecl {{.*}} used TF2 'TwoFloats' nrvo cinit
+//CHECK-NEXT: InitListExpr {{.*}} 'TwoFloats'
+//CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+//CHECK-NEXT: IntegerLiteral {{.*}} 'int' 1
+//CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+//CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+TwoFloats case2() {
+  TwoFloats TF2 = {1, 2};
+  return TF2;
+}
+
+// Case 3: Simple initialization with conversion of an argument.
+// CHECK-LABEL: Dumping case3
+// CHECK: VarDecl {{.*}} used TF3 'TwoFloats' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'Val' 'int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+TwoFloats case3(int Val) {
+  TwoFloats TF3 = {Val, 2};
+  return TF3;
+}
+
+// Case 4: Initialization from a scalarized vector into a structure with element
+// conversions.
+// CHECK-LABEL: Dumping case4
+// CHECK: VarDecl {{.*}} used TF4 'TwoFloats' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}}'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector<int, 2>'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector<int, 2>'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+TwoFloats case4(int2 TwoVals) {
+  TwoFloats TF4 = {TwoVals};
+  return TF4;
+}
+
+// Case 5: Initialization from a scalarized vector of matching type.
+// CHECK-LABEL: Dumping case5
+// CHECK: VarDecl {{.*}} used TI1 'TwoInts' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoInts'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector<int, 2>'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector<int, 2>' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector<int, 2>'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+TwoInts case5(int2 TwoVals) {
+  TwoInts TI1 = {TwoVals};
+  return TI1;
+}
+
+// Case 6: Initialization from a scalarized structure of different type with
+// different element types.
+// CHECK-LABEL: Dumping case6
+// CHECK: VarDecl {{.*}} used TI2 'TwoInts' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoInts'
+// CHECK-NEXT: ImplicitCastExpr {{.*}}'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .X {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF4' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .Y {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF4' 'TwoFloats'
+TwoInts case6(TwoFloats TF4) {
+  TwoInts TI2 = {TF4};
+  return TI2;
+}
+
+// Case 7: Initialization of a complex structure, with bogus braces and element
+// conversions from a collection of scalar values, and structures.
+// CHECK-LABEL: Dumping case7
+// CHECK: VarDecl {{.*}} used D1 'Doggo' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'Doggo'
+// CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .Z {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoInts' lvalue ParmVar {{.*}} 'TI1' 'TwoInts'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .W {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoInts' lvalue ParmVar {{.*}} 'TI1' 'TwoInts'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .Z {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoInts' lvalue ParmVar {{.*}} 'TI2' 'TwoInts'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .W {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoInts' lvalue ParmVar {{.*}} 'TI2' 'TwoInts'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'Val' 'int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'Val' 'int'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4[2]'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .X {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF1' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .Y {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF1' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .X {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF2' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .Y {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF2' 'TwoFloats'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .X {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF3' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .Y {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF3' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .X {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF4' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .Y {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF4' 'TwoFloats'
+Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2,
+            TwoFloats TF3, TwoFloats TF4) {
+  Doggo D1 = {TI1, TI2, {Val, Val}, {{TF1, TF2}, {TF3, TF4}}};
+  return D1;
+}
+
+// Case 8: Initialization of a structure from a different structure with
+// significantly different element types and grouping.
+// CHECK-LABEL: Dumping case8
+// CHECK: VarDecl {{.*}} used A1 'AnimalBits' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'AnimalBits'
+// CHECK-NEXT: InitListExpr {{.*}} 'int[4]'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' <IntegralCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'long' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .HairCount {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+AnimalBits case8(Doggo D1) {
+  AnimalBits A1 = {D1};
+  return A1;
+}
+
+// Case 9: Everything everywhere all at once... Initializing mismatched
+// structures from different layouts, different component groupings, with no
+// top-level bracing separation.
+// CHECK-LABEL: Dumping case9
+// CHECK: VarDecl {{.*}} used Z1 'Zoo' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'Zoo'
+// CHECK-NEXT: InitListExpr {{.*}} 'Doggo[2]'
+// CHECK-NEXT: InitListExpr {{.*}} 'Doggo'
+// CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .HairCount {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4[2]'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'Doggo'
+// CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <IntegralCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t':'long' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int64_t':'long' lvalue .Counter {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4[2]'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'Kitteh[4]'
+// CHECK-NEXT: InitListExpr {{.*}} 'Kitteh'
+// CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .HairCount {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4[2]'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'Kitteh'
+// CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <IntegralCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t':'long' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int64_t':'long' lvalue .Counter {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4[2]'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'Kitteh'
+// CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector<int, 4>' lvalue .LegState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .HairCount {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4[2]'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector<float, 4>' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'Kitteh'
+// CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector<int, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <IntegralCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t':'long' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int64_t':'long' lvalue .Counter {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4[2]'
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .LeftDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+// CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector<float, 4>' lvalue .RightDir {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+Zoo case9(Doggo D1, AnimalBits A1) {
+  Zoo Z1 = {D1, A1, D1, A1, D1, A1};
+  return Z1;
+}
+
+// Case 10: Initialize an object with a base class from two objects.
+// CHECK-LABEL: Dumping case10
+// CHECK: | `-VarDecl {{.*}} used FF1 'FourFloats' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'FourFloats'
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .X {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF1' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .Y {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF1' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .X {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF2' 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .Y {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoFloats' lvalue ParmVar {{.*}} 'TF2' 'TwoFloats'
+FourFloats case10(TwoFloats TF1, TwoFloats TF2) {
+  FourFloats FF1 = {TF1, TF2};
+  return FF1;
+}
+
+// Case 11: Initialize an object with a base class from a vector splat.
+// CHECK-LABEL: Dumping case11
+// CHECK: VarDecl {{.*}} used FF1 'FourFloats' nrvo cinit
+// CHECK-NEXT: ExprWithCleanups {{.*}} 'FourFloats'
+// CHECK-NEXT: InitListExpr {{.*}} 'FourFloats'
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector<float, 4>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector<float, 4>' xxxx
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 1>' lvalue <VectorSplat>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector<float, 4>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector<float, 4>' xxxx
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 1>' lvalue <VectorSplat>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector<float, 4>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector<float, 4>' xxxx
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 1>' lvalue <VectorSplat>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector<float, 4>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector<float, 4>' xxxx
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 1>' lvalue <VectorSplat>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3
+FourFloats case11(float F) {
+  FourFloats FF1 = {F.xxxx};
+  return FF1;
+}
+
+// Case 12: Initialize bitfield from two integers.
+// CHECK-LABEL: Dumping case12
+// CHECK: VarDecl {{.*}} used SB 'SlicyBits' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'SlicyBits'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'I' 'int'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'J' 'int'
+SlicyBits case12(int I, int J) {
+  SlicyBits SB = {I, J};
+  return SB;
+}
+
+// Case 13: Initialize bitfield from a struct of two ints.
+// CHECK-LABEL: Dumping case13
+// CHECK: VarDecl {{.*}} used SB 'SlicyBits' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'SlicyBits'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .Z {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoInts' lvalue ParmVar {{.*}} 'TI' 'TwoInts'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .W {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'TwoInts' lvalue ParmVar {{.*}} 'TI' 'TwoInts'
+SlicyBits case13(TwoInts TI) {
+  SlicyBits SB = {TI};
+  return SB;
+}
+
+// Case 14: Initialize struct of ints from struct with bitfields.
+// CHECK-LABEL: Dumping case14
+// CHECK: VarDecl {{.*}} used TI 'TwoInts' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoInts'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue bitfield .Z {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'SlicyBits' lvalue ParmVar {{.*}} 'SB' 'SlicyBits'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue bitfield .W {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'SlicyBits' lvalue ParmVar {{.*}} 'SB' 'SlicyBits'
+TwoInts case14(SlicyBits SB) {
+  TwoInts TI = {SB};
+  return TI;
+}
+
+// Case 15: Initialize struct of floats from struct with bitfields.
+// CHECK-LABEL: Dumping case15
+// CHECK: VarDecl {{.*}} used TI 'TwoFloats' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue bitfield .Z {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'SlicyBits' lvalue ParmVar {{.*}} 'SB' 'SlicyBits'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue bitfield .W {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'SlicyBits' lvalue ParmVar {{.*}} 'SB' 'SlicyBits'
+TwoFloats case15(SlicyBits SB) {
+  TwoFloats TI = {SB};
+  return TI;
+}
+
+// Case 16: Side-effecting initialization list arguments. The important thing
+// here is that case16 only has _one_ call to makeTwo.
+TwoFloats makeTwo(inout float X) {
+    TwoFloats TF = {X, X*1.5};
+    X *= 2;
+    return TF;
+}
+
+// CHECK-LABEL: Dumping case16
+// CHECK: VarDecl {{.*}} used FF 'FourFloats' nrvo cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'FourFloats'
+// CHECK-NEXT: InitListExpr {{.*}} 'TwoFloats'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 0
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr {{.*}} 'float' xvalue .X {{.*}}
+// CHECK-NEXT: OpaqueValueExpr [[OVEArg:0x[0-9A-Fa-f]+]] {{.*}} 'TwoFloats' xvalue
+// CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'TwoFloats' xvalue
+// CHECK-NEXT: CallExpr {{.*}} 'TwoFloats'
+
+// I don't care about the call here, just skip ahead to the next argument, and
+// verify that we match the same OpaqueValueExpr.
+
+// CHECK: MemberExpr {{.*}} 'float' xvalue .Y {{.*}}
+// CHECK-NEXT: OpaqueValueExpr [[OVEArg]] {{.*}} 'TwoFloats' xvalue
+// CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'TwoFloats' xvalue
+// CHECK-NEXT: CallExpr {{.*}} 'TwoFloats'
+FourFloats case16() {
+    float X = 0;
+    FourFloats FF = {0, makeTwo(X), 3};
+    return FF;
+}
diff --git a/clang/test/SemaHLSL/Language/InitLists.hlsl b/clang/test/SemaHLSL/Language/InitLists.hlsl
new file mode 100644
index 0000000000000..3607dfd8aedbc
--- /dev/null
+++ b/clang/test/SemaHLSL/Language/InitLists.hlsl
@@ -0,0 +1,126 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -finclude-default-header -verify -Wdouble-promotion -Wconversion %s
+
+struct TwoFloats {
+  float X, Y;
+};
+
+struct TwoInts {
+  int Z, W;
+};
+
+struct Doggo {
+  int4 LegState;
+  int TailState;
+  float HairCount;
+  float4 EarDirection[2];
+};
+
+struct AnimalBits {
+  int Legs[4];
+  uint State;
+  int64_t Counter;
+  float4 LeftDir;
+  float4 RightDir;
+};
+
+struct Kitteh {
+  int4 Legs;
+  int TailState;
+  float HairCount;
+  float4 Claws[2];
+};
+
+struct Zoo {
+  Doggo Dogs[2];
+  Kitteh Cats[4];
+};
+
+struct FourFloats : TwoFloats {
+  float Z, W;
+};
+
+struct SlicyBits {
+  int Z : 8;
+  int W : 8;
+};
+
+struct ContainsResource { // #ContainsResource
+  int X;
+  RWBuffer<float4> B;
+};
+
+struct ContainsResourceInverted {
+  RWBuffer<float4> B;
+  int X;
+};
+
+void fn() {
+  TwoFloats TF1 = {{{1.0, 2}}};
+  TwoFloats TF2 = {1,2};
+  int Val = 1;
+  TwoFloats TF3 = {Val, 2}; // expected-warning{{implicit conversion from 'int' to 'float' may lose precision}}
+  int2 TwoVals = 1.xx;
+  int2 Something = 1.xxx; // expected-warning{{implicit conversion truncates vector: 'vector<int, 3>' (vector of 3 'int' values) to 'vector<int, 2>' (vector of 2 'int' values)}}
+  TwoFloats TF4 = {TwoVals}; // expected-warning{{implicit conversion from 'int' to 'float' may lose precision}} expected-warning{{implicit conversion from 'int' to 'float' may lose precision}}
+
+  TwoInts TI1 = {TwoVals};
+  TwoInts TI2 = {TF4}; // expected-warning{{implicit conversion turns floating-point number into integer: 'float' to 'int'}} expected-warning{{implicit conversion turns floating-point number into integer: 'float' to 'int'}}
+
+  Doggo D1 = {TI1, TI2, {Val, Val}, {{TF1, TF2}, {TF3, TF4}}}; // expected-warning{{implicit conversion from 'int' to 'float' may lose precision}}
+  AnimalBits A1 = {D1}; // expected-warning{{implicit conversion turns floating-point number into integer: 'float' to 'long'}} expected-warning{{implicit conversion changes signedness: 'int' to 'unsigned int'}}
+
+  Zoo Z1 = {D1, A1, D1, A1, D1, A1}; // #insanity
+
+  // expected-warning@#insanity{{implicit conversion from 'int64_t' (aka 'long') to 'float' may lose precision}}
+  // expected-warning@#insanity{{implicit conversion changes signedness: 'uint' (aka 'unsigned int') to 'int'}}
+  // expected-warning@#insanity{{implicit conversion from 'int64_t' (aka 'long') to 'float' may lose precision}}
+  // expected-warning@#insanity{{implicit conversion changes signedness: 'uint' (aka 'unsigned int') to 'int'}}
+  // expected-warning@#insanity{{implicit conversion from 'int64_t' (aka 'long') to 'float' may lose precision}}
+  // expected-warning@#insanity{{implicit conversion changes signedness: 'uint' (aka 'unsigned int') to 'int'}}
+}
+
+void fn2() {
+  TwoFloats TF2 = {1,2};
+  FourFloats FF1 = {TF2, TF2};
+  FourFloats FF2 = {1,2,3,4};
+  FourFloats FF3 = {1.xxx, 2};
+
+  SlicyBits SB1 = {1,2};
+  TwoInts TI1 = {SB1};
+  SlicyBits SB2 = {TI1};
+}
+
+void Errs() {
+  TwoFloats F1 = {}; // expected-error{{too few initializers in list for type 'TwoFloats' (expected 2 but found 0)}}
+  TwoFloats F2 = {1}; // expected-error{{too few initializers in list for type 'TwoFloats' (expected 2 but found 1)}}
+  TwoFloats F3 = {1,2,3}; // expected-error{{too many initializers in list for type 'TwoFloats' (expected 2 but found 3)}}
+
+  int2 Something = {1.xxx}; // expected-error{{too many initializers in list for type 'int2' (aka 'vector<int, 2>') (expected 2 but found 3)}}
+}
+
+struct R {
+  int A;
+  union { // #anon
+    float F;
+    int4 G;
+  };
+};
+
+// expected-note@#anon{{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to}}
+// expected-note@#anon{{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to}}
+
+void Err2(RWBuffer<float4> B) {
+  ContainsResource RS1 = {1, B};
+  ContainsResource RS2 = (1.xx); // expected-error{{no viable conversion from 'vector<int, 2>' (vector of 2 'int' values) to 'ContainsResource'}}
+  ContainsResource RS3 = {B, 1}; // expected-error{{no viable conversion from 'RWBuffer<float4>' (aka 'RWBuffer<vector<float, 4>>') to 'int'}}
+  ContainsResourceInverted IR = {RS1}; // expected-error{{no viable conversion from 'int' to 'hlsl::RWBuffer<vector<float, 4>>'}}
+
+  R r = {1,2}; // expected-error{{no viable conversion from 'int' to 'R::(anonymous union at}}
+}
+
+// expected-note@#ContainsResource{{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'vector<int, 2>' (vector of 2 'int' values) to 'const ContainsResource &' for 1st argument}}
+// expected-note@#ContainsResource{{candidate constructor (the implicit move constructor) not viable: no known conversion from 'vector<int, 2>' (vector of 2 'int' values) to 'ContainsResource &&' for 1st argument}}
+
+// These notes refer to the RWBuffer constructors that do not have source locations
+// expected-note@*{{candidate constructor (the implicit copy constructor) not viable}}
+// expected-note@*{{candidate constructor (the implicit move constructor) not viable}}

From 07b0665d284f3d953dc74fae594102f9d3e81cb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Sat, 15 Feb 2025 20:50:14 +0100
Subject: [PATCH 033/109] [Flang] Support overriding `LLVM_LIT_ARGS` in
 standalone builds (#127340)

Declare `LLVM_LIT_ARGS` as a cache variable in standalone builds to
permit overriding it. This mirrors the logic used in Clang.
---
 flang/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index c012b884ae3be..0f98d12343c43 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -199,6 +199,11 @@ if (FLANG_STANDALONE_BUILD)
     ${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX})
 
   set(LLVM_EXTERNAL_LIT "${LLVM_TOOLS_BINARY_DIR}/llvm-lit" CACHE STRING "Command used to spawn lit")
+  set(LIT_ARGS_DEFAULT "-sv")
+  if (MSVC OR XCODE)
+    set(LIT_ARGS_DEFAULT "${LIT_ARGS_DEFAULT} --no-progress-bar")
+  endif()
+  set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")
 
   option(FLANG_INCLUDE_TESTS
          "Generate build targets for the Flang unit tests."

From 02fb9769417f972ffedefe32d7c0ae9cabd29917 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Sat, 15 Feb 2025 20:06:32 +0000
Subject: [PATCH 034/109] [mlir] Improve GreedyPatternRewriteDriver logging
 (#127314)

Currently, when `GreedyPatternRewriteDriver` fails, the log output
contains nested failure messages:

```bash
   } -> failure : pattern failed to match
} -> failure : pattern failed to match
```

This may seem redundant, but these messages refer to different aspects
of the pattern application logic. This patch clarifies the distinction
by separately logging:

* Success/failure for a specific pattern (e.g., "_this pattern_ failed
  to match on the Op currently being processed").
* Success/failure for an operation as a whole (e.g., "_all patterns_
  failed to match the Op currently being processed").

Before (example with success):
```bash
Processing operation : (...) {

  * Pattern (...) -> ()' {
Trying to match "..."
    ** Match Failure : (...)
  } -> failure : pattern failed to match

  * Pattern (...) -> ()' {
Trying to match "..."
  } -> success : pattern applied successfully
} -> success : pattern matched
```

After (example with success):
```bash
Processing operation : (...) {

  * Pattern (...) -> ()' {
Trying to match "..."
    ** Match Failure : (...)
  } -> failure : pattern failed to match

  * Pattern (...) -> ()' {
Trying to match "..."
  } -> success : pattern applied successfully
} -> success : at least one pattern matched
```

This improves log clarity, making it easier to distinguish pattern-level
failures from operation-level outcomes.
---
 mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index 969c560c99ab7..fe84c61300646 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -615,14 +615,14 @@ bool GreedyPatternRewriteDriver::processWorklist() {
         matcher.matchAndRewrite(op, rewriter, canApply, onFailure, onSuccess);
 
     if (succeeded(matchResult)) {
-      LLVM_DEBUG(logResultWithLine("success", "pattern matched"));
+      LLVM_DEBUG(logResultWithLine("success", "at least one pattern matched"));
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
       expensiveChecks.notifyRewriteSuccess();
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
       changed = true;
       ++numRewrites;
     } else {
-      LLVM_DEBUG(logResultWithLine("failure", "pattern failed to match"));
+      LLVM_DEBUG(logResultWithLine("failure", "all patterns failed to match"));
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
       expensiveChecks.notifyRewriteFailure();
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS

From ad948fa028bdfe1f15785aec4477f92ec681637a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Sat, 15 Feb 2025 20:16:25 +0000
Subject: [PATCH 035/109] [mlir][vector] Document `ConvertVectorStore` + unify
 var names (nfc) (#126422)

1. Documents `ConvertVectorStore`. As the generated output is rather complex, I
  have refined the comments + variable names in:
    * "vector-emulate-narrow-type-unaligned-non-atomic.mlir",
  to serve as reference for this pattern.

2. As a follow-on for #123527, renames `isAlignedEmulation` to `isFullyAligned`
  and `numSrcElemsPerDest` to `emulatedPerContainerElem`.
---
 .../Transforms/VectorEmulateNarrowType.cpp    |  89 ++++++--
 ...late-narrow-type-unaligned-non-atomic.mlir | 215 ++++++++++--------
 2 files changed, 185 insertions(+), 119 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index bf1ecd7d4559c..5d8a525ac87f1 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -432,7 +432,45 @@ namespace {
 // ConvertVectorStore
 //===----------------------------------------------------------------------===//
 
-// TODO: Document-me
+// Emulate `vector.store` using a multi-byte container type.
+//
+// The container type is obtained through Op adaptor and would normally be
+// generated via `NarrowTypeEmulationConverter`.
+//
+// EXAMPLE 1
+// (aligned store of i4, emulated using i8 as the container type)
+//
+//      vector.store %src, %dest[%idx_1, %idx_2] : memref<4x8xi4>, vector<8xi4>
+//
+// is rewritten as:
+//
+//      %src_bitcast = vector.bitcast %src : vector<8xi4> to vector<4xi8>
+//      vector.store %src_bitcast, %dest_bitcast[%idx]
+//        : memref<16xi8>, vector<4xi8>
+//
+// EXAMPLE 2
+// (unaligned store of i2, emulated using i8 as the container type)
+//
+//    vector.store %src, %dest[%c2, %c0] :memref<3x3xi2>, vector<3xi2>
+//
+// The i2 store is emulated through 2 x RMW sequences. The destination i2 memref
+// is modelled using 3 bytes:
+//
+//    Byte 0     Byte 1     Byte 2
+// +----------+----------+----------+
+// | oooooooo | ooooNNNN | NNoooooo |
+// +----------+----------+----------+
+//
+// N - (N)ew entries (i.e. to be overwritten by vector.store)
+// o - (o)ld entries (to be preserved)
+//
+// For the generated output in the non-atomic case, see:
+//  * @vector_store_i2_const_index_two_partial_stores`
+// in:
+//  * "vector-emulate-narrow-type-unaligned-non-atomic.mlir".
+//
+// NOTE: By default, all RMW sequences are atomic. Set `disableAtomicRMW` to
+// `false` to generate non-atomic RMW sequences.
 struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
   using OpConversionPattern::OpConversionPattern;
 
@@ -464,7 +502,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
           op, "impossible to pack emulated elements into container elements "
               "(bit-wise misalignment)");
     }
-    int numSrcElemsPerDest = containerBits / emulatedBits;
+    int emulatedPerContainerElem = containerBits / emulatedBits;
 
     // Adjust the number of elements to store when emulating narrow types.
     // Here only the 1-D vector store is considered, and the N-D memref types
@@ -480,7 +518,8 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
     // vector<4xi8>
 
     auto origElements = valueToStore.getType().getNumElements();
-    bool isAlignedEmulation = origElements % numSrcElemsPerDest == 0;
+    // Note, per-element-alignment was already verified above.
+    bool isFullyAligned = origElements % emulatedPerContainerElem == 0;
 
     auto stridedMetadata =
         rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
@@ -496,9 +535,8 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
             getAsOpFoldResult(adaptor.getIndices()));
 
     std::optional<int64_t> foldedNumFrontPadElems =
-        isAlignedEmulation
-            ? 0
-            : getConstantIntValue(linearizedInfo.intraDataOffset);
+        isFullyAligned ? 0
+                       : getConstantIntValue(linearizedInfo.intraDataOffset);
 
     if (!foldedNumFrontPadElems) {
       return rewriter.notifyMatchFailure(
@@ -516,10 +554,10 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
     // need unaligned emulation because the store address is aligned and the
     // source is a whole byte.
     bool emulationRequiresPartialStores =
-        !isAlignedEmulation || *foldedNumFrontPadElems != 0;
+        !isFullyAligned || *foldedNumFrontPadElems != 0;
     if (!emulationRequiresPartialStores) {
       // Basic case: storing full bytes.
-      auto numElements = origElements / numSrcElemsPerDest;
+      auto numElements = origElements / emulatedPerContainerElem;
       auto bitCast = rewriter.create<vector::BitCastOp>(
           loc, VectorType::get(numElements, containerElemTy),
           op.getValueToStore());
@@ -567,7 +605,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
 
     // Build a mask used for rmw.
     auto subWidthStoreMaskType =
-        VectorType::get({numSrcElemsPerDest}, rewriter.getI1Type());
+        VectorType::get({emulatedPerContainerElem}, rewriter.getI1Type());
 
     auto storeFunc = disableAtomicRMW ? nonAtomicRMW : atomicRMW;
 
@@ -576,10 +614,11 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
     // with the unaligned part so that the rest elements are aligned to width
     // boundary.
     auto frontSubWidthStoreElem =
-        (numSrcElemsPerDest - *foldedNumFrontPadElems) % numSrcElemsPerDest;
+        (emulatedPerContainerElem - *foldedNumFrontPadElems) %
+        emulatedPerContainerElem;
     if (frontSubWidthStoreElem > 0) {
-      SmallVector<bool> frontMaskValues(numSrcElemsPerDest, false);
-      if (*foldedNumFrontPadElems + origElements < numSrcElemsPerDest) {
+      SmallVector<bool> frontMaskValues(emulatedPerContainerElem, false);
+      if (*foldedNumFrontPadElems + origElements < emulatedPerContainerElem) {
         std::fill_n(frontMaskValues.begin() + *foldedNumFrontPadElems,
                     origElements, true);
         frontSubWidthStoreElem = origElements;
@@ -590,7 +629,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
       auto frontMask = rewriter.create<arith::ConstantOp>(
           loc, DenseElementsAttr::get(subWidthStoreMaskType, frontMaskValues));
 
-      currentSourceIndex = numSrcElemsPerDest - (*foldedNumFrontPadElems);
+      currentSourceIndex = emulatedPerContainerElem - (*foldedNumFrontPadElems);
       auto value =
           extractSliceIntoByte(rewriter, loc, valueToStore, 0,
                                frontSubWidthStoreElem, *foldedNumFrontPadElems);
@@ -614,8 +653,9 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
     // After the previous step, the store address is aligned to the emulated
     // width boundary.
     int64_t fullWidthStoreSize =
-        (origElements - currentSourceIndex) / numSrcElemsPerDest;
-    int64_t numNonFullWidthElements = fullWidthStoreSize * numSrcElemsPerDest;
+        (origElements - currentSourceIndex) / emulatedPerContainerElem;
+    int64_t numNonFullWidthElements =
+        fullWidthStoreSize * emulatedPerContainerElem;
     if (fullWidthStoreSize > 0) {
       auto fullWidthStorePart = staticallyExtractSubvector(
           rewriter, loc, valueToStore, currentSourceIndex,
@@ -624,7 +664,8 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
       auto originType = cast<VectorType>(fullWidthStorePart.getType());
       auto memrefElemType = getElementTypeOrSelf(memrefBase.getType());
       auto storeType = VectorType::get(
-          {originType.getNumElements() / numSrcElemsPerDest}, memrefElemType);
+          {originType.getNumElements() / emulatedPerContainerElem},
+          memrefElemType);
       auto bitCast = rewriter.create<vector::BitCastOp>(loc, storeType,
                                                         fullWidthStorePart);
       rewriter.create<vector::StoreOp>(loc, bitCast.getResult(), memrefBase,
@@ -646,7 +687,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
                                currentSourceIndex, remainingElements, 0);
 
       // Generate back mask.
-      auto maskValues = SmallVector<bool>(numSrcElemsPerDest, 0);
+      auto maskValues = SmallVector<bool>(emulatedPerContainerElem, 0);
       std::fill_n(maskValues.begin(), remainingElements, 1);
       auto backMask = rewriter.create<arith::ConstantOp>(
           loc, DenseElementsAttr::get(subWidthStoreMaskType, maskValues));
@@ -960,7 +1001,8 @@ struct ConvertVectorMaskedLoad final
     // subvector at the proper offset after bit-casting.
     auto origType = op.getVectorType();
     auto origElements = origType.getNumElements();
-    bool isAlignedEmulation = origElements % emulatedPerContainerElem == 0;
+    // Note, per-element-alignment was already verified above.
+    bool isFullyAligned = origElements % emulatedPerContainerElem == 0;
 
     auto stridedMetadata =
         rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
@@ -975,9 +1017,8 @@ struct ConvertVectorMaskedLoad final
             getAsOpFoldResult(adaptor.getIndices()));
 
     std::optional<int64_t> foldedIntraVectorOffset =
-        isAlignedEmulation
-            ? 0
-            : getConstantIntValue(linearizedInfo.intraDataOffset);
+        isFullyAligned ? 0
+                       : getConstantIntValue(linearizedInfo.intraDataOffset);
 
     int64_t maxIntraDataOffset =
         foldedIntraVectorOffset.value_or(emulatedPerContainerElem - 1);
@@ -1001,7 +1042,7 @@ struct ConvertVectorMaskedLoad final
       passthru = dynamicallyInsertSubVector(
           rewriter, loc, passthru, emptyVector, linearizedInfo.intraDataOffset,
           origElements);
-    } else if (!isAlignedEmulation) {
+    } else if (!isFullyAligned) {
       passthru = staticallyInsertSubvector(rewriter, loc, passthru, emptyVector,
                                            *foldedIntraVectorOffset);
     }
@@ -1029,7 +1070,7 @@ struct ConvertVectorMaskedLoad final
       mask = dynamicallyInsertSubVector(rewriter, loc, mask, emptyMask,
                                         linearizedInfo.intraDataOffset,
                                         origElements);
-    } else if (!isAlignedEmulation) {
+    } else if (!isFullyAligned) {
       mask = staticallyInsertSubvector(rewriter, loc, op.getMask(), emptyMask,
                                        *foldedIntraVectorOffset);
     }
@@ -1040,7 +1081,7 @@ struct ConvertVectorMaskedLoad final
       result = dynamicallyExtractSubVector(
           rewriter, loc, result, op.getPassThru(),
           linearizedInfo.intraDataOffset, origElements);
-    } else if (!isAlignedEmulation) {
+    } else if (!isFullyAligned) {
       result = staticallyExtractSubvector(
           rewriter, loc, result, *foldedIntraVectorOffset, origElements);
     }
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned-non-atomic.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned-non-atomic.mlir
index 1d6263535ae80..d27e99a54529c 100644
--- a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned-non-atomic.mlir
+++ b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned-non-atomic.mlir
@@ -1,5 +1,7 @@
 // RUN: mlir-opt --test-emulate-narrow-int="arith-compute-bitwidth=1 memref-load-bitwidth=8 disable-atomic-rmw=true" --cse --split-input-file %s | FileCheck %s
 
+// NOTE: In this file all RMW stores are non-atomic.
+
 // TODO: remove memref.alloc() in the tests to eliminate noises.
 // memref.alloc exists here because sub-byte vector data types such as i2
 // are currently not supported as input arguments.
@@ -8,121 +10,144 @@
 /// vector.store
 ///----------------------------------------------------------------------------------------
 
-func.func @vector_store_i2_const_index_two_partial_stores(%arg0: vector<3xi2>) {
-    %0 = memref.alloc() : memref<3x3xi2>
+func.func @vector_store_i2_const_index_two_partial_stores(%src: vector<3xi2>) {
+    %dest = memref.alloc() : memref<3x3xi2>
     %c0 = arith.constant 0 : index
     %c2 = arith.constant 2 : index
-    vector.store %arg0, %0[%c2, %c0] :memref<3x3xi2>, vector<3xi2>
+    vector.store %src, %dest[%c2, %c0] :memref<3x3xi2>, vector<3xi2>
     return
 }
 
-// Emit two non-atomic RMW partial stores. Store 6 bits from the input vector (bits [12:18)),
-// into bytes [1:2] from a 3-byte output memref. Due to partial storing,
-// both bytes are accessed partially through masking.
-
-// CHECK: func @vector_store_i2_const_index_two_partial_stores(
-// CHECK-SAME: %[[ARG0:.+]]: vector<3xi2>)
-// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<3xi8>
-// CHECK: %[[C1:.+]] = arith.constant 1 : index
-
-// Part 1 RMW sequence
-// CHECK: %[[CST:.+]] = arith.constant dense<[false, false, true, true]>
-// CHECK: %[[CST0:.+]] = arith.constant dense<0> : vector<4xi2>
-// CHECK: %[[EXTRACT:.+]] = vector.extract_strided_slice %[[ARG0]]
-// CHECK-SAME: {offsets = [0], sizes = [2], strides = [1]} : vector<3xi2> to vector<2xi2>
-// CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[EXTRACT]], %[[CST0]]
-// CHECK-SAME: {offsets = [2], strides = [1]} : vector<2xi2> into vector<4xi2>
-// CHECK: %[[LOAD:.+]] = vector.load
-// CHECK: %[[DOWNCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi8> to vector<4xi2>
-// CHECK: %[[SELECT:.+]] = arith.select %[[CST]], %[[INSERT]], %[[DOWNCAST]]
-// CHECK: %[[UPCAST:.+]] = vector.bitcast %[[SELECT]]
-// CHECK: vector.store %[[UPCAST]], %[[ALLOC]][%[[C1]]]
-
-// Part 2 RMW sequence
-// CHECK: %[[OFFSET:.+]] = arith.addi %[[C1]], %[[C1]] : index
-// CHECK: %[[EXTRACT2:.+]] = vector.extract_strided_slice %[[ARG0]]
-// CHECK-SAME: {offsets = [2], sizes = [1], strides = [1]} : vector<3xi2> to vector<1xi2>
-// CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[EXTRACT2]], %[[CST0]]
-// CHECK-SAME: {offsets = [0], strides = [1]} : vector<1xi2> into vector<4xi2>
-// CHECK: %[[CST1:.+]] = arith.constant dense<[true, false, false, false]> : vector<4xi1>
-// CHECK: %[[LOAD2:.+]] = vector.load
-// CHECK: %[[UPCAST2:.+]] = vector.bitcast %[[LOAD2]] : vector<1xi8> to vector<4xi2>
-// CHECK: %[[SELECT2:.+]] = arith.select %[[CST1]], %[[INSERT2]], %[[UPCAST2]]
-// CHECK: %[[DOWNCAST2:.+]] = vector.bitcast %[[SELECT2]]
-// CHECK: vector.store %[[DOWNCAST2]], %[[ALLOC]][%[[OFFSET]]]
-
+//  Store 6 bits from the input vector into bytes [1:2] of a 3-byte destination
+//  memref, i.e. into  bits [12:18) of a 24-bit destintion container
+//  (`memref<3x3xi2>` is emulated via `memref<3xi8>`). This requires two
+//  non-atomic RMW partial stores. Due to partial storing, both bytes are
+//  accessed partially through masking.
+
+//      CHECK:  func @vector_store_i2_const_index_two_partial_stores(
+// CHECK-SAME:    %[[SRC:.+]]: vector<3xi2>)
+
+//      CHECK:  %[[DEST:.+]] = memref.alloc() : memref<3xi8>
+//      CHECK:  %[[C1:.+]] = arith.constant 1 : index
+
+// RMW sequence for Byte 1
+//      CHECK:  %[[MASK_1:.+]] = arith.constant dense<[false, false, true, true]>
+//      CHECK:  %[[INIT:.+]] = arith.constant dense<0> : vector<4xi2>
+//      CHECK:  %[[SRC_SLICE_1:.+]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {offsets = [0], sizes = [2], strides = [1]} : vector<3xi2> to vector<2xi2>
+//      CHECK:  %[[INIT_WITH_SLICE_1:.+]] = vector.insert_strided_slice %[[SRC_SLICE_1]], %[[INIT]]
+// CHECK-SAME:    {offsets = [2], strides = [1]} : vector<2xi2> into vector<4xi2>
+//      CHECK:  %[[DEST_BYTE_1:.+]] = vector.load %[[DEST]][%[[C1]]] : memref<3xi8>, vector<1xi8>
+//      CHECK:  %[[DEST_BYTE_1_AS_I2:.+]] = vector.bitcast %[[DEST_BYTE_1]]
+// CHECK-SAME:    vector<1xi8> to vector<4xi2>
+//      CHECK:  %[[RES_BYTE_1:.+]] = arith.select %[[MASK_1]], %[[INIT_WITH_SLICE_1]], %[[DEST_BYTE_1_AS_I2]]
+//      CHECK:  %[[RES_BYTE_1_AS_I8:.+]] = vector.bitcast %[[RES_BYTE_1]]
+// CHECK-SAME:    vector<4xi2> to vector<1xi8>
+//      CHECK:  vector.store %[[RES_BYTE_1_AS_I8]], %[[DEST]][%[[C1]]]
+
+// RMW sequence for Byte 2
+//      CHECK:  %[[OFFSET:.+]] = arith.addi %[[C1]], %[[C1]] : index
+//      CHECK:  %[[SRC_SLICE_2:.+]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {offsets = [2], sizes = [1], strides = [1]} : vector<3xi2> to vector<1xi2>
+//      CHECK:  %[[INIT_WITH_SLICE_2:.+]] = vector.insert_strided_slice %[[SRC_SLICE_2]], %[[INIT]]
+// CHECK-SAME:    {offsets = [0], strides = [1]} : vector<1xi2> into vector<4xi2>
+//      CHECK:  %[[MASK_2:.+]] = arith.constant dense<[true, false, false, false]> : vector<4xi1>
+//      CHECK:  %[[DEST_BYTE_2:.+]] = vector.load %[[DEST]][%[[OFFSET]]] : memref<3xi8>, vector<1xi8>
+//      CHECK:  %[[DEST_BYTE_2_AS_I2:.+]] = vector.bitcast %[[DEST_BYTE_2]]
+// CHECK-SAME:    vector<1xi8> to vector<4xi2>
+//      CHECK:  %[[RES_BYTE_2:.+]] = arith.select %[[MASK_2]], %[[INIT_WITH_SLICE_2]], %[[DEST_BYTE_2_AS_I2]]
+//      CHECK:  %[[RES_BYTE_2_AS_I8:.+]] = vector.bitcast %[[RES_BYTE_2]]
+// CHECK-SAME:    vector<4xi2> to vector<1xi8>
+//      CHECK:  vector.store %[[RES_BYTE_2_AS_I8]], %[[DEST]][%[[OFFSET]]]
 
 // -----
 
-func.func @vector_store_i2_two_partial_one_full_stores(%arg0: vector<7xi2>) {
-    %0 = memref.alloc() : memref<3x7xi2>
+func.func @vector_store_i2_two_partial_one_full_stores(%src: vector<7xi2>) {
+    %dest = memref.alloc() : memref<3x7xi2>
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
-    vector.store %arg0, %0[%c1, %c0] :memref<3x7xi2>, vector<7xi2>
+    vector.store %src, %dest[%c1, %c0] :memref<3x7xi2>, vector<7xi2>
     return
 }
 
-// In this example, emit two RMW stores and one full-width store.
-
-// CHECK: func @vector_store_i2_two_partial_one_full_stores(
-// CHECK-SAME: %[[ARG0:.+]]:
-// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<6xi8>
-// CHECK: %[[C1:.+]] = arith.constant 1 : index
-// CHECK: %[[CST:.+]] = arith.constant dense<[false, false, false, true]>
-// CHECK: %[[CST0:.+]] = arith.constant dense<0> : vector<4xi2>
-// CHECK: %[[EXTRACT:.+]] = vector.extract_strided_slice %[[ARG0]]
-// CHECK-SAME: {offsets = [0], sizes = [1], strides = [1]}
-// CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[EXTRACT]], %[[CST0]]
-// CHECK-SAME: {offsets = [3], strides = [1]}
-// First sub-width RMW:
-// CHECK: %[[LOAD:.+]] = vector.load %[[ALLOC]][%[[C1]]]
-// CHECK: %[[UPCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi8> to vector<4xi2>
-// CHECK: %[[SELECT:.+]] = arith.select %[[CST]], %[[INSERT]], %[[UPCAST]]
-// CHECK: %[[DOWNCAST:.+]] = vector.bitcast %[[SELECT]]
-// CHECK: vector.store %[[DOWNCAST]], %[[ALLOC]][%[[C1]]]
+// Store 14 bits from the input vector into bytes [1:3] of a 6-byte destination
+// memref, i.e. bits [15:29) of a 48-bit destination container memref
+// (`memref<3x7xi2>` is emulated via `memref<6xi8>`). This requires two
+// non-atomic RMW stores (for the "boundary" bytes) and one full byte store
+// (for the "middle" byte). Note that partial stores require masking.
+
+//      CHECK: func @vector_store_i2_two_partial_one_full_stores(
+// CHECK-SAME:    %[[SRC:.+]]:
+
+//      CHECK:  %[[DEST:.+]] = memref.alloc() : memref<6xi8>
+//      CHECK:  %[[C1:.+]] = arith.constant 1 : index
+
+// First partial/RMW store:
+//      CHECK:  %[[MASK_1:.+]] = arith.constant dense<[false, false, false, true]>
+//      CHECK:  %[[INIT:.+]] = arith.constant dense<0> : vector<4xi2>
+//      CHECK:  %[[SRC_SLICE_0:.+]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {offsets = [0], sizes = [1], strides = [1]}
+//      CHECK:  %[[INIT_WITH_SLICE_1:.+]] = vector.insert_strided_slice %[[SRC_SLICE_0]], %[[INIT]]
+// CHECK-SAME:    {offsets = [3], strides = [1]}
+//      CHECK:  %[[DEST_BYTE_1:.+]] = vector.load %[[DEST]][%[[C1]]]
+//      CHECK:  %[[DEST_BYTE_AS_I2:.+]] = vector.bitcast %[[DEST_BYTE_1]]
+// CHECK-SAME:    : vector<1xi8> to vector<4xi2>
+//      CHECK:  %[[RES_BYTE_1:.+]] = arith.select %[[MASK_1]], %[[INIT_WITH_SLICE_1]], %[[DEST_BYTE_AS_I2]]
+//      CHECK:  %[[RES_BYTE_1_AS_I8:.+]] = vector.bitcast %[[RES_BYTE_1]]
+// CHECK-SAME:    : vector<4xi2> to vector<1xi8>
+//      CHECK:  vector.store %[[RES_BYTE_1_AS_I8]], %[[DEST]][%[[C1]]]
 
 // Full-width store:
-// CHECK: %[[INDEX:.+]] = arith.addi %[[C1]], %[[C1]]
-// CHECK: %[[EXTRACT1:.+]] = vector.extract_strided_slice %[[ARG0]]
-// CHECK-SAME: {offsets = [1], sizes = [4], strides = [1]}
-// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[EXTRACT1]]
-// CHECK: vector.store %[[BITCAST]], %[[ALLOC]][%[[INDEX]]]
-
-// Second sub-width RMW:
-// CHECK: %[[INDEX2:.+]] = arith.addi %[[INDEX]], %[[C1]]
-// CHECK: %[[EXTRACT2:.+]] = vector.extract_strided_slice %[[ARG0]]
-// CHECK-SAME: {offsets = [5], sizes = [2], strides = [1]}
-// CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[EXTRACT2]]
-// CHECK-SAME: {offsets = [0], strides = [1]}
-// CHECK: %[[CST1:.+]] = arith.constant dense<[true, true, false, false]>
-// CHECK: %[[LOAD2:.+]] = vector.load %[[ALLOC]][%[[INDEX2]]]
-// CHECK: %[[UPCAST2:.+]] = vector.bitcast %[[LOAD2]]
-// CHECK: %[[SELECT2:.+]] = arith.select %[[CST1]], %[[INSERT2]], %[[UPCAST2]]
-// CHECK: %[[DOWNCAST2:.+]] = vector.bitcast %[[SELECT2]]
-// CHECK: vector.store %[[DOWNCAST2]], %[[ALLOC]][%[[INDEX2]]]
+//      CHECK:  %[[C2:.+]] = arith.addi %[[C1]], %[[C1]]
+//      CHECK:  %[[SRC_SLICE_1:.+]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {offsets = [1], sizes = [4], strides = [1]}
+//      CHECK:  %[[SRC_SLICE_1_AS_I8:.+]] = vector.bitcast %[[SRC_SLICE_1]]
+// CHECK-SAME:    : vector<4xi2> to vector<1xi8>
+//      CHECK:  vector.store %[[SRC_SLICE_1_AS_I8]], %[[DEST]][%[[C2]]]
+
+// Second partial/RMW store:
+//      CHECK:  %[[C3:.+]] = arith.addi %[[C2]], %[[C1]]
+//      CHECK:  %[[SRC_SLICE_2:.+]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {offsets = [5], sizes = [2], strides = [1]}
+//      CHECK:  %[[INIT_WITH_SLICE2:.+]] = vector.insert_strided_slice %[[SRC_SLICE_2]]
+// CHECK-SAME:    {offsets = [0], strides = [1]}
+//      CHECK:  %[[MASK_2:.+]] = arith.constant dense<[true, true, false, false]>
+//      CHECK:  %[[DEST_BYTE_2:.+]] = vector.load %[[DEST]][%[[C3]]]
+//      CHECK:  %[[DEST_BYTE_2_AS_I2:.+]] = vector.bitcast %[[DEST_BYTE_2]]
+//      CHECK:  %[[RES_BYTE_2:.+]] = arith.select %[[MASK_2]], %[[INIT_WITH_SLICE2]], %[[DEST_BYTE_2_AS_I2]]
+//      CHECK:  %[[RES_BYTE_2_AS_I8:.+]] = vector.bitcast %[[RES_BYTE_2]]
+// CHECK-SAME:    : vector<4xi2> to vector<1xi8>
+//      CHECK:  vector.store %[[RES_BYTE_2_AS_I8]], %[[DEST]][%[[C3]]]
 
 // -----
 
-func.func @vector_store_i2_const_index_one_partial_store(%arg0: vector<1xi2>) {
-    %0 = memref.alloc() : memref<4x1xi2>
+func.func @vector_store_i2_const_index_one_partial_store(%src: vector<1xi2>) {
+    %dest = memref.alloc() : memref<4x1xi2>
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
-    vector.store %arg0, %0[%c1, %c0] :memref<4x1xi2>, vector<1xi2>
+    vector.store %src, %dest[%c1, %c0] :memref<4x1xi2>, vector<1xi2>
     return
 }
 
-// in this test, only emit partial RMW store as the store is within one byte.
-
-// CHECK: func @vector_store_i2_const_index_one_partial_store(
-// CHECK-SAME: %[[ARG0:.+]]: vector<1xi2>)
-// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1xi8>
-// CHECK: %[[C0:.+]] = arith.constant 0 : index
-// CHECK: %[[CST:.+]] = arith.constant dense<[false, true, false, false]>
-// CHECK: %[[CST0:.+]] = arith.constant dense<0> : vector<4xi2>
-// CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[ARG0]], %[[CST0]]
-// CHECK-SAME: {offsets = [1], strides = [1]} : vector<1xi2> into vector<4xi2>
-// CHECK: %[[LOAD:.+]] = vector.load %[[ALLOC]][%[[C0]]] : memref<1xi8>, vector<1xi8>
-// CHECK: %[[UPCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi8> to vector<4xi2>
-// CHECK: %[[SELECT:.+]] = arith.select %[[CST]], %[[INSERT]], %[[UPCAST]]
-// CHECK: %[[DOWNCAST:.+]] = vector.bitcast %[[SELECT]]
-// CHECK: vector.store %[[DOWNCAST]], %[[ALLOC]][%[[C0]]]
+// Store 2 bits from the input vector into byte 0 of a 1-byte destination
+// memref, i.e. bits [3:5) of a 8-bit destination container memref
+// (`<memref<4x1xi2>` is emulated via `memref<1xi8>`). This requires one
+// non-atomic RMW.
+
+//      CHECK:  func @vector_store_i2_const_index_one_partial_store(
+// CHECK-SAME:    %[[SRC:.+]]: vector<1xi2>)
+
+//      CHECK:  %[[DEST:.+]] = memref.alloc() : memref<1xi8>
+//      CHECK:  %[[C0:.+]] = arith.constant 0 : index
+
+//      CHECK:  %[[MASK:.+]] = arith.constant dense<[false, true, false, false]>
+//      CHECK:  %[[INIT:.+]] = arith.constant dense<0> : vector<4xi2>
+//      CHECK:  %[[INIT_WITH_SLICE:.+]] = vector.insert_strided_slice %[[SRC]], %[[INIT]]
+// CHECK-SAME:    {offsets = [1], strides = [1]} : vector<1xi2> into vector<4xi2>
+//      CHECK:  %[[DEST_BYTE:.+]] = vector.load %[[DEST]][%[[C0]]] : memref<1xi8>, vector<1xi8>
+//      CHECK:  %[[DEST_BYTE_AS_I2:.+]] = vector.bitcast %[[DEST_BYTE]]
+// CHECK-SAME:    : vector<1xi8> to vector<4xi2>
+//      CHECK:  %[[RES_BYTE:.+]] = arith.select %[[MASK]], %[[INIT_WITH_SLICE]], %[[DEST_BYTE_AS_I2]]
+//      CHECK:  %[[RES_BYTE_AS_I8:.+]] = vector.bitcast %[[RES_BYTE]]
+// CHECK-SAME:    : vector<4xi2> to vector<1xi8>
+//      CHECK:  vector.store %[[RES_BYTE_AS_I8]], %[[DEST]][%[[C0]]]

From 50581ef1ee45815b9230043319de5ae93680d4ad Mon Sep 17 00:00:00 2001
From: Chris Bieneman <chris.bieneman@me.com>
Date: Sat, 15 Feb 2025 14:18:45 -0600
Subject: [PATCH 036/109] [NFC] Fix warning in recent commit

---
 clang/lib/Sema/SemaHLSL.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index be45761552290..957c3a0888438 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3223,7 +3223,7 @@ bool SemaHLSL::TransformInitList(const InitializedEntity &Entity,
     Expr *E = Init->getInit(I);
     if (E->HasSideEffects(Ctx)) {
       QualType Ty = E->getType();
-      if (auto *RTy = Ty->getAs<RecordType>())
+      if (Ty->isRecordType())
         E = new (Ctx) MaterializeTemporaryExpr(Ty, E, E->isLValue());
       E = new (Ctx) OpaqueValueExpr(E->getBeginLoc(), Ty, E->getValueKind(),
                                     E->getObjectKind(), E);

From e5f4019f69948f55b77fcb5f63ae8c296418432c Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 15 Feb 2025 20:33:03 +0000
Subject: [PATCH 037/109] [AArch64] Add extending reduction costs for addlv and
 dot

This adds some basic getExtendedReductionCost and getMulAccReductionCost to
account for add reduction (uaddlv/saddlv) and mla reductions with dotprod.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |   48 +
 .../AArch64/AArch64TargetTransformInfo.h      |    9 +
 .../SLPVectorizer/AArch64/horizontal.ll       |    2 +-
 .../SLPVectorizer/AArch64/vecreduceadd.ll     | 1151 +++++++++++++++++
 4 files changed, 1209 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a8cf4aba6186d..9e286a91cae3b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4635,6 +4635,54 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
 }
 
+InstructionCost AArch64TTIImpl::getExtendedReductionCost(
+    unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
+    FastMathFlags FMF, TTI::TargetCostKind CostKind) {
+  EVT VecVT = TLI->getValueType(DL, VecTy);
+  EVT ResVT = TLI->getValueType(DL, ResTy);
+
+  if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
+      VecVT.getSizeInBits() >= 64) {
+    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
+
+    // The legal cases are:
+    //   UADDLV 8/16/32->32
+    //   UADDLP 32->64
+    unsigned RevVTSize = ResVT.getSizeInBits();
+    if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
+         RevVTSize <= 32) ||
+        ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
+         RevVTSize <= 32) ||
+        ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
+         RevVTSize <= 64))
+      return (LT.first - 1) * 2 + 2;
+  }
+
+  return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
+                                         CostKind);
+}
+
+InstructionCost
+AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
+                                       VectorType *VecTy,
+                                       TTI::TargetCostKind CostKind) {
+  EVT VecVT = TLI->getValueType(DL, VecTy);
+  EVT ResVT = TLI->getValueType(DL, ResTy);
+
+  if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) {
+    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
+
+    // The legal cases with dotprod are
+    //   UDOT 8->32
+    // Which requires an additional uaddv to sum the i32 values.
+    if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
+         ResVT == MVT::i32)
+      return LT.first + 2;
+  }
+
+  return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, VecTy, CostKind);
+}
+
 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
   static const CostTblEntry ShuffleTbl[] = {
       { TTI::SK_Splice, MVT::nxv16i8,  1 },
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 481cb5511a331..c7f8450213ae5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -425,6 +425,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
                                              std::optional<FastMathFlags> FMF,
                                              TTI::TargetCostKind CostKind);
 
+  InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
+                                           Type *ResTy, VectorType *ValTy,
+                                           FastMathFlags FMF,
+                                           TTI::TargetCostKind CostKind);
+
+  InstructionCost getMulAccReductionCost(
+      bool IsUnsigned, Type *ResTy, VectorType *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
+
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask,
                                  TTI::TargetCostKind CostKind, int Index,
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 7c5f9847db1f4..8dc0181425625 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -228,7 +228,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; YAML-NEXT: Function:        test_unrolled_select
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-41'
+; YAML-NEXT:   - Cost:            '-44'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '10'
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
new file mode 100644
index 0000000000000..36826eb6681c8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
@@ -0,0 +1,1151 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -slp-threshold=-2 -S -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=COST %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+; COST-LABEL: Function:  sext_v4i8_i16
+; COST: Cost:            '-2'
+define i16 @sext_v4i8_i16(ptr %x) {
+; CHECK-LABEL: @sext_v4i8_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i8> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+entry:
+  %0 = load i8, ptr %x
+  %conv = sext i8 %0 to i16
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 1
+  %1 = load i8, ptr %arrayidx.1
+  %conv.1 = sext i8 %1 to i16
+  %add.1 = add nsw i16 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %2 = load i8, ptr %arrayidx.2
+  %conv.2 = sext i8 %2 to i16
+  %add.2 = add nsw i16 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 3
+  %3 = load i8, ptr %arrayidx.3
+  %conv.3 = sext i8 %3 to i16
+  %add.3 = add nsw i16 %add.2, %conv.3
+  ret i16 %add.3
+}
+
+; COST-LABEL: Function:  sext_v8i8_i16
+; COST: Cost:            '-12'
+define i16 @sext_v8i8_i16(ptr %x) {
+; CHECK-LABEL: @sext_v8i8_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+entry:
+  %0 = load i8, ptr %x
+  %conv = sext i8 %0 to i16
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 1
+  %1 = load i8, ptr %arrayidx.1
+  %conv.1 = sext i8 %1 to i16
+  %add.1 = add nsw i16 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %2 = load i8, ptr %arrayidx.2
+  %conv.2 = sext i8 %2 to i16
+  %add.2 = add nsw i16 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 3
+  %3 = load i8, ptr %arrayidx.3
+  %conv.3 = sext i8 %3 to i16
+  %add.3 = add nsw i16 %add.2, %conv.3
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %4 = load i8, ptr %arrayidx.4
+  %conv.4 = sext i8 %4 to i16
+  %add.4 = add nsw i16 %add.3, %conv.4
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 5
+  %5 = load i8, ptr %arrayidx.5
+  %conv.5 = sext i8 %5 to i16
+  %add.5 = add nsw i16 %add.4, %conv.5
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %6 = load i8, ptr %arrayidx.6
+  %conv.6 = sext i8 %6 to i16
+  %add.6 = add nsw i16 %add.5, %conv.6
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 7
+  %7 = load i8, ptr %arrayidx.7
+  %conv.7 = sext i8 %7 to i16
+  %add.7 = add nsw i16 %add.6, %conv.7
+  ret i16 %add.7
+}
+
+; COST-LABEL: Function:  sext_v16i8_i16
+; COST: Cost:            '-28'
+define i16 @sext_v16i8_i16(ptr %x) {
+; CHECK-LABEL: @sext_v16i8_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i8> [[TMP0]] to <16 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+entry:
+  %0 = load i8, ptr %x
+  %conv = sext i8 %0 to i16
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 1
+  %1 = load i8, ptr %arrayidx.1
+  %conv.1 = sext i8 %1 to i16
+  %add.1 = add nsw i16 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %2 = load i8, ptr %arrayidx.2
+  %conv.2 = sext i8 %2 to i16
+  %add.2 = add nsw i16 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 3
+  %3 = load i8, ptr %arrayidx.3
+  %conv.3 = sext i8 %3 to i16
+  %add.3 = add nsw i16 %add.2, %conv.3
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %4 = load i8, ptr %arrayidx.4
+  %conv.4 = sext i8 %4 to i16
+  %add.4 = add nsw i16 %add.3, %conv.4
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 5
+  %5 = load i8, ptr %arrayidx.5
+  %conv.5 = sext i8 %5 to i16
+  %add.5 = add nsw i16 %add.4, %conv.5
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %6 = load i8, ptr %arrayidx.6
+  %conv.6 = sext i8 %6 to i16
+  %add.6 = add nsw i16 %add.5, %conv.6
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 7
+  %7 = load i8, ptr %arrayidx.7
+  %conv.7 = sext i8 %7 to i16
+  %add.7 = add nsw i16 %add.6, %conv.7
+  %arrayidx.8 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %8 = load i8, ptr %arrayidx.8
+  %conv.8 = sext i8 %8 to i16
+  %add.8 = add nsw i16 %add.7, %conv.8
+  %arrayidx.9 = getelementptr inbounds nuw i8, ptr %x, i64 9
+  %9 = load i8, ptr %arrayidx.9
+  %conv.9 = sext i8 %9 to i16
+  %add.9 = add nsw i16 %add.8, %conv.9
+  %arrayidx.10 = getelementptr inbounds nuw i8, ptr %x, i64 10
+  %10 = load i8, ptr %arrayidx.10
+  %conv.10 = sext i8 %10 to i16
+  %add.10 = add nsw i16 %add.9, %conv.10
+  %arrayidx.11 = getelementptr inbounds nuw i8, ptr %x, i64 11
+  %11 = load i8, ptr %arrayidx.11
+  %conv.11 = sext i8 %11 to i16
+  %add.11 = add nsw i16 %add.10, %conv.11
+  %arrayidx.12 = getelementptr inbounds nuw i8, ptr %x, i64 12
+  %12 = load i8, ptr %arrayidx.12
+  %conv.12 = sext i8 %12 to i16
+  %add.12 = add nsw i16 %add.11, %conv.12
+  %arrayidx.13 = getelementptr inbounds nuw i8, ptr %x, i64 13
+  %13 = load i8, ptr %arrayidx.13
+  %conv.13 = sext i8 %13 to i16
+  %add.13 = add nsw i16 %add.12, %conv.13
+  %arrayidx.14 = getelementptr inbounds nuw i8, ptr %x, i64 14
+  %14 = load i8, ptr %arrayidx.14
+  %conv.14 = sext i8 %14 to i16
+  %add.14 = add nsw i16 %add.13, %conv.14
+  %arrayidx.15 = getelementptr inbounds nuw i8, ptr %x, i64 15
+  %15 = load i8, ptr %arrayidx.15
+  %conv.15 = sext i8 %15 to i16
+  %add.15 = add nsw i16 %add.14, %conv.15
+  ret i16 %add.15
+}
+
+; COST-LABEL: Function:  sext_v32i8_i16
+; COST: Cost:            '-57'
+define i16 @sext_v32i8_i16(ptr %x) {
+; CHECK-LABEL: @sext_v32i8_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[X:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <32 x i8> [[TMP0]] to <32 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+entry:
+  %0 = load i8, ptr %x
+  %conv = sext i8 %0 to i16
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 1
+  %1 = load i8, ptr %arrayidx.1
+  %conv.1 = sext i8 %1 to i16
+  %add.1 = add nsw i16 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %2 = load i8, ptr %arrayidx.2
+  %conv.2 = sext i8 %2 to i16
+  %add.2 = add nsw i16 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 3
+  %3 = load i8, ptr %arrayidx.3
+  %conv.3 = sext i8 %3 to i16
+  %add.3 = add nsw i16 %add.2, %conv.3
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %4 = load i8, ptr %arrayidx.4
+  %conv.4 = sext i8 %4 to i16
+  %add.4 = add nsw i16 %add.3, %conv.4
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 5
+  %5 = load i8, ptr %arrayidx.5
+  %conv.5 = sext i8 %5 to i16
+  %add.5 = add nsw i16 %add.4, %conv.5
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %6 = load i8, ptr %arrayidx.6
+  %conv.6 = sext i8 %6 to i16
+  %add.6 = add nsw i16 %add.5, %conv.6
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 7
+  %7 = load i8, ptr %arrayidx.7
+  %conv.7 = sext i8 %7 to i16
+  %add.7 = add nsw i16 %add.6, %conv.7
+  %arrayidx.8 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %8 = load i8, ptr %arrayidx.8
+  %conv.8 = sext i8 %8 to i16
+  %add.8 = add nsw i16 %add.7, %conv.8
+  %arrayidx.9 = getelementptr inbounds nuw i8, ptr %x, i64 9
+  %9 = load i8, ptr %arrayidx.9
+  %conv.9 = sext i8 %9 to i16
+  %add.9 = add nsw i16 %add.8, %conv.9
+  %arrayidx.10 = getelementptr inbounds nuw i8, ptr %x, i64 10
+  %10 = load i8, ptr %arrayidx.10
+  %conv.10 = sext i8 %10 to i16
+  %add.10 = add nsw i16 %add.9, %conv.10
+  %arrayidx.11 = getelementptr inbounds nuw i8, ptr %x, i64 11
+  %11 = load i8, ptr %arrayidx.11
+  %conv.11 = sext i8 %11 to i16
+  %add.11 = add nsw i16 %add.10, %conv.11
+  %arrayidx.12 = getelementptr inbounds nuw i8, ptr %x, i64 12
+  %12 = load i8, ptr %arrayidx.12
+  %conv.12 = sext i8 %12 to i16
+  %add.12 = add nsw i16 %add.11, %conv.12
+  %arrayidx.13 = getelementptr inbounds nuw i8, ptr %x, i64 13
+  %13 = load i8, ptr %arrayidx.13
+  %conv.13 = sext i8 %13 to i16
+  %add.13 = add nsw i16 %add.12, %conv.13
+  %arrayidx.14 = getelementptr inbounds nuw i8, ptr %x, i64 14
+  %14 = load i8, ptr %arrayidx.14
+  %conv.14 = sext i8 %14 to i16
+  %add.14 = add nsw i16 %add.13, %conv.14
+  %arrayidx.15 = getelementptr inbounds nuw i8, ptr %x, i64 15
+  %15 = load i8, ptr %arrayidx.15
+  %conv.15 = sext i8 %15 to i16
+  %add.15 = add nsw i16 %add.14, %conv.15
+  %arrayidx.16 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %16 = load i8, ptr %arrayidx.16
+  %conv.16 = sext i8 %16 to i16
+  %add.16 = add nsw i16 %add.15, %conv.16
+  %arrayidx.17 = getelementptr inbounds nuw i8, ptr %x, i64 17
+  %17 = load i8, ptr %arrayidx.17
+  %conv.17 = sext i8 %17 to i16
+  %add.17 = add nsw i16 %add.16, %conv.17
+  %arrayidx.18 = getelementptr inbounds nuw i8, ptr %x, i64 18
+  %18 = load i8, ptr %arrayidx.18
+  %conv.18 = sext i8 %18 to i16
+  %add.18 = add nsw i16 %add.17, %conv.18
+  %arrayidx.19 = getelementptr inbounds nuw i8, ptr %x, i64 19
+  %19 = load i8, ptr %arrayidx.19
+  %conv.19 = sext i8 %19 to i16
+  %add.19 = add nsw i16 %add.18, %conv.19
+  %arrayidx.20 = getelementptr inbounds nuw i8, ptr %x, i64 20
+  %20 = load i8, ptr %arrayidx.20
+  %conv.20 = sext i8 %20 to i16
+  %add.20 = add nsw i16 %add.19, %conv.20
+  %arrayidx.21 = getelementptr inbounds nuw i8, ptr %x, i64 21
+  %21 = load i8, ptr %arrayidx.21
+  %conv.21 = sext i8 %21 to i16
+  %add.21 = add nsw i16 %add.20, %conv.21
+  %arrayidx.22 = getelementptr inbounds nuw i8, ptr %x, i64 22
+  %22 = load i8, ptr %arrayidx.22
+  %conv.22 = sext i8 %22 to i16
+  %add.22 = add nsw i16 %add.21, %conv.22
+  %arrayidx.23 = getelementptr inbounds nuw i8, ptr %x, i64 23
+  %23 = load i8, ptr %arrayidx.23
+  %conv.23 = sext i8 %23 to i16
+  %add.23 = add nsw i16 %add.22, %conv.23
+  %arrayidx.24 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %24 = load i8, ptr %arrayidx.24
+  %conv.24 = sext i8 %24 to i16
+  %add.24 = add nsw i16 %add.23, %conv.24
+  %arrayidx.25 = getelementptr inbounds nuw i8, ptr %x, i64 25
+  %25 = load i8, ptr %arrayidx.25
+  %conv.25 = sext i8 %25 to i16
+  %add.25 = add nsw i16 %add.24, %conv.25
+  %arrayidx.26 = getelementptr inbounds nuw i8, ptr %x, i64 26
+  %26 = load i8, ptr %arrayidx.26
+  %conv.26 = sext i8 %26 to i16
+  %add.26 = add nsw i16 %add.25, %conv.26
+  %arrayidx.27 = getelementptr inbounds nuw i8, ptr %x, i64 27
+  %27 = load i8, ptr %arrayidx.27
+  %conv.27 = sext i8 %27 to i16
+  %add.27 = add nsw i16 %add.26, %conv.27
+  %arrayidx.28 = getelementptr inbounds nuw i8, ptr %x, i64 28
+  %28 = load i8, ptr %arrayidx.28
+  %conv.28 = sext i8 %28 to i16
+  %add.28 = add nsw i16 %add.27, %conv.28
+  %arrayidx.29 = getelementptr inbounds nuw i8, ptr %x, i64 29
+  %29 = load i8, ptr %arrayidx.29
+  %conv.29 = sext i8 %29 to i16
+  %add.29 = add nsw i16 %add.28, %conv.29
+  %arrayidx.30 = getelementptr inbounds nuw i8, ptr %x, i64 30
+  %30 = load i8, ptr %arrayidx.30
+  %conv.30 = sext i8 %30 to i16
+  %add.30 = add nsw i16 %add.29, %conv.30
+  %arrayidx.31 = getelementptr inbounds nuw i8, ptr %x, i64 31
+  %31 = load i8, ptr %arrayidx.31
+  %conv.31 = sext i8 %31 to i16
+  %add.31 = add nsw i16 %add.30, %conv.31
+  ret i16 %add.31
+}
+
+
+
+
+
+
+; COST-LABEL: Function:  sext_v4i16_i32
+; COST: Cost:            '-4'
+define i32 @sext_v4i16_i32(ptr %x) {
+; CHECK-LABEL: @sext_v4i16_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i16, ptr %x
+  %conv = sext i16 %0 to i32
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %1 = load i16, ptr %arrayidx.1
+  %conv.1 = sext i16 %1 to i32
+  %add.1 = add nsw i32 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %2 = load i16, ptr %arrayidx.2
+  %conv.2 = sext i16 %2 to i32
+  %add.2 = add nsw i32 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %3 = load i16, ptr %arrayidx.3
+  %conv.3 = sext i16 %3 to i32
+  %add.3 = add nsw i32 %add.2, %conv.3
+  ret i32 %add.3
+}
+
+; COST-LABEL: Function:  sext_v8i16_i32
+; COST: Cost:            '-12'
+define i32 @sext_v8i16_i32(ptr %x) {
+; CHECK-LABEL: @sext_v8i16_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i16, ptr %x
+  %conv = sext i16 %0 to i32
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %1 = load i16, ptr %arrayidx.1
+  %conv.1 = sext i16 %1 to i32
+  %add.1 = add nsw i32 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %2 = load i16, ptr %arrayidx.2
+  %conv.2 = sext i16 %2 to i32
+  %add.2 = add nsw i32 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %3 = load i16, ptr %arrayidx.3
+  %conv.3 = sext i16 %3 to i32
+  %add.3 = add nsw i32 %add.2, %conv.3
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %4 = load i16, ptr %arrayidx.4
+  %conv.4 = sext i16 %4 to i32
+  %add.4 = add nsw i32 %add.3, %conv.4
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 10
+  %5 = load i16, ptr %arrayidx.5
+  %conv.5 = sext i16 %5 to i32
+  %add.5 = add nsw i32 %add.4, %conv.5
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 12
+  %6 = load i16, ptr %arrayidx.6
+  %conv.6 = sext i16 %6 to i32
+  %add.6 = add nsw i32 %add.5, %conv.6
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 14
+  %7 = load i16, ptr %arrayidx.7
+  %conv.7 = sext i16 %7 to i32
+  %add.7 = add nsw i32 %add.6, %conv.7
+  ret i32 %add.7
+}
+
+; COST-LABEL: Function:  sext_v16i16_i32
+; COST: Cost:            '-25'
+define i32 @sext_v16i16_i32(ptr %x) {
+; CHECK-LABEL: @sext_v16i16_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[X:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i16> [[TMP0]] to <16 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i16, ptr %x
+  %conv = sext i16 %0 to i32
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %1 = load i16, ptr %arrayidx.1
+  %conv.1 = sext i16 %1 to i32
+  %add.1 = add nsw i32 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %2 = load i16, ptr %arrayidx.2
+  %conv.2 = sext i16 %2 to i32
+  %add.2 = add nsw i32 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %3 = load i16, ptr %arrayidx.3
+  %conv.3 = sext i16 %3 to i32
+  %add.3 = add nsw i32 %add.2, %conv.3
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %4 = load i16, ptr %arrayidx.4
+  %conv.4 = sext i16 %4 to i32
+  %add.4 = add nsw i32 %add.3, %conv.4
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 10
+  %5 = load i16, ptr %arrayidx.5
+  %conv.5 = sext i16 %5 to i32
+  %add.5 = add nsw i32 %add.4, %conv.5
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 12
+  %6 = load i16, ptr %arrayidx.6
+  %conv.6 = sext i16 %6 to i32
+  %add.6 = add nsw i32 %add.5, %conv.6
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 14
+  %7 = load i16, ptr %arrayidx.7
+  %conv.7 = sext i16 %7 to i32
+  %add.7 = add nsw i32 %add.6, %conv.7
+  %arrayidx.8 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %8 = load i16, ptr %arrayidx.8
+  %conv.8 = sext i16 %8 to i32
+  %add.8 = add nsw i32 %add.7, %conv.8
+  %arrayidx.9 = getelementptr inbounds nuw i8, ptr %x, i64 18
+  %9 = load i16, ptr %arrayidx.9
+  %conv.9 = sext i16 %9 to i32
+  %add.9 = add nsw i32 %add.8, %conv.9
+  %arrayidx.10 = getelementptr inbounds nuw i8, ptr %x, i64 20
+  %10 = load i16, ptr %arrayidx.10
+  %conv.10 = sext i16 %10 to i32
+  %add.10 = add nsw i32 %add.9, %conv.10
+  %arrayidx.11 = getelementptr inbounds nuw i8, ptr %x, i64 22
+  %11 = load i16, ptr %arrayidx.11
+  %conv.11 = sext i16 %11 to i32
+  %add.11 = add nsw i32 %add.10, %conv.11
+  %arrayidx.12 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %12 = load i16, ptr %arrayidx.12
+  %conv.12 = sext i16 %12 to i32
+  %add.12 = add nsw i32 %add.11, %conv.12
+  %arrayidx.13 = getelementptr inbounds nuw i8, ptr %x, i64 26
+  %13 = load i16, ptr %arrayidx.13
+  %conv.13 = sext i16 %13 to i32
+  %add.13 = add nsw i32 %add.12, %conv.13
+  %arrayidx.14 = getelementptr inbounds nuw i8, ptr %x, i64 28
+  %14 = load i16, ptr %arrayidx.14
+  %conv.14 = sext i16 %14 to i32
+  %add.14 = add nsw i32 %add.13, %conv.14
+  %arrayidx.15 = getelementptr inbounds nuw i8, ptr %x, i64 30
+  %15 = load i16, ptr %arrayidx.15
+  %conv.15 = sext i16 %15 to i32
+  %add.15 = add nsw i32 %add.14, %conv.15
+  ret i32 %add.15
+}
+
+; COST-LABEL: Function:  sext_v32i16_i32
+; COST: Cost:            '-51'
+define i32 @sext_v32i16_i32(ptr %x) {
+; CHECK-LABEL: @sext_v32i16_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i16>, ptr [[X:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <32 x i16> [[TMP0]] to <32 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i16, ptr %x
+  %conv = sext i16 %0 to i32
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %1 = load i16, ptr %arrayidx.1
+  %conv.1 = sext i16 %1 to i32
+  %add.1 = add nsw i32 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %2 = load i16, ptr %arrayidx.2
+  %conv.2 = sext i16 %2 to i32
+  %add.2 = add nsw i32 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %3 = load i16, ptr %arrayidx.3
+  %conv.3 = sext i16 %3 to i32
+  %add.3 = add nsw i32 %add.2, %conv.3
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %4 = load i16, ptr %arrayidx.4
+  %conv.4 = sext i16 %4 to i32
+  %add.4 = add nsw i32 %add.3, %conv.4
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 10
+  %5 = load i16, ptr %arrayidx.5
+  %conv.5 = sext i16 %5 to i32
+  %add.5 = add nsw i32 %add.4, %conv.5
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 12
+  %6 = load i16, ptr %arrayidx.6
+  %conv.6 = sext i16 %6 to i32
+  %add.6 = add nsw i32 %add.5, %conv.6
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 14
+  %7 = load i16, ptr %arrayidx.7
+  %conv.7 = sext i16 %7 to i32
+  %add.7 = add nsw i32 %add.6, %conv.7
+  %arrayidx.8 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %8 = load i16, ptr %arrayidx.8
+  %conv.8 = sext i16 %8 to i32
+  %add.8 = add nsw i32 %add.7, %conv.8
+  %arrayidx.9 = getelementptr inbounds nuw i8, ptr %x, i64 18
+  %9 = load i16, ptr %arrayidx.9
+  %conv.9 = sext i16 %9 to i32
+  %add.9 = add nsw i32 %add.8, %conv.9
+  %arrayidx.10 = getelementptr inbounds nuw i8, ptr %x, i64 20
+  %10 = load i16, ptr %arrayidx.10
+  %conv.10 = sext i16 %10 to i32
+  %add.10 = add nsw i32 %add.9, %conv.10
+  %arrayidx.11 = getelementptr inbounds nuw i8, ptr %x, i64 22
+  %11 = load i16, ptr %arrayidx.11
+  %conv.11 = sext i16 %11 to i32
+  %add.11 = add nsw i32 %add.10, %conv.11
+  %arrayidx.12 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %12 = load i16, ptr %arrayidx.12
+  %conv.12 = sext i16 %12 to i32
+  %add.12 = add nsw i32 %add.11, %conv.12
+  %arrayidx.13 = getelementptr inbounds nuw i8, ptr %x, i64 26
+  %13 = load i16, ptr %arrayidx.13
+  %conv.13 = sext i16 %13 to i32
+  %add.13 = add nsw i32 %add.12, %conv.13
+  %arrayidx.14 = getelementptr inbounds nuw i8, ptr %x, i64 28
+  %14 = load i16, ptr %arrayidx.14
+  %conv.14 = sext i16 %14 to i32
+  %add.14 = add nsw i32 %add.13, %conv.14
+  %arrayidx.15 = getelementptr inbounds nuw i8, ptr %x, i64 30
+  %15 = load i16, ptr %arrayidx.15
+  %conv.15 = sext i16 %15 to i32
+  %add.15 = add nsw i32 %add.14, %conv.15
+  %arrayidx.16 = getelementptr inbounds nuw i8, ptr %x, i64 32
+  %16 = load i16, ptr %arrayidx.16
+  %conv.16 = sext i16 %16 to i32
+  %add.16 = add nsw i32 %add.15, %conv.16
+  %arrayidx.17 = getelementptr inbounds nuw i8, ptr %x, i64 34
+  %17 = load i16, ptr %arrayidx.17
+  %conv.17 = sext i16 %17 to i32
+  %add.17 = add nsw i32 %add.16, %conv.17
+  %arrayidx.18 = getelementptr inbounds nuw i8, ptr %x, i64 36
+  %18 = load i16, ptr %arrayidx.18
+  %conv.18 = sext i16 %18 to i32
+  %add.18 = add nsw i32 %add.17, %conv.18
+  %arrayidx.19 = getelementptr inbounds nuw i8, ptr %x, i64 38
+  %19 = load i16, ptr %arrayidx.19
+  %conv.19 = sext i16 %19 to i32
+  %add.19 = add nsw i32 %add.18, %conv.19
+  %arrayidx.20 = getelementptr inbounds nuw i8, ptr %x, i64 40
+  %20 = load i16, ptr %arrayidx.20
+  %conv.20 = sext i16 %20 to i32
+  %add.20 = add nsw i32 %add.19, %conv.20
+  %arrayidx.21 = getelementptr inbounds nuw i8, ptr %x, i64 42
+  %21 = load i16, ptr %arrayidx.21
+  %conv.21 = sext i16 %21 to i32
+  %add.21 = add nsw i32 %add.20, %conv.21
+  %arrayidx.22 = getelementptr inbounds nuw i8, ptr %x, i64 44
+  %22 = load i16, ptr %arrayidx.22
+  %conv.22 = sext i16 %22 to i32
+  %add.22 = add nsw i32 %add.21, %conv.22
+  %arrayidx.23 = getelementptr inbounds nuw i8, ptr %x, i64 46
+  %23 = load i16, ptr %arrayidx.23
+  %conv.23 = sext i16 %23 to i32
+  %add.23 = add nsw i32 %add.22, %conv.23
+  %arrayidx.24 = getelementptr inbounds nuw i8, ptr %x, i64 48
+  %24 = load i16, ptr %arrayidx.24
+  %conv.24 = sext i16 %24 to i32
+  %add.24 = add nsw i32 %add.23, %conv.24
+  %arrayidx.25 = getelementptr inbounds nuw i8, ptr %x, i64 50
+  %25 = load i16, ptr %arrayidx.25
+  %conv.25 = sext i16 %25 to i32
+  %add.25 = add nsw i32 %add.24, %conv.25
+  %arrayidx.26 = getelementptr inbounds nuw i8, ptr %x, i64 52
+  %26 = load i16, ptr %arrayidx.26
+  %conv.26 = sext i16 %26 to i32
+  %add.26 = add nsw i32 %add.25, %conv.26
+  %arrayidx.27 = getelementptr inbounds nuw i8, ptr %x, i64 54
+  %27 = load i16, ptr %arrayidx.27
+  %conv.27 = sext i16 %27 to i32
+  %add.27 = add nsw i32 %add.26, %conv.27
+  %arrayidx.28 = getelementptr inbounds nuw i8, ptr %x, i64 56
+  %28 = load i16, ptr %arrayidx.28
+  %conv.28 = sext i16 %28 to i32
+  %add.28 = add nsw i32 %add.27, %conv.28
+  %arrayidx.29 = getelementptr inbounds nuw i8, ptr %x, i64 58
+  %29 = load i16, ptr %arrayidx.29
+  %conv.29 = sext i16 %29 to i32
+  %add.29 = add nsw i32 %add.28, %conv.29
+  %arrayidx.30 = getelementptr inbounds nuw i8, ptr %x, i64 60
+  %30 = load i16, ptr %arrayidx.30
+  %conv.30 = sext i16 %30 to i32
+  %add.30 = add nsw i32 %add.29, %conv.30
+  %arrayidx.31 = getelementptr inbounds nuw i8, ptr %x, i64 62
+  %31 = load i16, ptr %arrayidx.31
+  %conv.31 = sext i16 %31 to i32
+  %add.31 = add nsw i32 %add.30, %conv.31
+  ret i32 %add.31
+}
+
+
+
+
+
+
+
+; COST-LABEL: Function:  sext_v4i32_i64
+; COST: Cost:            '-4'
+define i64 @sext_v4i32_i64(ptr %x) {
+; CHECK-LABEL: @sext_v4i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[TMP0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %0 = load i32, ptr %x
+  %conv = sext i32 %0 to i64
+  %arrayidx.1 = getelementptr inbounds nuw i32, ptr %x, i64 1
+  %1 = load i32, ptr %arrayidx.1
+  %conv.1 = sext i32 %1 to i64
+  %add.1 = add nsw i64 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i32, ptr %x, i64 2
+  %2 = load i32, ptr %arrayidx.2
+  %conv.2 = sext i32 %2 to i64
+  %add.2 = add nsw i64 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i32, ptr %x, i64 3
+  %3 = load i32, ptr %arrayidx.3
+  %conv.3 = sext i32 %3 to i64
+  %add.3 = add nsw i64 %add.2, %conv.3
+  ret i64 %add.3
+}
+
+; COST-LABEL: Function:  sext_v8i32_i64
+; COST: Cost:            '-9'
+define i64 @sext_v8i32_i64(ptr %x) {
+; CHECK-LABEL: @sext_v8i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i32> [[TMP0]] to <8 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %0 = load i32, ptr %x
+  %conv = sext i32 %0 to i64
+  %arrayidx.1 = getelementptr inbounds nuw i32, ptr %x, i64 1
+  %1 = load i32, ptr %arrayidx.1
+  %conv.1 = sext i32 %1 to i64
+  %add.1 = add nsw i64 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i32, ptr %x, i64 2
+  %2 = load i32, ptr %arrayidx.2
+  %conv.2 = sext i32 %2 to i64
+  %add.2 = add nsw i64 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i32, ptr %x, i64 3
+  %3 = load i32, ptr %arrayidx.3
+  %conv.3 = sext i32 %3 to i64
+  %add.3 = add nsw i64 %add.2, %conv.3
+  %arrayidx.4 = getelementptr inbounds nuw i32, ptr %x, i64 4
+  %4 = load i32, ptr %arrayidx.4
+  %conv.4 = sext i32 %4 to i64
+  %add.4 = add nsw i64 %add.3, %conv.4
+  %arrayidx.5 = getelementptr inbounds nuw i32, ptr %x, i64 5
+  %5 = load i32, ptr %arrayidx.5
+  %conv.5 = sext i32 %5 to i64
+  %add.5 = add nsw i64 %add.4, %conv.5
+  %arrayidx.6 = getelementptr inbounds nuw i32, ptr %x, i64 6
+  %6 = load i32, ptr %arrayidx.6
+  %conv.6 = sext i32 %6 to i64
+  %add.6 = add nsw i64 %add.5, %conv.6
+  %arrayidx.7 = getelementptr inbounds nuw i32, ptr %x, i64 7
+  %7 = load i32, ptr %arrayidx.7
+  %conv.7 = sext i32 %7 to i64
+  %add.7 = add nsw i64 %add.6, %conv.7
+  ret i64 %add.7
+}
+
+; COST-LABEL: Function:  sext_v16i32_i64
+; COST: Cost:            '-19'
+define i64 @sext_v16i32_i64(ptr %x) {
+; CHECK-LABEL: @sext_v16i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i32> [[TMP0]] to <16 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %0 = load i32, ptr %x
+  %conv = sext i32 %0 to i64
+  %arrayidx.1 = getelementptr inbounds nuw i32, ptr %x, i64 1
+  %1 = load i32, ptr %arrayidx.1
+  %conv.1 = sext i32 %1 to i64
+  %add.1 = add nsw i64 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i32, ptr %x, i64 2
+  %2 = load i32, ptr %arrayidx.2
+  %conv.2 = sext i32 %2 to i64
+  %add.2 = add nsw i64 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i32, ptr %x, i64 3
+  %3 = load i32, ptr %arrayidx.3
+  %conv.3 = sext i32 %3 to i64
+  %add.3 = add nsw i64 %add.2, %conv.3
+  %arrayidx.4 = getelementptr inbounds nuw i32, ptr %x, i64 4
+  %4 = load i32, ptr %arrayidx.4
+  %conv.4 = sext i32 %4 to i64
+  %add.4 = add nsw i64 %add.3, %conv.4
+  %arrayidx.5 = getelementptr inbounds nuw i32, ptr %x, i64 5
+  %5 = load i32, ptr %arrayidx.5
+  %conv.5 = sext i32 %5 to i64
+  %add.5 = add nsw i64 %add.4, %conv.5
+  %arrayidx.6 = getelementptr inbounds nuw i32, ptr %x, i64 6
+  %6 = load i32, ptr %arrayidx.6
+  %conv.6 = sext i32 %6 to i64
+  %add.6 = add nsw i64 %add.5, %conv.6
+  %arrayidx.7 = getelementptr inbounds nuw i32, ptr %x, i64 7
+  %7 = load i32, ptr %arrayidx.7
+  %conv.7 = sext i32 %7 to i64
+  %add.7 = add nsw i64 %add.6, %conv.7
+  %arrayidx.8 = getelementptr inbounds nuw i32, ptr %x, i64 8
+  %8 = load i32, ptr %arrayidx.8
+  %conv.8 = sext i32 %8 to i64
+  %add.8 = add nsw i64 %add.7, %conv.8
+  %arrayidx.9 = getelementptr inbounds nuw i32, ptr %x, i64 9
+  %9 = load i32, ptr %arrayidx.9
+  %conv.9 = sext i32 %9 to i64
+  %add.9 = add nsw i64 %add.8, %conv.9
+  %arrayidx.10 = getelementptr inbounds nuw i32, ptr %x, i64 10
+  %10 = load i32, ptr %arrayidx.10
+  %conv.10 = sext i32 %10 to i64
+  %add.10 = add nsw i64 %add.9, %conv.10
+  %arrayidx.11 = getelementptr inbounds nuw i32, ptr %x, i64 11
+  %11 = load i32, ptr %arrayidx.11
+  %conv.11 = sext i32 %11 to i64
+  %add.11 = add nsw i64 %add.10, %conv.11
+  %arrayidx.12 = getelementptr inbounds nuw i32, ptr %x, i64 12
+  %12 = load i32, ptr %arrayidx.12
+  %conv.12 = sext i32 %12 to i64
+  %add.12 = add nsw i64 %add.11, %conv.12
+  %arrayidx.13 = getelementptr inbounds nuw i32, ptr %x, i64 13
+  %13 = load i32, ptr %arrayidx.13
+  %conv.13 = sext i32 %13 to i64
+  %add.13 = add nsw i64 %add.12, %conv.13
+  %arrayidx.14 = getelementptr inbounds nuw i32, ptr %x, i64 14
+  %14 = load i32, ptr %arrayidx.14
+  %conv.14 = sext i32 %14 to i64
+  %add.14 = add nsw i64 %add.13, %conv.14
+  %arrayidx.15 = getelementptr inbounds nuw i32, ptr %x, i64 15
+  %15 = load i32, ptr %arrayidx.15
+  %conv.15 = sext i32 %15 to i64
+  %add.15 = add nsw i64 %add.14, %conv.15
+  ret i64 %add.15
+}
+
+; COST-LABEL: Function:  sext_v32i32_i64
+; COST: Cost:            '-39'
+define i64 @sext_v32i32_i64(ptr %x) {
+; CHECK-LABEL: @sext_v32i32_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i32>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <32 x i32> [[TMP0]] to <32 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %0 = load i32, ptr %x
+  %conv = sext i32 %0 to i64
+  %arrayidx.1 = getelementptr inbounds nuw i32, ptr %x, i64 1
+  %1 = load i32, ptr %arrayidx.1
+  %conv.1 = sext i32 %1 to i64
+  %add.1 = add nsw i64 %conv, %conv.1
+  %arrayidx.2 = getelementptr inbounds nuw i32, ptr %x, i64 2
+  %2 = load i32, ptr %arrayidx.2
+  %conv.2 = sext i32 %2 to i64
+  %add.2 = add nsw i64 %add.1, %conv.2
+  %arrayidx.3 = getelementptr inbounds nuw i32, ptr %x, i64 3
+  %3 = load i32, ptr %arrayidx.3
+  %conv.3 = sext i32 %3 to i64
+  %add.3 = add nsw i64 %add.2, %conv.3
+  %arrayidx.4 = getelementptr inbounds nuw i32, ptr %x, i64 4
+  %4 = load i32, ptr %arrayidx.4
+  %conv.4 = sext i32 %4 to i64
+  %add.4 = add nsw i64 %add.3, %conv.4
+  %arrayidx.5 = getelementptr inbounds nuw i32, ptr %x, i64 5
+  %5 = load i32, ptr %arrayidx.5
+  %conv.5 = sext i32 %5 to i64
+  %add.5 = add nsw i64 %add.4, %conv.5
+  %arrayidx.6 = getelementptr inbounds nuw i32, ptr %x, i64 6
+  %6 = load i32, ptr %arrayidx.6
+  %conv.6 = sext i32 %6 to i64
+  %add.6 = add nsw i64 %add.5, %conv.6
+  %arrayidx.7 = getelementptr inbounds nuw i32, ptr %x, i64 7
+  %7 = load i32, ptr %arrayidx.7
+  %conv.7 = sext i32 %7 to i64
+  %add.7 = add nsw i64 %add.6, %conv.7
+  %arrayidx.8 = getelementptr inbounds nuw i32, ptr %x, i64 8
+  %8 = load i32, ptr %arrayidx.8
+  %conv.8 = sext i32 %8 to i64
+  %add.8 = add nsw i64 %add.7, %conv.8
+  %arrayidx.9 = getelementptr inbounds nuw i32, ptr %x, i64 9
+  %9 = load i32, ptr %arrayidx.9
+  %conv.9 = sext i32 %9 to i64
+  %add.9 = add nsw i64 %add.8, %conv.9
+  %arrayidx.10 = getelementptr inbounds nuw i32, ptr %x, i64 10
+  %10 = load i32, ptr %arrayidx.10
+  %conv.10 = sext i32 %10 to i64
+  %add.10 = add nsw i64 %add.9, %conv.10
+  %arrayidx.11 = getelementptr inbounds nuw i32, ptr %x, i64 11
+  %11 = load i32, ptr %arrayidx.11
+  %conv.11 = sext i32 %11 to i64
+  %add.11 = add nsw i64 %add.10, %conv.11
+  %arrayidx.12 = getelementptr inbounds nuw i32, ptr %x, i64 12
+  %12 = load i32, ptr %arrayidx.12
+  %conv.12 = sext i32 %12 to i64
+  %add.12 = add nsw i64 %add.11, %conv.12
+  %arrayidx.13 = getelementptr inbounds nuw i32, ptr %x, i64 13
+  %13 = load i32, ptr %arrayidx.13
+  %conv.13 = sext i32 %13 to i64
+  %add.13 = add nsw i64 %add.12, %conv.13
+  %arrayidx.14 = getelementptr inbounds nuw i32, ptr %x, i64 14
+  %14 = load i32, ptr %arrayidx.14
+  %conv.14 = sext i32 %14 to i64
+  %add.14 = add nsw i64 %add.13, %conv.14
+  %arrayidx.15 = getelementptr inbounds nuw i32, ptr %x, i64 15
+  %15 = load i32, ptr %arrayidx.15
+  %conv.15 = sext i32 %15 to i64
+  %add.15 = add nsw i64 %add.14, %conv.15
+  %arrayidx.16 = getelementptr inbounds nuw i32, ptr %x, i64 16
+  %16 = load i32, ptr %arrayidx.16
+  %conv.16 = sext i32 %16 to i64
+  %add.16 = add nsw i64 %add.15, %conv.16
+  %arrayidx.17 = getelementptr inbounds nuw i32, ptr %x, i64 17
+  %17 = load i32, ptr %arrayidx.17
+  %conv.17 = sext i32 %17 to i64
+  %add.17 = add nsw i64 %add.16, %conv.17
+  %arrayidx.18 = getelementptr inbounds nuw i32, ptr %x, i64 18
+  %18 = load i32, ptr %arrayidx.18
+  %conv.18 = sext i32 %18 to i64
+  %add.18 = add nsw i64 %add.17, %conv.18
+  %arrayidx.19 = getelementptr inbounds nuw i32, ptr %x, i64 19
+  %19 = load i32, ptr %arrayidx.19
+  %conv.19 = sext i32 %19 to i64
+  %add.19 = add nsw i64 %add.18, %conv.19
+  %arrayidx.20 = getelementptr inbounds nuw i32, ptr %x, i64 20
+  %20 = load i32, ptr %arrayidx.20
+  %conv.20 = sext i32 %20 to i64
+  %add.20 = add nsw i64 %add.19, %conv.20
+  %arrayidx.21 = getelementptr inbounds nuw i32, ptr %x, i64 21
+  %21 = load i32, ptr %arrayidx.21
+  %conv.21 = sext i32 %21 to i64
+  %add.21 = add nsw i64 %add.20, %conv.21
+  %arrayidx.22 = getelementptr inbounds nuw i32, ptr %x, i64 22
+  %22 = load i32, ptr %arrayidx.22
+  %conv.22 = sext i32 %22 to i64
+  %add.22 = add nsw i64 %add.21, %conv.22
+  %arrayidx.23 = getelementptr inbounds nuw i32, ptr %x, i64 23
+  %23 = load i32, ptr %arrayidx.23
+  %conv.23 = sext i32 %23 to i64
+  %add.23 = add nsw i64 %add.22, %conv.23
+  %arrayidx.24 = getelementptr inbounds nuw i32, ptr %x, i64 24
+  %24 = load i32, ptr %arrayidx.24
+  %conv.24 = sext i32 %24 to i64
+  %add.24 = add nsw i64 %add.23, %conv.24
+  %arrayidx.25 = getelementptr inbounds nuw i32, ptr %x, i64 25
+  %25 = load i32, ptr %arrayidx.25
+  %conv.25 = sext i32 %25 to i64
+  %add.25 = add nsw i64 %add.24, %conv.25
+  %arrayidx.26 = getelementptr inbounds nuw i32, ptr %x, i64 26
+  %26 = load i32, ptr %arrayidx.26
+  %conv.26 = sext i32 %26 to i64
+  %add.26 = add nsw i64 %add.25, %conv.26
+  %arrayidx.27 = getelementptr inbounds nuw i32, ptr %x, i64 27
+  %27 = load i32, ptr %arrayidx.27
+  %conv.27 = sext i32 %27 to i64
+  %add.27 = add nsw i64 %add.26, %conv.27
+  %arrayidx.28 = getelementptr inbounds nuw i32, ptr %x, i64 28
+  %28 = load i32, ptr %arrayidx.28
+  %conv.28 = sext i32 %28 to i64
+  %add.28 = add nsw i64 %add.27, %conv.28
+  %arrayidx.29 = getelementptr inbounds nuw i32, ptr %x, i64 29
+  %29 = load i32, ptr %arrayidx.29
+  %conv.29 = sext i32 %29 to i64
+  %add.29 = add nsw i64 %add.28, %conv.29
+  %arrayidx.30 = getelementptr inbounds nuw i32, ptr %x, i64 30
+  %30 = load i32, ptr %arrayidx.30
+  %conv.30 = sext i32 %30 to i64
+  %add.30 = add nsw i64 %add.29, %conv.30
+  %arrayidx.31 = getelementptr inbounds nuw i32, ptr %x, i64 31
+  %31 = load i32, ptr %arrayidx.31
+  %conv.31 = sext i32 %31 to i64
+  %add.31 = add nsw i64 %add.30, %conv.31
+  ret i64 %add.31
+}
+
+
+
+; COST-LABEL: Function:  mla_v4i8_i32
+; COST: Cost:            '-6'
+define i32 @mla_v4i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" {
+; CHECK-LABEL: @mla_v4i8_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i8> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+entry:
+  %0 = load i8, ptr %x
+  %conv = sext i8 %0 to i32
+  %1 = load i8, ptr %y
+  %conv3 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 1
+  %2 = load i8, ptr %arrayidx.1
+  %conv.1 = sext i8 %2 to i32
+  %arrayidx2.1 = getelementptr inbounds nuw i8, ptr %y, i64 1
+  %3 = load i8, ptr %arrayidx2.1
+  %conv3.1 = sext i8 %3 to i32
+  %mul.1 = mul nsw i32 %conv3.1, %conv.1
+  %add.1 = add nsw i32 %mul.1, %mul
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %4 = load i8, ptr %arrayidx.2
+  %conv.2 = sext i8 %4 to i32
+  %arrayidx2.2 = getelementptr inbounds nuw i8, ptr %y, i64 2
+  %5 = load i8, ptr %arrayidx2.2
+  %conv3.2 = sext i8 %5 to i32
+  %mul.2 = mul nsw i32 %conv3.2, %conv.2
+  %add.2 = add nsw i32 %mul.2, %add.1
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 3
+  %6 = load i8, ptr %arrayidx.3
+  %conv.3 = sext i8 %6 to i32
+  %arrayidx2.3 = getelementptr inbounds nuw i8, ptr %y, i64 3
+  %7 = load i8, ptr %arrayidx2.3
+  %conv3.3 = sext i8 %7 to i32
+  %mul.3 = mul nsw i32 %conv3.3, %conv.3
+  %add.3 = add nsw i32 %mul.3, %add.2
+  ret i32 %add.3
+}
+
+
+; COST-LABEL: Function:  mla_v8i8_i32
+; COST: Cost:            '-18'
+define i32 @mla_v8i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" {
+; CHECK-LABEL: @mla_v8i8_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[Y:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+entry:
+  %0 = load i8, ptr %x
+  %conv = sext i8 %0 to i32
+  %1 = load i8, ptr %y
+  %conv3 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 1
+  %2 = load i8, ptr %arrayidx.1
+  %conv.1 = sext i8 %2 to i32
+  %arrayidx2.1 = getelementptr inbounds nuw i8, ptr %y, i64 1
+  %3 = load i8, ptr %arrayidx2.1
+  %conv3.1 = sext i8 %3 to i32
+  %mul.1 = mul nsw i32 %conv3.1, %conv.1
+  %add.1 = add nsw i32 %mul.1, %mul
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %4 = load i8, ptr %arrayidx.2
+  %conv.2 = sext i8 %4 to i32
+  %arrayidx2.2 = getelementptr inbounds nuw i8, ptr %y, i64 2
+  %5 = load i8, ptr %arrayidx2.2
+  %conv3.2 = sext i8 %5 to i32
+  %mul.2 = mul nsw i32 %conv3.2, %conv.2
+  %add.2 = add nsw i32 %mul.2, %add.1
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 3
+  %6 = load i8, ptr %arrayidx.3
+  %conv.3 = sext i8 %6 to i32
+  %arrayidx2.3 = getelementptr inbounds nuw i8, ptr %y, i64 3
+  %7 = load i8, ptr %arrayidx2.3
+  %conv3.3 = sext i8 %7 to i32
+  %mul.3 = mul nsw i32 %conv3.3, %conv.3
+  %add.3 = add nsw i32 %mul.3, %add.2
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %8 = load i8, ptr %arrayidx.4
+  %conv.4 = sext i8 %8 to i32
+  %arrayidx2.4 = getelementptr inbounds nuw i8, ptr %y, i64 4
+  %9 = load i8, ptr %arrayidx2.4
+  %conv3.4 = sext i8 %9 to i32
+  %mul.4 = mul nsw i32 %conv3.4, %conv.4
+  %add.4 = add nsw i32 %mul.4, %add.3
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 5
+  %10 = load i8, ptr %arrayidx.5
+  %conv.5 = sext i8 %10 to i32
+  %arrayidx2.5 = getelementptr inbounds nuw i8, ptr %y, i64 5
+  %11 = load i8, ptr %arrayidx2.5
+  %conv3.5 = sext i8 %11 to i32
+  %mul.5 = mul nsw i32 %conv3.5, %conv.5
+  %add.5 = add nsw i32 %mul.5, %add.4
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %12 = load i8, ptr %arrayidx.6
+  %conv.6 = sext i8 %12 to i32
+  %arrayidx2.6 = getelementptr inbounds nuw i8, ptr %y, i64 6
+  %13 = load i8, ptr %arrayidx2.6
+  %conv3.6 = sext i8 %13 to i32
+  %mul.6 = mul nsw i32 %conv3.6, %conv.6
+  %add.6 = add nsw i32 %mul.6, %add.5
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 7
+  %14 = load i8, ptr %arrayidx.7
+  %conv.7 = sext i8 %14 to i32
+  %arrayidx2.7 = getelementptr inbounds nuw i8, ptr %y, i64 7
+  %15 = load i8, ptr %arrayidx2.7
+  %conv3.7 = sext i8 %15 to i32
+  %mul.7 = mul nsw i32 %conv3.7, %conv.7
+  %add.7 = add nsw i32 %mul.7, %add.6
+  ret i32 %add.7
+}
+
+
+; COST-LABEL: Function:  mla_v16i8_i32
+; COST: Cost:            '-40'
+define i32 @mla_v16i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" {
+; CHECK-LABEL: @mla_v16i8_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i8> [[TMP0]] to <16 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[Y:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+entry:
+  %0 = load i8, ptr %x
+  %conv = sext i8 %0 to i32
+  %1 = load i8, ptr %y
+  %conv3 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %x, i64 1
+  %2 = load i8, ptr %arrayidx.1
+  %conv.1 = sext i8 %2 to i32
+  %arrayidx2.1 = getelementptr inbounds nuw i8, ptr %y, i64 1
+  %3 = load i8, ptr %arrayidx2.1
+  %conv3.1 = sext i8 %3 to i32
+  %mul.1 = mul nsw i32 %conv3.1, %conv.1
+  %add.1 = add nsw i32 %mul.1, %mul
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %x, i64 2
+  %4 = load i8, ptr %arrayidx.2
+  %conv.2 = sext i8 %4 to i32
+  %arrayidx2.2 = getelementptr inbounds nuw i8, ptr %y, i64 2
+  %5 = load i8, ptr %arrayidx2.2
+  %conv3.2 = sext i8 %5 to i32
+  %mul.2 = mul nsw i32 %conv3.2, %conv.2
+  %add.2 = add nsw i32 %mul.2, %add.1
+  %arrayidx.3 = getelementptr inbounds nuw i8, ptr %x, i64 3
+  %6 = load i8, ptr %arrayidx.3
+  %conv.3 = sext i8 %6 to i32
+  %arrayidx2.3 = getelementptr inbounds nuw i8, ptr %y, i64 3
+  %7 = load i8, ptr %arrayidx2.3
+  %conv3.3 = sext i8 %7 to i32
+  %mul.3 = mul nsw i32 %conv3.3, %conv.3
+  %add.3 = add nsw i32 %mul.3, %add.2
+  %arrayidx.4 = getelementptr inbounds nuw i8, ptr %x, i64 4
+  %8 = load i8, ptr %arrayidx.4
+  %conv.4 = sext i8 %8 to i32
+  %arrayidx2.4 = getelementptr inbounds nuw i8, ptr %y, i64 4
+  %9 = load i8, ptr %arrayidx2.4
+  %conv3.4 = sext i8 %9 to i32
+  %mul.4 = mul nsw i32 %conv3.4, %conv.4
+  %add.4 = add nsw i32 %mul.4, %add.3
+  %arrayidx.5 = getelementptr inbounds nuw i8, ptr %x, i64 5
+  %10 = load i8, ptr %arrayidx.5
+  %conv.5 = sext i8 %10 to i32
+  %arrayidx2.5 = getelementptr inbounds nuw i8, ptr %y, i64 5
+  %11 = load i8, ptr %arrayidx2.5
+  %conv3.5 = sext i8 %11 to i32
+  %mul.5 = mul nsw i32 %conv3.5, %conv.5
+  %add.5 = add nsw i32 %mul.5, %add.4
+  %arrayidx.6 = getelementptr inbounds nuw i8, ptr %x, i64 6
+  %12 = load i8, ptr %arrayidx.6
+  %conv.6 = sext i8 %12 to i32
+  %arrayidx2.6 = getelementptr inbounds nuw i8, ptr %y, i64 6
+  %13 = load i8, ptr %arrayidx2.6
+  %conv3.6 = sext i8 %13 to i32
+  %mul.6 = mul nsw i32 %conv3.6, %conv.6
+  %add.6 = add nsw i32 %mul.6, %add.5
+  %arrayidx.7 = getelementptr inbounds nuw i8, ptr %x, i64 7
+  %14 = load i8, ptr %arrayidx.7
+  %conv.7 = sext i8 %14 to i32
+  %arrayidx2.7 = getelementptr inbounds nuw i8, ptr %y, i64 7
+  %15 = load i8, ptr %arrayidx2.7
+  %conv3.7 = sext i8 %15 to i32
+  %mul.7 = mul nsw i32 %conv3.7, %conv.7
+  %add.7 = add nsw i32 %mul.7, %add.6
+  %arrayidx.8 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %16 = load i8, ptr %arrayidx.8
+  %conv.8 = sext i8 %16 to i32
+  %arrayidx2.8 = getelementptr inbounds nuw i8, ptr %y, i64 8
+  %17 = load i8, ptr %arrayidx2.8
+  %conv3.8 = sext i8 %17 to i32
+  %mul.8 = mul nsw i32 %conv3.8, %conv.8
+  %add.8 = add nsw i32 %mul.8, %add.7
+  %arrayidx.9 = getelementptr inbounds nuw i8, ptr %x, i64 9
+  %18 = load i8, ptr %arrayidx.9
+  %conv.9 = sext i8 %18 to i32
+  %arrayidx2.9 = getelementptr inbounds nuw i8, ptr %y, i64 9
+  %19 = load i8, ptr %arrayidx2.9
+  %conv3.9 = sext i8 %19 to i32
+  %mul.9 = mul nsw i32 %conv3.9, %conv.9
+  %add.9 = add nsw i32 %mul.9, %add.8
+  %arrayidx.10 = getelementptr inbounds nuw i8, ptr %x, i64 10
+  %20 = load i8, ptr %arrayidx.10
+  %conv.10 = sext i8 %20 to i32
+  %arrayidx2.10 = getelementptr inbounds nuw i8, ptr %y, i64 10
+  %21 = load i8, ptr %arrayidx2.10
+  %conv3.10 = sext i8 %21 to i32
+  %mul.10 = mul nsw i32 %conv3.10, %conv.10
+  %add.10 = add nsw i32 %mul.10, %add.9
+  %arrayidx.11 = getelementptr inbounds nuw i8, ptr %x, i64 11
+  %22 = load i8, ptr %arrayidx.11
+  %conv.11 = sext i8 %22 to i32
+  %arrayidx2.11 = getelementptr inbounds nuw i8, ptr %y, i64 11
+  %23 = load i8, ptr %arrayidx2.11
+  %conv3.11 = sext i8 %23 to i32
+  %mul.11 = mul nsw i32 %conv3.11, %conv.11
+  %add.11 = add nsw i32 %mul.11, %add.10
+  %arrayidx.12 = getelementptr inbounds nuw i8, ptr %x, i64 12
+  %24 = load i8, ptr %arrayidx.12
+  %conv.12 = sext i8 %24 to i32
+  %arrayidx2.12 = getelementptr inbounds nuw i8, ptr %y, i64 12
+  %25 = load i8, ptr %arrayidx2.12
+  %conv3.12 = sext i8 %25 to i32
+  %mul.12 = mul nsw i32 %conv3.12, %conv.12
+  %add.12 = add nsw i32 %mul.12, %add.11
+  %arrayidx.13 = getelementptr inbounds nuw i8, ptr %x, i64 13
+  %26 = load i8, ptr %arrayidx.13
+  %conv.13 = sext i8 %26 to i32
+  %arrayidx2.13 = getelementptr inbounds nuw i8, ptr %y, i64 13
+  %27 = load i8, ptr %arrayidx2.13
+  %conv3.13 = sext i8 %27 to i32
+  %mul.13 = mul nsw i32 %conv3.13, %conv.13
+  %add.13 = add nsw i32 %mul.13, %add.12
+  %arrayidx.14 = getelementptr inbounds nuw i8, ptr %x, i64 14
+  %28 = load i8, ptr %arrayidx.14
+  %conv.14 = sext i8 %28 to i32
+  %arrayidx2.14 = getelementptr inbounds nuw i8, ptr %y, i64 14
+  %29 = load i8, ptr %arrayidx2.14
+  %conv3.14 = sext i8 %29 to i32
+  %mul.14 = mul nsw i32 %conv3.14, %conv.14
+  %add.14 = add nsw i32 %mul.14, %add.13
+  %arrayidx.15 = getelementptr inbounds nuw i8, ptr %x, i64 15
+  %30 = load i8, ptr %arrayidx.15
+  %conv.15 = sext i8 %30 to i32
+  %arrayidx2.15 = getelementptr inbounds nuw i8, ptr %y, i64 15
+  %31 = load i8, ptr %arrayidx2.15
+  %conv3.15 = sext i8 %31 to i32
+  %mul.15 = mul nsw i32 %conv3.15, %conv.15
+  %add.15 = add nsw i32 %mul.15, %add.14
+  ret i32 %add.15
+}

From e5f5517f9121083dbb9d2841b607f504cbbb490b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 15 Feb 2025 21:54:16 +0100
Subject: [PATCH 038/109] [VPlan] Create IR basic block for middle.block in
 VPlan.

Create a IR BB directly for the middle.block, instead of creating the IR
BB during skeleton creation and then replacing the middle VPBB with a
VPIRBB.

This moves another part of skeleton creation to VPlan and simplififes
the code slightly by removing code to disconnect the middle block and
vector preheader + the corresponding DT update.

NFC modulo IR block naming and block creation order, which changes the
IR names for the blocks.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   9 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  10 +-
 .../AArch64/simple_early_exit.ll              |  38 +++---
 .../LoopVectorize/AArch64/vplan-printing.ll   |   6 +-
 .../RISCV/riscv-vector-reverse.ll             |   8 +-
 .../LoopVectorize/X86/scatter_crash.ll        |  62 +++++-----
 .../LoopVectorize/X86/small-size.ll           |  48 ++++----
 .../pr59319-loop-access-info-invalidation.ll  |  42 +++----
 llvm/test/Transforms/LoopVectorize/pr66616.ll |  26 ++---
 .../scev-exit-phi-invalidation.ll             |  30 ++---
 .../LoopVectorize/single_early_exit.ll        |  18 +--
 .../single_early_exit_live_outs.ll            | 108 +++++++++---------
 .../single_early_exit_with_outer_loop.ll      |   6 +-
 .../LoopVectorize/vplan-predicate-switch.ll   |   4 +-
 .../vplan-printing-before-execute.ll          |   4 +-
 15 files changed, 204 insertions(+), 215 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 584cda34f902e..2cdb87fdd3f8d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2681,13 +2681,9 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
           Cost->requiresScalarEpilogue(VF.isVector())) &&
          "loops not exiting via the latch without required epilogue?");
 
-  LoopMiddleBlock =
-      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 LI, nullptr, Twine(Prefix) + "middle.block");
-  replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock);
   LoopScalarPreHeader =
-      SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
-                 nullptr, Twine(Prefix) + "scalar.ph");
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+                 LI, nullptr, Twine(Prefix) + "scalar.ph");
   replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader);
 }
 
@@ -10761,6 +10757,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
         VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+        BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
         preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
                                           BestEpiPlan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5a88ebeffb18b..b0cac4f78ff3c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -987,18 +987,10 @@ void VPlan::execute(VPTransformState *State) {
   setName("Final VPlan");
   LLVM_DEBUG(dump());
 
-  // Disconnect the middle block from its single successor (the scalar loop
-  // header) in both the CFG and DT. The branch will be recreated during VPlan
-  // execution.
-  BasicBlock *MiddleBB = State->CFG.ExitBB;
-  BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
-  auto *BrInst = new UnreachableInst(MiddleBB->getContext());
-  BrInst->insertBefore(MiddleBB->getTerminator()->getIterator());
-  MiddleBB->getTerminator()->eraseFromParent();
-  State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
   // Disconnect scalar preheader and scalar header, as the dominator tree edge
   // will be updated as part of VPlan execution. This allows keeping the DTU
   // logic generic during VPlan execution.
+  BasicBlock *ScalarPh = State->CFG.ExitBB;
   State->CFG.DTU.applyUpdates(
       {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 3982ed6dd26ab..5f926db1131f6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -53,13 +53,13 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 16 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -136,12 +136,12 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <2 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -214,12 +214,12 @@ define i64 @loop_contains_safe_call() #1 {
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -309,13 +309,13 @@ define i64 @loop_contains_safe_div() #1 {
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[INDEX1]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <vscale x 4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[INDEX1]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -395,12 +395,12 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[WIDE_LOAD2]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -495,11 +495,11 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[FOUND:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[FOUND:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 32ecedc535b4d..a880bea2c52d1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -103,9 +103,9 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): ir-bb<middle.block>
+; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<middle.block>:
+; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
 ; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, ir<1024>
@@ -113,7 +113,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from ir-bb<middle.block>)
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<scalar.ph>:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index f630f4f21e065..ebb5d46cd8438 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -209,9 +209,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): ir-bb<middle.block>
+; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<middle.block>:
+; CHECK-NEXT:  middle.block:
 ; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VEC_TC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
@@ -458,9 +458,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): ir-bb<middle.block>
+; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<middle.block>:
+; CHECK-NEXT:  middle.block:
 ; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VEC_TC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 58d3ead2d2919..6bed344d4d73b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -26,7 +26,7 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[TMP2]], 4063299859190
 ; CHECK-NEXT:    [[TOBOOL6:%.*]] = icmp eq i64 [[MUL]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP0]] to i64
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[ITER_CHECK27:%.*]], label [[ITER_CHECK:%.*]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[ITER_CHECK22:%.*]], label [[ITER_CHECK:%.*]]
 ; CHECK:       iter.check:
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], -9
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 1
@@ -112,16 +112,16 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ 8, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       iter.check23:
+; CHECK:       iter.check22:
 ; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP3]], -9
 ; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = add nuw i64 [[TMP27]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK25:%.*]] = icmp ult i64 [[TMP28]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH41:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK25:%.*]]
-; CHECK:       vector.main.loop.iter.check25:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label [[VEC_EPILOG_SCALAR_PH40:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK24:%.*]]
+; CHECK:       vector.main.loop.iter.check24:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK28:%.*]] = icmp ult i64 [[TMP28]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH42:%.*]], label [[VECTOR_PH30:%.*]]
-; CHECK:       vector.ph26:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label [[VEC_EPILOG_PH42:%.*]], label [[VECTOR_PH25:%.*]]
+; CHECK:       vector.ph25:
 ; CHECK-NEXT:    [[N_MOD_VF31:%.*]] = urem i64 [[TMP28]], 16
 ; CHECK-NEXT:    [[N_VEC32:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF31]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[N_VEC32]], 2
@@ -131,10 +131,10 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY29:%.*]]
-; CHECK:       vector.body29:
-; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
-; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY29]] ]
-; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH30]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY29]] ]
+; CHECK:       vector.body28:
+; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH25]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY29]] ]
+; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH25]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i64> splat (i64 8), [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <16 x i64> [[TMP30]], [[VEC_IND37]]
@@ -153,21 +153,21 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK20:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block20:
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK35:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block35:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK43:%.*]]
-; CHECK:       vec.epilog.iter.check43:
+; CHECK:       vec.epilog.iter.check42:
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
 ; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH41]], label [[VEC_EPILOG_PH42]]
-; CHECK:       vec.epilog.ph42:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK25]] ]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH40]], label [[VEC_EPILOG_PH42]]
+; CHECK:       vec.epilog.ph41:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[N_MOD_VF52:%.*]] = urem i64 [[TMP28]], 8
 ; CHECK-NEXT:    [[N_VEC53:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF52]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[N_VEC53]], 2
@@ -182,11 +182,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY50:%.*]]
-; CHECK:       vec.epilog.vector.body52:
-; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
-; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
-; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY51:%.*]]
+; CHECK:       vec.epilog.vector.body51:
+; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY51]] ]
+; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY51]] ]
+; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY51]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
@@ -205,17 +205,17 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY50]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       vec.epilog.middle.block40:
+; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY51]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       vec.epilog.middle.block64:
 ; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH41]]
-; CHECK:       vec.epilog.scalar.ph41:
-; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[ITER_CHECK27]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[ITER_CHECK27]] ]
+; CHECK-NEXT:    br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH40]]
+; CHECK:       vec.epilog.scalar.ph40:
+; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[ITER_CHECK22]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[ITER_CHECK22]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
 ; CHECK:       for.body.us:
-; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH41]] ]
-; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL67]], [[VEC_EPILOG_SCALAR_PH41]] ]
+; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH40]] ]
+; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL67]], [[VEC_EPILOG_SCALAR_PH40]] ]
 ; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw i64 8, [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[ADD_PTR_US:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, i64 [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nsw i64 [[TMP56]], [[INDVARS_IV70]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index c9132bab80f19..8914edf28372f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -133,25 +133,25 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[N]], 0
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]]
 ; CHECK:       .lr.ph.preheader:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH8:%.*]], label [[VECTOR_PH9:%.*]]
-; CHECK:       vector.ph9:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH8:%.*]], label [[VECTOR_PH8:%.*]]
+; CHECK:       vector.ph8:
 ; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[N]] to i64
 ; CHECK-NEXT:    [[N_RND_UP10:%.*]] = add nuw nsw i64 [[TMP17]], 3
 ; CHECK-NEXT:    [[N_VEC12:%.*]] = and i64 [[N_RND_UP10]], 8589934588
 ; CHECK-NEXT:    [[TRIP_COUNT_MINUS_114:%.*]] = add nsw i64 [[TMP17]], -1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_114]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY15:%.*]]
-; CHECK:       vector.body14:
-; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE78:%.*]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY13:%.*]]
+; CHECK:       vector.body13:
+; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE26:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX16]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX16]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT18]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT20]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; CHECK:       pred.store.if20:
+; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK:       pred.store.if19:
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], ptr @c, i64 0, i64 [[OFFSET_IDX]]
@@ -159,11 +159,11 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = and i32 [[TMP23]], [[TMP21]]
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; CHECK:       pred.store.continue21:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; CHECK:       pred.store.continue20:
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1
-; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; CHECK:       pred.store.if22:
+; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; CHECK:       pred.store.if21:
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
@@ -172,11 +172,11 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = and i32 [[TMP31]], [[TMP29]]
 ; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; CHECK:       pred.store.continue23:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.continue22:
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2
-; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; CHECK:       pred.store.if24:
+; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; CHECK:       pred.store.if23:
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
@@ -185,11 +185,11 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP35]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = and i32 [[TMP39]], [[TMP37]]
 ; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP40]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; CHECK:       pred.store.continue25:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; CHECK:       pred.store.continue24:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3
-; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE78]]
-; CHECK:       pred.store.if26:
+; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26]]
+; CHECK:       pred.store.if25:
 ; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [2048 x i32], ptr @b, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
@@ -198,14 +198,14 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], ptr @a, i64 0, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP47]], [[TMP45]]
 ; CHECK-NEXT:    store i32 [[TMP49]], ptr [[TMP48]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE78]]
-; CHECK:       pred.store.continue27:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; CHECK:       pred.store.continue26:
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX16]], 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC12]]
-; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY15]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block7:
+; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK28:%.*]], label [[VECTOR_BODY13]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block28:
 ; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH8]]
-; CHECK:       scalar.ph8:
+; CHECK:       scalar.ph7:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph5:
 ; CHECK-NEXT:    br i1 poison, label [[DOT_PREHEADER_CRIT_EDGE]], label [[DOTLR_PH5]], !llvm.loop [[LOOP6:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
index 95ae2de117666..b6c72056b0c5c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll
@@ -48,28 +48,28 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[TMP1]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH5]], label [[VECTOR_PH7:%.*]]
-; CHECK:       vector.ph7:
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH5]], label [[VECTOR_PH6:%.*]]
+; CHECK:       vector.ph6:
 ; CHECK-NEXT:    [[N_MOD_VF8:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF8]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY10:%.*]]
-; CHECK:       vector.body10:
-; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH7]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY10]] ]
+; CHECK:       vector.body9:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH6]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY10]] ]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !4, !noalias !7
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX12]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK4:%.*]], label [[VECTOR_BODY10]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK:       middle.block4:
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK12:%.*]], label [[VECTOR_BODY10]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       middle.block12:
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[LOOP_3_LR_PH:%.*]], label [[SCALAR_PH5]]
-; CHECK:       scalar.ph5:
-; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK4]] ], [ 0, [[LOOP_2_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK:       scalar.ph4:
+; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK12]] ], [ 0, [[LOOP_2_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP_2:%.*]]
 ; CHECK:       loop.3.lr.ph:
 ; CHECK-NEXT:    [[IDXPROM_I_I61:%.*]] = and i64 [[IV761_LCSSA]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I62:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[IDXPROM_I_I61]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK22:%.*]] = icmp ult i64 [[TMP3]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH22:%.*]], label [[VECTOR_MEMCHECK15:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK22]], label [[SCALAR_PH21:%.*]], label [[VECTOR_MEMCHECK15:%.*]]
 ; CHECK:       vector.memcheck15:
 ; CHECK-NEXT:    [[SCEVGEP15:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[IDXPROM_I_I61]], 2
@@ -78,34 +78,34 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) {
 ; CHECK-NEXT:    [[BOUND017:%.*]] = icmp ult ptr [[TMP1]], [[SCEVGEP16]]
 ; CHECK-NEXT:    [[BOUND118:%.*]] = icmp ult ptr [[ARRAYIDX_I_I62]], [[SCEVGEP15]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
-; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH22]], label [[VECTOR_PH24:%.*]]
-; CHECK:       vector.ph24:
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT19]], label [[SCALAR_PH21]], label [[VECTOR_PH24:%.*]]
+; CHECK:       vector.ph23:
 ; CHECK-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC25:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF24]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY27:%.*]]
-; CHECK:       vector.body27:
+; CHECK:       vector.body26:
 ; CHECK-NEXT:    [[INDEX29:%.*]] = phi i64 [ 0, [[VECTOR_PH24]] ], [ [[INDEX_NEXT29:%.*]], [[VECTOR_BODY27]] ]
-; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope !10, !noalias !13
+; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4, !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX29]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK21:%.*]], label [[VECTOR_BODY27]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       middle.block21:
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC25]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK29:%.*]], label [[VECTOR_BODY27]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       middle.block29:
 ; CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC25]]
-; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH22]]
-; CHECK:       scalar.ph22:
-; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK21]] ], [ 0, [[LOOP_3_LR_PH]] ], [ 0, [[VECTOR_MEMCHECK15]] ]
+; CHECK-NEXT:    br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH21]]
+; CHECK:       scalar.ph21:
+; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK29]] ], [ 0, [[LOOP_3_LR_PH]] ], [ 0, [[VECTOR_MEMCHECK15]] ]
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
 ; CHECK:       loop.2:
 ; CHECK-NEXT:    [[IV846:%.*]] = phi i64 [ [[IV_NEXT85:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL13]], [[SCALAR_PH5]] ]
 ; CHECK-NEXT:    [[IV_NEXT87:%.*]] = add i64 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX_I_I56:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[IV761_LCSSA]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX_I_I56]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX_I_I56]], align 4
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[IV_NEXT85]] = add i64 [[IV846]], 1
 ; CHECK-NEXT:    [[EXITCOND92_NOT:%.*]] = icmp eq i64 [[IV846]], [[IV]]
 ; CHECK-NEXT:    br i1 [[EXITCOND92_NOT]], label [[LOOP_3_LR_PH]], label [[LOOP_2]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       loop.3:
-; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH22]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
+; CHECK-NEXT:    [[IV932:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH21]] ], [ [[IV_NEXT94:%.*]], [[LOOP_3]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_I_I62]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_I_I653:%.*]] = getelementptr i32, ptr [[TMP2:%.*]], i64 [[IV93:%.*]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll
index 24b9441749ee4..b5f9e999b6ea5 100644
--- a/llvm/test/Transforms/LoopVectorize/pr66616.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll
@@ -38,29 +38,29 @@ define void @pr66616(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH2:%.*]], label [[VECTOR_PH3:%.*]]
-; CHECK:       vector.ph3:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH1:%.*]], label [[VECTOR_PH2:%.*]]
+; CHECK:       vector.ph2:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[ADD3_LCSSA]], [[DOTCAST]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[N_VEC]]
-; CHECK-NEXT:    br label [[VECTOR_BODY7:%.*]]
-; CHECK:       vector.body4:
-; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH3]] ], [ [[INDEX_NEXT9:%.*]], [[VECTOR_BODY7]] ]
+; CHECK-NEXT:    br label [[VECTOR_BODY3:%.*]]
+; CHECK:       vector.body3:
+; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH2]] ], [ [[INDEX_NEXT9:%.*]], [[VECTOR_BODY3]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY7]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block1:
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK6:%.*]], label [[VECTOR_BODY3]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block6:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH2]]
-; CHECK:       scalar.ph2:
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK1]] ], [ [[ADD3_LCSSA]], [[PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK1]] ], [ [[PTR]], [[PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH1]]
+; CHECK:       scalar.ph1:
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK6]] ], [ [[ADD3_LCSSA]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK6]] ], [ [[PTR]], [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP_2:%.*]]
 ; CHECK:       loop.2:
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[IV_2_I:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH2]] ]
-; CHECK-NEXT:    [[IV_3:%.*]] = phi ptr [ [[IV_3_I:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL6]], [[SCALAR_PH2]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[IV_2_I:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH1]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi ptr [ [[IV_3_I:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL6]], [[SCALAR_PH1]] ]
 ; CHECK-NEXT:    [[IV_2_I]] = add i32 [[IV_2]], 1
 ; CHECK-NEXT:    [[IV_3_I]] = getelementptr i8, ptr [[IV_3]], i64 1
 ; CHECK-NEXT:    [[COND2:%.*]] = icmp eq i32 [[IV_2]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
index a848f98e0949e..69b8496878e8f 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
@@ -37,7 +37,7 @@ define void @test_pr63368(i1 %c, ptr %A) {
 ; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[L_LCSSA_LCSSA]], i32 -1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SMAX1]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH3:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH2:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[L_LCSSA_LCSSA]], i32 -1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SMAX]], 1
@@ -46,31 +46,31 @@ define void @test_pr63368(i1 %c, ptr %A) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i8 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP3]], 255
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[SCALAR_PH3]], label [[VECTOR_PH4:%.*]]
-; CHECK:       vector.ph4:
+; CHECK-NEXT:    br i1 [[TMP8]], label [[SCALAR_PH2]], label [[VECTOR_PH3:%.*]]
+; CHECK:       vector.ph3:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[N_VEC]] to i8
-; CHECK-NEXT:    br label [[VECTOR_BODY5:%.*]]
-; CHECK:       vector.body5:
-; CHECK-NEXT:    [[INDEX6:%.*]] = phi i32 [ 0, [[VECTOR_PH4]] ], [ [[INDEX_NEXT7:%.*]], [[VECTOR_BODY5]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX6]] to i8
+; CHECK-NEXT:    br label [[VECTOR_BODY4:%.*]]
+; CHECK:       vector.body4:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i32 [ 0, [[VECTOR_PH3]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY4]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX5]] to i8
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i8 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i8 [[TMP10]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[A]], i8 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    store <4 x i8> zeroinitializer, ptr [[TMP13]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i32 [[INDEX6]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT7]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK2:%.*]], label [[VECTOR_BODY5]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block2:
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY4]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block7:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_2:%.*]], label [[SCALAR_PH3]]
-; CHECK:       scalar.ph3:
-; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i8 [ [[TMP9]], [[MIDDLE_BLOCK2]] ], [ 0, [[EXIT_1]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_2:%.*]], label [[SCALAR_PH2]]
+; CHECK:       scalar.ph2:
+; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i8 [ [[TMP9]], [[MIDDLE_BLOCK7]] ], [ 0, [[EXIT_1]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP_2:%.*]]
 ; CHECK:       loop.2:
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL8]], [[SCALAR_PH3]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL8]], [[SCALAR_PH2]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ]
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add i8 [[IV_2]], 1
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i8 [[IV_2_NEXT]]
 ; CHECK-NEXT:    store i8 0, ptr [[GEP_A]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index b414f53291df4..51eb2f650738c 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -33,10 +33,10 @@ define i64 @same_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -112,10 +112,10 @@ define i64 @diff_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -211,11 +211,11 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[FOUND:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[FOUND:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
@@ -298,10 +298,10 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[EARLY_EXIT:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 266, [[MIDDLE_BLOCK]] ], [ -10, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 1bfe054057089..e24c6090b704b 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -34,12 +34,12 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -119,12 +119,12 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -204,12 +204,12 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -287,12 +287,12 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -371,10 +371,10 @@ define i64 @same_exit_block_pre_inc_use2() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -453,12 +453,12 @@ define i64 @same_exit_block_pre_inc_use3() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -534,12 +534,12 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -620,12 +620,12 @@ define i64 @same_exit_block_post_inc_use() {
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -715,12 +715,12 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[TMP15]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -798,12 +798,12 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -888,11 +888,11 @@ define i64 @diff_exit_block_pre_inc_use2() {
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
 ; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -977,13 +977,13 @@ define i64 @diff_exit_block_pre_inc_use3() {
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1073,12 +1073,12 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1174,13 +1174,13 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[TMP11]], i64 [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1263,12 +1263,12 @@ define i64 @loop_contains_safe_call() {
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1342,12 +1342,12 @@ define i64 @loop_contains_safe_div() {
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1423,12 +1423,12 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[WIDE_LOAD2]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1508,12 +1508,12 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1588,12 +1588,12 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]]
-; CHECK-NEXT:    br label [[LOOP_END:%.*]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll
index fefed2d859767..51cfc72752014 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll
@@ -6,7 +6,7 @@ declare void @init_mem(ptr, i64);
 ; uncountable early exits is correctly adding to the outer loop at depth 1.
 define void @early_exit_in_outer_loop1() {
 ; CHECK-LABEL: Loop info for function 'early_exit_in_outer_loop1':
-; CHECK: Loop at depth 1 containing: {{.*}}%middle.block,%scalar.ph,%vector.ph,%vector.body,%middle.split
+; CHECK: Loop at depth 1 containing: {{.*}}%scalar.ph,%vector.ph,%vector.body,%middle.split,%middle.block
 entry:
   %p1 = alloca [1024 x i8]
   %p2 = alloca [1024 x i8]
@@ -45,8 +45,8 @@ loop.inner.end:
 ; loops at depths 1 and 2, respectively.
 define void @early_exit_in_outer_loop2() {
 ; CHECK-LABEL: Loop info for function 'early_exit_in_outer_loop2':
-; CHECK: Loop at depth 1 containing: {{.*}}%middle.block,%scalar.ph,%vector.ph,%vector.body,%middle.split
-; CHECK:    Loop at depth 2 containing: {{.*}}%middle.block,%scalar.ph,%vector.ph,%vector.body,%middle.split<exiting>
+; CHECK: Loop at depth 1 containing: {{.*}}%scalar.ph,%vector.ph,%vector.body,%middle.split,%middle.block
+; CHECK:    Loop at depth 2 containing: {{.*}}%scalar.ph,%vector.ph,%vector.body,%middle.split<exiting>,%middle.block
 entry:
   %p1 = alloca [1024 x i8]
   %p2 = alloca [1024 x i8]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index dd3b50b3e060c..2cc8aea82ca52 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -83,9 +83,9 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VTC]]>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): ir-bb<middle.block>
+; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<middle.block>:
+; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VTC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index beb305f23884e..db3e31c1a15ae 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -89,9 +89,9 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT:   vp<[[VPTR4:%.+]]> = vector-pointer vp<[[PADD1]]>, ir<1>
 ; CHECK-NEXT:   WIDEN store vp<[[VPTR3]]>, ir<%add>
 ; CHECK-NEXT:   WIDEN store vp<[[VPTR4]]>, ir<%add>.1
-; CHECK-NEXT: Successor(s): ir-bb<middle.block>
+; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<middle.block>:
+; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:   EMIT vp<[[C:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VTC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[C]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>

From 256145b4b0058ae22a1040cd4b7ea44fc49a4ece Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 15 Feb 2025 14:13:32 -0800
Subject: [PATCH 039/109] [PowerPC] Use getSignedTargetConstant in
 SelectOptimalAddrMode. (#127305)

Fixes #127298.
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |  4 ++--
 llvm/test/CodeGen/PowerPC/pr127298.ll       | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr127298.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f1195feea80e8..767d1ded8de3a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -19110,8 +19110,8 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
         int32_t Addr = (int32_t)CNImm;
         // Otherwise, break this down into LIS + Disp.
         Disp = DAG.getSignedTargetConstant((int16_t)Addr, DL, MVT::i32);
-        Base =
-            DAG.getTargetConstant((Addr - (int16_t)Addr) >> 16, DL, MVT::i32);
+        Base = DAG.getSignedTargetConstant((Addr - (int16_t)Addr) >> 16, DL,
+                                           MVT::i32);
         uint32_t LIS = CNType == MVT::i32 ? PPC::LIS : PPC::LIS8;
         Base = SDValue(DAG.getMachineNode(LIS, DL, CNType, Base), 0);
         break;
diff --git a/llvm/test/CodeGen/PowerPC/pr127298.ll b/llvm/test/CodeGen/PowerPC/pr127298.ll
new file mode 100644
index 0000000000000..f7560216ef7d8
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr127298.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=powerpc | FileCheck %s
+
+define void @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    lis 3, -8530
+; CHECK-NEXT:    lbz 3, -16657(3)
+; CHECK-NEXT:    blr
+Entry:
+  %0 = load volatile i8, ptr inttoptr (i32 -559038737 to ptr), align 1
+  ret void
+}

From 963ff1c3051a8c413a04fff27ba60cf93e48f310 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <git@ozinenko.com>
Date: Sat, 15 Feb 2025 23:53:36 +0100
Subject: [PATCH 040/109] [mlir] lower min/maxnum to libdevice calls (#127323)

Introduce lowering from arith.minnum/maxxnum operations to the
corresponding Nvidia libdevice calls. This requires to reorder pattern
population methods so that the libdevice-targeting patterns are
prioritized over default patterns targeting LLVM IR intrinsics from the
Arith dialect. The tests are placed into a separate file as the existing
gpu-to-nvvm.mlir files has a mode that forces Arith dialect operations
to be preserved as is without using a separate FileCheck tag to
differentiate.

Co-authored-by: William Moses <gh@wsmoses.com>
---
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  8 ++++++-
 .../GPUToNVVM/gpu-arith-ops-to-nvvm.mlir      | 21 +++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Conversion/GPUToNVVM/gpu-arith-ops-to-nvvm.mlir

diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 35330f870e6ae..c1a4d31861d3b 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -378,6 +378,8 @@ struct LowerGpuOpsToNVVMOpsPass final
     RewritePatternSet llvmPatterns(m.getContext());
     LLVMConversionTarget target(getContext());
 
+    populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
+
     llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
                                                       allowedDialects.end());
     for (Dialect *dialect : getContext().getLoadedDialects()) {
@@ -407,7 +409,6 @@ struct LowerGpuOpsToNVVMOpsPass final
                                                      llvmPatterns);
     }
 
-    populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
     populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);
     if (this->hasRedux)
       populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns);
@@ -552,6 +553,11 @@ void mlir::populateGpuToNVVMConversionPatterns(
 
   populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
                                     "__nv_fmod");
+  populateOpPatterns<arith::MaxNumFOp>(converter, patterns, "__nv_fmaxf",
+                                       "__nv_fmax");
+  populateOpPatterns<arith::MinNumFOp>(converter, patterns, "__nv_fminf",
+                                       "__nv_fmin");
+
   populateIntOpPatterns<math::AbsIOp>(converter, patterns, "__nv_abs");
   populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
                                    "__nv_fabs");
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-arith-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-arith-ops-to-nvvm.mlir
new file mode 100644
index 0000000000000..2b1c5a7fef8fe
--- /dev/null
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-arith-ops-to-nvvm.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s
+
+gpu.module @test_module_54 {
+  // CHECK: llvm.func @__nv_fmaxf(f32, f32) -> f32
+  // CHECK: llvm.func @__nv_fminf(f32, f32) -> f32
+  // CHECK: llvm.func @__nv_fmax(f64, f64) -> f64
+  // CHECK: llvm.func @__nv_fmin(f64, f64) -> f64
+  // CHECK-LABEL: @gpu_fminmax
+  func.func @gpu_fminmax(%arg1_f32: f32, %arg2_f32: f32, %arg1_f64: f64, %arg2_f64: f64) 
+      -> (f32, f32, f64, f64) {
+    // CHECK: llvm.call @__nv_fmaxf
+    %max_f32 = arith.maxnumf %arg1_f32, %arg2_f32 : f32
+    // CHECK: llvm.call @__nv_fminf
+    %min_f32 = arith.minnumf %arg1_f32, %arg2_f32 : f32
+    // CHECK: llvm.call @__nv_fmax(
+    %max_f64 = arith.maxnumf %arg1_f64, %arg2_f64 : f64
+    // CHECK: llvm.call @__nv_fmin(
+    %min_f64 = arith.minnumf %arg1_f64, %arg2_f64 : f64
+    return %max_f32, %min_f32, %max_f64, %min_f64 : f32, f32, f64, f64
+  }
+}

From ed48398431afa477fc57fcabb6ae1f7d94df6012 Mon Sep 17 00:00:00 2001
From: Roland McGrath <mcgrathr@google.com>
Date: Sat, 15 Feb 2025 16:32:30 -0800
Subject: [PATCH 041/109] [libc] Fill out generated malloc.h and related
 stdlib.h extensions (#127293)

This updates the generated stdlib.h and malloc.h headers to
include the subsets of extenion functions declared by glibc that
are also supported by Scudo and that use only simple types.
Scudo's extensions not declared by glibc are omitted.  glibc's
extensions not implemented by Scudo are omitted.  The mallinfo
and mallinfo2 functions are omitted (at least for now) since they
need struct definitions for their return types.
---
 libc/include/malloc.yaml        | 13 +++++++++++++
 libc/include/stdlib-malloc.yaml | 28 ++++++++++++++++++++--------
 libc/include/stdlib.yaml        |  8 ++++++++
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/libc/include/malloc.yaml b/libc/include/malloc.yaml
index bf7678797c023..cd84723557bcb 100644
--- a/libc/include/malloc.yaml
+++ b/libc/include/malloc.yaml
@@ -9,6 +9,12 @@ macros:
   - macro_name: M_PURGE_ALL
     macro_header: malloc-macros.h
 functions:
+  - name: malloc_usable_size
+    standards:
+      - gnu
+    return_type: size_t
+    arguments:
+      - type: void *
   - name: mallopt
     standards:
       - gnu
@@ -16,3 +22,10 @@ functions:
     arguments:
       - type: int
       - type: int
+  - name: pvalloc
+    standards:
+      - bsd
+      - gnu
+    return_type: void *
+    arguments:
+      - type: size_t
diff --git a/libc/include/stdlib-malloc.yaml b/libc/include/stdlib-malloc.yaml
index 648a6e58a90a0..51c11f5602e2c 100644
--- a/libc/include/stdlib-malloc.yaml
+++ b/libc/include/stdlib-malloc.yaml
@@ -1,4 +1,10 @@
 # This file has declarations that appear both in <stdlib.h> and in <malloc.h>.
+# These include the subset of GNU extensions that Scudo supports.
+#
+# Note: glibc's <stdlib.h> and <malloc.h> both also have `reallocarray`,
+# which Scudo does not support and is omitted here.  (Each of those glibc
+# headers also has related functions the other lacks, but those should be
+# covered separately in stdlib.yaml and malloc.yaml instead.)
 
 functions:
   - name: aligned_alloc
@@ -27,6 +33,13 @@ functions:
     return_type: void *
     arguments:
       - type: size_t
+  - name: memalign
+    standards:
+      - gnu
+    return_type: void *
+    arguments:
+      - type: size_t
+      - type: size_t
   - name: realloc
     standards:
       - stdc
@@ -34,11 +47,10 @@ functions:
     arguments:
       - type: void *
       - type: size_t
-
-# Note: glibc's <stdlib.h> and <malloc.h> both have these, which are
-# currently missing here:
-#  - name: reallocarray
-#  - name: memalign
-#  - name: valloc
-# Each of those glibc headers also has related functions the other lacks.
-# Only the common subset is mentioned here for future consideration.
+  - name: valloc
+    standards:
+      - bsd
+      - gnu
+    return_type: void *
+    arguments:
+      - type: size_t
diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml
index 6ada26f020179..8d2b3f357e1a9 100644
--- a/libc/include/stdlib.yaml
+++ b/libc/include/stdlib.yaml
@@ -127,6 +127,14 @@ functions:
     arguments:
       - type: long long
       - type: long long
+  - name: posix_memalign
+    standards:
+      - posix
+    return_type: int
+    arguments:
+      - type: void **
+      - type: size_t
+      - type: size_t
   - name: qsort
     standards:
       - stdc

From 3e3af861b4a48b90041fad7ab8e9bc17e8a0a602 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Sat, 15 Feb 2025 17:09:51 -0800
Subject: [PATCH 042/109] [SandboxVec][DAG] Add MemDGNode::MemSuccs (#127253)

This patch adds Memory successors to the memory nodes of the DAG. This
will help maintain the memory dependencies when nodes get removed.
---
 .../SandboxVectorizer/DependencyGraph.h       | 17 ++++++++
 .../SandboxVectorizer/DependencyGraphTest.cpp | 40 ++++++++++++++++++-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
index 6852d0b6714fb..b25f96571741e 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
@@ -117,6 +117,7 @@ class DGNode {
   virtual ~DGNode();
   /// \Returns the number of unscheduled successors.
   unsigned getNumUnscheduledSuccs() const { return UnscheduledSuccs; }
+  // TODO: Make this private?
   void decrUnscheduledSuccs() {
     assert(UnscheduledSuccs > 0 && "Counting error!");
     --UnscheduledSuccs;
@@ -214,6 +215,8 @@ class MemDGNode final : public DGNode {
   MemDGNode *NextMemN = nullptr;
   /// Memory predecessors.
   DenseSet<MemDGNode *> MemPreds;
+  /// Memory successors.
+  DenseSet<MemDGNode *> MemSuccs;
   friend class PredIterator; // For MemPreds.
   /// Creates both edges: this<->N.
   void setNextNode(MemDGNode *N) {
@@ -265,10 +268,20 @@ class MemDGNode final : public DGNode {
     [[maybe_unused]] auto Inserted = MemPreds.insert(PredN).second;
     assert(Inserted && "PredN already exists!");
     assert(PredN != this && "Trying to add a dependency to self!");
+    PredN->MemSuccs.insert(this);
     if (!Scheduled) {
       ++PredN->UnscheduledSuccs;
     }
   }
+  /// Removes the memory dependency PredN->this. This also updates the
+  /// UnscheduledSuccs counter of PredN if this node has not been scheduled.
+  void removeMemPred(MemDGNode *PredN) {
+    MemPreds.erase(PredN);
+    PredN->MemSuccs.erase(this);
+    if (!Scheduled) {
+      PredN->decrUnscheduledSuccs();
+    }
+  }
   /// \Returns true if there is a memory dependency N->this.
   bool hasMemPred(DGNode *N) const {
     if (auto *MN = dyn_cast<MemDGNode>(N))
@@ -279,6 +292,10 @@ class MemDGNode final : public DGNode {
   iterator_range<DenseSet<MemDGNode *>::const_iterator> memPreds() const {
     return make_range(MemPreds.begin(), MemPreds.end());
   }
+  /// \Returns all memory dependency successors.
+  iterator_range<DenseSet<MemDGNode *>::const_iterator> memSuccs() const {
+    return make_range(MemSuccs.begin(), MemSuccs.end());
+  }
 #ifndef NDEBUG
   virtual void print(raw_ostream &OS, bool PrintDeps = true) const override;
 #endif // NDEBUG
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
index d81932dca4989..bb809bf33420e 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
@@ -250,6 +250,9 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   EXPECT_THAT(N1->memPreds(), testing::ElementsAre(N0));
   EXPECT_TRUE(N2->preds(DAG).empty());
 
+  // Check memSuccs().
+  EXPECT_THAT(N0->memSuccs(), testing::ElementsAre(N1));
+
   // Check UnscheduledSuccs.
   EXPECT_EQ(N0->getNumUnscheduledSuccs(), 1u); // N1
   EXPECT_EQ(N1->getNumUnscheduledSuccs(), 0u);
@@ -268,6 +271,41 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   EXPECT_TRUE(N0->scheduled());
 }
 
+TEST_F(DependencyGraphTest, AddRemoveMemPred) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
+  store i8 %v0, ptr %ptr
+  store i8 %v1, ptr %ptr
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *S0 = cast<sandboxir::StoreInst>(&*It++);
+  auto *S1 = cast<sandboxir::StoreInst>(&*It++);
+
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
+  DAG.extend({&*BB->begin(), BB->getTerminator()});
+  auto *N0 = cast<sandboxir::MemDGNode>(DAG.getNode(S0));
+  auto *N1 = cast<sandboxir::MemDGNode>(DAG.getNode(S1));
+
+  // Check removeMemPred().
+  EXPECT_FALSE(N0->memSuccs().empty());
+  EXPECT_EQ(N0->getNumUnscheduledSuccs(), 1u);
+  N1->removeMemPred(N0);
+  EXPECT_TRUE(N1->memPreds().empty());
+  EXPECT_EQ(N0->getNumUnscheduledSuccs(), 0u);
+
+  // Check addMemPred().
+  N1->addMemPred(N0);
+  EXPECT_THAT(N1->memPreds(), testing::UnorderedElementsAre(N0));
+  EXPECT_THAT(N0->memSuccs(), testing::UnorderedElementsAre(N1));
+  EXPECT_THAT(N0->getNumUnscheduledSuccs(), 1u);
+}
+
 TEST_F(DependencyGraphTest, Preds) {
   parseIR(C, R"IR(
 declare ptr @bar(i8)
@@ -533,7 +571,7 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1) {
   EXPECT_TRUE(RetN->preds(DAG).empty());
 }
 
-TEST_F(DependencyGraphTest, VolatileSotres) {
+TEST_F(DependencyGraphTest, VolatileStores) {
   parseIR(C, R"IR(
 define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v) {
   store volatile i8 %v, ptr %ptr0

From 2157aecfe429f763c23d0ec8a59b896799e2d031 Mon Sep 17 00:00:00 2001
From: Roland McGrath <mcgrathr@google.com>
Date: Sat, 15 Feb 2025 17:36:44 -0800
Subject: [PATCH 043/109] [libc] Produce standard-compliant header guard macros
 in hdrgen (#127356)

Macros starting with alphabetic characters such as "LLVM" are in
the application name space and cannot be defined or used by a
conforming implementation's headers.  This fixes the headers that
are entirely generated, and the __llvm-libc-common.h header to
use a conforming macro name for the header guard.  That is, it
starts with "_LLVM_LIBC_" instead of "LLVM_LIBC_", as identifiers
starting with an underscore followed by a capital letter are in
the name space reserved for the implementation.

The remaining headers either will be fixed implicitly by removal
of their custom template files, or will need to be fixed by hand.
---
 libc/include/__llvm-libc-common.h                     | 6 +++---
 libc/utils/hdrgen/header.py                           | 2 +-
 libc/utils/hdrgen/tests/expected_output/subdir/test.h | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h
index a0fa506c01ab8..212e3c6a9446c 100644
--- a/libc/include/__llvm-libc-common.h
+++ b/libc/include/__llvm-libc-common.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_COMMON_H
-#define LLVM_LIBC_COMMON_H
+#ifndef _LLVM_LIBC_COMMON_H
+#define _LLVM_LIBC_COMMON_H
 
 #define __LLVM_LIBC__ 1
 
@@ -87,4 +87,4 @@
 
 #endif // __cplusplus
 
-#endif // LLVM_LIBC_COMMON_H
+#endif // _LLVM_LIBC_COMMON_H
diff --git a/libc/utils/hdrgen/header.py b/libc/utils/hdrgen/header.py
index 9ea9f98f8fc83..42a075c4b6c89 100644
--- a/libc/utils/hdrgen/header.py
+++ b/libc/utils/hdrgen/header.py
@@ -124,7 +124,7 @@ def includes(self):
         }
 
     def header_guard(self):
-        return "LLVM_LIBC_" + "_".join(
+        return "_LLVM_LIBC_" + "_".join(
             word.upper() for word in NONIDENTIFIER.split(self.name) if word
         )
 
diff --git a/libc/utils/hdrgen/tests/expected_output/subdir/test.h b/libc/utils/hdrgen/tests/expected_output/subdir/test.h
index 20bab502e6821..40936bcfcba6d 100644
--- a/libc/utils/hdrgen/tests/expected_output/subdir/test.h
+++ b/libc/utils/hdrgen/tests/expected_output/subdir/test.h
@@ -6,8 +6,8 @@
 //
 //===---------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SUBDIR_TEST_H
-#define LLVM_LIBC_SUBDIR_TEST_H
+#ifndef _LLVM_LIBC_SUBDIR_TEST_H
+#define _LLVM_LIBC_SUBDIR_TEST_H
 
 #include "../__llvm-libc-common.h"
 #include "../llvm-libc-types/type_a.h"
@@ -23,4 +23,4 @@ int *ptrfunc(void) __NOEXCEPT;
 
 __END_C_DECLS
 
-#endif // LLVM_LIBC_SUBDIR_TEST_H
+#endif // _LLVM_LIBC_SUBDIR_TEST_H

From c6d95c441a29a45782ff72d6cb82839b86fd0e4a Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Sat, 15 Feb 2025 19:54:23 -0800
Subject: [PATCH 044/109] [libc++] Fix initialization-order-fiasco with
 iostream.cpp constructors (#126995)

Asan reports it after #124103.

It's know case of false positive for Asan.

https://github.com/google/sanitizers/wiki/AddressSanitizerInitializationOrderFiasco#false-positives

It's can be avoided with `constexpr` constructors.

In general order global constructors in different
modules is undefined. If global constructor uses
external global, they can be not constructed yet.

However, implementation may contain workaround for
that, or the state of non-constructed global can
be still valid.

Asan will still falsely report such cases, as it
has no machinery to detect correctness of such
cases.

We need to fix/workaround the issue in libc++, as
it will affect many libc++ with Asan users.
---
 libcxx/src/iostream.cpp                       | 14 +++++++++----
 .../ios_Init/ios_Init.global.pass.cpp         | 20 +++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 libcxx/test/std/input.output/iostreams.base/ios.base/ios.types/ios_Init/ios_Init.global.pass.cpp

diff --git a/libcxx/src/iostream.cpp b/libcxx/src/iostream.cpp
index d91f9f0c04826..416725235c340 100644
--- a/libcxx/src/iostream.cpp
+++ b/libcxx/src/iostream.cpp
@@ -18,8 +18,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class StreamT, class BufferT>
 union stream_data {
-  stream_data() {}
-  ~stream_data() {}
+  constexpr stream_data() {}
+  constexpr ~stream_data() {}
   struct {
     // The stream has to be the first element, since that's referenced by the stream declarations in <iostream>
     StreamT stream;
@@ -38,13 +38,19 @@ union stream_data {
 #define CHAR_MANGLING_wchar_t "_W"
 #define CHAR_MANGLING(CharT) CHAR_MANGLING_##CharT
 
+#ifdef _LIBCPP_COMPILER_CLANG_BASED
+#  define STRING_DATA_CONSTINIT constinit
+#else
+#  define STRING_DATA_CONSTINIT
+#endif
+
 #ifdef _LIBCPP_ABI_MICROSOFT
 #  define STREAM(StreamT, BufferT, CharT, var)                                                                         \
-    stream_data<StreamT<CharT>, BufferT<CharT>> var __asm__(                                                           \
+    STRING_DATA_CONSTINIT stream_data<StreamT<CharT>, BufferT<CharT>> var __asm__(                                     \
         "?" #var "@" ABI_NAMESPACE_STR "@std@@3V?$" #StreamT                                                           \
         "@" CHAR_MANGLING(CharT) "U?$char_traits@" CHAR_MANGLING(CharT) "@" ABI_NAMESPACE_STR "@std@@@12@A")
 #else
-#  define STREAM(StreamT, BufferT, CharT, var) stream_data<StreamT<CharT>, BufferT<CharT>> var
+#  define STREAM(StreamT, BufferT, CharT, var) STRING_DATA_CONSTINIT stream_data<StreamT<CharT>, BufferT<CharT>> var
 #endif
 
 // These definitions and the declarations in <iostream> technically cause ODR violations, since they have different
diff --git a/libcxx/test/std/input.output/iostreams.base/ios.base/ios.types/ios_Init/ios_Init.global.pass.cpp b/libcxx/test/std/input.output/iostreams.base/ios.base/ios.types/ios_Init/ios_Init.global.pass.cpp
new file mode 100644
index 0000000000000..ac6a7213bb722
--- /dev/null
+++ b/libcxx/test/std/input.output/iostreams.base/ios.base/ios.types/ios_Init/ios_Init.global.pass.cpp
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <iostream>
+
+// FIXME: Remove after issue https://github.com/llvm/llvm-project/issues/127348 resolved.
+extern "C" const char* __asan_default_options() { return "check_initialization_order=true:strict_init_order=true"; }
+
+// Test that ios used from globals constructors doesn't trigger Asan initialization-order-fiasco.
+
+struct Global {
+  Global() { std::cout << "Hello!"; }
+} global;
+
+int main(int, char**) { return 0; }

From dbc98cfa46d52ede06e8be7fc5e855d807ba0fac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Sun, 16 Feb 2025 08:48:52 +0100
Subject: [PATCH 045/109] [libclc] [cmake] Fix per-target *_convert.cl
 dependencies (#127315)

Fix `add_libclc_builtin_set` to add an appropriate dependency to either
`clspv-generate_convert.cl` or `generate_convert.cl` based on the `ARCH`
argument, rather than to both unconditionally. This fixes build failures
due to missing dependencies when `clspv*` targets are not enabled.

The added check mirrors the one from `libclc/CMakeLists.txt`.

Fixes: #127378
---
 libclc/cmake/modules/AddLibclc.cmake | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index b520626c6ffd1..a3b311f12a1e3 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -249,13 +249,19 @@ function(add_libclc_builtin_set)
 
     get_filename_component( file_dir ${file} DIRECTORY )
 
+    if( ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv64 )
+      set(CONVERT_DEP clspv-generate_convert.cl)
+    else()
+      set(CONVERT_DEP generate_convert.cl)
+    endif()
+
     compile_to_bc(
       TRIPLE ${ARG_TRIPLE}
       INPUT ${input_file}
       OUTPUT ${output_file}
       EXTRA_OPTS -fno-builtin -nostdlib
         "${ARG_COMPILE_FLAGS}" -I${CMAKE_CURRENT_SOURCE_DIR}/${file_dir}
-      DEPENDENCIES generate_convert.cl clspv-generate_convert.cl
+      DEPENDENCIES ${CONVERT_DEP}
     )
     list( APPEND bytecode_files ${output_file} )
   endforeach()

From 538b8f8008b95782ea1967284e22500c72aec67d Mon Sep 17 00:00:00 2001
From: MagentaTreehouse <99200384+MagentaTreehouse@users.noreply.github.com>
Date: Sun, 16 Feb 2025 04:08:30 -0500
Subject: [PATCH 046/109] [Clang] [NFC] Remove outdated FIXME (#126978)

---
 clang/include/clang/AST/Redeclarable.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clang/include/clang/AST/Redeclarable.h b/clang/include/clang/AST/Redeclarable.h
index ee21f11e5f707..68516c66aaf65 100644
--- a/clang/include/clang/AST/Redeclarable.h
+++ b/clang/include/clang/AST/Redeclarable.h
@@ -114,8 +114,6 @@ class Redeclarable {
 
     bool isFirst() const {
       return isa<KnownLatest>(Link) ||
-             // FIXME: 'template' is required on the next line due to an
-             // apparent clang bug.
              isa<UninitializedLatest>(cast<NotKnownLatest>(Link));
     }
 

From a422bc773fab84e4e68dbfbe4e4859760eb1f67e Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Sun, 16 Feb 2025 12:01:55 +0100
Subject: [PATCH 047/109] [InstCombine] Test for trunc to i1 in
 foldSelectICmpAndBinOp. (NFC)

---
 .../InstCombine/select-with-bitwise-ops.ll    | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll
index 30e763ccea590..7c100f579399d 100644
--- a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll
+++ b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll
@@ -1709,3 +1709,82 @@ define i8 @select_icmp_eq_and_1_0_lshr_tv(i8 %x, i8 %y) {
   %select = select i1 %cmp, i8 %blshr, i8 %y
   ret i8 %select
 }
+
+define i8 @select_trunc_or_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @select_trunc_or_2(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %trunc = trunc i8 %x to i1
+  %or = or i8 %y, 2
+  %select = select i1 %trunc, i8 %or, i8 %y
+  ret i8 %select
+}
+
+define i8 @select_not_trunc_or_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @select_not_trunc_or_2(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %trunc = trunc i8 %x to i1
+  %not = xor i1 %trunc, true
+  %or = or i8 %y, 2
+  %select = select i1 %not, i8 %y, i8 %or
+  ret i8 %select
+}
+
+define i8 @select_trunc_nuw_or_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @select_trunc_nuw_or_2(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %trunc = trunc nuw i8 %x to i1
+  %or = or i8 %y, 2
+  %select = select i1 %trunc, i8 %or, i8 %y
+  ret i8 %select
+}
+
+define i8 @select_trunc_nsw_or_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @select_trunc_nsw_or_2(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc nsw i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %trunc = trunc nsw i8 %x to i1
+  %or = or i8 %y, 2
+  %select = select i1 %trunc, i8 %or, i8 %y
+  ret i8 %select
+}
+
+define <2 x i8> @select_trunc_or_2_vec(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @select_trunc_or_2_vec(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <2 x i8> [[X:%.*]] to <2 x i1>
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i8> [[Y:%.*]], splat (i8 2)
+; CHECK-NEXT:    [[SELECT:%.*]] = select <2 x i1> [[TRUNC]], <2 x i8> [[OR]], <2 x i8> [[Y]]
+; CHECK-NEXT:    ret <2 x i8> [[SELECT]]
+;
+  %trunc = trunc <2 x i8> %x to <2 x i1>
+  %or = or <2 x i8> %y, <i8 2, i8 2>
+  %select = select <2 x i1> %trunc, <2 x i8> %or, <2 x i8> %y
+  ret <2 x i8> %select
+}
+
+define i8 @neg_select_trunc_or_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @neg_select_trunc_or_2(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], 2
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[Y]], i8 [[OR]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %trunc = trunc i8 %x to i1
+  %or = or i8 %y, 2
+  %select = select i1 %trunc, i8 %y, i8 %or
+  ret i8 %select
+}

From d64cf1998367cb7d0df398991808f3eed12f084f Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Sun, 16 Feb 2025 12:15:43 +0100
Subject: [PATCH 048/109] [clang][bytecode] Add Descriptor::dumpFull (#127386)

This is useful to print all (or most) of the valid offsets into a block
of the given descriptor.
---
 clang/lib/AST/ByteCode/Descriptor.h |  1 +
 clang/lib/AST/ByteCode/Disasm.cpp   | 32 +++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/clang/lib/AST/ByteCode/Descriptor.h b/clang/lib/AST/ByteCode/Descriptor.h
index 96c82a18913e0..01fa4b198de67 100644
--- a/clang/lib/AST/ByteCode/Descriptor.h
+++ b/clang/lib/AST/ByteCode/Descriptor.h
@@ -274,6 +274,7 @@ struct Descriptor final {
 
   void dump() const;
   void dump(llvm::raw_ostream &OS) const;
+  void dumpFull(unsigned Offset = 0, unsigned Indent = 0) const;
 };
 
 /// Bitfield tracking the initialisation status of elements of primitive arrays.
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 92a169a37c365..85fc30482b003 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -251,6 +251,38 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
     OS << " dummy";
 }
 
+/// Dump descriptor, including all valid offsets.
+LLVM_DUMP_METHOD void Descriptor::dumpFull(unsigned Offset,
+                                           unsigned Indent) const {
+  unsigned Spaces = Indent * 2;
+  llvm::raw_ostream &OS = llvm::errs();
+  OS.indent(Spaces);
+  dump(OS);
+  OS << '\n';
+  OS.indent(Spaces) << "Metadata: " << getMetadataSize() << " bytes\n";
+  OS.indent(Spaces) << "Size: " << getSize() << " bytes\n";
+  OS.indent(Spaces) << "AllocSize: " << getAllocSize() << " bytes\n";
+  Offset += getMetadataSize();
+  if (isCompositeArray()) {
+    OS.indent(Spaces) << "Elements: " << getNumElems() << '\n';
+    unsigned FO = Offset;
+    for (unsigned I = 0; I != getNumElems(); ++I) {
+      FO += sizeof(InlineDescriptor);
+      assert(ElemDesc->getMetadataSize() == 0);
+      OS.indent(Spaces) << "Element " << I << " offset: " << FO << '\n';
+      ElemDesc->dumpFull(FO, Indent + 1);
+
+      FO += ElemDesc->getAllocSize();
+    }
+  } else if (isRecord()) {
+    ElemRecord->dump(OS, Indent + 1, Offset);
+  } else if (isPrimitive()) {
+  } else {
+  }
+
+  OS << '\n';
+}
+
 LLVM_DUMP_METHOD void InlineDescriptor::dump(llvm::raw_ostream &OS) const {
   {
     ColorScope SC(OS, true, {llvm::raw_ostream::BLUE, true});

From e8d848824bbd76892a7b371a8409b0fb378a55a4 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Sun, 16 Feb 2025 12:24:35 +0100
Subject: [PATCH 049/109] [clang][bytecode] Fix dynamic array allocation return
 values (#127387)

We need to return a pointer to the first element, not the array itself.
---
 clang/lib/AST/ByteCode/DynamicAllocator.cpp |  2 +
 clang/lib/AST/ByteCode/Interp.h             |  9 +---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp    | 41 +++++++--------
 clang/lib/AST/ByteCode/Program.cpp          |  4 +-
 clang/test/AST/ByteCode/allocate-arrays.cpp | 55 +++++++++++++++++++++
 5 files changed, 82 insertions(+), 29 deletions(-)
 create mode 100644 clang/test/AST/ByteCode/allocate-arrays.cpp

diff --git a/clang/lib/AST/ByteCode/DynamicAllocator.cpp b/clang/lib/AST/ByteCode/DynamicAllocator.cpp
index 819fbdb8b070b..3ef8c2e1f3e7c 100644
--- a/clang/lib/AST/ByteCode/DynamicAllocator.cpp
+++ b/clang/lib/AST/ByteCode/DynamicAllocator.cpp
@@ -54,6 +54,7 @@ Block *DynamicAllocator::allocate(const Expr *Source, PrimType T,
 Block *DynamicAllocator::allocate(const Descriptor *ElementDesc,
                                   size_t NumElements, unsigned EvalID,
                                   Form AllocForm) {
+  assert(ElementDesc->getMetadataSize() == 0);
   // Create a new descriptor for an array of the specified size and
   // element type.
   const Descriptor *D = allocateDescriptor(
@@ -72,6 +73,7 @@ Block *DynamicAllocator::allocate(const Descriptor *D, unsigned EvalID,
   auto *B = new (Memory.get()) Block(EvalID, D, /*isStatic=*/false);
   B->invokeCtor();
 
+  assert(D->getMetadataSize() == sizeof(InlineDescriptor));
   InlineDescriptor *ID = reinterpret_cast<InlineDescriptor *>(B->rawData());
   ID->Desc = D;
   ID->IsActive = true;
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 5cc371c7ee495..73cc107b7dbff 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -2896,9 +2896,7 @@ inline bool Alloc(InterpState &S, CodePtr OpPC, const Descriptor *Desc) {
   Block *B = Allocator.allocate(Desc, S.Ctx.getEvalID(),
                                 DynamicAllocator::Form::NonArray);
   assert(B);
-
   S.Stk.push<Pointer>(B);
-
   return true;
 }
 
@@ -2923,8 +2921,7 @@ inline bool AllocN(InterpState &S, CodePtr OpPC, PrimType T, const Expr *Source,
       Allocator.allocate(Source, T, static_cast<size_t>(NumElements),
                          S.Ctx.getEvalID(), DynamicAllocator::Form::Array);
   assert(B);
-  S.Stk.push<Pointer>(B, sizeof(InlineDescriptor));
-
+  S.Stk.push<Pointer>(B);
   return true;
 }
 
@@ -2950,9 +2947,7 @@ inline bool AllocCN(InterpState &S, CodePtr OpPC, const Descriptor *ElementDesc,
       Allocator.allocate(ElementDesc, static_cast<size_t>(NumElements),
                          S.Ctx.getEvalID(), DynamicAllocator::Form::Array);
   assert(B);
-
-  S.Stk.push<Pointer>(B, sizeof(InlineDescriptor));
-
+  S.Stk.push<Pointer>(B);
   return true;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 55ac41736344d..b964906fb6594 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1655,27 +1655,27 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC,
     return false;
   }
 
+  bool IsArray = NumElems.ugt(1);
   std::optional<PrimType> ElemT = S.getContext().classify(ElemType);
   DynamicAllocator &Allocator = S.getAllocator();
   if (ElemT) {
-    if (NumElems.ule(1)) {
-      const Descriptor *Desc =
-          S.P.createDescriptor(NewCall, *ElemT, Descriptor::InlineDescMD,
-                               /*IsConst=*/false, /*IsTemporary=*/false,
-                               /*IsMutable=*/false);
-      Block *B = Allocator.allocate(Desc, S.getContext().getEvalID(),
+    if (IsArray) {
+      Block *B = Allocator.allocate(NewCall, *ElemT, NumElems.getZExtValue(),
+                                    S.Ctx.getEvalID(),
                                     DynamicAllocator::Form::Operator);
       assert(B);
-
-      S.Stk.push<Pointer>(B);
+      S.Stk.push<Pointer>(Pointer(B).atIndex(0));
       return true;
     }
-    assert(NumElems.ugt(1));
 
-    Block *B =
-        Allocator.allocate(NewCall, *ElemT, NumElems.getZExtValue(),
-                           S.Ctx.getEvalID(), DynamicAllocator::Form::Operator);
+    const Descriptor *Desc =
+        S.P.createDescriptor(NewCall, *ElemT, Descriptor::InlineDescMD,
+                             /*IsConst=*/false, /*IsTemporary=*/false,
+                             /*IsMutable=*/false);
+    Block *B = Allocator.allocate(Desc, S.getContext().getEvalID(),
+                                  DynamicAllocator::Form::Operator);
     assert(B);
+
     S.Stk.push<Pointer>(B);
     return true;
   }
@@ -1683,21 +1683,22 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC,
   assert(!ElemT);
   // Structs etc.
   const Descriptor *Desc = S.P.createDescriptor(
-      NewCall, ElemType.getTypePtr(), Descriptor::InlineDescMD,
+      NewCall, ElemType.getTypePtr(),
+      IsArray ? std::nullopt : Descriptor::InlineDescMD,
       /*IsConst=*/false, /*IsTemporary=*/false, /*IsMutable=*/false,
       /*Init=*/nullptr);
 
-  if (NumElems.ule(1)) {
-    Block *B = Allocator.allocate(Desc, S.getContext().getEvalID(),
-                                  DynamicAllocator::Form::Operator);
+  if (IsArray) {
+    Block *B =
+        Allocator.allocate(Desc, NumElems.getZExtValue(), S.Ctx.getEvalID(),
+                           DynamicAllocator::Form::Operator);
     assert(B);
-    S.Stk.push<Pointer>(B);
+    S.Stk.push<Pointer>(Pointer(B).atIndex(0));
     return true;
   }
 
-  Block *B =
-      Allocator.allocate(Desc, NumElems.getZExtValue(), S.Ctx.getEvalID(),
-                         DynamicAllocator::Form::Operator);
+  Block *B = Allocator.allocate(Desc, S.getContext().getEvalID(),
+                                DynamicAllocator::Form::Operator);
   assert(B);
   S.Stk.push<Pointer>(B);
   return true;
diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp
index 833c9ef88d770..0754e259b7cb3 100644
--- a/clang/lib/AST/ByteCode/Program.cpp
+++ b/clang/lib/AST/ByteCode/Program.cpp
@@ -432,8 +432,8 @@ Descriptor *Program::createDescriptor(const DeclTy &D, const Type *Ty,
         return allocateDescriptor(D, *T, MDSize, IsTemporary,
                                   Descriptor::UnknownSize{});
       } else {
-        const Descriptor *Desc = createDescriptor(D, ElemTy.getTypePtr(),
-                                                  MDSize, IsConst, IsTemporary);
+        const Descriptor *Desc = createDescriptor(
+            D, ElemTy.getTypePtr(), std::nullopt, IsConst, IsTemporary);
         if (!Desc)
           return nullptr;
         return allocateDescriptor(D, Desc, MDSize, IsTemporary,
diff --git a/clang/test/AST/ByteCode/allocate-arrays.cpp b/clang/test/AST/ByteCode/allocate-arrays.cpp
new file mode 100644
index 0000000000000..f1e5af6cab2aa
--- /dev/null
+++ b/clang/test/AST/ByteCode/allocate-arrays.cpp
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -std=c++2c -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -std=c++2c  -verify=ref,both %s
+
+
+/// This example used to cause an invalid read because allocating
+/// an array needs to return a pointer to the first element,
+/// not to the array.
+
+namespace std {
+  using size_t = decltype(sizeof(0));
+
+  template <class _Tp>
+  class allocator {
+  public:
+    typedef size_t size_type;
+    typedef _Tp value_type;
+    constexpr _Tp *allocate(size_t __n) {
+      return static_cast<_Tp *>(::operator new(__n * sizeof(_Tp)));
+    }
+  };
+}
+
+void *operator new(std::size_t, void *p) { return p; }
+void* operator new[] (std::size_t, void* p) {return p;}
+
+namespace std {
+  template <class _Ep>
+  class initializer_list {
+    const _Ep *__begin_;
+    __SIZE_TYPE__ __size_;
+
+  public:
+    typedef _Ep value_type;
+    typedef const _Ep &reference;
+    constexpr __SIZE_TYPE__ size() const noexcept { return __size_; }
+    constexpr const _Ep *begin() const noexcept { return __begin_; }
+    constexpr const _Ep *end() const noexcept { return __begin_ + __size_; }
+  };
+}
+
+template<typename T>
+class vector {
+public:
+  constexpr vector(std::initializer_list<T> Ts) {
+    A = B = std::allocator<T>{}.allocate(Ts.size()); // both-note {{heap allocation performed here}}
+
+    new (A) T(*Ts.begin());
+  }
+private:
+  T *A = nullptr;
+  T *B = nullptr;
+};
+
+constexpr vector<vector<int>> ints = {{3}, {4}}; // both-error {{must be initialized by a constant expression}} \
+                                                 // both-note {{pointer to}}

From dab9156923133b4ce3c40efcae4f80b0d720e72f Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Sun, 16 Feb 2025 19:35:09 +0800
Subject: [PATCH 050/109] [X86] Add missing explicit conversion for AMXAVX512
 and SHA (#127385)

Address the X86 part of #126491
---
 clang/lib/Headers/amxavx512intrin.h         | 2 +-
 clang/lib/Headers/shaintrin.h               | 5 +++--
 clang/test/CodeGen/X86/amxavx512-builtins.c | 2 +-
 clang/test/CodeGen/X86/sha-builtins.c       | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h
index a158983482d5b..bbde44fc265b3 100644
--- a/clang/lib/Headers/amxavx512intrin.h
+++ b/clang/lib/Headers/amxavx512intrin.h
@@ -228,7 +228,7 @@
 ///             dst.byte[i] := a.row[row_index].byte[row_chunk+i]
 /// ENDFOR
 /// \endcode
-#define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b)
+#define _tile_movrow(a, b) ((__m512i)__builtin_ia32_tilemovrow(a, b))
 
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 
diff --git a/clang/lib/Headers/shaintrin.h b/clang/lib/Headers/shaintrin.h
index 232e1fa298230..e21d3bded7660 100644
--- a/clang/lib/Headers/shaintrin.h
+++ b/clang/lib/Headers/shaintrin.h
@@ -47,8 +47,9 @@
 ///    An immediate value where bits [1:0] select among four possible
 ///    combining functions and rounding constants (not specified here).
 /// \returns A 128-bit vector of [4 x i32] containing the updated SHA-1 state.
-#define _mm_sha1rnds4_epu32(V1, V2, M) \
-  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M))
+#define _mm_sha1rnds4_epu32(V1, V2, M)                                         \
+  ((__m128i)__builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1),                    \
+                                     (__v4si)(__m128i)(V2), (M)))
 
 /// Calculates the SHA-1 state variable E from the SHA-1 state variables in
 ///    the 128-bit vector of [4 x i32] in \a __X, adds that to the next set of
diff --git a/clang/test/CodeGen/X86/amxavx512-builtins.c b/clang/test/CodeGen/X86/amxavx512-builtins.c
index 0f203349b1d1e..d60929994901a 100644
--- a/clang/test/CodeGen/X86/amxavx512-builtins.c
+++ b/clang/test/CodeGen/X86/amxavx512-builtins.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-avx512 \
-// RUN: -target-feature +avx10.2-512 -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s
+// RUN: -target-feature +avx10.2-512 -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression -flax-vector-conversions=none | FileCheck %s
 
 #include <immintrin.h>
 #include <stddef.h>
diff --git a/clang/test/CodeGen/X86/sha-builtins.c b/clang/test/CodeGen/X86/sha-builtins.c
index ede1a6bf7b1f3..44cab43dad9f7 100644
--- a/clang/test/CodeGen/X86/sha-builtins.c
+++ b/clang/test/CodeGen/X86/sha-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +sha -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +sha -emit-llvm -flax-vector-conversions=none -o - | FileCheck %s
 
 
 #include <immintrin.h>

From 29f3a352068ce562bcb65e18a676c82a9991583c Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 16 Feb 2025 20:18:29 +0800
Subject: [PATCH 051/109] [InstCombine] Do not keep samesign when speculatively
 executing icmps (#127007)

Closes https://github.com/llvm/llvm-project/issues/126974.
---
 .../InstCombine/InstCombineCompares.cpp       |  5 ++++
 llvm/test/Transforms/InstCombine/umax-icmp.ll | 24 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 96d6db2ba5bfe..76020d2b1dbf4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5637,6 +5637,11 @@ Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I,
       return false;
     return std::nullopt;
   };
+  // Remove samesign here since it is illegal to keep it when we speculatively
+  // execute comparisons. For example, `icmp samesign ult umax(X, -46), -32`
+  // cannot be decomposed into `(icmp samesign ult X, -46) or (icmp samesign ult
+  // -46, -32)`. `X` is allowed to be non-negative here.
+  Pred = static_cast<CmpInst::Predicate>(Pred);
   auto CmpXZ = IsCondKnownTrue(simplifyICmpInst(Pred, X, Z, Q));
   auto CmpYZ = IsCondKnownTrue(simplifyICmpInst(Pred, Y, Z, Q));
   if (!CmpXZ.has_value() && !CmpYZ.has_value())
diff --git a/llvm/test/Transforms/InstCombine/umax-icmp.ll b/llvm/test/Transforms/InstCombine/umax-icmp.ll
index b4eea30bfc6af..0c42d26750e4b 100644
--- a/llvm/test/Transforms/InstCombine/umax-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/umax-icmp.ll
@@ -804,4 +804,28 @@ end:
   ret void
 }
 
+define i1 @pr126974(i8 %x) {
+; CHECK-LABEL: @pr126974(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i8 [[X:%.*]], -2
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cond = icmp sgt i8 %x, -2
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %umax = call i8 @llvm.umax.i8(i8 %x, i8 -46)
+  %cmp = icmp samesign ult i8 %umax, -32
+  ret i1 %cmp
+
+if.else:
+  ret i1 false
+}
+
 declare i32 @llvm.umax.i32(i32, i32)

From 54f37133b7b05f82960f47bc980c4a1aaa7d04df Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Sun, 16 Feb 2025 13:25:31 +0100
Subject: [PATCH 052/109] [Flang][NFC] Move runtime library files to flang-rt
 (#110298)

Mostly mechanical changes in preparation of extracting the Flang-RT
"subproject" in #110217. This PR intends to only move pre-existing files
to the new folder structure, with no behavioral change. Common files
(headers, testing, cmake) shared by Flang-RT and Flang remain in
`flang/`.

Some cosmetic changes and files paths were necessary:
* Relative paths to the new path for the source files and
`add_subdirectory`.
 * Add the new location's include directory to `include_directories`
* The unittest/Evaluate directory has unitests for flang-rt and Flang. A
new `CMakeLists.txt` was introduced for the flang-rt tests.
 * Change the `#include` paths relative to the include directive
 * clang-format on the `#include` directives
* Since the paths are part if the copyright header and include guards, a
script was used to canonicalize those
* `test/Runtime` and runtime tests in `test/Driver` are moved, but the
lit.cfg.py mechanism to execute the will only be added in #110217.
---
 flang-rt/.clang-format                        | 25 ++++++++
 flang-rt/cmake/config.h.cmake.in              | 19 ++++++
 .../ExternalHelloWorld/external-hello.cpp     |  8 +++
 .../flang-rt/runtime}/allocator-registry.h    |  8 +--
 .../flang-rt/runtime}/array-constructor.h     | 10 ++--
 .../include/flang-rt}/runtime/assign-impl.h   |  8 +--
 .../include/flang-rt}/runtime/buffer.h        | 10 ++--
 .../include/flang-rt}/runtime/connection.h    |  8 +--
 .../include/flang-rt}/runtime/derived.h       |  8 +--
 .../include/flang-rt/runtime}/descriptor.h    | 12 ++--
 .../include/flang-rt}/runtime/emit-encoded.h  |  8 +--
 .../include/flang-rt}/runtime/environment.h   |  8 +--
 .../include/flang-rt}/runtime/file.h          | 10 ++--
 .../flang-rt}/runtime/format-implementation.h |  8 +--
 .../include/flang-rt}/runtime/format.h        |  8 +--
 .../include/flang-rt}/runtime/internal-unit.h | 10 ++--
 .../include/flang-rt}/runtime/io-error.h      | 10 ++--
 .../include/flang-rt}/runtime/io-stmt.h       | 10 ++--
 .../include/flang-rt}/runtime/lock.h          |  8 +--
 .../include/flang-rt/runtime}/memory.h        |  8 +--
 .../include/flang-rt}/runtime/namelist.h      |  8 +--
 .../include/flang-rt}/runtime/non-tbp-dio.h   |  8 +--
 .../flang-rt}/runtime/numeric-templates.h     |  8 +--
 .../flang-rt}/runtime/random-templates.h      | 10 ++--
 .../flang-rt}/runtime/reduction-templates.h   | 10 ++--
 .../include/flang-rt}/runtime/stat.h          |  8 +--
 .../include/flang-rt}/runtime/terminator.h    |  8 +--
 .../include/flang-rt}/runtime/tools.h         | 14 ++---
 .../include/flang-rt/runtime}/type-code.h     |  8 +--
 .../include/flang-rt}/runtime/type-info.h     | 10 ++--
 .../include/flang-rt}/runtime/utf.h           |  8 +--
 .../lib/cuda}/allocatable.cpp                 |  9 +--
 .../CUDA => flang-rt/lib/cuda}/allocator.cpp  | 12 ++--
 .../CUDA => flang-rt/lib/cuda}/descriptor.cpp |  6 +-
 .../CUDA => flang-rt/lib/cuda}/init.cpp       |  6 +-
 .../CUDA => flang-rt/lib/cuda}/kernel.cpp     |  4 +-
 .../lib/cuda}/memmove-function.cpp            |  4 +-
 .../CUDA => flang-rt/lib/cuda}/memory.cpp     |  6 +-
 .../CUDA => flang-rt/lib/cuda}/pointer.cpp    |  9 +--
 .../lib/cuda}/registration.cpp                |  4 +-
 .../lib/quadmath}/acos.cpp                    |  2 +-
 .../lib/quadmath}/acosh.cpp                   |  2 +-
 .../lib/quadmath}/asin.cpp                    |  2 +-
 .../lib/quadmath}/asinh.cpp                   |  2 +-
 .../lib/quadmath}/atan.cpp                    |  2 +-
 .../lib/quadmath}/atan2.cpp                   |  2 +-
 .../lib/quadmath}/atanh.cpp                   |  2 +-
 .../lib/quadmath}/ceil.cpp                    |  2 +-
 .../lib/quadmath}/complex-math.c              |  5 +-
 .../lib/quadmath}/complex-math.h              |  8 +--
 .../lib/quadmath}/cos.cpp                     |  2 +-
 .../lib/quadmath}/cosh.cpp                    |  2 +-
 .../lib/quadmath}/erf.cpp                     |  2 +-
 .../lib/quadmath}/erfc.cpp                    |  2 +-
 .../lib/quadmath}/exp.cpp                     |  2 +-
 .../lib/quadmath}/exponent.cpp                |  2 +-
 .../lib/quadmath}/floor.cpp                   |  2 +-
 .../lib/quadmath}/fma.cpp                     |  2 +-
 .../lib/quadmath}/fraction.cpp                |  2 +-
 .../lib/quadmath}/hypot.cpp                   |  2 +-
 .../lib/quadmath}/j0.cpp                      |  2 +-
 .../lib/quadmath}/j1.cpp                      |  2 +-
 .../lib/quadmath}/jn.cpp                      |  2 +-
 .../lib/quadmath}/lgamma.cpp                  |  2 +-
 .../lib/quadmath}/llround.cpp                 |  2 +-
 .../lib/quadmath}/log.cpp                     |  2 +-
 .../lib/quadmath}/log10.cpp                   |  2 +-
 .../lib/quadmath}/lround.cpp                  |  2 +-
 .../lib/quadmath}/math-entries.h              | 13 ++--
 .../lib/quadmath}/mod-real.cpp                |  2 +-
 .../lib/quadmath}/modulo-real.cpp             |  2 +-
 .../lib/quadmath}/nearbyint.cpp               |  2 +-
 .../lib/quadmath}/nearest.cpp                 |  2 +-
 .../lib/quadmath}/norm2.cpp                   |  4 +-
 .../lib/quadmath}/numeric-template-specs.h    | 10 ++--
 .../lib/quadmath}/pow.cpp                     |  2 +-
 .../lib/quadmath}/random.cpp                  |  4 +-
 .../lib/quadmath}/remainder.cpp               |  2 +-
 .../lib/quadmath}/round.cpp                   |  2 +-
 .../lib/quadmath}/rrspacing.cpp               |  2 +-
 .../lib/quadmath}/scale.cpp                   |  2 +-
 .../lib/quadmath}/set-exponent.cpp            |  2 +-
 .../lib/quadmath}/sin.cpp                     |  2 +-
 .../lib/quadmath}/sinh.cpp                    |  2 +-
 .../lib/quadmath}/spacing.cpp                 |  2 +-
 .../lib/quadmath}/sqrt.cpp                    |  2 +-
 .../lib/quadmath}/tan.cpp                     |  2 +-
 .../lib/quadmath}/tanh.cpp                    |  2 +-
 .../lib/quadmath}/tgamma.cpp                  |  2 +-
 .../lib/quadmath}/trunc.cpp                   |  2 +-
 .../lib/quadmath}/y0.cpp                      |  2 +-
 .../lib/quadmath}/y1.cpp                      |  2 +-
 .../lib/quadmath}/yn.cpp                      |  2 +-
 .../lib}/runtime/ISO_Fortran_binding.cpp      |  8 +--
 .../lib}/runtime/ISO_Fortran_util.h           | 14 ++---
 .../lib}/runtime/allocatable.cpp              | 14 ++---
 .../lib}/runtime/allocator-registry.cpp       |  6 +-
 .../lib}/runtime/array-constructor.cpp        | 14 ++---
 {flang => flang-rt/lib}/runtime/assign.cpp    | 16 ++---
 {flang => flang-rt/lib}/runtime/buffer.cpp    |  4 +-
 {flang => flang-rt/lib}/runtime/character.cpp |  8 +--
 {flang => flang-rt/lib}/runtime/command.cpp   | 12 ++--
 .../lib}/runtime/complex-powi.cpp             | 16 ++---
 .../lib}/runtime/complex-reduction.c          |  5 +-
 .../lib}/runtime/complex-reduction.h          | 11 ++--
 .../lib}/runtime/connection.cpp               |  8 +--
 {flang => flang-rt/lib}/runtime/copy.cpp      |  8 +--
 {flang => flang-rt/lib}/runtime/copy.h        | 10 ++--
 .../lib}/runtime/derived-api.cpp              | 13 ++--
 {flang => flang-rt/lib}/runtime/derived.cpp   | 14 ++---
 .../lib}/runtime/descriptor-io.cpp            |  2 +-
 .../lib}/runtime/descriptor-io.h              | 18 +++---
 .../lib}/runtime/descriptor.cpp               | 22 +++----
 .../lib}/runtime/dot-product.cpp              |  8 +--
 .../lib}/runtime/edit-input.cpp               |  6 +-
 {flang => flang-rt/lib}/runtime/edit-input.h  | 12 ++--
 .../lib}/runtime/edit-output.cpp              |  6 +-
 {flang => flang-rt/lib}/runtime/edit-output.h | 12 ++--
 .../lib}/runtime/environment-default-list.h   | 11 ++--
 .../lib}/runtime/environment.cpp              |  6 +-
 .../lib}/runtime/exceptions.cpp               |  4 +-
 {flang => flang-rt/lib}/runtime/execute.cpp   | 12 ++--
 .../lib}/runtime/extensions.cpp               |  8 +--
 .../lib}/runtime/external-unit.cpp            |  8 +--
 {flang => flang-rt/lib}/runtime/extrema.cpp   |  4 +-
 {flang => flang-rt/lib}/runtime/file.cpp      |  8 +--
 {flang => flang-rt/lib}/runtime/findloc.cpp   |  4 +-
 {flang => flang-rt/lib}/runtime/format.cpp    |  4 +-
 {flang => flang-rt/lib}/runtime/inquiry.cpp   |  8 +--
 .../lib}/runtime/internal-unit.cpp            |  8 +--
 .../lib}/runtime/io-api-common.h              | 12 ++--
 .../lib}/runtime/io-api-minimal.cpp           | 12 ++--
 {flang => flang-rt/lib}/runtime/io-api.cpp    | 16 ++---
 {flang => flang-rt/lib}/runtime/io-error.cpp  |  6 +-
 {flang => flang-rt/lib}/runtime/io-stmt.cpp   | 16 ++---
 {flang => flang-rt/lib}/runtime/iostat.cpp    |  2 +-
 {flang => flang-rt/lib}/runtime/main.cpp      |  6 +-
 .../lib}/runtime/matmul-transpose.cpp         |  8 +--
 {flang => flang-rt/lib}/runtime/matmul.cpp    |  8 +--
 {flang => flang-rt/lib}/runtime/memory.cpp    |  8 +--
 .../lib}/runtime/misc-intrinsic.cpp           |  8 +--
 {flang => flang-rt/lib}/runtime/namelist.cpp  |  8 +--
 .../lib}/runtime/non-tbp-dio.cpp              |  6 +-
 {flang => flang-rt/lib}/runtime/numeric.cpp   |  8 +--
 {flang => flang-rt/lib}/runtime/pointer.cpp   | 16 ++---
 {flang => flang-rt/lib}/runtime/product.cpp   |  4 +-
 .../lib}/runtime/pseudo-unit.cpp              |  6 +-
 {flang => flang-rt/lib}/runtime/ragged.cpp    |  4 +-
 {flang => flang-rt/lib}/runtime/random.cpp    | 10 ++--
 {flang => flang-rt/lib}/runtime/reduce.cpp    | 10 ++--
 {flang => flang-rt/lib}/runtime/reduction.cpp |  6 +-
 {flang => flang-rt/lib}/runtime/stack.h       | 12 ++--
 {flang => flang-rt/lib}/runtime/stat.cpp      | 10 ++--
 {flang => flang-rt/lib}/runtime/stop.cpp      | 10 ++--
 {flang => flang-rt/lib}/runtime/sum.cpp       |  4 +-
 {flang => flang-rt/lib}/runtime/support.cpp   |  6 +-
 .../lib}/runtime/temporary-stack.cpp          |  8 +--
 .../lib}/runtime/terminator.cpp               |  4 +-
 .../lib}/runtime/time-intrinsic.cpp           |  8 +--
 {flang => flang-rt/lib}/runtime/tools.cpp     |  6 +-
 .../lib}/runtime/transformational.cpp         |  8 +--
 {flang => flang-rt/lib}/runtime/type-code.cpp |  4 +-
 {flang => flang-rt/lib}/runtime/type-info.cpp | 10 ++--
 {flang => flang-rt/lib}/runtime/unit-map.cpp  |  2 +-
 {flang => flang-rt/lib}/runtime/unit-map.h    | 12 ++--
 {flang => flang-rt/lib}/runtime/unit.cpp      |  8 +--
 {flang => flang-rt/lib}/runtime/unit.h        | 30 +++++-----
 {flang => flang-rt/lib}/runtime/utf.cpp       |  4 +-
 .../test/Driver/ctofortran.f90                |  0
 {flang => flang-rt}/test/Driver/exec.f90      |  0
 {flang => flang-rt}/test/Runtime/no-cpp-dep.c |  0
 .../Evaluate/ISO-Fortran-binding.cpp          | 10 +++-
 .../unittests/Evaluate/reshape.cpp            | 10 +++-
 .../unittests/Runtime/AccessTest.cpp          |  2 +-
 .../unittests/Runtime/Allocatable.cpp         |  4 +-
 .../unittests/Runtime/ArrayConstructor.cpp    | 10 ++--
 .../unittests/Runtime/BufferTest.cpp          |  4 +-
 .../unittests/Runtime/CUDA/Allocatable.cpp    | 10 ++--
 .../unittests/Runtime/CUDA/AllocatorCUF.cpp   | 10 ++--
 .../unittests/Runtime/CUDA/Memory.cpp         | 11 ++--
 .../unittests/Runtime/CharacterTest.cpp       |  4 +-
 .../unittests/Runtime/CommandTest.cpp         |  4 +-
 .../unittests/Runtime/Complex.cpp             |  3 +-
 .../unittests/Runtime/CrashHandlerFixture.cpp |  5 +-
 .../unittests/Runtime/CrashHandlerFixture.h   |  9 +--
 .../unittests/Runtime/Derived.cpp             |  6 +-
 .../unittests/Runtime/ExternalIOTest.cpp      |  4 +-
 .../unittests/Runtime/Format.cpp              |  8 +--
 .../unittests/Runtime/Inquiry.cpp             |  6 +-
 .../unittests/Runtime/ListInputTest.cpp       |  6 +-
 .../unittests/Runtime/LogicalFormatTest.cpp   |  4 +-
 .../unittests/Runtime/Matmul.cpp              |  8 +--
 .../unittests/Runtime/MatmulTranspose.cpp     |  8 +--
 .../unittests/Runtime/MiscIntrinsic.cpp       |  8 +--
 .../unittests/Runtime/Namelist.cpp            |  6 +-
 .../unittests/Runtime/Numeric.cpp             |  2 +-
 .../unittests/Runtime/NumericalFormatTest.cpp |  4 +-
 .../unittests/Runtime/Pointer.cpp             |  6 +-
 .../unittests/Runtime/Ragged.cpp              |  2 +-
 .../unittests/Runtime/Random.cpp              |  8 +--
 .../unittests/Runtime/Reduction.cpp           |  8 +--
 .../unittests/Runtime/RuntimeCrashTest.cpp    |  4 +-
 .../unittests/Runtime/Stop.cpp                |  5 +-
 .../unittests/Runtime/Support.cpp             |  6 +-
 .../unittests/Runtime/TemporaryStack.cpp      |  8 +--
 .../unittests/Runtime/Time.cpp                |  2 +-
 .../unittests/Runtime/Transformational.cpp    |  6 +-
 {flang => flang-rt}/unittests/Runtime/tools.h | 12 ++--
 flang/CMakeLists.txt                          |  1 +
 .../ExternalHelloWorld/CMakeLists.txt         |  2 +-
 flang/include/flang/Runtime/allocatable.h     |  2 +-
 .../include/flang/Runtime/descriptor-consts.h |  2 +
 flang/include/flang/Runtime/pointer.h         |  2 +-
 flang/runtime/CMakeLists.txt                  | 29 ++++++++-
 flang/runtime/CUDA/CMakeLists.txt             |  7 ++-
 flang/runtime/Float128Math/CMakeLists.txt     |  1 +
 flang/runtime/config.h.cmake                  | 16 -----
 flang/unittests/CMakeLists.txt                | 10 +++-
 flang/unittests/Runtime/CMakeLists.txt        | 60 +++++++++----------
 flang/unittests/Runtime/CUDA/CMakeLists.txt   |  6 +-
 220 files changed, 825 insertions(+), 726 deletions(-)
 create mode 100644 flang-rt/.clang-format
 create mode 100644 flang-rt/cmake/config.h.cmake.in
 rename {flang => flang-rt}/examples/ExternalHelloWorld/external-hello.cpp (80%)
 rename {flang/include/flang/Runtime => flang-rt/include/flang-rt/runtime}/allocator-registry.h (87%)
 rename {flang/include/flang/Runtime => flang-rt/include/flang-rt/runtime}/array-constructor.h (89%)
 rename {flang => flang-rt/include/flang-rt}/runtime/assign-impl.h (85%)
 rename {flang => flang-rt/include/flang-rt}/runtime/buffer.h (97%)
 rename {flang => flang-rt/include/flang-rt}/runtime/connection.h (96%)
 rename {flang => flang-rt/include/flang-rt}/runtime/derived.h (90%)
 rename {flang/include/flang/Runtime => flang-rt/include/flang-rt/runtime}/descriptor.h (98%)
 rename {flang => flang-rt/include/flang-rt}/runtime/emit-encoded.h (94%)
 rename {flang => flang-rt/include/flang-rt}/runtime/environment.h (91%)
 rename {flang => flang-rt/include/flang-rt}/runtime/file.h (95%)
 rename {flang => flang-rt/include/flang-rt}/runtime/format-implementation.h (98%)
 rename {flang => flang-rt/include/flang-rt}/runtime/format.h (97%)
 rename {flang => flang-rt/include/flang-rt}/runtime/internal-unit.h (89%)
 rename {flang => flang-rt/include/flang-rt}/runtime/io-error.h (92%)
 rename {flang => flang-rt/include/flang-rt}/runtime/io-stmt.h (99%)
 rename {flang => flang-rt/include/flang-rt}/runtime/lock.h (94%)
 rename {flang/include/flang/Runtime => flang-rt/include/flang-rt/runtime}/memory.h (96%)
 rename {flang => flang-rt/include/flang-rt}/runtime/namelist.h (91%)
 rename {flang => flang-rt/include/flang-rt}/runtime/non-tbp-dio.h (91%)
 rename {flang => flang-rt/include/flang-rt}/runtime/numeric-templates.h (98%)
 rename {flang => flang-rt/include/flang-rt}/runtime/random-templates.h (93%)
 rename {flang => flang-rt/include/flang-rt}/runtime/reduction-templates.h (98%)
 rename {flang => flang-rt/include/flang-rt}/runtime/stat.h (93%)
 rename {flang => flang-rt/include/flang-rt}/runtime/terminator.h (95%)
 rename {flang => flang-rt/include/flang-rt}/runtime/tools.h (98%)
 rename {flang/include/flang/Runtime => flang-rt/include/flang-rt/runtime}/type-code.h (93%)
 rename {flang => flang-rt/include/flang-rt}/runtime/type-info.h (98%)
 rename {flang => flang-rt/include/flang-rt}/runtime/utf.h (94%)
 rename {flang/runtime/CUDA => flang-rt/lib/cuda}/allocatable.cpp (94%)
 rename {flang/runtime/CUDA => flang-rt/lib/cuda}/allocator.cpp (86%)
 rename {flang/runtime/CUDA => flang-rt/lib/cuda}/descriptor.cpp (91%)
 rename {flang/runtime/CUDA => flang-rt/lib/cuda}/init.cpp (81%)
 rename {flang/runtime/CUDA => flang-rt/lib/cuda}/kernel.cpp (98%)
 rename {flang/runtime/CUDA => flang-rt/lib/cuda}/memmove-function.cpp (90%)
 rename {flang/runtime/CUDA => flang-rt/lib/cuda}/memory.cpp (96%)
 rename {flang/runtime/CUDA => flang-rt/lib/cuda}/pointer.cpp (92%)
 rename {flang/runtime/CUDA => flang-rt/lib/cuda}/registration.cpp (92%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/acos.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/acosh.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/asin.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/asinh.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/atan.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/atan2.cpp (88%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/atanh.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/ceil.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/complex-math.c (92%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/complex-math.h (88%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/cos.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/cosh.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/erf.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/erfc.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/exp.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/exponent.cpp (90%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/floor.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/fma.cpp (89%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/fraction.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/hypot.cpp (88%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/j0.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/j1.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/jn.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/lgamma.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/llround.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/log.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/log10.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/lround.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/math-entries.h (96%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/mod-real.cpp (88%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/modulo-real.cpp (89%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/nearbyint.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/nearest.cpp (88%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/norm2.cpp (89%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/numeric-template-specs.h (82%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/pow.cpp (88%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/random.cpp (83%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/remainder.cpp (88%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/round.cpp (89%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/rrspacing.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/scale.cpp (90%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/set-exponent.cpp (88%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/sin.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/sinh.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/spacing.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/sqrt.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/tan.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/tanh.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/tgamma.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/trunc.cpp (89%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/y0.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/y1.cpp (87%)
 rename {flang/runtime/Float128Math => flang-rt/lib/quadmath}/yn.cpp (87%)
 rename {flang => flang-rt/lib}/runtime/ISO_Fortran_binding.cpp (97%)
 rename {flang => flang-rt/lib}/runtime/ISO_Fortran_util.h (90%)
 rename {flang => flang-rt/lib}/runtime/allocatable.cpp (96%)
 rename {flang => flang-rt/lib}/runtime/allocator-registry.cpp (87%)
 rename {flang => flang-rt/lib}/runtime/array-constructor.cpp (96%)
 rename {flang => flang-rt/lib}/runtime/assign.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/buffer.cpp (88%)
 rename {flang => flang-rt/lib}/runtime/character.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/command.cpp (96%)
 rename {flang => flang-rt/lib}/runtime/complex-powi.cpp (92%)
 rename {flang => flang-rt/lib}/runtime/complex-reduction.c (97%)
 rename {flang => flang-rt/lib}/runtime/complex-reduction.h (96%)
 rename {flang => flang-rt/lib}/runtime/connection.cpp (90%)
 rename {flang => flang-rt/lib}/runtime/copy.cpp (97%)
 rename {flang => flang-rt/lib}/runtime/copy.h (78%)
 rename {flang => flang-rt/lib}/runtime/derived-api.cpp (95%)
 rename {flang => flang-rt/lib}/runtime/derived.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/descriptor-io.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/descriptor-io.h (98%)
 rename {flang => flang-rt/lib}/runtime/descriptor.cpp (94%)
 rename {flang => flang-rt/lib}/runtime/dot-product.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/edit-input.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/edit-input.h (87%)
 rename {flang => flang-rt/lib}/runtime/edit-output.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/edit-output.h (95%)
 rename {flang => flang-rt/lib}/runtime/environment-default-list.h (70%)
 rename {flang => flang-rt/lib}/runtime/environment.cpp (96%)
 rename {flang => flang-rt/lib}/runtime/exceptions.cpp (97%)
 rename {flang => flang-rt/lib}/runtime/execute.cpp (97%)
 rename {flang => flang-rt/lib}/runtime/extensions.cpp (97%)
 rename {flang => flang-rt/lib}/runtime/external-unit.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/extrema.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/file.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/findloc.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/format.cpp (87%)
 rename {flang => flang-rt/lib}/runtime/inquiry.cpp (94%)
 rename {flang => flang-rt/lib}/runtime/internal-unit.cpp (96%)
 rename {flang => flang-rt/lib}/runtime/io-api-common.h (92%)
 rename {flang => flang-rt/lib}/runtime/io-api-minimal.cpp (94%)
 rename {flang => flang-rt/lib}/runtime/io-api.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/io-error.cpp (96%)
 rename {flang => flang-rt/lib}/runtime/io-stmt.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/iostat.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/main.cpp (89%)
 rename {flang => flang-rt/lib}/runtime/matmul-transpose.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/matmul.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/memory.cpp (85%)
 rename {flang => flang-rt/lib}/runtime/misc-intrinsic.cpp (95%)
 rename {flang => flang-rt/lib}/runtime/namelist.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/non-tbp-dio.cpp (86%)
 rename {flang => flang-rt/lib}/runtime/numeric.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/pointer.cpp (96%)
 rename {flang => flang-rt/lib}/runtime/product.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/pseudo-unit.cpp (97%)
 rename {flang => flang-rt/lib}/runtime/ragged.cpp (96%)
 rename {flang => flang-rt/lib}/runtime/random.cpp (96%)
 rename {flang => flang-rt/lib}/runtime/reduce.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/reduction.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/stack.h (93%)
 rename {flang => flang-rt/lib}/runtime/stat.cpp (92%)
 rename {flang => flang-rt/lib}/runtime/stop.cpp (95%)
 rename {flang => flang-rt/lib}/runtime/sum.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/support.cpp (90%)
 rename {flang => flang-rt/lib}/runtime/temporary-stack.cpp (97%)
 rename {flang => flang-rt/lib}/runtime/terminator.cpp (95%)
 rename {flang => flang-rt/lib}/runtime/time-intrinsic.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/tools.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/transformational.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/type-code.cpp (98%)
 rename {flang => flang-rt/lib}/runtime/type-info.cpp (97%)
 rename {flang => flang-rt/lib}/runtime/unit-map.cpp (97%)
 rename {flang => flang-rt/lib}/runtime/unit-map.h (92%)
 rename {flang => flang-rt/lib}/runtime/unit.cpp (99%)
 rename {flang => flang-rt/lib}/runtime/unit.h (95%)
 rename {flang => flang-rt/lib}/runtime/utf.cpp (97%)
 rename {flang => flang-rt}/test/Driver/ctofortran.f90 (100%)
 rename {flang => flang-rt}/test/Driver/exec.f90 (100%)
 rename {flang => flang-rt}/test/Runtime/no-cpp-dep.c (100%)
 rename {flang => flang-rt}/unittests/Evaluate/ISO-Fortran-binding.cpp (98%)
 rename {flang => flang-rt}/unittests/Evaluate/reshape.cpp (88%)
 rename {flang => flang-rt}/unittests/Runtime/AccessTest.cpp (99%)
 rename {flang => flang-rt}/unittests/Runtime/Allocatable.cpp (97%)
 rename {flang => flang-rt}/unittests/Runtime/ArrayConstructor.cpp (96%)
 rename {flang => flang-rt}/unittests/Runtime/BufferTest.cpp (97%)
 rename {flang => flang-rt}/unittests/Runtime/CUDA/Allocatable.cpp (90%)
 rename {flang => flang-rt}/unittests/Runtime/CUDA/AllocatorCUF.cpp (92%)
 rename {flang => flang-rt}/unittests/Runtime/CUDA/Memory.cpp (93%)
 rename {flang => flang-rt}/unittests/Runtime/CharacterTest.cpp (99%)
 rename {flang => flang-rt}/unittests/Runtime/CommandTest.cpp (99%)
 rename {flang => flang-rt}/unittests/Runtime/Complex.cpp (98%)
 rename {flang => flang-rt}/unittests/Runtime/CrashHandlerFixture.cpp (92%)
 rename {flang => flang-rt}/unittests/Runtime/CrashHandlerFixture.h (70%)
 rename {flang => flang-rt}/unittests/Runtime/Derived.cpp (93%)
 rename {flang => flang-rt}/unittests/Runtime/ExternalIOTest.cpp (99%)
 rename {flang => flang-rt}/unittests/Runtime/Format.cpp (96%)
 rename {flang => flang-rt}/unittests/Runtime/Inquiry.cpp (97%)
 rename {flang => flang-rt}/unittests/Runtime/ListInputTest.cpp (98%)
 rename {flang => flang-rt}/unittests/Runtime/LogicalFormatTest.cpp (94%)
 rename {flang => flang-rt}/unittests/Runtime/Matmul.cpp (98%)
 rename {flang => flang-rt}/unittests/Runtime/MatmulTranspose.cpp (98%)
 rename {flang => flang-rt}/unittests/Runtime/MiscIntrinsic.cpp (96%)
 rename {flang => flang-rt}/unittests/Runtime/Namelist.cpp (99%)
 rename {flang => flang-rt}/unittests/Runtime/Numeric.cpp (99%)
 rename {flang => flang-rt}/unittests/Runtime/NumericalFormatTest.cpp (99%)
 rename {flang => flang-rt}/unittests/Runtime/Pointer.cpp (97%)
 rename {flang => flang-rt}/unittests/Runtime/Ragged.cpp (94%)
 rename {flang => flang-rt}/unittests/Runtime/Random.cpp (92%)
 rename {flang => flang-rt}/unittests/Runtime/Reduction.cpp (99%)
 rename {flang => flang-rt}/unittests/Runtime/RuntimeCrashTest.cpp (98%)
 rename {flang => flang-rt}/unittests/Runtime/Stop.cpp (96%)
 rename {flang => flang-rt}/unittests/Runtime/Support.cpp (96%)
 rename {flang => flang-rt}/unittests/Runtime/TemporaryStack.cpp (98%)
 rename {flang => flang-rt}/unittests/Runtime/Time.cpp (98%)
 rename {flang => flang-rt}/unittests/Runtime/Transformational.cpp (99%)
 rename {flang => flang-rt}/unittests/Runtime/tools.h (85%)
 delete mode 100644 flang/runtime/config.h.cmake

diff --git a/flang-rt/.clang-format b/flang-rt/.clang-format
new file mode 100644
index 0000000000000..23f4c5ae2dcf2
--- /dev/null
+++ b/flang-rt/.clang-format
@@ -0,0 +1,25 @@
+---
+# See: https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+BasedOnStyle: LLVM
+AlignAfterOpenBracket: DontAlign
+AlignEscapedNewlines: DontAlign
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignOperands: false
+AlignTrailingComments: false
+IncludeCategories:
+  - Regex:           '^<'
+    Priority:        6
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        5
+  - Regex:           '^"flang/'     # Headers shared with Flang
+    Priority:        4
+  - Regex:           '^"flang-rt/'  # Public Flang-RT headers
+    Priority:        3
+  - Regex:           '^[^/]*$'      # Private headers from same library
+    Priority:        1
+  - Regex:           '.*'           # Private headers from sibling libraries
+    Priority:        2
+...
+
+# vim:set filetype=yaml:
diff --git a/flang-rt/cmake/config.h.cmake.in b/flang-rt/cmake/config.h.cmake.in
new file mode 100644
index 0000000000000..8a4668b90addd
--- /dev/null
+++ b/flang-rt/cmake/config.h.cmake.in
@@ -0,0 +1,19 @@
+/*===-- cmake/config.cmake.in ---------------------------------------*- C -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===----------------------------------------------------------------------===*/
+
+#ifndef FORTRAN_RUNTIME_CONFIG_H
+#define FORTRAN_RUNTIME_CONFIG_H
+
+/* Define to 1 if you have the `strerror_r' function. */
+#cmakedefine01 HAVE_STRERROR_R
+
+/* Define to 1 if you have the declaration of `strerror_s', and to 0 if you
+   don't. */
+#cmakedefine01 HAVE_DECL_STRERROR_S
+
+#endif
diff --git a/flang/examples/ExternalHelloWorld/external-hello.cpp b/flang-rt/examples/ExternalHelloWorld/external-hello.cpp
similarity index 80%
rename from flang/examples/ExternalHelloWorld/external-hello.cpp
rename to flang-rt/examples/ExternalHelloWorld/external-hello.cpp
index 4991bf9eba999..7c8a12476295c 100644
--- a/flang/examples/ExternalHelloWorld/external-hello.cpp
+++ b/flang-rt/examples/ExternalHelloWorld/external-hello.cpp
@@ -1,3 +1,11 @@
+//===-- examples/ExternalHelloWorld/external-hello.cpp ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "flang/Runtime/io-api.h"
 #include "flang/Runtime/main.h"
 #include "flang/Runtime/stop.h"
diff --git a/flang/include/flang/Runtime/allocator-registry.h b/flang-rt/include/flang-rt/runtime/allocator-registry.h
similarity index 87%
rename from flang/include/flang/Runtime/allocator-registry.h
rename to flang-rt/include/flang-rt/runtime/allocator-registry.h
index 29302c5d825bc..1a59ec8b1ef5b 100644
--- a/flang/include/flang/Runtime/allocator-registry.h
+++ b/flang-rt/include/flang-rt/runtime/allocator-registry.h
@@ -1,4 +1,4 @@
-//===-- runtime/allocator-registry.h ----------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/allocator-registry.h -----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_
-#define FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_
+#ifndef FLANG_RT_RUNTIME_ALLOCATOR_REGISTRY_H_
+#define FLANG_RT_RUNTIME_ALLOCATOR_REGISTRY_H_
 
 #include "flang/Common/api-attrs.h"
 #include "flang/Runtime/allocator-registry-consts.h"
@@ -55,4 +55,4 @@ RT_OFFLOAD_VAR_GROUP_END
 
 } // namespace Fortran::runtime
 
-#endif // FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_
+#endif // FLANG_RT_RUNTIME_ALLOCATOR_REGISTRY_H_
diff --git a/flang/include/flang/Runtime/array-constructor.h b/flang-rt/include/flang-rt/runtime/array-constructor.h
similarity index 89%
rename from flang/include/flang/Runtime/array-constructor.h
rename to flang-rt/include/flang-rt/runtime/array-constructor.h
index 2f6aaae17c650..9c037177161c0 100644
--- a/flang/include/flang/Runtime/array-constructor.h
+++ b/flang-rt/include/flang-rt/runtime/array-constructor.h
@@ -1,4 +1,4 @@
-//===-- include/flang/Runtime/array-constructor.h ---------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/array-constructor.h ------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,11 +9,11 @@
 // External APIs to create temporary storage for array constructors when their
 // final extents or length parameters cannot be pre-computed.
 
-#ifndef FORTRAN_RUNTIME_ARRAYCONSTRUCTOR_H_
-#define FORTRAN_RUNTIME_ARRAYCONSTRUCTOR_H_
+#ifndef FLANG_RT_RUNTIME_ARRAY_CONSTRUCTOR_H_
+#define FLANG_RT_RUNTIME_ARRAY_CONSTRUCTOR_H_
 
+#include "descriptor.h"
 #include "flang/Runtime/array-constructor-consts.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/entry-names.h"
 #include <cstdint>
 
@@ -54,4 +54,4 @@ static_assert(alignof(Fortran::runtime::ArrayConstructorVector) <=
     "MaxArrayConstructorVectorAlignInBytes");
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_ARRAYCONSTRUCTOR_H_
+#endif // FLANG_RT_RUNTIME_ARRAY_CONSTRUCTOR_H_
diff --git a/flang/runtime/assign-impl.h b/flang-rt/include/flang-rt/runtime/assign-impl.h
similarity index 85%
rename from flang/runtime/assign-impl.h
rename to flang-rt/include/flang-rt/runtime/assign-impl.h
index aaa320ef7f959..cc931f6713cd8 100644
--- a/flang/runtime/assign-impl.h
+++ b/flang-rt/include/flang-rt/runtime/assign-impl.h
@@ -1,4 +1,4 @@
-//===-- runtime/assign-impl.h -----------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/assign-impl.h ------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_ASSIGN_IMPL_H_
-#define FORTRAN_RUNTIME_ASSIGN_IMPL_H_
+#ifndef FLANG_RT_RUNTIME_ASSIGN_IMPL_H_
+#define FLANG_RT_RUNTIME_ASSIGN_IMPL_H_
 
 #include "flang/Runtime/freestanding-tools.h"
 
@@ -28,4 +28,4 @@ RT_API_ATTRS void DoFromSourceAssign(Descriptor &, const Descriptor &,
 #endif
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_ASSIGN_IMPL_H_
+#endif // FLANG_RT_RUNTIME_ASSIGN_IMPL_H_
diff --git a/flang/runtime/buffer.h b/flang-rt/include/flang-rt/runtime/buffer.h
similarity index 97%
rename from flang/runtime/buffer.h
rename to flang-rt/include/flang-rt/runtime/buffer.h
index 41a1abb1b2d90..b5a9ce9e35e91 100644
--- a/flang/runtime/buffer.h
+++ b/flang-rt/include/flang-rt/runtime/buffer.h
@@ -1,4 +1,4 @@
-//===-- runtime/buffer.h ----------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/buffer.h -----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,12 +8,12 @@
 
 // External file buffering
 
-#ifndef FORTRAN_RUNTIME_BUFFER_H_
-#define FORTRAN_RUNTIME_BUFFER_H_
+#ifndef FLANG_RT_RUNTIME_BUFFER_H_
+#define FLANG_RT_RUNTIME_BUFFER_H_
 
 #include "io-error.h"
+#include "memory.h"
 #include "flang/Runtime/freestanding-tools.h"
-#include "flang/Runtime/memory.h"
 #include <algorithm>
 #include <cinttypes>
 #include <cstring>
@@ -221,4 +221,4 @@ template <typename STORE, std::size_t minBuffer = 65536> class FileFrame {
   bool dirty_{false};
 };
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_BUFFER_H_
+#endif // FLANG_RT_RUNTIME_BUFFER_H_
diff --git a/flang/runtime/connection.h b/flang-rt/include/flang-rt/runtime/connection.h
similarity index 96%
rename from flang/runtime/connection.h
rename to flang-rt/include/flang-rt/runtime/connection.h
index 6f1ea90a160e5..03d9658e7067b 100644
--- a/flang/runtime/connection.h
+++ b/flang-rt/include/flang-rt/runtime/connection.h
@@ -1,4 +1,4 @@
-//===-- runtime/connection.h ------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/connection.h -------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,8 @@
 
 // Fortran I/O connection state (abstracted over internal & external units)
 
-#ifndef FORTRAN_RUNTIME_IO_CONNECTION_H_
-#define FORTRAN_RUNTIME_IO_CONNECTION_H_
+#ifndef FLANG_RT_RUNTIME_CONNECTION_H_
+#define FLANG_RT_RUNTIME_CONNECTION_H_
 
 #include "format.h"
 #include "flang/Common/optional.h"
@@ -124,4 +124,4 @@ class SavedPosition {
 };
 
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_IO_CONNECTION_H_
+#endif // FLANG_RT_RUNTIME_CONNECTION_H_
diff --git a/flang/runtime/derived.h b/flang-rt/include/flang-rt/runtime/derived.h
similarity index 90%
rename from flang/runtime/derived.h
rename to flang-rt/include/flang-rt/runtime/derived.h
index f5a1e219b848c..ac6962c57168c 100644
--- a/flang/runtime/derived.h
+++ b/flang-rt/include/flang-rt/runtime/derived.h
@@ -1,4 +1,4 @@
-//===-- runtime/derived.h -------------------------------------------------===//
+//===-- include/flang-rt/runtime/derived.h ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,8 @@
 
 // Internal runtime utilities for derived type operations.
 
-#ifndef FORTRAN_RUNTIME_DERIVED_H_
-#define FORTRAN_RUNTIME_DERIVED_H_
+#ifndef FLANG_RT_RUNTIME_DERIVED_H_
+#define FLANG_RT_RUNTIME_DERIVED_H_
 
 #include "flang/Common/api-attrs.h"
 
@@ -48,4 +48,4 @@ RT_API_ATTRS void Destroy(const Descriptor &, bool finalize,
 RT_API_ATTRS bool HasDynamicComponent(const Descriptor &);
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_DERIVED_H_
+#endif // FLANG_RT_RUNTIME_DERIVED_H_
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang-rt/include/flang-rt/runtime/descriptor.h
similarity index 98%
rename from flang/include/flang/Runtime/descriptor.h
rename to flang-rt/include/flang-rt/runtime/descriptor.h
index 628ac8c927a51..19e1a0bf6a1dd 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang-rt/include/flang-rt/runtime/descriptor.h
@@ -1,4 +1,4 @@
-//===-- include/flang/Runtime/descriptor.h ----------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/descriptor.h -------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_DESCRIPTOR_H_
-#define FORTRAN_RUNTIME_DESCRIPTOR_H_
+#ifndef FLANG_RT_RUNTIME_DESCRIPTOR_H_
+#define FLANG_RT_RUNTIME_DESCRIPTOR_H_
 
 // Defines data structures used during execution of a Fortran program
 // to implement nontrivial dummy arguments, pointers, allocatables,
@@ -18,10 +18,10 @@
 // User C code is welcome to depend on that ISO_Fortran_binding.h file,
 // but should never reference this internal header.
 
+#include "memory.h"
+#include "type-code.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
 #include "flang/Runtime/descriptor-consts.h"
-#include "flang/Runtime/memory.h"
-#include "flang/Runtime/type-code.h"
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
@@ -482,4 +482,4 @@ class alignas(Descriptor) StaticDescriptor {
 };
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_DESCRIPTOR_H_
+#endif // FLANG_RT_RUNTIME_DESCRIPTOR_H_
diff --git a/flang/runtime/emit-encoded.h b/flang-rt/include/flang-rt/runtime/emit-encoded.h
similarity index 94%
rename from flang/runtime/emit-encoded.h
rename to flang-rt/include/flang-rt/runtime/emit-encoded.h
index 4b5e390078835..d99f56b29558e 100644
--- a/flang/runtime/emit-encoded.h
+++ b/flang-rt/include/flang-rt/runtime/emit-encoded.h
@@ -1,4 +1,4 @@
-//===-- runtime/emit-encoded.h ----------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/emit-encoded.h -----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,8 @@
 
 // Templates for emitting CHARACTER values with conversion
 
-#ifndef FORTRAN_RUNTIME_EMIT_ENCODED_H_
-#define FORTRAN_RUNTIME_EMIT_ENCODED_H_
+#ifndef FLANG_RT_RUNTIME_EMIT_ENCODED_H_
+#define FLANG_RT_RUNTIME_EMIT_ENCODED_H_
 
 #include "connection.h"
 #include "environment.h"
@@ -114,4 +114,4 @@ RT_API_ATTRS bool EmitRepeated(CONTEXT &to, char ch, std::size_t n) {
 }
 
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_EMIT_ENCODED_H_
+#endif // FLANG_RT_RUNTIME_EMIT_ENCODED_H_
diff --git a/flang/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h
similarity index 91%
rename from flang/runtime/environment.h
rename to flang-rt/include/flang-rt/runtime/environment.h
index 500aa925a625b..142add432b5f7 100644
--- a/flang/runtime/environment.h
+++ b/flang-rt/include/flang-rt/runtime/environment.h
@@ -1,4 +1,4 @@
-//===-- runtime/environment.h -----------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/environment.h ------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_ENVIRONMENT_H_
-#define FORTRAN_RUNTIME_ENVIRONMENT_H_
+#ifndef FLANG_RT_RUNTIME_ENVIRONMENT_H_
+#define FLANG_RT_RUNTIME_ENVIRONMENT_H_
 
 #include "flang/Common/optional.h"
 #include "flang/Decimal/decimal.h"
@@ -67,4 +67,4 @@ RT_OFFLOAD_VAR_GROUP_END
 
 } // namespace Fortran::runtime
 
-#endif // FORTRAN_RUNTIME_ENVIRONMENT_H_
+#endif // FLANG_RT_RUNTIME_ENVIRONMENT_H_
diff --git a/flang/runtime/file.h b/flang-rt/include/flang-rt/runtime/file.h
similarity index 95%
rename from flang/runtime/file.h
rename to flang-rt/include/flang-rt/runtime/file.h
index c06acbb9904cc..3bba29722b3b8 100644
--- a/flang/runtime/file.h
+++ b/flang-rt/include/flang-rt/runtime/file.h
@@ -1,4 +1,4 @@
-//===-- runtime/file.h ------------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/file.h -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,12 +8,12 @@
 
 // Raw system I/O wrappers
 
-#ifndef FORTRAN_RUNTIME_FILE_H_
-#define FORTRAN_RUNTIME_FILE_H_
+#ifndef FLANG_RT_RUNTIME_FILE_H_
+#define FLANG_RT_RUNTIME_FILE_H_
 
 #include "io-error.h"
+#include "memory.h"
 #include "flang/Common/optional.h"
-#include "flang/Runtime/memory.h"
 #include <cinttypes>
 
 namespace Fortran::runtime::io {
@@ -113,4 +113,4 @@ RT_API_ATTRS bool MayWrite(const char *path);
 RT_API_ATTRS bool MayReadAndWrite(const char *path);
 RT_API_ATTRS std::int64_t SizeInBytes(const char *path);
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_FILE_H_
+#endif // FLANG_RT_RUNTIME_FILE_H_
diff --git a/flang/runtime/format-implementation.h b/flang-rt/include/flang-rt/runtime/format-implementation.h
similarity index 98%
rename from flang/runtime/format-implementation.h
rename to flang-rt/include/flang-rt/runtime/format-implementation.h
index 46204ca927c13..8f4eb1161dd14 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang-rt/include/flang-rt/runtime/format-implementation.h
@@ -1,4 +1,4 @@
-//===-- runtime/format-implementation.h -------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/format-implementation.h --------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,8 @@
 
 // Implements out-of-line member functions of template class FormatControl
 
-#ifndef FORTRAN_RUNTIME_FORMAT_IMPLEMENTATION_H_
-#define FORTRAN_RUNTIME_FORMAT_IMPLEMENTATION_H_
+#ifndef FLANG_RT_RUNTIME_FORMAT_IMPLEMENTATION_H_
+#define FLANG_RT_RUNTIME_FORMAT_IMPLEMENTATION_H_
 
 #include "emit-encoded.h"
 #include "format.h"
@@ -601,4 +601,4 @@ RT_API_ATTRS void FormatControl<CONTEXT>::Finish(Context &context) {
   }
 }
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_FORMAT_IMPLEMENTATION_H_
+#endif // FLANG_RT_RUNTIME_FORMAT_IMPLEMENTATION_H_
diff --git a/flang/runtime/format.h b/flang-rt/include/flang-rt/runtime/format.h
similarity index 97%
rename from flang/runtime/format.h
rename to flang-rt/include/flang-rt/runtime/format.h
index 815bf70685e64..b169d63fa4999 100644
--- a/flang/runtime/format.h
+++ b/flang-rt/include/flang-rt/runtime/format.h
@@ -1,4 +1,4 @@
-//===-- runtime/format.h ----------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/format.h -----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,8 @@
 
 // FORMAT string processing
 
-#ifndef FORTRAN_RUNTIME_FORMAT_H_
-#define FORTRAN_RUNTIME_FORMAT_H_
+#ifndef FLANG_RT_RUNTIME_FORMAT_H_
+#define FLANG_RT_RUNTIME_FORMAT_H_
 
 #include "environment.h"
 #include "io-error.h"
@@ -201,4 +201,4 @@ template <typename CONTEXT> class FormatControl {
   Iteration stack_[maxMaxHeight];
 };
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_FORMAT_H_
+#endif // FLANG_RT_RUNTIME_FORMAT_H_
diff --git a/flang/runtime/internal-unit.h b/flang-rt/include/flang-rt/runtime/internal-unit.h
similarity index 89%
rename from flang/runtime/internal-unit.h
rename to flang-rt/include/flang-rt/runtime/internal-unit.h
index a0ee6353eeda3..429d3489e0112 100644
--- a/flang/runtime/internal-unit.h
+++ b/flang-rt/include/flang-rt/runtime/internal-unit.h
@@ -1,4 +1,4 @@
-//===-- runtime/internal-unit.h ---------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/internal-unit.h ----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,11 +8,11 @@
 
 // Fortran internal I/O "units"
 
-#ifndef FORTRAN_RUNTIME_IO_INTERNAL_UNIT_H_
-#define FORTRAN_RUNTIME_IO_INTERNAL_UNIT_H_
+#ifndef FLANG_RT_RUNTIME_INTERNAL_UNIT_H_
+#define FLANG_RT_RUNTIME_INTERNAL_UNIT_H_
 
 #include "connection.h"
-#include "flang/Runtime/descriptor.h"
+#include "descriptor.h"
 #include <cinttypes>
 #include <type_traits>
 
@@ -56,4 +56,4 @@ template <Direction DIR> class InternalDescriptorUnit : public ConnectionState {
 extern template class InternalDescriptorUnit<Direction::Output>;
 extern template class InternalDescriptorUnit<Direction::Input>;
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_IO_INTERNAL_UNIT_H_
+#endif // FLANG_RT_RUNTIME_INTERNAL_UNIT_H_
diff --git a/flang/runtime/io-error.h b/flang-rt/include/flang-rt/runtime/io-error.h
similarity index 92%
rename from flang/runtime/io-error.h
rename to flang-rt/include/flang-rt/runtime/io-error.h
index 39a343c8e0a51..1cef6a208f374 100644
--- a/flang/runtime/io-error.h
+++ b/flang-rt/include/flang-rt/runtime/io-error.h
@@ -1,4 +1,4 @@
-//===-- runtime/io-error.h --------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/io-error.h ---------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,12 +12,12 @@
 // IOSTAT error codes are raw errno values augmented with values for
 // Fortran-specific errors.
 
-#ifndef FORTRAN_RUNTIME_IO_ERROR_H_
-#define FORTRAN_RUNTIME_IO_ERROR_H_
+#ifndef FLANG_RT_RUNTIME_IO_ERROR_H_
+#define FLANG_RT_RUNTIME_IO_ERROR_H_
 
+#include "memory.h"
 #include "terminator.h"
 #include "flang/Runtime/iostat.h"
-#include "flang/Runtime/memory.h"
 #include <cinttypes>
 
 namespace Fortran::runtime::io {
@@ -80,4 +80,4 @@ class IoErrorHandler : public Terminator {
 };
 
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_IO_ERROR_H_
+#endif // FLANG_RT_RUNTIME_IO_ERROR_H_
diff --git a/flang/runtime/io-stmt.h b/flang-rt/include/flang-rt/runtime/io-stmt.h
similarity index 99%
rename from flang/runtime/io-stmt.h
rename to flang-rt/include/flang-rt/runtime/io-stmt.h
index 1f1419b249e5e..a364ddfd9b3c7 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang-rt/include/flang-rt/runtime/io-stmt.h
@@ -1,4 +1,4 @@
-//===-- runtime/io-stmt.h ---------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/io-stmt.h ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,10 +8,11 @@
 
 // Representations of the state of an I/O statement in progress
 
-#ifndef FORTRAN_RUNTIME_IO_STMT_H_
-#define FORTRAN_RUNTIME_IO_STMT_H_
+#ifndef FLANG_RT_RUNTIME_IO_STMT_H_
+#define FLANG_RT_RUNTIME_IO_STMT_H_
 
 #include "connection.h"
+#include "descriptor.h"
 #include "file.h"
 #include "format.h"
 #include "internal-unit.h"
@@ -19,7 +20,6 @@
 #include "flang/Common/optional.h"
 #include "flang/Common/reference-wrapper.h"
 #include "flang/Common/visit.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/io-api.h"
 #include <flang/Common/variant.h>
 #include <functional>
@@ -791,4 +791,4 @@ class ErroneousIoStatementState : public IoStatementBase {
 };
 
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_IO_STMT_H_
+#endif // FLANG_RT_RUNTIME_IO_STMT_H_
diff --git a/flang/runtime/lock.h b/flang-rt/include/flang-rt/runtime/lock.h
similarity index 94%
rename from flang/runtime/lock.h
rename to flang-rt/include/flang-rt/runtime/lock.h
index 46ca28703a45b..7c88534245733 100644
--- a/flang/runtime/lock.h
+++ b/flang-rt/include/flang-rt/runtime/lock.h
@@ -1,4 +1,4 @@
-//===-- runtime/lock.h ------------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/lock.h -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,8 @@
 
 // Wraps a mutex
 
-#ifndef FORTRAN_RUNTIME_LOCK_H_
-#define FORTRAN_RUNTIME_LOCK_H_
+#ifndef FLANG_RT_RUNTIME_LOCK_H_
+#define FLANG_RT_RUNTIME_LOCK_H_
 
 #include "terminator.h"
 #include "tools.h"
@@ -113,4 +113,4 @@ class CriticalSection {
 };
 } // namespace Fortran::runtime
 
-#endif // FORTRAN_RUNTIME_LOCK_H_
+#endif // FLANG_RT_RUNTIME_LOCK_H_
diff --git a/flang/include/flang/Runtime/memory.h b/flang-rt/include/flang-rt/runtime/memory.h
similarity index 96%
rename from flang/include/flang/Runtime/memory.h
rename to flang-rt/include/flang-rt/runtime/memory.h
index 98412a989f890..93b477afa9814 100644
--- a/flang/include/flang/Runtime/memory.h
+++ b/flang-rt/include/flang-rt/runtime/memory.h
@@ -1,4 +1,4 @@
-//===-- include/flang/Runtime/memory.h --------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/memory.h -----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,8 +9,8 @@
 // Thin wrapper around malloc()/free() to isolate the dependency,
 // ease porting, and provide an owning pointer.
 
-#ifndef FORTRAN_RUNTIME_MEMORY_H_
-#define FORTRAN_RUNTIME_MEMORY_H_
+#ifndef FLANG_RT_RUNTIME_MEMORY_H_
+#define FLANG_RT_RUNTIME_MEMORY_H_
 
 #include "flang/Common/api-attrs.h"
 #include <cassert>
@@ -170,4 +170,4 @@ template <typename A> struct Allocator {
 };
 } // namespace Fortran::runtime
 
-#endif // FORTRAN_RUNTIME_MEMORY_H_
+#endif // FLANG_RT_RUNTIME_MEMORY_H_
diff --git a/flang/runtime/namelist.h b/flang-rt/include/flang-rt/runtime/namelist.h
similarity index 91%
rename from flang/runtime/namelist.h
rename to flang-rt/include/flang-rt/runtime/namelist.h
index 25216a75e9367..17d7bf310cc96 100644
--- a/flang/runtime/namelist.h
+++ b/flang-rt/include/flang-rt/runtime/namelist.h
@@ -1,4 +1,4 @@
-//===-- runtime/namelist.h --------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/namelist.h ---------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,8 @@
 
 // Defines the data structure used for NAMELIST I/O
 
-#ifndef FORTRAN_RUNTIME_NAMELIST_H_
-#define FORTRAN_RUNTIME_NAMELIST_H_
+#ifndef FLANG_RT_RUNTIME_NAMELIST_H_
+#define FLANG_RT_RUNTIME_NAMELIST_H_
 
 #include "non-tbp-dio.h"
 #include "flang/Common/api-attrs.h"
@@ -51,4 +51,4 @@ class NamelistGroup {
 RT_API_ATTRS bool IsNamelistNameOrSlash(IoStatementState &);
 
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_NAMELIST_H_
+#endif // FLANG_RT_RUNTIME_NAMELIST_H_
diff --git a/flang/runtime/non-tbp-dio.h b/flang-rt/include/flang-rt/runtime/non-tbp-dio.h
similarity index 91%
rename from flang/runtime/non-tbp-dio.h
rename to flang-rt/include/flang-rt/runtime/non-tbp-dio.h
index 8429d790fea57..2bbbfa7f97f79 100644
--- a/flang/runtime/non-tbp-dio.h
+++ b/flang-rt/include/flang-rt/runtime/non-tbp-dio.h
@@ -1,4 +1,4 @@
-//===-- flang/runtime/non-tbp-dio.h -----------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/non-tbp-dio.h ------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -19,8 +19,8 @@
 // a containing scope has become inaccessible in a nested scope due
 // to the use of "IMPORT, NONE" or "IMPORT, ONLY:".
 
-#ifndef FORTRAN_RUNTIME_NON_TBP_DIO_H_
-#define FORTRAN_RUNTIME_NON_TBP_DIO_H_
+#ifndef FLANG_RT_RUNTIME_NON_TBP_DIO_H_
+#define FLANG_RT_RUNTIME_NON_TBP_DIO_H_
 
 #include "flang/Common/Fortran-consts.h"
 #include "flang/Common/api-attrs.h"
@@ -53,4 +53,4 @@ struct NonTbpDefinedIoTable {
 };
 
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_NON_TBP_DIO_H_
+#endif // FLANG_RT_RUNTIME_NON_TBP_DIO_H_
diff --git a/flang/runtime/numeric-templates.h b/flang-rt/include/flang-rt/runtime/numeric-templates.h
similarity index 98%
rename from flang/runtime/numeric-templates.h
rename to flang-rt/include/flang-rt/runtime/numeric-templates.h
index fbb371bffc27a..1253d56f3c0fd 100644
--- a/flang/runtime/numeric-templates.h
+++ b/flang-rt/include/flang-rt/runtime/numeric-templates.h
@@ -1,4 +1,4 @@
-//===-- runtime/numeric-templates.h -----------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/numeric-templates.h ------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,8 +15,8 @@
 // for the data type corresponding to CppTypeFor<TypeCategory::Real, 16>
 // on the target.
 
-#ifndef FORTRAN_RUNTIME_NUMERIC_TEMPLATES_H_
-#define FORTRAN_RUNTIME_NUMERIC_TEMPLATES_H_
+#ifndef FLANG_RT_RUNTIME_NUMERIC_TEMPLATES_H_
+#define FLANG_RT_RUNTIME_NUMERIC_TEMPLATES_H_
 
 #include "terminator.h"
 #include "tools.h"
@@ -368,4 +368,4 @@ template <typename T> inline RT_API_ATTRS T ErfcScaled(T arg) {
 
 } // namespace Fortran::runtime
 
-#endif // FORTRAN_RUNTIME_NUMERIC_TEMPLATES_H_
+#endif // FLANG_RT_RUNTIME_NUMERIC_TEMPLATES_H_
diff --git a/flang/runtime/random-templates.h b/flang-rt/include/flang-rt/runtime/random-templates.h
similarity index 93%
rename from flang/runtime/random-templates.h
rename to flang-rt/include/flang-rt/runtime/random-templates.h
index 3885941704d4a..895c5ad4fc8bb 100644
--- a/flang/runtime/random-templates.h
+++ b/flang-rt/include/flang-rt/runtime/random-templates.h
@@ -1,4 +1,4 @@
-//===-- runtime/random-templates.h ------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/random-templates.h -------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_RANDOM_TEMPLATES_H_
-#define FORTRAN_RUNTIME_RANDOM_TEMPLATES_H_
+#ifndef FLANG_RT_RUNTIME_RANDOM_TEMPLATES_H_
+#define FLANG_RT_RUNTIME_RANDOM_TEMPLATES_H_
 
+#include "descriptor.h"
 #include "lock.h"
 #include "numeric-templates.h"
 #include "flang/Common/optional.h"
-#include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <random>
 
@@ -108,4 +108,4 @@ inline void GenerateUnsigned(const Descriptor &harvest) {
 
 } // namespace Fortran::runtime::random
 
-#endif // FORTRAN_RUNTIME_RANDOM_TEMPLATES_H_
+#endif // FLANG_RT_RUNTIME_RANDOM_TEMPLATES_H_
diff --git a/flang/runtime/reduction-templates.h b/flang-rt/include/flang-rt/runtime/reduction-templates.h
similarity index 98%
rename from flang/runtime/reduction-templates.h
rename to flang-rt/include/flang-rt/runtime/reduction-templates.h
index b20b03655c3d0..8c6f838b8dadf 100644
--- a/flang/runtime/reduction-templates.h
+++ b/flang-rt/include/flang-rt/runtime/reduction-templates.h
@@ -1,4 +1,4 @@
-//===-- runtime/reduction-templates.h ---------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/reduction-templates.h ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -18,14 +18,14 @@
 // * Character-valued reductions (MAXVAL & MINVAL) return arbitrary
 //   length results, dynamically allocated in a caller-supplied descriptor
 
-#ifndef FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
-#define FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
+#ifndef FLANG_RT_RUNTIME_REDUCTION_TEMPLATES_H_
+#define FLANG_RT_RUNTIME_REDUCTION_TEMPLATES_H_
 
+#include "descriptor.h"
 #include "numeric-templates.h"
 #include "terminator.h"
 #include "tools.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include <algorithm>
 
 namespace Fortran::runtime {
@@ -416,4 +416,4 @@ template <int KIND> struct Norm2Helper {
 };
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
+#endif // FLANG_RT_RUNTIME_REDUCTION_TEMPLATES_H_
diff --git a/flang/runtime/stat.h b/flang-rt/include/flang-rt/runtime/stat.h
similarity index 93%
rename from flang/runtime/stat.h
rename to flang-rt/include/flang-rt/runtime/stat.h
index 572cb6d10b489..070d0bf8673fb 100644
--- a/flang/runtime/stat.h
+++ b/flang-rt/include/flang-rt/runtime/stat.h
@@ -1,4 +1,4 @@
-//===-- runtime/stat.h ------------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/stat.h -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,8 +9,8 @@
 // Defines the values returned by the runtime for STAT= specifiers
 // on executable statements.
 
-#ifndef FORTRAN_RUNTIME_STAT_H_
-#define FORTRAN_RUNTIME_STAT_H_
+#ifndef FLANG_RT_RUNTIME_STAT_H_
+#define FLANG_RT_RUNTIME_STAT_H_
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
 #include "flang/Common/api-attrs.h"
 #include "flang/Runtime/magic-numbers.h"
@@ -60,4 +60,4 @@ RT_API_ATTRS int ToErrmsg(const Descriptor *errmsg, int stat); // returns stat
 RT_API_ATTRS int ReturnError(Terminator &, int stat,
     const Descriptor *errmsg = nullptr, bool hasStat = false);
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_STAT_H
+#endif // FLANG_RT_RUNTIME_STAT_H_
diff --git a/flang/runtime/terminator.h b/flang-rt/include/flang-rt/runtime/terminator.h
similarity index 95%
rename from flang/runtime/terminator.h
rename to flang-rt/include/flang-rt/runtime/terminator.h
index 609f059d6e092..4815f0674c849 100644
--- a/flang/runtime/terminator.h
+++ b/flang-rt/include/flang-rt/runtime/terminator.h
@@ -1,4 +1,4 @@
-//===-- runtime/terminator.h ------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/terminator.h -------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,8 @@
 
 // Termination of the image
 
-#ifndef FORTRAN_RUNTIME_TERMINATOR_H_
-#define FORTRAN_RUNTIME_TERMINATOR_H_
+#ifndef FLANG_RT_RUNTIME_TERMINATOR_H_
+#define FLANG_RT_RUNTIME_TERMINATOR_H_
 
 #include "flang/Common/api-attrs.h"
 #include <cstdarg>
@@ -121,4 +121,4 @@ namespace Fortran::runtime::io {
 RT_API_ATTRS void FlushOutputOnCrash(const Terminator &);
 }
 
-#endif // FORTRAN_RUNTIME_TERMINATOR_H_
+#endif // FLANG_RT_RUNTIME_TERMINATOR_H_
diff --git a/flang/runtime/tools.h b/flang-rt/include/flang-rt/runtime/tools.h
similarity index 98%
rename from flang/runtime/tools.h
rename to flang-rt/include/flang-rt/runtime/tools.h
index 75544098d47ab..91a026bf2ac14 100644
--- a/flang/runtime/tools.h
+++ b/flang-rt/include/flang-rt/runtime/tools.h
@@ -1,4 +1,4 @@
-//===-- runtime/tools.h -----------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/tools.h ------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,16 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_TOOLS_H_
-#define FORTRAN_RUNTIME_TOOLS_H_
+#ifndef FLANG_RT_RUNTIME_TOOLS_H_
+#define FLANG_RT_RUNTIME_TOOLS_H_
 
+#include "descriptor.h"
+#include "memory.h"
 #include "stat.h"
 #include "terminator.h"
 #include "flang/Common/optional.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/freestanding-tools.h"
-#include "flang/Runtime/memory.h"
 #include <cstring>
 #include <functional>
 #include <map>
@@ -348,7 +348,7 @@ inline RT_API_ATTRS RESULT ApplyFloatingPointKind(
     if constexpr (HasCppTypeFor<TypeCategory::Real, 16>) {
       // If FUNC implemenation relies on FP math functions,
       // then we should not be here. The compiler should have
-      // generated a call to an entry in flang_rt.quadmath
+      // generated a call to an entry in the libflang_rt.quadmath
       // library.
       if constexpr (!NEEDSMATH) {
         return FUNC<16>{}(std::forward<A>(x)...);
@@ -570,4 +570,4 @@ RT_API_ATTRS void CreatePartialReductionResult(Descriptor &result,
     const char *intrinsic, TypeCode);
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_TOOLS_H_
+#endif // FLANG_RT_RUNTIME_TOOLS_H_
diff --git a/flang/include/flang/Runtime/type-code.h b/flang-rt/include/flang-rt/runtime/type-code.h
similarity index 93%
rename from flang/include/flang/Runtime/type-code.h
rename to flang-rt/include/flang-rt/runtime/type-code.h
index ae854ed2145e4..9416a2816fd43 100644
--- a/flang/include/flang/Runtime/type-code.h
+++ b/flang-rt/include/flang-rt/runtime/type-code.h
@@ -1,4 +1,4 @@
-//===-- include/flang/Runtime/type-code.h -----------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/type-code.h --------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_TYPE_CODE_H_
-#define FORTRAN_RUNTIME_TYPE_CODE_H_
+#ifndef FLANG_RT_RUNTIME_TYPE_CODE_H_
+#define FLANG_RT_RUNTIME_TYPE_CODE_H_
 
 #include "flang/Common/Fortran-consts.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
@@ -75,4 +75,4 @@ class TypeCode {
   ISO::CFI_type_t raw_{CFI_type_other};
 };
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_TYPE_CODE_H_
+#endif // FLANG_RT_RUNTIME_TYPE_CODE_H_
diff --git a/flang/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h
similarity index 98%
rename from flang/runtime/type-info.h
rename to flang-rt/include/flang-rt/runtime/type-info.h
index 32403b1db5169..9891fcecdcb25 100644
--- a/flang/runtime/type-info.h
+++ b/flang-rt/include/flang-rt/runtime/type-info.h
@@ -1,4 +1,4 @@
-//===-- runtime/type-info.h -------------------------------------*- C++ -*-===//
+//===-- include/flang-rt/runtime/type-info.h --------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,17 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_TYPE_INFO_H_
-#define FORTRAN_RUNTIME_TYPE_INFO_H_
+#ifndef FLANG_RT_RUNTIME_TYPE_INFO_H_
+#define FLANG_RT_RUNTIME_TYPE_INFO_H_
 
 // A C++ perspective of the derived type description schemata in
 // flang/module/__fortran_type_info.f90.
 
+#include "descriptor.h"
 #include "terminator.h"
 #include "flang/Common/Fortran-consts.h"
 #include "flang/Common/bit-population-count.h"
 #include "flang/Common/optional.h"
-#include "flang/Runtime/descriptor.h"
 #include <cinttypes>
 #include <memory>
 
@@ -321,4 +321,4 @@ class DerivedType {
 };
 
 } // namespace Fortran::runtime::typeInfo
-#endif // FORTRAN_RUNTIME_TYPE_INFO_H_
+#endif // FLANG_RT_RUNTIME_TYPE_INFO_H_
diff --git a/flang/runtime/utf.h b/flang-rt/include/flang-rt/runtime/utf.h
similarity index 94%
rename from flang/runtime/utf.h
rename to flang-rt/include/flang-rt/runtime/utf.h
index 10c2d61484217..b5add823124fc 100644
--- a/flang/runtime/utf.h
+++ b/flang-rt/include/flang-rt/runtime/utf.h
@@ -1,4 +1,4 @@
-//===-- runtime/utf.h -----------------------------------------------------===//
+//===-- include/flang-rt/runtime/utf.h --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -38,8 +38,8 @@
 // standard maximum.  However, we support extended forms up to 32 bits so that
 // CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data.
 
-#ifndef FORTRAN_RUNTIME_UTF_H_
-#define FORTRAN_RUNTIME_UTF_H_
+#ifndef FLANG_RT_RUNTIME_UTF_H_
+#define FLANG_RT_RUNTIME_UTF_H_
 
 #include "flang/Common/optional.h"
 #include <cstddef>
@@ -70,4 +70,4 @@ RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *);
 RT_API_ATTRS std::size_t EncodeUTF8(char *, char32_t);
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_UTF_H_
+#endif // FLANG_RT_RUNTIME_UTF_H_
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
similarity index 94%
rename from flang/runtime/CUDA/allocatable.cpp
rename to flang-rt/lib/cuda/allocatable.cpp
index 6df3b06793b3e..b773e802c90ff 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/CUDA/allocatable.cpp --------------------------------------===//
+//===-- lib/cuda/allocatable.cpp --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/allocatable.h"
-#include "../assign-impl.h"
-#include "../stat.h"
-#include "../terminator.h"
+#include "flang-rt/runtime/assign-impl.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/CUDA/memmove-function.h"
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
similarity index 86%
rename from flang/runtime/CUDA/allocator.cpp
rename to flang-rt/lib/cuda/allocator.cpp
index 368c1124ef70a..4199bf04b33f0 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/CUDA/allocator.cpp ----------------------------------------===//
+//===-- lib/cuda/allocator.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/allocator.h"
-#include "../derived.h"
-#include "../stat.h"
-#include "../terminator.h"
-#include "../type-info.h"
+#include "flang-rt/runtime/allocator-registry.h"
+#include "flang-rt/runtime/derived.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
 #include "flang/Runtime/CUDA/common.h"
-#include "flang/Runtime/allocator-registry.h"
 #include "flang/Support/Fortran.h"
 
 #include "cuda_runtime.h"
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
similarity index 91%
rename from flang/runtime/CUDA/descriptor.cpp
rename to flang-rt/lib/cuda/descriptor.cpp
index 947eeb66aa3d6..d44ab2e45d2a8 100644
--- a/flang/runtime/CUDA/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/CUDA/descriptor.cpp ---------------------------------------===//
+//===-- lib/cuda/descriptor.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,10 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/descriptor.h"
-#include "../terminator.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/allocator.h"
 #include "flang/Runtime/CUDA/common.h"
-#include "flang/Runtime/descriptor.h"
 
 #include "cuda_runtime.h"
 
diff --git a/flang/runtime/CUDA/init.cpp b/flang-rt/lib/cuda/init.cpp
similarity index 81%
rename from flang/runtime/CUDA/init.cpp
rename to flang-rt/lib/cuda/init.cpp
index 2bffce842b952..d79bffc32424d 100644
--- a/flang/runtime/CUDA/init.cpp
+++ b/flang-rt/lib/cuda/init.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/CUDA/init.cpp ---------------------------------------------===//
+//===-- lib/cuda/init.cpp ---------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/init.h"
-#include "../environment.h"
-#include "../terminator.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 
 #include "cuda_runtime.h"
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang-rt/lib/cuda/kernel.cpp
similarity index 98%
rename from flang/runtime/CUDA/kernel.cpp
rename to flang-rt/lib/cuda/kernel.cpp
index 02d89fb8423a5..75eb639817b9a 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang-rt/lib/cuda/kernel.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/CUDA/kernel.cpp -------------------------------------------===//
+//===-- lib/cuda/kernel.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/kernel.h"
-#include "../terminator.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 
 #include "cuda_runtime.h"
diff --git a/flang/runtime/CUDA/memmove-function.cpp b/flang-rt/lib/cuda/memmove-function.cpp
similarity index 90%
rename from flang/runtime/CUDA/memmove-function.cpp
rename to flang-rt/lib/cuda/memmove-function.cpp
index 3ba9fa7e0f7f7..a7eb0cf1a3e7a 100644
--- a/flang/runtime/CUDA/memmove-function.cpp
+++ b/flang-rt/lib/cuda/memmove-function.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/CUDA/memmove-function.cpp ---------------------------------===//
+//===-- lib/cuda/memmove-function.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/memmove-function.h"
-#include "../terminator.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 
 #include "cuda_runtime.h"
diff --git a/flang/runtime/CUDA/memory.cpp b/flang-rt/lib/cuda/memory.cpp
similarity index 96%
rename from flang/runtime/CUDA/memory.cpp
rename to flang-rt/lib/cuda/memory.cpp
index 0bbb493d2db91..7ead0dd35e1d9 100644
--- a/flang/runtime/CUDA/memory.cpp
+++ b/flang-rt/lib/cuda/memory.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/CUDA/memory.cpp -------------------------------------------===//
+//===-- lib/cuda/memory.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/memory.h"
-#include "../assign-impl.h"
-#include "../terminator.h"
+#include "flang-rt/runtime/assign-impl.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/CUDA/memmove-function.h"
diff --git a/flang/runtime/CUDA/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
similarity index 92%
rename from flang/runtime/CUDA/pointer.cpp
rename to flang-rt/lib/cuda/pointer.cpp
index d3ebe97b4e4ac..c2559ecb9a6f2 100644
--- a/flang/runtime/CUDA/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/CUDA/pointer.cpp ------------------------------------------===//
+//===-- lib/cuda/pointer.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/pointer.h"
-#include "../assign-impl.h"
-#include "../stat.h"
-#include "../terminator.h"
+#include "flang-rt/runtime/assign-impl.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/CUDA/memmove-function.h"
 #include "flang/Runtime/pointer.h"
diff --git a/flang/runtime/CUDA/registration.cpp b/flang-rt/lib/cuda/registration.cpp
similarity index 92%
rename from flang/runtime/CUDA/registration.cpp
rename to flang-rt/lib/cuda/registration.cpp
index b7b6ef389bffb..60b0e491b6ffd 100644
--- a/flang/runtime/CUDA/registration.cpp
+++ b/flang-rt/lib/cuda/registration.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/CUDA/registration.cpp -------------------------------------===//
+//===-- lib/cuda/registration.cpp -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/registration.h"
-#include "../terminator.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/common.h"
 
 #include "cuda_runtime.h"
diff --git a/flang/runtime/Float128Math/acos.cpp b/flang-rt/lib/quadmath/acos.cpp
similarity index 87%
rename from flang/runtime/Float128Math/acos.cpp
rename to flang-rt/lib/quadmath/acos.cpp
index d9b4950aa1e35..d094121f0f678 100644
--- a/flang/runtime/Float128Math/acos.cpp
+++ b/flang-rt/lib/quadmath/acos.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/acos.cpp -------------------------------------===//
+//===-- lib/quadmath/acos.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/acosh.cpp b/flang-rt/lib/quadmath/acosh.cpp
similarity index 87%
rename from flang/runtime/Float128Math/acosh.cpp
rename to flang-rt/lib/quadmath/acosh.cpp
index c572673ef55e6..968aa489d15a2 100644
--- a/flang/runtime/Float128Math/acosh.cpp
+++ b/flang-rt/lib/quadmath/acosh.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/acosh.cpp ------------------------------------===//
+//===-- lib/quadmath/acosh.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/asin.cpp b/flang-rt/lib/quadmath/asin.cpp
similarity index 87%
rename from flang/runtime/Float128Math/asin.cpp
rename to flang-rt/lib/quadmath/asin.cpp
index 57b6ffd967360..c3345cd3d748a 100644
--- a/flang/runtime/Float128Math/asin.cpp
+++ b/flang-rt/lib/quadmath/asin.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/asin.cpp -------------------------------------===//
+//===-- lib/quadmath/asin.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/asinh.cpp b/flang-rt/lib/quadmath/asinh.cpp
similarity index 87%
rename from flang/runtime/Float128Math/asinh.cpp
rename to flang-rt/lib/quadmath/asinh.cpp
index 03dded722b254..1023b678b6131 100644
--- a/flang/runtime/Float128Math/asinh.cpp
+++ b/flang-rt/lib/quadmath/asinh.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/asinh.cpp ------------------------------------===//
+//===-- lib/quadmath/asinh.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/atan.cpp b/flang-rt/lib/quadmath/atan.cpp
similarity index 87%
rename from flang/runtime/Float128Math/atan.cpp
rename to flang-rt/lib/quadmath/atan.cpp
index 19c86cae8867b..6379df3275c03 100644
--- a/flang/runtime/Float128Math/atan.cpp
+++ b/flang-rt/lib/quadmath/atan.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/atan.cpp -------------------------------------===//
+//===-- lib/quadmath/atan.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/atan2.cpp b/flang-rt/lib/quadmath/atan2.cpp
similarity index 88%
rename from flang/runtime/Float128Math/atan2.cpp
rename to flang-rt/lib/quadmath/atan2.cpp
index 09d666ae14304..7527b224cb3a5 100644
--- a/flang/runtime/Float128Math/atan2.cpp
+++ b/flang-rt/lib/quadmath/atan2.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/atan2.cpp ------------------------------------===//
+//===-- lib/quadmath/atan2.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/atanh.cpp b/flang-rt/lib/quadmath/atanh.cpp
similarity index 87%
rename from flang/runtime/Float128Math/atanh.cpp
rename to flang-rt/lib/quadmath/atanh.cpp
index 442d9beafae47..c7455fcb7ca67 100644
--- a/flang/runtime/Float128Math/atanh.cpp
+++ b/flang-rt/lib/quadmath/atanh.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/atanh.cpp ------------------------------------===//
+//===-- lib/quadmath/atanh.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/ceil.cpp b/flang-rt/lib/quadmath/ceil.cpp
similarity index 87%
rename from flang/runtime/Float128Math/ceil.cpp
rename to flang-rt/lib/quadmath/ceil.cpp
index 48e20b2b41577..03a98bedfdc03 100644
--- a/flang/runtime/Float128Math/ceil.cpp
+++ b/flang-rt/lib/quadmath/ceil.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/ceil.cpp -------------------------------------===//
+//===-- lib/quadmath/ceil.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/complex-math.c b/flang-rt/lib/quadmath/complex-math.c
similarity index 92%
rename from flang/runtime/Float128Math/complex-math.c
rename to flang-rt/lib/quadmath/complex-math.c
index a7e32f3ef755b..e485fd7eb5dbb 100644
--- a/flang/runtime/Float128Math/complex-math.c
+++ b/flang-rt/lib/quadmath/complex-math.c
@@ -1,11 +1,10 @@
-/*===-- runtime/Float128Math/complex-math.c -------------------------*- C -*-===
+/*===-- lib/quadmath/complex-math.c ---------------------------------*- C -*-===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
- * ===-----------------------------------------------------------------------===
- */
+ *===----------------------------------------------------------------------===*/
 
 #include "complex-math.h"
 
diff --git a/flang/runtime/Float128Math/complex-math.h b/flang-rt/lib/quadmath/complex-math.h
similarity index 88%
rename from flang/runtime/Float128Math/complex-math.h
rename to flang-rt/lib/quadmath/complex-math.h
index cf5e980a39b6b..424ed84da4e01 100644
--- a/flang/runtime/Float128Math/complex-math.h
+++ b/flang-rt/lib/quadmath/complex-math.h
@@ -1,4 +1,4 @@
-/*===-- runtime/Float128Math/complex-math.h -------------------------*- C -*-===
+/*===-- lib/quadmath/complex-math.h ---------------------------------*- C -*-===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
  *
  *===----------------------------------------------------------------------===*/
 
-#ifndef FORTRAN_RUNTIME_FLOAT128MATH_COMPLEX_MATH_H_
-#define FORTRAN_RUNTIME_FLOAT128MATH_COMPLEX_MATH_H_
+#ifndef FLANG_RT_QUADMATH_COMPLEX_MATH_H_
+#define FLANG_RT_QUADMATH_COMPLEX_MATH_H_
 
 #include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
@@ -59,4 +59,4 @@
 #error "Float128Math build with glibc>=2.26 is unsupported yet"
 #endif
 
-#endif /* FORTRAN_RUNTIME_FLOAT128MATH_COMPLEX_MATH_H_ */
+#endif /* FLANG_RT_QUADMATH_COMPLEX_MATH_H_ */
diff --git a/flang/runtime/Float128Math/cos.cpp b/flang-rt/lib/quadmath/cos.cpp
similarity index 87%
rename from flang/runtime/Float128Math/cos.cpp
rename to flang-rt/lib/quadmath/cos.cpp
index d1b3e0e736ca3..1116080c53d2a 100644
--- a/flang/runtime/Float128Math/cos.cpp
+++ b/flang-rt/lib/quadmath/cos.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/cos.cpp --------------------------------------===//
+//===-- lib/quadmath/cos.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/cosh.cpp b/flang-rt/lib/quadmath/cosh.cpp
similarity index 87%
rename from flang/runtime/Float128Math/cosh.cpp
rename to flang-rt/lib/quadmath/cosh.cpp
index 9fe5b61d8f95f..dd5978e5e5f08 100644
--- a/flang/runtime/Float128Math/cosh.cpp
+++ b/flang-rt/lib/quadmath/cosh.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/cosh.cpp -------------------------------------===//
+//===-- lib/quadmath/cosh.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/erf.cpp b/flang-rt/lib/quadmath/erf.cpp
similarity index 87%
rename from flang/runtime/Float128Math/erf.cpp
rename to flang-rt/lib/quadmath/erf.cpp
index 2a553bd395e88..0021b7900f6a1 100644
--- a/flang/runtime/Float128Math/erf.cpp
+++ b/flang-rt/lib/quadmath/erf.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/erf.cpp --------------------------------------===//
+//===-- lib/quadmath/erf.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/erfc.cpp b/flang-rt/lib/quadmath/erfc.cpp
similarity index 87%
rename from flang/runtime/Float128Math/erfc.cpp
rename to flang-rt/lib/quadmath/erfc.cpp
index 2435ed2786cb3..5b80fb475b3fc 100644
--- a/flang/runtime/Float128Math/erfc.cpp
+++ b/flang-rt/lib/quadmath/erfc.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/erfc.cpp -------------------------------------===//
+//===-- lib/quadmath/erfc.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/exp.cpp b/flang-rt/lib/quadmath/exp.cpp
similarity index 87%
rename from flang/runtime/Float128Math/exp.cpp
rename to flang-rt/lib/quadmath/exp.cpp
index 5ca87d9dd25d2..94e444c3b00c7 100644
--- a/flang/runtime/Float128Math/exp.cpp
+++ b/flang-rt/lib/quadmath/exp.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/exp.cpp --------------------------------------===//
+//===-- lib/quadmath/exp.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/exponent.cpp b/flang-rt/lib/quadmath/exponent.cpp
similarity index 90%
rename from flang/runtime/Float128Math/exponent.cpp
rename to flang-rt/lib/quadmath/exponent.cpp
index 237cde34e8691..0d2fa6478cca8 100644
--- a/flang/runtime/Float128Math/exponent.cpp
+++ b/flang-rt/lib/quadmath/exponent.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/exponent.cpp ---------------------------------===//
+//===-- lib/quadmath/exponent.cpp -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/floor.cpp b/flang-rt/lib/quadmath/floor.cpp
similarity index 87%
rename from flang/runtime/Float128Math/floor.cpp
rename to flang-rt/lib/quadmath/floor.cpp
index 28f9c7b55dd51..e5dfb33db82ce 100644
--- a/flang/runtime/Float128Math/floor.cpp
+++ b/flang-rt/lib/quadmath/floor.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/floor.cpp ------------------------------------===//
+//===-- lib/quadmath/floor.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/fma.cpp b/flang-rt/lib/quadmath/fma.cpp
similarity index 89%
rename from flang/runtime/Float128Math/fma.cpp
rename to flang-rt/lib/quadmath/fma.cpp
index 87176c25dd604..910303af32339 100644
--- a/flang/runtime/Float128Math/fma.cpp
+++ b/flang-rt/lib/quadmath/fma.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/fma.cpp --------------------------------------===//
+//===-- lib/quadmath/fma.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/fraction.cpp b/flang-rt/lib/quadmath/fraction.cpp
similarity index 87%
rename from flang/runtime/Float128Math/fraction.cpp
rename to flang-rt/lib/quadmath/fraction.cpp
index 45ec12cd77518..a9927666a7b00 100644
--- a/flang/runtime/Float128Math/fraction.cpp
+++ b/flang-rt/lib/quadmath/fraction.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/fraction.cpp ---------------------------------===//
+//===-- lib/quadmath/fraction.cpp -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/hypot.cpp b/flang-rt/lib/quadmath/hypot.cpp
similarity index 88%
rename from flang/runtime/Float128Math/hypot.cpp
rename to flang-rt/lib/quadmath/hypot.cpp
index 03049b06d3a24..3090d0b2aff74 100644
--- a/flang/runtime/Float128Math/hypot.cpp
+++ b/flang-rt/lib/quadmath/hypot.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/hypot.cpp ------------------------------------===//
+//===-- lib/quadmath/hypot.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/j0.cpp b/flang-rt/lib/quadmath/j0.cpp
similarity index 87%
rename from flang/runtime/Float128Math/j0.cpp
rename to flang-rt/lib/quadmath/j0.cpp
index 7207cbe1a92e7..06df1c2aca452 100644
--- a/flang/runtime/Float128Math/j0.cpp
+++ b/flang-rt/lib/quadmath/j0.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/j0.cpp ---------------------------------------===//
+//===-- lib/quadmath/j0.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/j1.cpp b/flang-rt/lib/quadmath/j1.cpp
similarity index 87%
rename from flang/runtime/Float128Math/j1.cpp
rename to flang-rt/lib/quadmath/j1.cpp
index 9e49bcbc32ca4..d8a1f123b95e6 100644
--- a/flang/runtime/Float128Math/j1.cpp
+++ b/flang-rt/lib/quadmath/j1.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/j1.cpp ---------------------------------------===//
+//===-- lib/quadmath/j1.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/jn.cpp b/flang-rt/lib/quadmath/jn.cpp
similarity index 87%
rename from flang/runtime/Float128Math/jn.cpp
rename to flang-rt/lib/quadmath/jn.cpp
index 37e5f428e5e26..a53e305bb8746 100644
--- a/flang/runtime/Float128Math/jn.cpp
+++ b/flang-rt/lib/quadmath/jn.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/jn.cpp ---------------------------------------===//
+//===-- lib/quadmath/jn.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/lgamma.cpp b/flang-rt/lib/quadmath/lgamma.cpp
similarity index 87%
rename from flang/runtime/Float128Math/lgamma.cpp
rename to flang-rt/lib/quadmath/lgamma.cpp
index 54d0dd8083868..b96dff1d0d72a 100644
--- a/flang/runtime/Float128Math/lgamma.cpp
+++ b/flang-rt/lib/quadmath/lgamma.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/lgamma.cpp -----------------------------------===//
+//===-- lib/quadmath/lgamma.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/llround.cpp b/flang-rt/lib/quadmath/llround.cpp
similarity index 87%
rename from flang/runtime/Float128Math/llround.cpp
rename to flang-rt/lib/quadmath/llround.cpp
index f0c53ccdf66fd..8f2913d390431 100644
--- a/flang/runtime/Float128Math/llround.cpp
+++ b/flang-rt/lib/quadmath/llround.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/llround.cpp ----------------------------------===//
+//===-- lib/quadmath/llround.cpp --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/log.cpp b/flang-rt/lib/quadmath/log.cpp
similarity index 87%
rename from flang/runtime/Float128Math/log.cpp
rename to flang-rt/lib/quadmath/log.cpp
index 28fec1958f10b..0c489c922a3fc 100644
--- a/flang/runtime/Float128Math/log.cpp
+++ b/flang-rt/lib/quadmath/log.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/log.cpp --------------------------------------===//
+//===-- lib/quadmath/log.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/log10.cpp b/flang-rt/lib/quadmath/log10.cpp
similarity index 87%
rename from flang/runtime/Float128Math/log10.cpp
rename to flang-rt/lib/quadmath/log10.cpp
index f844d508f8d3b..a2f222e15a147 100644
--- a/flang/runtime/Float128Math/log10.cpp
+++ b/flang-rt/lib/quadmath/log10.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/log10.cpp ------------------------------------===//
+//===-- lib/quadmath/log10.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/lround.cpp b/flang-rt/lib/quadmath/lround.cpp
similarity index 87%
rename from flang/runtime/Float128Math/lround.cpp
rename to flang-rt/lib/quadmath/lround.cpp
index 8c2d3315c62a7..539ee107a3881 100644
--- a/flang/runtime/Float128Math/lround.cpp
+++ b/flang-rt/lib/quadmath/lround.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/lround.cpp -----------------------------------===//
+//===-- lib/quadmath/lround.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang-rt/lib/quadmath/math-entries.h
similarity index 96%
rename from flang/runtime/Float128Math/math-entries.h
rename to flang-rt/lib/quadmath/math-entries.h
index a94503fe8e67a..6e47f32cc8a43 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang-rt/lib/quadmath/math-entries.h
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/math-entries.h ---------------------*- C++ -*-===//
+//===-- lib/quadmath/math-entries.h -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_
-#define FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_
-#include "terminator.h"
-#include "tools.h"
+#ifndef FLANG_RT_QUADMATH_MATH_ENTRIES_H_
+#define FLANG_RT_QUADMATH_MATH_ENTRIES_H_
+
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
 #include <cfloat>
@@ -231,4 +232,4 @@ DEFINE_SIMPLE_ALIAS(Yn, ynl)
 
 } // namespace Fortran::runtime
 
-#endif // FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_
+#endif // FLANG_RT_QUADMATH_MATH_ENTRIES_H_
diff --git a/flang/runtime/Float128Math/mod-real.cpp b/flang-rt/lib/quadmath/mod-real.cpp
similarity index 88%
rename from flang/runtime/Float128Math/mod-real.cpp
rename to flang-rt/lib/quadmath/mod-real.cpp
index e831c2df4abc1..0230964e3ddc2 100644
--- a/flang/runtime/Float128Math/mod-real.cpp
+++ b/flang-rt/lib/quadmath/mod-real.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/mod-real.cpp ---------------------------------===//
+//===-- lib/quadmath/mod-real.cpp -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/modulo-real.cpp b/flang-rt/lib/quadmath/modulo-real.cpp
similarity index 89%
rename from flang/runtime/Float128Math/modulo-real.cpp
rename to flang-rt/lib/quadmath/modulo-real.cpp
index 88729da7e3987..0f28747b86985 100644
--- a/flang/runtime/Float128Math/modulo-real.cpp
+++ b/flang-rt/lib/quadmath/modulo-real.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/modulo-real.cpp ------------------------------===//
+//===-- lib/quadmath/modulo-real.cpp ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/nearbyint.cpp b/flang-rt/lib/quadmath/nearbyint.cpp
similarity index 87%
rename from flang/runtime/Float128Math/nearbyint.cpp
rename to flang-rt/lib/quadmath/nearbyint.cpp
index 9eecb0c5f3e2f..3811fc53d1d82 100644
--- a/flang/runtime/Float128Math/nearbyint.cpp
+++ b/flang-rt/lib/quadmath/nearbyint.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/nearbyint.cpp --------------------------------===//
+//===-- lib/quadmath/nearbyint.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/nearest.cpp b/flang-rt/lib/quadmath/nearest.cpp
similarity index 88%
rename from flang/runtime/Float128Math/nearest.cpp
rename to flang-rt/lib/quadmath/nearest.cpp
index 50f6e7ea75a60..8c1969a7b596c 100644
--- a/flang/runtime/Float128Math/nearest.cpp
+++ b/flang-rt/lib/quadmath/nearest.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/nearest.cpp ----------------------------------===//
+//===-- lib/quadmath/nearest.cpp --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/norm2.cpp b/flang-rt/lib/quadmath/norm2.cpp
similarity index 89%
rename from flang/runtime/Float128Math/norm2.cpp
rename to flang-rt/lib/quadmath/norm2.cpp
index 18e9c8cc8a2b9..e98f4007737d1 100644
--- a/flang/runtime/Float128Math/norm2.cpp
+++ b/flang-rt/lib/quadmath/norm2.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/norm2.cpp ------------------------------------===//
+//===-- lib/quadmath/norm2.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,7 +8,7 @@
 
 #include "math-entries.h"
 #include "numeric-template-specs.h"
-#include "reduction-templates.h"
+#include "flang-rt/runtime/reduction-templates.h"
 
 namespace Fortran::runtime {
 extern "C" {
diff --git a/flang/runtime/Float128Math/numeric-template-specs.h b/flang-rt/lib/quadmath/numeric-template-specs.h
similarity index 82%
rename from flang/runtime/Float128Math/numeric-template-specs.h
rename to flang-rt/lib/quadmath/numeric-template-specs.h
index a0a77230c3e9e..e215ad70eca14 100644
--- a/flang/runtime/Float128Math/numeric-template-specs.h
+++ b/flang-rt/lib/quadmath/numeric-template-specs.h
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/numeric-template-specs.h -----------*- C++ -*-===//
+//===-- lib/quadmath/numeric-template-specs.h -------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_FLOAT128MATH_NUMERIC_TEMPLATE_SPECS_H_
-#define FORTRAN_RUNTIME_FLOAT128MATH_NUMERIC_TEMPLATE_SPECS_H_
+#ifndef FLANG_RT_QUADMATH_NUMERIC_TEMPLATE_SPECS_H_
+#define FLANG_RT_QUADMATH_NUMERIC_TEMPLATE_SPECS_H_
 
 #include "math-entries.h"
-#include "numeric-templates.h"
+#include "flang-rt/runtime/numeric-templates.h"
 
 namespace Fortran::runtime {
 using F128Type = CppTypeFor<TypeCategory::Real, 16>;
@@ -52,4 +52,4 @@ template <> struct SQRTTy<F128Type> {
 };
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_FLOAT128MATH_NUMERIC_TEMPLATE_SPECS_H_
+#endif // FLANG_RT_QUADMATH_NUMERIC_TEMPLATE_SPECS_H_
diff --git a/flang/runtime/Float128Math/pow.cpp b/flang-rt/lib/quadmath/pow.cpp
similarity index 88%
rename from flang/runtime/Float128Math/pow.cpp
rename to flang-rt/lib/quadmath/pow.cpp
index 99aae04c65ecb..29c0536254658 100644
--- a/flang/runtime/Float128Math/pow.cpp
+++ b/flang-rt/lib/quadmath/pow.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/pow.cpp --------------------------------------===//
+//===-- lib/quadmath/pow.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/random.cpp b/flang-rt/lib/quadmath/random.cpp
similarity index 83%
rename from flang/runtime/Float128Math/random.cpp
rename to flang-rt/lib/quadmath/random.cpp
index 93c5c14cee37d..a6d22733ebce4 100644
--- a/flang/runtime/Float128Math/random.cpp
+++ b/flang-rt/lib/quadmath/random.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/random.cpp -----------------------------------===//
+//===-- lib/quadmath/random.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,7 +8,7 @@
 
 #include "math-entries.h"
 #include "numeric-template-specs.h"
-#include "random-templates.h"
+#include "flang-rt/runtime/random-templates.h"
 
 using namespace Fortran::runtime::random;
 extern "C" {
diff --git a/flang/runtime/Float128Math/remainder.cpp b/flang-rt/lib/quadmath/remainder.cpp
similarity index 88%
rename from flang/runtime/Float128Math/remainder.cpp
rename to flang-rt/lib/quadmath/remainder.cpp
index e5c2793dab71a..4b68cdd6ac9de 100644
--- a/flang/runtime/Float128Math/remainder.cpp
+++ b/flang-rt/lib/quadmath/remainder.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/remainder.cpp --------------------------------===//
+//===-- lib/quadmath/remainder.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/round.cpp b/flang-rt/lib/quadmath/round.cpp
similarity index 89%
rename from flang/runtime/Float128Math/round.cpp
rename to flang-rt/lib/quadmath/round.cpp
index e79ce30536b3b..844338f5e6413 100644
--- a/flang/runtime/Float128Math/round.cpp
+++ b/flang-rt/lib/quadmath/round.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/round.cpp ------------------------------------===//
+//===-- lib/quadmath/round.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/rrspacing.cpp b/flang-rt/lib/quadmath/rrspacing.cpp
similarity index 87%
rename from flang/runtime/Float128Math/rrspacing.cpp
rename to flang-rt/lib/quadmath/rrspacing.cpp
index 04cefc049bbee..e8613f4d7d7e2 100644
--- a/flang/runtime/Float128Math/rrspacing.cpp
+++ b/flang-rt/lib/quadmath/rrspacing.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/rrspacing.cpp --------------------------------===//
+//===-- lib/quadmath/rrspacing.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/scale.cpp b/flang-rt/lib/quadmath/scale.cpp
similarity index 90%
rename from flang/runtime/Float128Math/scale.cpp
rename to flang-rt/lib/quadmath/scale.cpp
index 6b083afbdf4d1..3d919f85a4487 100644
--- a/flang/runtime/Float128Math/scale.cpp
+++ b/flang-rt/lib/quadmath/scale.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/scale.cpp ------------------------------------===//
+//===-- lib/quadmath/scale.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/set-exponent.cpp b/flang-rt/lib/quadmath/set-exponent.cpp
similarity index 88%
rename from flang/runtime/Float128Math/set-exponent.cpp
rename to flang-rt/lib/quadmath/set-exponent.cpp
index 63c5b325085fb..d6b582e7c4f38 100644
--- a/flang/runtime/Float128Math/set-exponent.cpp
+++ b/flang-rt/lib/quadmath/set-exponent.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/set-exponent.cpp -----------------------------===//
+//===-- lib/quadmath/set-exponent.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/sin.cpp b/flang-rt/lib/quadmath/sin.cpp
similarity index 87%
rename from flang/runtime/Float128Math/sin.cpp
rename to flang-rt/lib/quadmath/sin.cpp
index 99fa3e493e694..dcff2f9ce02ca 100644
--- a/flang/runtime/Float128Math/sin.cpp
+++ b/flang-rt/lib/quadmath/sin.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/sin.cpp --------------------------------------===//
+//===-- lib/quadmath/sin.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/sinh.cpp b/flang-rt/lib/quadmath/sinh.cpp
similarity index 87%
rename from flang/runtime/Float128Math/sinh.cpp
rename to flang-rt/lib/quadmath/sinh.cpp
index b6cd96963612e..3ab7280f705a6 100644
--- a/flang/runtime/Float128Math/sinh.cpp
+++ b/flang-rt/lib/quadmath/sinh.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/sinh.cpp -------------------------------------===//
+//===-- lib/quadmath/sinh.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/spacing.cpp b/flang-rt/lib/quadmath/spacing.cpp
similarity index 87%
rename from flang/runtime/Float128Math/spacing.cpp
rename to flang-rt/lib/quadmath/spacing.cpp
index fc6aa2c4ec2d8..1d7ecdb4852d2 100644
--- a/flang/runtime/Float128Math/spacing.cpp
+++ b/flang-rt/lib/quadmath/spacing.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/spacing.cpp ----------------------------------===//
+//===-- lib/quadmath/spacing.cpp --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/sqrt.cpp b/flang-rt/lib/quadmath/sqrt.cpp
similarity index 87%
rename from flang/runtime/Float128Math/sqrt.cpp
rename to flang-rt/lib/quadmath/sqrt.cpp
index 871c66e007984..6e0d11a6697f0 100644
--- a/flang/runtime/Float128Math/sqrt.cpp
+++ b/flang-rt/lib/quadmath/sqrt.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/sqrt.cpp -------------------------------------===//
+//===-- lib/quadmath/sqrt.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/tan.cpp b/flang-rt/lib/quadmath/tan.cpp
similarity index 87%
rename from flang/runtime/Float128Math/tan.cpp
rename to flang-rt/lib/quadmath/tan.cpp
index 2d6f448ba8955..6f09b93060228 100644
--- a/flang/runtime/Float128Math/tan.cpp
+++ b/flang-rt/lib/quadmath/tan.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/tan.cpp --------------------------------------===//
+//===-- lib/quadmath/tan.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/tanh.cpp b/flang-rt/lib/quadmath/tanh.cpp
similarity index 87%
rename from flang/runtime/Float128Math/tanh.cpp
rename to flang-rt/lib/quadmath/tanh.cpp
index f6321f4819191..214a18d5c3778 100644
--- a/flang/runtime/Float128Math/tanh.cpp
+++ b/flang-rt/lib/quadmath/tanh.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/tanh.cpp -------------------------------------===//
+//===-- lib/quadmath/tanh.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/tgamma.cpp b/flang-rt/lib/quadmath/tgamma.cpp
similarity index 87%
rename from flang/runtime/Float128Math/tgamma.cpp
rename to flang-rt/lib/quadmath/tgamma.cpp
index 98fd792a63330..2b05a60dcaabb 100644
--- a/flang/runtime/Float128Math/tgamma.cpp
+++ b/flang-rt/lib/quadmath/tgamma.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/tgamma.cpp -----------------------------------===//
+//===-- lib/quadmath/tgamma.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/trunc.cpp b/flang-rt/lib/quadmath/trunc.cpp
similarity index 89%
rename from flang/runtime/Float128Math/trunc.cpp
rename to flang-rt/lib/quadmath/trunc.cpp
index 54fa33176813c..cd7c27b569fc3 100644
--- a/flang/runtime/Float128Math/trunc.cpp
+++ b/flang-rt/lib/quadmath/trunc.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/trunc.cpp ------------------------------------===//
+//===-- lib/quadmath/trunc.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/y0.cpp b/flang-rt/lib/quadmath/y0.cpp
similarity index 87%
rename from flang/runtime/Float128Math/y0.cpp
rename to flang-rt/lib/quadmath/y0.cpp
index 0b3059b4cfe25..9db04277660ad 100644
--- a/flang/runtime/Float128Math/y0.cpp
+++ b/flang-rt/lib/quadmath/y0.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/y0.cpp ---------------------------------------===//
+//===-- lib/quadmath/y0.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/y1.cpp b/flang-rt/lib/quadmath/y1.cpp
similarity index 87%
rename from flang/runtime/Float128Math/y1.cpp
rename to flang-rt/lib/quadmath/y1.cpp
index cb39d87034dc7..92e658195f3d9 100644
--- a/flang/runtime/Float128Math/y1.cpp
+++ b/flang-rt/lib/quadmath/y1.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/y1.cpp ---------------------------------------===//
+//===-- lib/quadmath/y1.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/Float128Math/yn.cpp b/flang-rt/lib/quadmath/yn.cpp
similarity index 87%
rename from flang/runtime/Float128Math/yn.cpp
rename to flang-rt/lib/quadmath/yn.cpp
index bef8f9457df2f..20c0bc9d5218e 100644
--- a/flang/runtime/Float128Math/yn.cpp
+++ b/flang-rt/lib/quadmath/yn.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/Float128Math/yn.cpp ---------------------------------------===//
+//===-- lib/quadmath/yn.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/ISO_Fortran_binding.cpp b/flang-rt/lib/runtime/ISO_Fortran_binding.cpp
similarity index 97%
rename from flang/runtime/ISO_Fortran_binding.cpp
rename to flang-rt/lib/runtime/ISO_Fortran_binding.cpp
index 64e239f498230..a5f8b357ae0b8 100644
--- a/flang/runtime/ISO_Fortran_binding.cpp
+++ b/flang-rt/lib/runtime/ISO_Fortran_binding.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/ISO_Fortran_binding.cpp -----------------------------------===//
+//===-- lib/runtime/ISO_Fortran_binding.cpp ---------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,11 +10,11 @@
 // as specified in section 18.5.5 of Fortran 2018.
 
 #include "ISO_Fortran_util.h"
-#include "terminator.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-code.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/pointer.h"
-#include "flang/Runtime/type-code.h"
 #include <cstdlib>
 
 namespace Fortran::ISO {
diff --git a/flang/runtime/ISO_Fortran_util.h b/flang-rt/lib/runtime/ISO_Fortran_util.h
similarity index 90%
rename from flang/runtime/ISO_Fortran_util.h
rename to flang-rt/lib/runtime/ISO_Fortran_util.h
index aca9aee8c5718..9bbc03eefc490 100644
--- a/flang/runtime/ISO_Fortran_util.h
+++ b/flang-rt/lib/runtime/ISO_Fortran_util.h
@@ -1,4 +1,4 @@
-//===-- runtime/ISO_Fortran_util.h ------------------------------*- C++ -*-===//
+//===-- lib/runtime/ISO_Fortran_util.h --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,15 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_ISO_FORTRAN_UTIL_H_
-#define FORTRAN_RUNTIME_ISO_FORTRAN_UTIL_H_
+#ifndef FLANG_RT_RUNTIME_ISO_FORTRAN_UTIL_H_
+#define FLANG_RT_RUNTIME_ISO_FORTRAN_UTIL_H_
 
 // Internal utils for establishing CFI_cdesc_t descriptors.
 
-#include "terminator.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-code.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
-#include "flang/Runtime/descriptor.h"
-#include "flang/Runtime/type-code.h"
 #include <cstdlib>
 
 namespace Fortran::ISO {
@@ -99,4 +99,4 @@ static inline RT_API_ATTRS void EstablishDescriptor(CFI_cdesc_t *descriptor,
   }
 }
 } // namespace Fortran::ISO
-#endif // FORTRAN_RUNTIME_ISO_FORTRAN_UTIL_H_
+#endif // FLANG_RT_RUNTIME_ISO_FORTRAN_UTIL_H_
diff --git a/flang/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
similarity index 96%
rename from flang/runtime/allocatable.cpp
rename to flang-rt/lib/runtime/allocatable.cpp
index 686114bf86eaf..a51816129199a 100644
--- a/flang/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/allocatable.cpp -------------------------------------------===//
+//===-- lib/runtime/allocatable.cpp -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,14 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/allocatable.h"
-#include "assign-impl.h"
-#include "derived.h"
-#include "stat.h"
-#include "terminator.h"
-#include "type-info.h"
+#include "flang-rt/runtime/assign-impl.h"
+#include "flang-rt/runtime/derived.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
 #include "flang/Runtime/assign.h"
-#include "flang/Runtime/descriptor.h"
 
 namespace Fortran::runtime {
 extern "C" {
diff --git a/flang/runtime/allocator-registry.cpp b/flang-rt/lib/runtime/allocator-registry.cpp
similarity index 87%
rename from flang/runtime/allocator-registry.cpp
rename to flang-rt/lib/runtime/allocator-registry.cpp
index f5670331d6dbe..f8a8daaf8e748 100644
--- a/flang/runtime/allocator-registry.cpp
+++ b/flang-rt/lib/runtime/allocator-registry.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/allocator-registry.cpp ------------------------------------===//
+//===-- lib/runtime/allocator-registry.cpp ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Runtime/allocator-registry.h"
-#include "terminator.h"
+#include "flang-rt/runtime/allocator-registry.h"
+#include "flang-rt/runtime/terminator.h"
 
 namespace Fortran::runtime {
 
diff --git a/flang/runtime/array-constructor.cpp b/flang-rt/lib/runtime/array-constructor.cpp
similarity index 96%
rename from flang/runtime/array-constructor.cpp
rename to flang-rt/lib/runtime/array-constructor.cpp
index c6953167f5fb2..7e267e714927f 100644
--- a/flang/runtime/array-constructor.cpp
+++ b/flang-rt/lib/runtime/array-constructor.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/array-constructor.cpp -------------------------------------===//
+//===-- lib/runtime/array-constructor.cpp -----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Runtime/array-constructor.h"
-#include "derived.h"
-#include "terminator.h"
-#include "tools.h"
-#include "type-info.h"
+#include "flang-rt/runtime/array-constructor.h"
+#include "flang-rt/runtime/derived.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/assign.h"
-#include "flang/Runtime/descriptor.h"
 
 namespace Fortran::runtime {
 
diff --git a/flang/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
similarity index 98%
rename from flang/runtime/assign.cpp
rename to flang-rt/lib/runtime/assign.cpp
index 8f0efaa376c19..a1f3715f278c1 100644
--- a/flang/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/assign.cpp ------------------------------------------------===//
+//===-- lib/runtime/assign.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/assign.h"
-#include "assign-impl.h"
-#include "derived.h"
-#include "stat.h"
-#include "terminator.h"
-#include "tools.h"
-#include "type-info.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/assign-impl.h"
+#include "flang-rt/runtime/derived.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
+#include "flang-rt/runtime/type-info.h"
 
 namespace Fortran::runtime {
 
diff --git a/flang/runtime/buffer.cpp b/flang-rt/lib/runtime/buffer.cpp
similarity index 88%
rename from flang/runtime/buffer.cpp
rename to flang-rt/lib/runtime/buffer.cpp
index 7b4869d69c2e5..4cf85e13b6d36 100644
--- a/flang/runtime/buffer.cpp
+++ b/flang-rt/lib/runtime/buffer.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/buffer.cpp ------------------------------------------------===//
+//===-- lib/runtime/buffer.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "buffer.h"
+#include "flang-rt/runtime/buffer.h"
 #include <algorithm>
 
 namespace Fortran::runtime::io {
diff --git a/flang/runtime/character.cpp b/flang-rt/lib/runtime/character.cpp
similarity index 99%
rename from flang/runtime/character.cpp
rename to flang-rt/lib/runtime/character.cpp
index 5049247397eb3..10cf27c37c4d8 100644
--- a/flang/runtime/character.cpp
+++ b/flang-rt/lib/runtime/character.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/character.cpp ---------------------------------------------===//
+//===-- lib/runtime/character.cpp -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/character.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Common/bit-population-count.h"
 #include "flang/Common/uint128.h"
 #include "flang/Runtime/character.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <cstring>
 
diff --git a/flang/runtime/command.cpp b/flang-rt/lib/runtime/command.cpp
similarity index 96%
rename from flang/runtime/command.cpp
rename to flang-rt/lib/runtime/command.cpp
index a555e26f96a66..8a5a61ac1ad44 100644
--- a/flang/runtime/command.cpp
+++ b/flang-rt/lib/runtime/command.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/command.cpp -----------------------------------------------===//
+//===-- lib/runtime/command.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/command.h"
-#include "environment.h"
-#include "stat.h"
-#include "terminator.h"
-#include "tools.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include <cstdlib>
 #include <limits>
 
diff --git a/flang/runtime/complex-powi.cpp b/flang-rt/lib/runtime/complex-powi.cpp
similarity index 92%
rename from flang/runtime/complex-powi.cpp
rename to flang-rt/lib/runtime/complex-powi.cpp
index 62f73e037b76f..a561d114591cf 100644
--- a/flang/runtime/complex-powi.cpp
+++ b/flang-rt/lib/runtime/complex-powi.cpp
@@ -1,11 +1,11 @@
-/*===-- flang/runtime/complex-powi.cpp ----------------------------*- C++ -*-===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- * ===-----------------------------------------------------------------------===
- */
+//===-- lib/runtime/complex-powi.cpp ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "flang/Common/float128.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/entry-names.h"
diff --git a/flang/runtime/complex-reduction.c b/flang-rt/lib/runtime/complex-reduction.c
similarity index 97%
rename from flang/runtime/complex-reduction.c
rename to flang-rt/lib/runtime/complex-reduction.c
index de1ff3d683084..967f26c05e702 100644
--- a/flang/runtime/complex-reduction.c
+++ b/flang-rt/lib/runtime/complex-reduction.c
@@ -1,11 +1,10 @@
-/*===-- flang/runtime/complex-reduction.c ---------------------------*- C -*-===
+/*===-- lib/flang_rt/complex-reduction.c ----------------------------*- C -*-===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
- * ===-----------------------------------------------------------------------===
- */
+ *===----------------------------------------------------------------------===*/
 
 #include "complex-reduction.h"
 #include <float.h>
diff --git a/flang/runtime/complex-reduction.h b/flang-rt/lib/runtime/complex-reduction.h
similarity index 96%
rename from flang/runtime/complex-reduction.h
rename to flang-rt/lib/runtime/complex-reduction.h
index 5ff2a828fb3ec..44c52fb02fa43 100644
--- a/flang/runtime/complex-reduction.h
+++ b/flang-rt/lib/runtime/complex-reduction.h
@@ -1,19 +1,18 @@
-/*===-- flang/runtime/complex-reduction.h ---------------------------*- C -*-===
+/*===-- lib/runtime/complex-reduction.h -----------------------------*- C -*-===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
- * ===-----------------------------------------------------------------------===
- */
+ *===----------------------------------------------------------------------===*/
 
 /* Wraps the C++-coded complex-valued SUM and PRODUCT reductions with
  * C-coded wrapper functions returning _Complex values, to avoid problems
  * with C++ build compilers that don't support C's _Complex.
  */
 
-#ifndef FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
-#define FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
+#ifndef FLANG_RT_RUNTIME_COMPLEX_REDUCTION_H_
+#define FLANG_RT_RUNTIME_COMPLEX_REDUCTION_H_
 
 #include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
@@ -156,4 +155,4 @@ void RTNAME(ReduceComplex16DimValue)(
     REDUCE_DIM_ARGS(CFloat128ComplexType, CFloat128ComplexType_value_op));
 #endif
 
-#endif // FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
+#endif // FLANG_RT_RUNTIME_COMPLEX_REDUCTION_H_
diff --git a/flang/runtime/connection.cpp b/flang-rt/lib/runtime/connection.cpp
similarity index 90%
rename from flang/runtime/connection.cpp
rename to flang-rt/lib/runtime/connection.cpp
index f24f0e832eb48..2f01dbbb95920 100644
--- a/flang/runtime/connection.cpp
+++ b/flang-rt/lib/runtime/connection.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/connection.cpp --------------------------------------------===//
+//===-- lib/runtime/connection.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "connection.h"
-#include "environment.h"
-#include "io-stmt.h"
+#include "flang-rt/runtime/connection.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/io-stmt.h"
 #include <algorithm>
 
 namespace Fortran::runtime::io {
diff --git a/flang/runtime/copy.cpp b/flang-rt/lib/runtime/copy.cpp
similarity index 97%
rename from flang/runtime/copy.cpp
rename to flang-rt/lib/runtime/copy.cpp
index b20f68f019498..5956642dd7258 100644
--- a/flang/runtime/copy.cpp
+++ b/flang-rt/lib/runtime/copy.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/copy.cpp -------------------------------------------------===//
+//===-- lib/runtime/copy.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,10 +8,10 @@
 
 #include "copy.h"
 #include "stack.h"
-#include "terminator.h"
-#include "type-info.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Runtime/allocatable.h"
-#include "flang/Runtime/descriptor.h"
 #include <cstring>
 
 namespace Fortran::runtime {
diff --git a/flang/runtime/copy.h b/flang-rt/lib/runtime/copy.h
similarity index 78%
rename from flang/runtime/copy.h
rename to flang-rt/lib/runtime/copy.h
index 542660530bfb6..836c9d4a1ef89 100644
--- a/flang/runtime/copy.h
+++ b/flang-rt/lib/runtime/copy.h
@@ -1,4 +1,4 @@
-//===-- runtime/copy.h ------------------------------------------*- C++ -*-===//
+//===-- lib/runtime/copy.h --------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,10 +9,10 @@
 // Utilities that copy data in a type-aware fashion, allocating & duplicating
 // allocatable/automatic components of derived types along the way.
 
-#ifndef FORTRAN_RUNTIME_COPY_H_
-#define FORTRAN_RUNTIME_COPY_H_
+#ifndef FLANG_RT_RUNTIME_COPY_H_
+#define FLANG_RT_RUNTIME_COPY_H_
 
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
 
 namespace Fortran::runtime {
 
@@ -22,4 +22,4 @@ RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
     const Descriptor &from, const SubscriptValue fromAt[], Terminator &);
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_COPY_H_
+#endif // FLANG_RT_RUNTIME_COPY_H_
diff --git a/flang/runtime/derived-api.cpp b/flang-rt/lib/runtime/derived-api.cpp
similarity index 95%
rename from flang/runtime/derived-api.cpp
rename to flang-rt/lib/runtime/derived-api.cpp
index c8ffd8e3bb67c..884fa8ee7d095 100644
--- a/flang/runtime/derived-api.cpp
+++ b/flang-rt/lib/runtime/derived-api.cpp
@@ -1,5 +1,4 @@
-//===-- runtime/derived-api.cpp
-//-----------------------------------------------===//
+//===-- lib/runtime/derived-api.cpp -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/derived-api.h"
-#include "derived.h"
-#include "terminator.h"
-#include "tools.h"
-#include "type-info.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/derived.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
+#include "flang-rt/runtime/type-info.h"
 
 namespace Fortran::runtime {
 
diff --git a/flang/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
similarity index 98%
rename from flang/runtime/derived.cpp
rename to flang-rt/lib/runtime/derived.cpp
index 10813c62e5da1..87e4b29d08c28 100644
--- a/flang/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/derived.cpp -----------------------------------------------===//
+//===-- lib/runtime/derived.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "derived.h"
-#include "stat.h"
-#include "terminator.h"
-#include "tools.h"
-#include "type-info.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/derived.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
+#include "flang-rt/runtime/type-info.h"
 
 namespace Fortran::runtime {
 
diff --git a/flang/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
similarity index 98%
rename from flang/runtime/descriptor-io.cpp
rename to flang-rt/lib/runtime/descriptor-io.cpp
index 380ad425d925f..3db1455af52fe 100644
--- a/flang/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/descriptor-io.cpp -----------------------------------------===//
+//===-- lib/runtime/descriptor-io.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h
similarity index 98%
rename from flang/runtime/descriptor-io.h
rename to flang-rt/lib/runtime/descriptor-io.h
index 1034958bf654a..dd399164325cb 100644
--- a/flang/runtime/descriptor-io.h
+++ b/flang-rt/lib/runtime/descriptor-io.h
@@ -1,4 +1,4 @@
-//===-- runtime/descriptor-io.h ---------------------------------*- C++ -*-===//
+//===-- lib/runtime/descriptor-io.h -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_DESCRIPTOR_IO_H_
-#define FORTRAN_RUNTIME_DESCRIPTOR_IO_H_
+#ifndef FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
+#define FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 
 // Implementation of I/O data list item transfers based on descriptors.
 // (All I/O items come through here so that the code is exercised for test;
@@ -16,15 +16,15 @@
 
 #include "edit-input.h"
 #include "edit-output.h"
-#include "io-stmt.h"
-#include "namelist.h"
-#include "terminator.h"
-#include "type-info.h"
 #include "unit.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/namelist.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Common/optional.h"
 #include "flang/Common/uint128.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 
 namespace Fortran::runtime::io::descr {
 template <typename A>
@@ -626,4 +626,4 @@ static RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
   return false;
 }
 } // namespace Fortran::runtime::io::descr
-#endif // FORTRAN_RUNTIME_DESCRIPTOR_IO_H_
+#endif // FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
diff --git a/flang/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
similarity index 94%
rename from flang/runtime/descriptor.cpp
rename to flang-rt/lib/runtime/descriptor.cpp
index 32f43e89dc7a3..8241a34a4990c 100644
--- a/flang/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/descriptor.cpp --------------------------------------------===//
+//===-- lib/runtime/descriptor.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,15 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
 #include "ISO_Fortran_util.h"
-#include "derived.h"
 #include "memory.h"
-#include "stat.h"
-#include "terminator.h"
-#include "tools.h"
-#include "type-info.h"
-#include "flang/Runtime/allocator-registry.h"
+#include "flang-rt/runtime/allocator-registry.h"
+#include "flang-rt/runtime/derived.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
+#include "flang-rt/runtime/type-info.h"
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
@@ -141,8 +141,10 @@ RT_API_ATTRS OwningPtr<Descriptor> Descriptor::Create(
 
 RT_API_ATTRS std::size_t Descriptor::SizeInBytes() const {
   const DescriptorAddendum *addendum{Addendum()};
-  return sizeof *this - sizeof(Dimension) + raw_.rank * sizeof(Dimension) +
-      (addendum ? addendum->SizeInBytes() : 0);
+  std::size_t bytes{ sizeof *this - sizeof(Dimension) + raw_.rank * sizeof(Dimension) +
+      (addendum ? addendum->SizeInBytes() : 0)};
+  assert (bytes <= MaxDescriptorSizeInBytes(raw_.rank,addendum) && "Descriptor must fit compiler-allocated space");
+  return bytes;
 }
 
 RT_API_ATTRS std::size_t Descriptor::Elements() const {
diff --git a/flang/runtime/dot-product.cpp b/flang-rt/lib/runtime/dot-product.cpp
similarity index 98%
rename from flang/runtime/dot-product.cpp
rename to flang-rt/lib/runtime/dot-product.cpp
index 712497a3a50ac..20612f1876c15 100644
--- a/flang/runtime/dot-product.cpp
+++ b/flang-rt/lib/runtime/dot-product.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/dot-product.cpp -------------------------------------------===//
+//===-- lib/runtime/dot-product.cpp -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "float.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Common/float128.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/reduction.h"
 #include <cfloat>
 #include <cinttypes>
diff --git a/flang/runtime/edit-input.cpp b/flang-rt/lib/runtime/edit-input.cpp
similarity index 99%
rename from flang/runtime/edit-input.cpp
rename to flang-rt/lib/runtime/edit-input.cpp
index 317f0b676bd21..99a266648f95c 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang-rt/lib/runtime/edit-input.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/edit-input.cpp --------------------------------------------===//
+//===-- lib/runtime/edit-input.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "edit-input.h"
-#include "namelist.h"
-#include "utf.h"
+#include "flang-rt/runtime/namelist.h"
+#include "flang-rt/runtime/utf.h"
 #include "flang/Common/optional.h"
 #include "flang/Common/real.h"
 #include "flang/Common/uint128.h"
diff --git a/flang/runtime/edit-input.h b/flang-rt/lib/runtime/edit-input.h
similarity index 87%
rename from flang/runtime/edit-input.h
rename to flang-rt/lib/runtime/edit-input.h
index 55a7a45578171..686cd461b3e34 100644
--- a/flang/runtime/edit-input.h
+++ b/flang-rt/lib/runtime/edit-input.h
@@ -1,4 +1,4 @@
-//===-- runtime/edit-input.h ------------------------------------*- C++ -*-===//
+//===-- lib/runtime/edit-input.h --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_EDIT_INPUT_H_
-#define FORTRAN_RUNTIME_EDIT_INPUT_H_
+#ifndef FLANG_RT_RUNTIME_EDIT_INPUT_H_
+#define FLANG_RT_RUNTIME_EDIT_INPUT_H_
 
-#include "format.h"
-#include "io-stmt.h"
+#include "flang-rt/runtime/format.h"
+#include "flang-rt/runtime/io-stmt.h"
 #include "flang/Decimal/decimal.h"
 
 namespace Fortran::runtime::io {
@@ -50,4 +50,4 @@ extern template RT_API_ATTRS bool EditCharacterInput(
     IoStatementState &, const DataEdit &, char32_t *, std::size_t);
 
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_EDIT_INPUT_H_
+#endif // FLANG_RT_RUNTIME_EDIT_INPUT_H_
diff --git a/flang/runtime/edit-output.cpp b/flang-rt/lib/runtime/edit-output.cpp
similarity index 99%
rename from flang/runtime/edit-output.cpp
rename to flang-rt/lib/runtime/edit-output.cpp
index 9db9a3d4a511b..36bbc638ff5fc 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang-rt/lib/runtime/edit-output.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/edit-output.cpp -------------------------------------------===//
+//===-- lib/runtime/edit-output.cpp -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "edit-output.h"
-#include "emit-encoded.h"
-#include "utf.h"
+#include "flang-rt/runtime/emit-encoded.h"
+#include "flang-rt/runtime/utf.h"
 #include "flang/Common/real.h"
 #include "flang/Common/uint128.h"
 #include <algorithm>
diff --git a/flang/runtime/edit-output.h b/flang-rt/lib/runtime/edit-output.h
similarity index 95%
rename from flang/runtime/edit-output.h
rename to flang-rt/lib/runtime/edit-output.h
index 42cc993f98cc1..51a47405e49e4 100644
--- a/flang/runtime/edit-output.h
+++ b/flang-rt/lib/runtime/edit-output.h
@@ -1,4 +1,4 @@
-//===-- runtime/edit-output.h -----------------------------------*- C++ -*-===//
+//===-- lib/runtime/edit-output.h -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_EDIT_OUTPUT_H_
-#define FORTRAN_RUNTIME_EDIT_OUTPUT_H_
+#ifndef FLANG_RT_RUNTIME_EDIT_OUTPUT_H_
+#define FLANG_RT_RUNTIME_EDIT_OUTPUT_H_
 
 // Output data editing templates implementing the FORMAT data editing
 // descriptors E, EN, ES, EX, D, F, and G for REAL data (and COMPLEX
@@ -18,8 +18,8 @@
 // Drives the same fast binary-to-decimal formatting templates used
 // in the f18 front-end.
 
-#include "format.h"
-#include "io-stmt.h"
+#include "flang-rt/runtime/format.h"
+#include "flang-rt/runtime/io-stmt.h"
 #include "flang/Common/uint128.h"
 #include "flang/Decimal/decimal.h"
 
@@ -138,4 +138,4 @@ extern template class RealOutputEditing<10>;
 extern template class RealOutputEditing<16>;
 
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_EDIT_OUTPUT_H_
+#endif // FLANG_RT_RUNTIME_EDIT_OUTPUT_H_
diff --git a/flang/runtime/environment-default-list.h b/flang-rt/lib/runtime/environment-default-list.h
similarity index 70%
rename from flang/runtime/environment-default-list.h
rename to flang-rt/lib/runtime/environment-default-list.h
index 4da261b10b9a8..76c0955bcce6d 100644
--- a/flang/runtime/environment-default-list.h
+++ b/flang-rt/lib/runtime/environment-default-list.h
@@ -1,14 +1,13 @@
-/*===-- runtime/environment-default-list.h --------------------------*- C -*-===
+/*===-- lib/flang_rt/environment-default-list.h ---------------------*- C -*-===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
- * ===-----------------------------------------------------------------------===
- */
+ *===----------------------------------------------------------------------===*/
 
-#ifndef FORTRAN_RUNTIME_ENVIRONMENT_DEFAULT_LIST_H_
-#define FORTRAN_RUNTIME_ENVIRONMENT_DEFAULT_LIST_H_
+#ifndef FLANG_RT_ENVIRONMENT_DEFAULT_LIST_H_
+#define FLANG_RT_ENVIRONMENT_DEFAULT_LIST_H_
 
 /* Try to maintain C compatibility to make it easier to both define environment
  * defaults in non-Fortran main programs as well as pass through the environment
@@ -28,4 +27,4 @@ struct EnvironmentDefaultList {
   const struct EnvironmentDefaultItem *item;
 };
 
-#endif /* FORTRAN_RUNTIME_ENVIRONMENT_DEFAULT_LIST_H_ */
+#endif /* FLANG_RT_ENVIRONMENT_DEFAULT_LIST_H_ */
diff --git a/flang/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
similarity index 96%
rename from flang/runtime/environment.cpp
rename to flang-rt/lib/runtime/environment.cpp
index 678d8745c9fd7..15380ba148df5 100644
--- a/flang/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/environment.cpp -------------------------------------------===//
+//===-- lib/runtime/environment.cpp -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "environment.h"
+#include "flang-rt/runtime/environment.h"
 #include "environment-default-list.h"
 #include "memory.h"
-#include "tools.h"
+#include "flang-rt/runtime/tools.h"
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
diff --git a/flang/runtime/exceptions.cpp b/flang-rt/lib/runtime/exceptions.cpp
similarity index 97%
rename from flang/runtime/exceptions.cpp
rename to flang-rt/lib/runtime/exceptions.cpp
index 344e7216cfaae..d25a67c8e9cb5 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang-rt/lib/runtime/exceptions.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/exceptions.cpp --------------------------------------===//
+//===-- lib/runtime/exceptions.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,7 +9,7 @@
 // Runtime exception support.
 
 #include "flang/Runtime/exceptions.h"
-#include "terminator.h"
+#include "flang-rt/runtime/terminator.h"
 #include <cfenv>
 #if defined(__aarch64__) && defined(__GLIBC__)
 #include <fpu_control.h>
diff --git a/flang/runtime/execute.cpp b/flang-rt/lib/runtime/execute.cpp
similarity index 97%
rename from flang/runtime/execute.cpp
rename to flang-rt/lib/runtime/execute.cpp
index c7f8f386d81f4..f180da846a32c 100644
--- a/flang/runtime/execute.cpp
+++ b/flang-rt/lib/runtime/execute.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/execute.cpp -----------------------------------------------===//
+//===-- lib/runtime/execute.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/execute.h"
-#include "environment.h"
-#include "stat.h"
-#include "terminator.h"
-#include "tools.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include <cstdlib>
 #include <errno.h>
 #include <future>
diff --git a/flang/runtime/extensions.cpp b/flang-rt/lib/runtime/extensions.cpp
similarity index 97%
rename from flang/runtime/extensions.cpp
rename to flang-rt/lib/runtime/extensions.cpp
index ac19ba7b31d4c..75195c33a6c21 100644
--- a/flang/runtime/extensions.cpp
+++ b/flang-rt/lib/runtime/extensions.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/extensions.cpp --------------------------------------------===//
+//===-- lib/runtime/extensions.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,10 +10,10 @@
 // extensions that will eventually be implemented in Fortran.
 
 #include "flang/Runtime/extensions.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Runtime/command.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/entry-names.h"
 #include "flang/Runtime/io-api.h"
 #include <chrono>
diff --git a/flang/runtime/external-unit.cpp b/flang-rt/lib/runtime/external-unit.cpp
similarity index 98%
rename from flang/runtime/external-unit.cpp
rename to flang-rt/lib/runtime/external-unit.cpp
index d17a92622f844..b8004d6315994 100644
--- a/flang/runtime/external-unit.cpp
+++ b/flang-rt/lib/runtime/external-unit.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/external-unit.cpp -----------------------------------------===//
+//===-- lib/runtime/external-unit.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,11 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "io-error.h"
-#include "lock.h"
-#include "tools.h"
 #include "unit-map.h"
 #include "unit.h"
+#include "flang-rt/runtime/io-error.h"
+#include "flang-rt/runtime/lock.h"
+#include "flang-rt/runtime/tools.h"
 
 // NOTE: the header files above may define OpenMP declare target
 // variables, so they have to be included unconditionally
diff --git a/flang/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp
similarity index 99%
rename from flang/runtime/extrema.cpp
rename to flang-rt/lib/runtime/extrema.cpp
index 7ecdf4b91702e..3d84daa380441 100644
--- a/flang/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/extrema.cpp -----------------------------------------------===//
+//===-- lib/runtime/extrema.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,7 +10,7 @@
 // and shapes and (for MAXLOC & MINLOC) result integer kinds.  Also implements
 // NORM2 using common infrastructure.
 
-#include "reduction-templates.h"
+#include "flang-rt/runtime/reduction-templates.h"
 #include "flang/Common/float128.h"
 #include "flang/Runtime/character.h"
 #include "flang/Runtime/reduction.h"
diff --git a/flang/runtime/file.cpp b/flang-rt/lib/runtime/file.cpp
similarity index 98%
rename from flang/runtime/file.cpp
rename to flang-rt/lib/runtime/file.cpp
index 9e077b8cea44b..16e73db488727 100644
--- a/flang/runtime/file.cpp
+++ b/flang-rt/lib/runtime/file.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/file.cpp --------------------------------------------------===//
+//===-- lib/runtime/file.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "file.h"
-#include "tools.h"
+#include "flang-rt/runtime/file.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Runtime/magic-numbers.h"
-#include "flang/Runtime/memory.h"
 #include <algorithm>
 #include <cerrno>
 #include <cstring>
diff --git a/flang/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp
similarity index 99%
rename from flang/runtime/findloc.cpp
rename to flang-rt/lib/runtime/findloc.cpp
index b9b1d7f7ab689..95986aefb86a4 100644
--- a/flang/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/findloc.cpp -----------------------------------------------===//
+//===-- lib/runtime/findloc.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,7 +9,7 @@
 // Implements FINDLOC for all required operand types and shapes and result
 // integer kinds.
 
-#include "reduction-templates.h"
+#include "flang-rt/runtime/reduction-templates.h"
 #include "flang/Runtime/character.h"
 #include "flang/Runtime/reduction.h"
 #include <cinttypes>
diff --git a/flang/runtime/format.cpp b/flang-rt/lib/runtime/format.cpp
similarity index 87%
rename from flang/runtime/format.cpp
rename to flang-rt/lib/runtime/format.cpp
index 433acce4b7373..ee0059f5f0729 100644
--- a/flang/runtime/format.cpp
+++ b/flang-rt/lib/runtime/format.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/format.cpp ------------------------------------------------===//
+//===-- lib/runtime/format.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "format-implementation.h"
+#include "flang-rt/runtime/format-implementation.h"
 
 namespace Fortran::runtime::io {
 RT_OFFLOAD_API_GROUP_BEGIN
diff --git a/flang/runtime/inquiry.cpp b/flang-rt/lib/runtime/inquiry.cpp
similarity index 94%
rename from flang/runtime/inquiry.cpp
rename to flang-rt/lib/runtime/inquiry.cpp
index 9fbcaa96fa3c4..b6a7fce7a1e78 100644
--- a/flang/runtime/inquiry.cpp
+++ b/flang-rt/lib/runtime/inquiry.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/inquiry.cpp --------------------------------------===//
+//===-- lib/runtime/inquiry.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,9 +11,9 @@
 
 #include "flang/Runtime/inquiry.h"
 #include "copy.h"
-#include "terminator.h"
-#include "tools.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include <algorithm>
 
 namespace Fortran::runtime {
diff --git a/flang/runtime/internal-unit.cpp b/flang-rt/lib/runtime/internal-unit.cpp
similarity index 96%
rename from flang/runtime/internal-unit.cpp
rename to flang-rt/lib/runtime/internal-unit.cpp
index f8f3877efb20e..e344b01e8b34e 100644
--- a/flang/runtime/internal-unit.cpp
+++ b/flang-rt/lib/runtime/internal-unit.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/internal-unit.cpp -----------------------------------------===//
+//===-- lib/runtime/internal-unit.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "internal-unit.h"
-#include "io-error.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/internal-unit.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/io-error.h"
 #include "flang/Runtime/freestanding-tools.h"
 #include <algorithm>
 #include <type_traits>
diff --git a/flang/runtime/io-api-common.h b/flang-rt/lib/runtime/io-api-common.h
similarity index 92%
rename from flang/runtime/io-api-common.h
rename to flang-rt/lib/runtime/io-api-common.h
index c7b86cab73a52..b91ff9ff16863 100644
--- a/flang/runtime/io-api-common.h
+++ b/flang-rt/lib/runtime/io-api-common.h
@@ -1,4 +1,4 @@
-//===-- runtime/io-api-common.h ---------------------------------*- C++ -*-===//
+//===-- lib/runtime/io-api-common.h -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FLANG_RUNTIME_IO_API_COMMON_H_
-#define FLANG_RUNTIME_IO_API_COMMON_H_
+#ifndef FLANG_RT_RUNTIME_IO_API_COMMON_H_
+#define FLANG_RT_RUNTIME_IO_API_COMMON_H_
 
-#include "io-stmt.h"
-#include "terminator.h"
 #include "unit.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Common/api-attrs.h"
 #include "flang/Common/optional.h"
 #include "flang/Runtime/io-api.h"
@@ -94,4 +94,4 @@ RT_API_ATTRS Cookie BeginExternalListIO(
 }
 
 } // namespace Fortran::runtime::io
-#endif // FLANG_RUNTIME_IO_API_COMMON_H_
+#endif // FLANG_RT_RUNTIME_IO_API_COMMON_H_
diff --git a/flang/runtime/io-api-minimal.cpp b/flang-rt/lib/runtime/io-api-minimal.cpp
similarity index 94%
rename from flang/runtime/io-api-minimal.cpp
rename to flang-rt/lib/runtime/io-api-minimal.cpp
index 68768427be0c2..c706a3aa239a5 100644
--- a/flang/runtime/io-api-minimal.cpp
+++ b/flang-rt/lib/runtime/io-api-minimal.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/io-api-minimal.cpp ----------------------------------------===//
+//===-- lib/runtime/io-api-minimal.cpp --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,12 +10,12 @@
 // list-directed output (PRINT *) of intrinsic types.
 
 #include "edit-output.h"
-#include "format.h"
 #include "io-api-common.h"
-#include "io-stmt.h"
-#include "terminator.h"
-#include "tools.h"
 #include "unit.h"
+#include "flang-rt/runtime/format.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Runtime/io-api.h"
 
 namespace Fortran::runtime::io {
@@ -150,7 +150,7 @@ bool IODEF(OutputLogical)(Cookie cookie, bool truth) {
 // Provide own definition for `std::__libcpp_verbose_abort` to avoid dependency
 // on the version provided by libc++.
 
-void std::__libcpp_verbose_abort(char const *format, ...) {
+void std::__libcpp_verbose_abort(char const *format, ...) noexcept {
   va_list list;
   va_start(list, format);
   std::vfprintf(stderr, format, list);
diff --git a/flang/runtime/io-api.cpp b/flang-rt/lib/runtime/io-api.cpp
similarity index 99%
rename from flang/runtime/io-api.cpp
rename to flang-rt/lib/runtime/io-api.cpp
index dc3f6f87fc21b..0355734c67fcd 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang-rt/lib/runtime/io-api.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/io-api.cpp ------------------------------------------------===//
+//===-- lib/runtime/io-api.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -17,16 +17,16 @@
 #include "descriptor-io.h"
 #include "edit-input.h"
 #include "edit-output.h"
-#include "environment.h"
-#include "format.h"
 #include "io-api-common.h"
-#include "io-stmt.h"
-#include "terminator.h"
-#include "tools.h"
 #include "unit.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/format.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Common/optional.h"
-#include "flang/Runtime/descriptor.h"
-#include "flang/Runtime/memory.h"
 #include <cstdlib>
 #include <memory>
 
diff --git a/flang/runtime/io-error.cpp b/flang-rt/lib/runtime/io-error.cpp
similarity index 96%
rename from flang/runtime/io-error.cpp
rename to flang-rt/lib/runtime/io-error.cpp
index 37909e8e6dad2..b350fb66fc25b 100644
--- a/flang/runtime/io-error.cpp
+++ b/flang-rt/lib/runtime/io-error.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/io-error.cpp ----------------------------------------------===//
+//===-- lib/runtime/io-error.cpp --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "io-error.h"
+#include "flang-rt/runtime/io-error.h"
 #include "config.h"
-#include "tools.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Runtime/magic-numbers.h"
 #include <cerrno>
 #include <cstdarg>
diff --git a/flang/runtime/io-stmt.cpp b/flang-rt/lib/runtime/io-stmt.cpp
similarity index 99%
rename from flang/runtime/io-stmt.cpp
rename to flang-rt/lib/runtime/io-stmt.cpp
index f24eb929ce748..b0823ffd9e703 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang-rt/lib/runtime/io-stmt.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/io-stmt.cpp -----------------------------------------------===//
+//===-- lib/runtime/io-stmt.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "io-stmt.h"
-#include "connection.h"
-#include "emit-encoded.h"
-#include "format.h"
-#include "tools.h"
+#include "flang-rt/runtime/io-stmt.h"
 #include "unit.h"
-#include "utf.h"
-#include "flang/Runtime/memory.h"
+#include "flang-rt/runtime/connection.h"
+#include "flang-rt/runtime/emit-encoded.h"
+#include "flang-rt/runtime/format.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/tools.h"
+#include "flang-rt/runtime/utf.h"
 #include <algorithm>
 #include <cstdio>
 #include <cstring>
diff --git a/flang/runtime/iostat.cpp b/flang-rt/lib/runtime/iostat.cpp
similarity index 98%
rename from flang/runtime/iostat.cpp
rename to flang-rt/lib/runtime/iostat.cpp
index 39e224cb01286..0f8bfb884e544 100644
--- a/flang/runtime/iostat.cpp
+++ b/flang-rt/lib/runtime/iostat.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/iostat.cpp ------------------------------------------------===//
+//===-- lib/runtime/iostat.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/main.cpp b/flang-rt/lib/runtime/main.cpp
similarity index 89%
rename from flang/runtime/main.cpp
rename to flang-rt/lib/runtime/main.cpp
index 96454989581b7..b3f066cda3732 100644
--- a/flang/runtime/main.cpp
+++ b/flang-rt/lib/runtime/main.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/main.cpp --------------------------------------------------===//
+//===-- lib/runtime/main.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/main.h"
-#include "environment.h"
-#include "terminator.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/terminator.h"
 #include <cfenv>
 #include <cstdio>
 #include <cstdlib>
diff --git a/flang/runtime/matmul-transpose.cpp b/flang-rt/lib/runtime/matmul-transpose.cpp
similarity index 98%
rename from flang/runtime/matmul-transpose.cpp
rename to flang-rt/lib/runtime/matmul-transpose.cpp
index bafa05056bebc..e20abbdddcd30 100644
--- a/flang/runtime/matmul-transpose.cpp
+++ b/flang-rt/lib/runtime/matmul-transpose.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/matmul-transpose.cpp --------------------------------------===//
+//===-- lib/runtime/matmul-transpose.cpp ------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -21,12 +21,12 @@
 // to use the faster BLAS routines.
 
 #include "flang/Runtime/matmul-transpose.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Common/optional.h"
 #include "flang/Runtime/c-or-cpp.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include <cstring>
 
 namespace {
diff --git a/flang/runtime/matmul.cpp b/flang-rt/lib/runtime/matmul.cpp
similarity index 98%
rename from flang/runtime/matmul.cpp
rename to flang-rt/lib/runtime/matmul.cpp
index f72601073a600..f14cea922d21e 100644
--- a/flang/runtime/matmul.cpp
+++ b/flang-rt/lib/runtime/matmul.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/matmul.cpp ------------------------------------------------===//
+//===-- lib/runtime/matmul.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -20,12 +20,12 @@
 // Places where BLAS routines could be called are marked as TODO items.
 
 #include "flang/Runtime/matmul.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Common/optional.h"
 #include "flang/Runtime/c-or-cpp.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include <cstring>
 
 namespace {
diff --git a/flang/runtime/memory.cpp b/flang-rt/lib/runtime/memory.cpp
similarity index 85%
rename from flang/runtime/memory.cpp
rename to flang-rt/lib/runtime/memory.cpp
index c7068ad6479a1..79c7e33777569 100644
--- a/flang/runtime/memory.cpp
+++ b/flang-rt/lib/runtime/memory.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/memory.cpp ------------------------------------------------===//
+//===-- lib/runtime/memory.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Runtime/memory.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Runtime/freestanding-tools.h"
 #include <cstdlib>
 
diff --git a/flang/runtime/misc-intrinsic.cpp b/flang-rt/lib/runtime/misc-intrinsic.cpp
similarity index 95%
rename from flang/runtime/misc-intrinsic.cpp
rename to flang-rt/lib/runtime/misc-intrinsic.cpp
index f7d893829fc0d..b7335e9f6799e 100644
--- a/flang/runtime/misc-intrinsic.cpp
+++ b/flang-rt/lib/runtime/misc-intrinsic.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/misc-intrinsic.cpp ----------------------------------------===//
+//===-- lib/runtime/misc-intrinsic.cpp --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,10 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/misc-intrinsic.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Common/optional.h"
-#include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <cstdio>
 #include <cstring>
diff --git a/flang/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp
similarity index 99%
rename from flang/runtime/namelist.cpp
rename to flang-rt/lib/runtime/namelist.cpp
index af092de70f781..b0cf2180fc6d4 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang-rt/lib/runtime/namelist.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/namelist.cpp ----------------------------------------------===//
+//===-- lib/runtime/namelist.cpp --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "namelist.h"
+#include "flang-rt/runtime/namelist.h"
 #include "descriptor-io.h"
-#include "emit-encoded.h"
-#include "io-stmt.h"
+#include "flang-rt/runtime/emit-encoded.h"
+#include "flang-rt/runtime/io-stmt.h"
 #include "flang/Runtime/io-api.h"
 #include <algorithm>
 #include <cstring>
diff --git a/flang/runtime/non-tbp-dio.cpp b/flang-rt/lib/runtime/non-tbp-dio.cpp
similarity index 86%
rename from flang/runtime/non-tbp-dio.cpp
rename to flang-rt/lib/runtime/non-tbp-dio.cpp
index 9419adb7631cc..72101b06e0c6e 100644
--- a/flang/runtime/non-tbp-dio.cpp
+++ b/flang-rt/lib/runtime/non-tbp-dio.cpp
@@ -1,4 +1,4 @@
-//===-- flang/runtime/non-tbp-dio.cpp ---------------------------*- C++ -*-===//
+//===-- lib/runtime/non-tbp-dio.cpp -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "non-tbp-dio.h"
-#include "type-info.h"
+#include "flang-rt/runtime/non-tbp-dio.h"
+#include "flang-rt/runtime/type-info.h"
 
 namespace Fortran::runtime::io {
 
diff --git a/flang/runtime/numeric.cpp b/flang-rt/lib/runtime/numeric.cpp
similarity index 99%
rename from flang/runtime/numeric.cpp
rename to flang-rt/lib/runtime/numeric.cpp
index 45fb56348fd44..37638765dc650 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang-rt/lib/runtime/numeric.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/numeric.cpp -----------------------------------------------===//
+//===-- lib/runtime/numeric.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/numeric.h"
-#include "numeric-templates.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/numeric-templates.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Common/float128.h"
 #include <cfloat>
 #include <climits>
diff --git a/flang/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp
similarity index 96%
rename from flang/runtime/pointer.cpp
rename to flang-rt/lib/runtime/pointer.cpp
index 3b0babe3d852f..ecca86bfe73cd 100644
--- a/flang/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/pointer.cpp -----------------------------------------------===//
+//===-- lib/runtime/pointer.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/pointer.h"
-#include "assign-impl.h"
-#include "derived.h"
-#include "environment.h"
-#include "stat.h"
-#include "terminator.h"
-#include "tools.h"
-#include "type-info.h"
+#include "flang-rt/runtime/assign-impl.h"
+#include "flang-rt/runtime/derived.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
+#include "flang-rt/runtime/type-info.h"
 
 namespace Fortran::runtime {
 extern "C" {
diff --git a/flang/runtime/product.cpp b/flang-rt/lib/runtime/product.cpp
similarity index 98%
rename from flang/runtime/product.cpp
rename to flang-rt/lib/runtime/product.cpp
index 293ffd301ba2e..02fdc2bfcd576 100644
--- a/flang/runtime/product.cpp
+++ b/flang-rt/lib/runtime/product.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/product.cpp -----------------------------------------------===//
+//===-- lib/runtime/product.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,7 +8,7 @@
 
 // Implements PRODUCT for all required operand types and shapes.
 
-#include "reduction-templates.h"
+#include "flang-rt/runtime/reduction-templates.h"
 #include "flang/Common/float128.h"
 #include "flang/Runtime/reduction.h"
 #include <cfloat>
diff --git a/flang/runtime/pseudo-unit.cpp b/flang-rt/lib/runtime/pseudo-unit.cpp
similarity index 97%
rename from flang/runtime/pseudo-unit.cpp
rename to flang-rt/lib/runtime/pseudo-unit.cpp
index 526afd11d916e..7e1f3bc86b294 100644
--- a/flang/runtime/pseudo-unit.cpp
+++ b/flang-rt/lib/runtime/pseudo-unit.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/pseudo-unit.cpp -------------------------------------------===//
+//===-- lib/runtime/pseudo-unit.cpp -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "io-error.h"
-#include "tools.h"
 #include "unit.h"
+#include "flang-rt/runtime/io-error.h"
+#include "flang-rt/runtime/tools.h"
 
 // NOTE: the header files above may define OpenMP declare target
 // variables, so they have to be included unconditionally
diff --git a/flang/runtime/ragged.cpp b/flang-rt/lib/runtime/ragged.cpp
similarity index 96%
rename from flang/runtime/ragged.cpp
rename to flang-rt/lib/runtime/ragged.cpp
index a4d9e541ba531..dddc3ccdfd858 100644
--- a/flang/runtime/ragged.cpp
+++ b/flang-rt/lib/runtime/ragged.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/ragged.cpp ------------------------------------------------===//
+//===-- lib/runtime/ragged.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/ragged.h"
-#include "tools.h"
+#include "flang-rt/runtime/tools.h"
 #include <cstdlib>
 
 namespace Fortran::runtime {
diff --git a/flang/runtime/random.cpp b/flang-rt/lib/runtime/random.cpp
similarity index 96%
rename from flang/runtime/random.cpp
rename to flang-rt/lib/runtime/random.cpp
index 8f158007a5a65..dc74f2725ed51 100644
--- a/flang/runtime/random.cpp
+++ b/flang-rt/lib/runtime/random.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/random.cpp ------------------------------------------------===//
+//===-- lib/runtime/random.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,14 +10,14 @@
 // RANDOM_SEED.
 
 #include "flang/Runtime/random.h"
-#include "lock.h"
-#include "random-templates.h"
-#include "terminator.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/lock.h"
+#include "flang-rt/runtime/random-templates.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Common/float128.h"
 #include "flang/Common/leading-zero-bit-count.h"
 #include "flang/Common/uint128.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include <cmath>
 #include <cstdint>
 #include <limits>
diff --git a/flang/runtime/reduce.cpp b/flang-rt/lib/runtime/reduce.cpp
similarity index 99%
rename from flang/runtime/reduce.cpp
rename to flang-rt/lib/runtime/reduce.cpp
index 6c42c5ef50e4f..3c5e815e32d2b 100644
--- a/flang/runtime/reduce.cpp
+++ b/flang-rt/lib/runtime/reduce.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/reduce.cpp ------------------------------------------------===//
+//===-- lib/runtime/reduce.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,10 +9,10 @@
 // REDUCE() implementation
 
 #include "flang/Runtime/reduce.h"
-#include "reduction-templates.h"
-#include "terminator.h"
-#include "tools.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/reduction-templates.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 
 namespace Fortran::runtime {
 
diff --git a/flang/runtime/reduction.cpp b/flang-rt/lib/runtime/reduction.cpp
similarity index 98%
rename from flang/runtime/reduction.cpp
rename to flang-rt/lib/runtime/reduction.cpp
index a8fcde7b3a166..5e45870489479 100644
--- a/flang/runtime/reduction.cpp
+++ b/flang-rt/lib/runtime/reduction.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/reduction.cpp ---------------------------------------------===//
+//===-- lib/runtime/reduction.cpp -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,8 +14,8 @@
 // NORM2, MAXLOC, MINLOC, MAXVAL, and MINVAL are in extrema.cpp.
 
 #include "flang/Runtime/reduction.h"
-#include "reduction-templates.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/reduction-templates.h"
 #include <cinttypes>
 
 namespace Fortran::runtime {
diff --git a/flang/runtime/stack.h b/flang-rt/lib/runtime/stack.h
similarity index 93%
rename from flang/runtime/stack.h
rename to flang-rt/lib/runtime/stack.h
index b6e6edb595e9a..38364ff541f16 100644
--- a/flang/runtime/stack.h
+++ b/flang-rt/lib/runtime/stack.h
@@ -1,4 +1,4 @@
-//===-- runtime/stack.h -----------------------------------------*- C++ -*-===//
+//===-- lib/runtime/stack.h -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,11 +10,11 @@
 // It is a list based stack with dynamic allocation/deallocation
 // of the list nodes.
 
-#ifndef FORTRAN_RUNTIME_STACK_H
-#define FORTRAN_RUNTIME_STACK_H
+#ifndef FLANG_RT_RUNTIME_STACK_H_
+#define FLANG_RT_RUNTIME_STACK_H_
 
-#include "terminator.h"
-#include "flang/Runtime/memory.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
 
 namespace Fortran::runtime {
 // Storage for the Stack elements of type T.
@@ -133,4 +133,4 @@ template <typename T, unsigned N = 0> class Stack : public StackStorage<T, N> {
   Terminator &terminator_;
 };
 } // namespace Fortran::runtime
-#endif // FORTRAN_RUNTIME_STACK_H
+#endif // FLANG_RT_RUNTIME_STACK_H_
diff --git a/flang/runtime/stat.cpp b/flang-rt/lib/runtime/stat.cpp
similarity index 92%
rename from flang/runtime/stat.cpp
rename to flang-rt/lib/runtime/stat.cpp
index 525a4e36cdc77..322b7282b7024 100644
--- a/flang/runtime/stat.cpp
+++ b/flang-rt/lib/runtime/stat.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/stat.cpp --------------------------------------------------===//
+//===-- lib/runtime/stat.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "stat.h"
-#include "terminator.h"
-#include "tools.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 
 namespace Fortran::runtime {
 RT_OFFLOAD_API_GROUP_BEGIN
diff --git a/flang/runtime/stop.cpp b/flang-rt/lib/runtime/stop.cpp
similarity index 95%
rename from flang/runtime/stop.cpp
rename to flang-rt/lib/runtime/stop.cpp
index f8c180e0aaffa..1d70a137377aa 100644
--- a/flang/runtime/stop.cpp
+++ b/flang-rt/lib/runtime/stop.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/stop.cpp --------------------------------------------------===//
+//===-- lib/runtime/stop.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,11 +8,11 @@
 
 #include "flang/Runtime/stop.h"
 #include "config.h"
-#include "environment.h"
-#include "file.h"
-#include "io-error.h"
-#include "terminator.h"
 #include "unit.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/file.h"
+#include "flang-rt/runtime/io-error.h"
+#include "flang-rt/runtime/terminator.h"
 #include <cfenv>
 #include <cstdio>
 #include <cstdlib>
diff --git a/flang/runtime/sum.cpp b/flang-rt/lib/runtime/sum.cpp
similarity index 98%
rename from flang/runtime/sum.cpp
rename to flang-rt/lib/runtime/sum.cpp
index 3cb7a2b285779..a76e228f18a4e 100644
--- a/flang/runtime/sum.cpp
+++ b/flang-rt/lib/runtime/sum.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/sum.cpp ---------------------------------------------------===//
+//===-- lib/runtime/sum.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
 // cancellation on intermediate results by using "Kahan summation"
 // (basically the same as manual "double-double").
 
-#include "reduction-templates.h"
+#include "flang-rt/runtime/reduction-templates.h"
 #include "flang/Common/float128.h"
 #include "flang/Runtime/reduction.h"
 #include <cfloat>
diff --git a/flang/runtime/support.cpp b/flang-rt/lib/runtime/support.cpp
similarity index 90%
rename from flang/runtime/support.cpp
rename to flang-rt/lib/runtime/support.cpp
index a607120256d9d..5a2b0c920aa80 100644
--- a/flang/runtime/support.cpp
+++ b/flang-rt/lib/runtime/support.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/support.cpp -----------------------------------------------===//
+//===-- lib/runtime/support.cpp ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,8 @@
 
 #include "flang/Runtime/support.h"
 #include "ISO_Fortran_util.h"
-#include "type-info.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/type-info.h"
 
 namespace Fortran::runtime {
 extern "C" {
diff --git a/flang/runtime/temporary-stack.cpp b/flang-rt/lib/runtime/temporary-stack.cpp
similarity index 97%
rename from flang/runtime/temporary-stack.cpp
rename to flang-rt/lib/runtime/temporary-stack.cpp
index 93340266b1b44..ea89d0c17bb6b 100644
--- a/flang/runtime/temporary-stack.cpp
+++ b/flang-rt/lib/runtime/temporary-stack.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/temporary-stack.cpp ---------------------------------------===//
+//===-- lib/runtime/temporary-stack.cpp -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,11 +10,11 @@
 // temporaries. For use in HLFIR lowering.
 
 #include "flang/Runtime/temporary-stack.h"
-#include "terminator.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
 #include "flang/Runtime/assign.h"
-#include "flang/Runtime/descriptor.h"
-#include "flang/Runtime/memory.h"
 
 namespace {
 
diff --git a/flang/runtime/terminator.cpp b/flang-rt/lib/runtime/terminator.cpp
similarity index 95%
rename from flang/runtime/terminator.cpp
rename to flang-rt/lib/runtime/terminator.cpp
index bab9edc64fa35..8a57ba06b1304 100644
--- a/flang/runtime/terminator.cpp
+++ b/flang-rt/lib/runtime/terminator.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/terminate.cpp ---------------------------------------------===//
+//===-- lib/runtime/terminator.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "terminator.h"
+#include "flang-rt/runtime/terminator.h"
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/flang/runtime/time-intrinsic.cpp b/flang-rt/lib/runtime/time-intrinsic.cpp
similarity index 98%
rename from flang/runtime/time-intrinsic.cpp
rename to flang-rt/lib/runtime/time-intrinsic.cpp
index 942604a92aaad..69c344f5d24bc 100644
--- a/flang/runtime/time-intrinsic.cpp
+++ b/flang-rt/lib/runtime/time-intrinsic.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/time-intrinsic.cpp ----------------------------------------===//
+//===-- lib/runtime/time-intrinsic.cpp --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,10 +9,10 @@
 // Implements time-related intrinsic subroutines.
 
 #include "flang/Runtime/time-intrinsic.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
diff --git a/flang/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp
similarity index 98%
rename from flang/runtime/tools.cpp
rename to flang-rt/lib/runtime/tools.cpp
index 73d6c2cf7e1d2..b9d9ca4fc378c 100644
--- a/flang/runtime/tools.cpp
+++ b/flang-rt/lib/runtime/tools.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/tools.cpp -------------------------------------------------===//
+//===-- lib/runtime/tools.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "tools.h"
-#include "terminator.h"
+#include "flang-rt/runtime/tools.h"
+#include "flang-rt/runtime/terminator.h"
 #include <algorithm>
 #include <cstdint>
 #include <cstdlib>
diff --git a/flang/runtime/transformational.cpp b/flang-rt/lib/runtime/transformational.cpp
similarity index 99%
rename from flang/runtime/transformational.cpp
rename to flang-rt/lib/runtime/transformational.cpp
index ab303bdef9b1d..eb694a9f2c833 100644
--- a/flang/runtime/transformational.cpp
+++ b/flang-rt/lib/runtime/transformational.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/transformational.cpp --------------------------------------===//
+//===-- lib/runtime/transformational.cpp ------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -18,10 +18,10 @@
 
 #include "flang/Runtime/transformational.h"
 #include "copy.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Common/float128.h"
-#include "flang/Runtime/descriptor.h"
 
 namespace Fortran::runtime {
 
diff --git a/flang/runtime/type-code.cpp b/flang-rt/lib/runtime/type-code.cpp
similarity index 98%
rename from flang/runtime/type-code.cpp
rename to flang-rt/lib/runtime/type-code.cpp
index d6948983bfe9f..8cfec9a4ec2fb 100644
--- a/flang/runtime/type-code.cpp
+++ b/flang-rt/lib/runtime/type-code.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/type-code.cpp ---------------------------------------------===//
+//===-- lib/runtime/type-code.cpp -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Runtime/type-code.h"
+#include "flang-rt/runtime/type-code.h"
 
 namespace Fortran::runtime {
 
diff --git a/flang/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp
similarity index 97%
rename from flang/runtime/type-info.cpp
rename to flang-rt/lib/runtime/type-info.cpp
index d4daa72aee6a1..82182696d70c6 100644
--- a/flang/runtime/type-info.cpp
+++ b/flang-rt/lib/runtime/type-info.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/type-info.cpp ---------------------------------------------===//
+//===-- lib/runtime/type-info.cpp -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "type-info.h"
-#include "terminator.h"
-#include "tools.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/tools.h"
 #include <cstdio>
 
 namespace Fortran::runtime::typeInfo {
@@ -86,7 +86,7 @@ RT_API_ATTRS std::size_t Component::SizeInBytes(
   } else if (category() == TypeCategory::Derived) {
     const DerivedType *type{derivedType()};
     return Descriptor::SizeInBytes(
-        rank_, true, type ? type->LenParameters() : 0);
+         rank_, true, type ? type->LenParameters() : 0);
   } else {
     return Descriptor::SizeInBytes(rank_);
   }
diff --git a/flang/runtime/unit-map.cpp b/flang-rt/lib/runtime/unit-map.cpp
similarity index 97%
rename from flang/runtime/unit-map.cpp
rename to flang-rt/lib/runtime/unit-map.cpp
index 684a9b9e20b97..41a03f3319d64 100644
--- a/flang/runtime/unit-map.cpp
+++ b/flang-rt/lib/runtime/unit-map.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/unit-map.cpp ----------------------------------------------===//
+//===-- lib/runtime/unit-map.cpp --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/runtime/unit-map.h b/flang-rt/lib/runtime/unit-map.h
similarity index 92%
rename from flang/runtime/unit-map.h
rename to flang-rt/lib/runtime/unit-map.h
index 6f1e01bb1e64a..fa61288a1e18d 100644
--- a/flang/runtime/unit-map.h
+++ b/flang-rt/lib/runtime/unit-map.h
@@ -1,4 +1,4 @@
-//===-- runtime/unit-map.h --------------------------------------*- C++ -*-===//
+//===-- lib/runtime/unit-map.h ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,13 +9,13 @@
 // Maps Fortran unit numbers to their ExternalFileUnit instances.
 // A simple hash table with forward-linked chains per bucket.
 
-#ifndef FORTRAN_RUNTIME_UNIT_MAP_H_
-#define FORTRAN_RUNTIME_UNIT_MAP_H_
+#ifndef FLANG_RT_RUNTIME_UNIT_MAP_H_
+#define FLANG_RT_RUNTIME_UNIT_MAP_H_
 
-#include "lock.h"
 #include "unit.h"
+#include "flang-rt/runtime/lock.h"
+#include "flang-rt/runtime/memory.h"
 #include "flang/Common/fast-int-set.h"
-#include "flang/Runtime/memory.h"
 #include <cstdint>
 #include <cstdlib>
 
@@ -100,4 +100,4 @@ class UnitMap {
   int emergencyNewUnit_{maxNewUnits_}; // not recycled
 };
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_UNIT_MAP_H_
+#endif // FLANG_RT_RUNTIME_UNIT_MAP_H_
diff --git a/flang/runtime/unit.cpp b/flang-rt/lib/runtime/unit.cpp
similarity index 99%
rename from flang/runtime/unit.cpp
rename to flang-rt/lib/runtime/unit.cpp
index 4aee8397d477e..1d4d54ae01956 100644
--- a/flang/runtime/unit.cpp
+++ b/flang-rt/lib/runtime/unit.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/unit.cpp --------------------------------------------------===//
+//===-- lib/runtime/unit.cpp ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 #include "unit.h"
-#include "io-error.h"
-#include "lock.h"
-#include "tools.h"
+#include "flang-rt/runtime/io-error.h"
+#include "flang-rt/runtime/lock.h"
+#include "flang-rt/runtime/tools.h"
 #include <limits>
 #include <utility>
 
diff --git a/flang/runtime/unit.h b/flang-rt/lib/runtime/unit.h
similarity index 95%
rename from flang/runtime/unit.h
rename to flang-rt/lib/runtime/unit.h
index a3ea268681680..eb762a2d3b235 100644
--- a/flang/runtime/unit.h
+++ b/flang-rt/lib/runtime/unit.h
@@ -1,4 +1,4 @@
-//===-- runtime/unit.h ------------------------------------------*- C++ -*-===//
+//===-- lib/runtime/unit.h --------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,21 +8,21 @@
 
 // Fortran external I/O units
 
-#ifndef FORTRAN_RUNTIME_IO_UNIT_H_
-#define FORTRAN_RUNTIME_IO_UNIT_H_
-
-#include "buffer.h"
-#include "connection.h"
-#include "environment.h"
-#include "file.h"
-#include "format.h"
-#include "io-error.h"
-#include "io-stmt.h"
-#include "lock.h"
-#include "terminator.h"
+#ifndef FLANG_RT_RUNTIME_UNIT_H_
+#define FLANG_RT_RUNTIME_UNIT_H_
+
+#include "flang-rt/runtime/buffer.h"
+#include "flang-rt/runtime/connection.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/file.h"
+#include "flang-rt/runtime/format.h"
+#include "flang-rt/runtime/io-error.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/lock.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Common/constexpr-bitset.h"
 #include "flang/Common/optional.h"
-#include "flang/Runtime/memory.h"
 #include <cstdlib>
 #include <cstring>
 #include <flang/Common/variant.h>
@@ -291,4 +291,4 @@ class ChildIo {
 };
 
 } // namespace Fortran::runtime::io
-#endif // FORTRAN_RUNTIME_IO_UNIT_H_
+#endif // FLANG_RT_RUNTIME_UNIT_H_
diff --git a/flang/runtime/utf.cpp b/flang-rt/lib/runtime/utf.cpp
similarity index 97%
rename from flang/runtime/utf.cpp
rename to flang-rt/lib/runtime/utf.cpp
index b09819cb2f736..ef9df49f24f66 100644
--- a/flang/runtime/utf.cpp
+++ b/flang-rt/lib/runtime/utf.cpp
@@ -1,4 +1,4 @@
-//===-- runtime/utf.cpp ---------------------------------------------------===//
+//===-- lib/runtime/utf.cpp -------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "utf.h"
+#include "flang-rt/runtime/utf.h"
 
 namespace Fortran::runtime {
 
diff --git a/flang/test/Driver/ctofortran.f90 b/flang-rt/test/Driver/ctofortran.f90
similarity index 100%
rename from flang/test/Driver/ctofortran.f90
rename to flang-rt/test/Driver/ctofortran.f90
diff --git a/flang/test/Driver/exec.f90 b/flang-rt/test/Driver/exec.f90
similarity index 100%
rename from flang/test/Driver/exec.f90
rename to flang-rt/test/Driver/exec.f90
diff --git a/flang/test/Runtime/no-cpp-dep.c b/flang-rt/test/Runtime/no-cpp-dep.c
similarity index 100%
rename from flang/test/Runtime/no-cpp-dep.c
rename to flang-rt/test/Runtime/no-cpp-dep.c
diff --git a/flang/unittests/Evaluate/ISO-Fortran-binding.cpp b/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp
similarity index 98%
rename from flang/unittests/Evaluate/ISO-Fortran-binding.cpp
rename to flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp
index 2884686b3f26e..8c0a6f29b6967 100644
--- a/flang/unittests/Evaluate/ISO-Fortran-binding.cpp
+++ b/flang-rt/unittests/Evaluate/ISO-Fortran-binding.cpp
@@ -1,5 +1,13 @@
+//===-- unittests/Evaluate/ISO-Fortran-binding.cpp --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang-rt/runtime/descriptor.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Testing/testing.h"
 #include "llvm/Support/raw_ostream.h"
 #include <type_traits>
diff --git a/flang/unittests/Evaluate/reshape.cpp b/flang-rt/unittests/Evaluate/reshape.cpp
similarity index 88%
rename from flang/unittests/Evaluate/reshape.cpp
rename to flang-rt/unittests/Evaluate/reshape.cpp
index 16cba15dcef60..2abe46c0969fb 100644
--- a/flang/unittests/Evaluate/reshape.cpp
+++ b/flang-rt/unittests/Evaluate/reshape.cpp
@@ -1,4 +1,12 @@
-#include "flang/Runtime/descriptor.h"
+//===-- unittests/Evaluate/reshape.cpp --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang-rt/runtime/descriptor.h"
 #include "flang/Runtime/transformational.h"
 #include "flang/Testing/testing.h"
 #include <cinttypes>
diff --git a/flang/unittests/Runtime/AccessTest.cpp b/flang-rt/unittests/Runtime/AccessTest.cpp
similarity index 99%
rename from flang/unittests/Runtime/AccessTest.cpp
rename to flang-rt/unittests/Runtime/AccessTest.cpp
index c2a2d7d398220..d431d0d19bd61 100644
--- a/flang/unittests/Runtime/AccessTest.cpp
+++ b/flang-rt/unittests/Runtime/AccessTest.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/AccessTest.cpp ----------------------------===//
+//===-- unittests/Runtime/AccessTest.cpp ------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/unittests/Runtime/Allocatable.cpp b/flang-rt/unittests/Runtime/Allocatable.cpp
similarity index 97%
rename from flang/unittests/Runtime/Allocatable.cpp
rename to flang-rt/unittests/Runtime/Allocatable.cpp
index f15f26bfd9c57..4702f48e0f440 100644
--- a/flang/unittests/Runtime/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/Allocatable.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Allocatable.cpp--------- ---------*- C++-*-===//
+//===-- unittests/Runtime/Allocatable.cpp -----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,7 +8,7 @@
 
 #include "flang/Runtime/allocatable.h"
 #include "gtest/gtest.h"
-#include "tools.h"
+#include "flang-rt/runtime/tools.h"
 
 using namespace Fortran::runtime;
 
diff --git a/flang/unittests/Runtime/ArrayConstructor.cpp b/flang-rt/unittests/Runtime/ArrayConstructor.cpp
similarity index 96%
rename from flang/unittests/Runtime/ArrayConstructor.cpp
rename to flang-rt/unittests/Runtime/ArrayConstructor.cpp
index 53774a0eea07d..5f791e7f4a7c3 100644
--- a/flang/unittests/Runtime/ArrayConstructor.cpp
+++ b/flang-rt/unittests/Runtime/ArrayConstructor.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/ArrayConstructor.cpp-------------*- C++ -*-===//
+//===-- unittests/Runtime/ArrayConstructor.cpp ------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
 #include "tools.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/array-constructor.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/type-code.h"
 #include "flang/Runtime/allocatable.h"
-#include "flang/Runtime/array-constructor.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
-#include "flang/Runtime/type-code.h"
 
 #include <memory>
 
diff --git a/flang/unittests/Runtime/BufferTest.cpp b/flang-rt/unittests/Runtime/BufferTest.cpp
similarity index 97%
rename from flang/unittests/Runtime/BufferTest.cpp
rename to flang-rt/unittests/Runtime/BufferTest.cpp
index 0632324b25d22..f4b9e901d6fcb 100644
--- a/flang/unittests/Runtime/BufferTest.cpp
+++ b/flang-rt/unittests/Runtime/BufferTest.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/BufferTest.cpp ------------------*- C++ -*-===//
+//===-- unittests/Runtime/BufferTest.cpp ------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "../../runtime/buffer.h"
+#include "flang-rt/runtime/buffer.h"
 #include "CrashHandlerFixture.h"
 #include "gtest/gtest.h"
 #include <algorithm>
diff --git a/flang/unittests/Runtime/CUDA/Allocatable.cpp b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
similarity index 90%
rename from flang/unittests/Runtime/CUDA/Allocatable.cpp
rename to flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
index bdfa8f5cc3213..3f759a69c0388 100644
--- a/flang/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Allocatable.cpp ------------------*- C++-*-===//
+//===-- unittests/Runtime/CUDA/Allocatable.cpp ------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,16 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/allocatable.h"
+#include "cuda_runtime.h"
 #include "gtest/gtest.h"
-#include "../../../runtime/terminator.h"
+#include "flang-rt/runtime/allocator-registry.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/allocator.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
-#include "flang/Runtime/allocator-registry.h"
 #include "flang/Support/Fortran.h"
 
-#include "cuda_runtime.h"
-
 using namespace Fortran::runtime;
 using namespace Fortran::runtime::cuda;
 
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
similarity index 92%
rename from flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
rename to flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 5ec122e4c5777..9bda3270fe8a1 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/AllocatableCUF.cpp ---------------*- C++-*-===//
+//===-- unittests/Runtime/CUDA/AllocatorCUF.cpp -----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,16 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "cuda_runtime.h"
 #include "gtest/gtest.h"
-#include "../../../runtime/terminator.h"
+#include "flang-rt/runtime/allocator-registry.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/allocator.h"
 #include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
-#include "flang/Runtime/allocator-registry.h"
 #include "flang/Support/Fortran.h"
 
-#include "cuda_runtime.h"
-
 using namespace Fortran::runtime;
 using namespace Fortran::runtime::cuda;
 
diff --git a/flang/unittests/Runtime/CUDA/Memory.cpp b/flang-rt/unittests/Runtime/CUDA/Memory.cpp
similarity index 93%
rename from flang/unittests/Runtime/CUDA/Memory.cpp
rename to flang-rt/unittests/Runtime/CUDA/Memory.cpp
index 61c9d1131ee56..37ae59ec238c8 100644
--- a/flang/unittests/Runtime/CUDA/Memory.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Memory.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Memory.cpp -----------------------*- C++-*-===//
+//===-- unittests/Runtime/CUDA/Memory.cpp -----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,18 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/CUDA/memory.h"
-#include "gtest/gtest.h"
-#include "../../../runtime/terminator.h"
+#include "cuda_runtime.h"
 #include "../tools.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/allocator-registry.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/CUDA/allocator.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
-#include "flang/Runtime/allocator-registry.h"
 #include "flang/Support/Fortran.h"
 
-#include "cuda_runtime.h"
-
 using namespace Fortran::runtime;
 using namespace Fortran::runtime::cuda;
 
diff --git a/flang/unittests/Runtime/CharacterTest.cpp b/flang-rt/unittests/Runtime/CharacterTest.cpp
similarity index 99%
rename from flang/unittests/Runtime/CharacterTest.cpp
rename to flang-rt/unittests/Runtime/CharacterTest.cpp
index d462c9120fd8c..83ec9b36d9b0c 100644
--- a/flang/unittests/Runtime/CharacterTest.cpp
+++ b/flang-rt/unittests/Runtime/CharacterTest.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/CharacterTest.cpp ---------------*- C++ -*-===//
+//===-- unittests/Runtime/CharacterTest.cpp ---------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,7 +11,7 @@
 
 #include "flang/Runtime/character.h"
 #include "gtest/gtest.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
 #include <cstring>
 #include <functional>
 #include <tuple>
diff --git a/flang/unittests/Runtime/CommandTest.cpp b/flang-rt/unittests/Runtime/CommandTest.cpp
similarity index 99%
rename from flang/unittests/Runtime/CommandTest.cpp
rename to flang-rt/unittests/Runtime/CommandTest.cpp
index ecb325330f1ad..72fe7629dbbb8 100644
--- a/flang/unittests/Runtime/CommandTest.cpp
+++ b/flang-rt/unittests/Runtime/CommandTest.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/CommandTest.cpp ---------------------------===//
+//===-- unittests/Runtime/CommandTest.cpp -----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,7 +9,7 @@
 #include "flang/Runtime/command.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
 #include "flang/Runtime/execute.h"
 #include "flang/Runtime/extensions.h"
 #include "flang/Runtime/main.h"
diff --git a/flang/unittests/Runtime/Complex.cpp b/flang-rt/unittests/Runtime/Complex.cpp
similarity index 98%
rename from flang/unittests/Runtime/Complex.cpp
rename to flang-rt/unittests/Runtime/Complex.cpp
index d714da24dc4e5..d7e5f55414fe2 100644
--- a/flang/unittests/Runtime/Complex.cpp
+++ b/flang-rt/unittests/Runtime/Complex.cpp
@@ -1,10 +1,11 @@
-//===-- flang/unittests/Runtime/Complex.cpp ---------------------*- C++ -*-===//
+//===-- unittests/Runtime/Complex.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 #include "gmock/gmock.h"
 #include "gtest/gtest-matchers.h"
 #include <limits>
diff --git a/flang/unittests/Runtime/CrashHandlerFixture.cpp b/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp
similarity index 92%
rename from flang/unittests/Runtime/CrashHandlerFixture.cpp
rename to flang-rt/unittests/Runtime/CrashHandlerFixture.cpp
index 811603337e660..8213edd1f9225 100644
--- a/flang/unittests/Runtime/CrashHandlerFixture.cpp
+++ b/flang-rt/unittests/Runtime/CrashHandlerFixture.cpp
@@ -1,12 +1,13 @@
-//===-- flang/unittests/Runtime/CrashHandlerFixture.cpp ---------*- C++ -*-===//
+//===-- unittests/Runtime/CrashHandlerFixture.cpp ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 #include "CrashHandlerFixture.h"
-#include "../../runtime/terminator.h"
+#include "flang-rt/runtime/terminator.h"
 #include <cstdarg>
 #include <cstdlib>
 
diff --git a/flang/unittests/Runtime/CrashHandlerFixture.h b/flang-rt/unittests/Runtime/CrashHandlerFixture.h
similarity index 70%
rename from flang/unittests/Runtime/CrashHandlerFixture.h
rename to flang-rt/unittests/Runtime/CrashHandlerFixture.h
index fe0ee0da5204e..74531d1e728f4 100644
--- a/flang/unittests/Runtime/CrashHandlerFixture.h
+++ b/flang-rt/unittests/Runtime/CrashHandlerFixture.h
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/CrashHandlerFixture.h -----------*- C++ -*-===//
+//===-- unittests/Runtime/CrashHandlerFixture.h -----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,12 +10,13 @@
 /// with expected message.
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_FLANG_UNITTESTS_RUNTIMEGTEST_CRASHHANDLERFIXTURE_H
-#define LLVM_FLANG_UNITTESTS_RUNTIMEGTEST_CRASHHANDLERFIXTURE_H
+
+#ifndef FLANG_RT_UNITTESTS_RUNTIME_CRASHHANDLERFIXTURE_H_
+#define FLANG_RT_UNITTESTS_RUNTIME_CRASHHANDLERFIXTURE_H_
 #include <gtest/gtest.h>
 
 struct CrashHandlerFixture : testing::Test {
   void SetUp();
 };
 
-#endif
+#endif /* FLANG_RT_UNITTESTS_RUNTIME_CRASHHANDLERFIXTURE_H_ */
diff --git a/flang/unittests/Runtime/Derived.cpp b/flang-rt/unittests/Runtime/Derived.cpp
similarity index 93%
rename from flang/unittests/Runtime/Derived.cpp
rename to flang-rt/unittests/Runtime/Derived.cpp
index 019d5e8309e4a..3196ba796ad19 100644
--- a/flang/unittests/Runtime/Derived.cpp
+++ b/flang-rt/unittests/Runtime/Derived.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Pointer.cpp--------- -------------*- C++-*-===//
+//===-- unittests/Runtime/Derived.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "gtest/gtest.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/tools.h"
 #include "flang/Runtime/derived-api.h"
-#include "flang/Runtime/descriptor.h"
 
 using namespace Fortran::runtime;
 
diff --git a/flang/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
similarity index 99%
rename from flang/unittests/Runtime/ExternalIOTest.cpp
rename to flang-rt/unittests/Runtime/ExternalIOTest.cpp
index b9407b5e7a591..c83535ca82bd3 100644
--- a/flang/unittests/Runtime/ExternalIOTest.cpp
+++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/RuntimeGTest/ExternalIOTest.cpp ---------*- C++ -*-===//
+//===-- unittests/Runtime/ExternalIOTest.cpp --------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
 
 #include "CrashHandlerFixture.h"
 #include "gtest/gtest.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
 #include "flang/Runtime/io-api-consts.h"
 #include "flang/Runtime/main.h"
 #include "flang/Runtime/stop.h"
diff --git a/flang/unittests/Runtime/Format.cpp b/flang-rt/unittests/Runtime/Format.cpp
similarity index 96%
rename from flang/unittests/Runtime/Format.cpp
rename to flang-rt/unittests/Runtime/Format.cpp
index 01803c628de26..fe7403f26700b 100644
--- a/flang/unittests/Runtime/Format.cpp
+++ b/flang-rt/unittests/Runtime/Format.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Format.cpp ----------------------*- C++ -*-===//
+//===-- unittests/Runtime/Format.cpp ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "CrashHandlerFixture.h"
-#include "../runtime/connection.h"
-#include "../runtime/format-implementation.h"
-#include "../runtime/io-error.h"
+#include "flang-rt/runtime/connection.h"
+#include "flang-rt/runtime/format-implementation.h"
+#include "flang-rt/runtime/io-error.h"
 #include <optional>
 #include <string>
 #include <tuple>
diff --git a/flang/unittests/Runtime/Inquiry.cpp b/flang-rt/unittests/Runtime/Inquiry.cpp
similarity index 97%
rename from flang/unittests/Runtime/Inquiry.cpp
rename to flang-rt/unittests/Runtime/Inquiry.cpp
index 3b523e992a317..e79c037864d9c 100644
--- a/flang/unittests/Runtime/Inquiry.cpp
+++ b/flang-rt/unittests/Runtime/Inquiry.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Inquiry.cpp -------------------------------===//
+//===-- unittests/Runtime/Inquiry.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/inquiry.h"
-#include "gtest/gtest.h"
 #include "tools.h"
-#include "flang/Runtime/type-code.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/type-code.h"
 
 using namespace Fortran::runtime;
 using Fortran::common::TypeCategory;
diff --git a/flang/unittests/Runtime/ListInputTest.cpp b/flang-rt/unittests/Runtime/ListInputTest.cpp
similarity index 98%
rename from flang/unittests/Runtime/ListInputTest.cpp
rename to flang-rt/unittests/Runtime/ListInputTest.cpp
index 38c758b7ef966..310c41a5c3fa5 100644
--- a/flang/unittests/Runtime/ListInputTest.cpp
+++ b/flang-rt/unittests/Runtime/ListInputTest.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/ListInputTest.cpp ---------------*- C++ -*-===//
+//===-- unittests/Runtime/ListInputTest.cpp ---------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "CrashHandlerFixture.h"
-#include "../../runtime/io-error.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/io-error.h"
 #include "flang/Runtime/io-api-consts.h"
 
 using namespace Fortran::runtime;
diff --git a/flang/unittests/Runtime/LogicalFormatTest.cpp b/flang-rt/unittests/Runtime/LogicalFormatTest.cpp
similarity index 94%
rename from flang/unittests/Runtime/LogicalFormatTest.cpp
rename to flang-rt/unittests/Runtime/LogicalFormatTest.cpp
index 26c9374be1338..bc933292181c1 100644
--- a/flang/unittests/Runtime/LogicalFormatTest.cpp
+++ b/flang-rt/unittests/Runtime/LogicalFormatTest.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/LogicalFormatTest.cpp -----------*- C++ -*-===//
+//===-- unittests/Runtime/LogicalFormatTest.cpp -----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CrashHandlerFixture.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
 #include "flang/Runtime/io-api-consts.h"
 #include <algorithm>
 #include <array>
diff --git a/flang/unittests/Runtime/Matmul.cpp b/flang-rt/unittests/Runtime/Matmul.cpp
similarity index 98%
rename from flang/unittests/Runtime/Matmul.cpp
rename to flang-rt/unittests/Runtime/Matmul.cpp
index c3fed9b972df2..1b716fd01fd42 100644
--- a/flang/unittests/Runtime/Matmul.cpp
+++ b/flang-rt/unittests/Runtime/Matmul.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Matmul.cpp--------- -------------*- C++ -*-===//
+//===-- unittests/Runtime/Matmul.cpp ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,12 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/matmul.h"
-#include "gtest/gtest.h"
 #include "tools.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/type-code.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
-#include "flang/Runtime/type-code.h"
 
 using namespace Fortran::runtime;
 using Fortran::common::TypeCategory;
diff --git a/flang/unittests/Runtime/MatmulTranspose.cpp b/flang-rt/unittests/Runtime/MatmulTranspose.cpp
similarity index 98%
rename from flang/unittests/Runtime/MatmulTranspose.cpp
rename to flang-rt/unittests/Runtime/MatmulTranspose.cpp
index c582e945dc7c9..cc14ab755505a 100644
--- a/flang/unittests/Runtime/MatmulTranspose.cpp
+++ b/flang-rt/unittests/Runtime/MatmulTranspose.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/MatmulTranspose.cpp -------------*- C++ -*-===//
+//===-- unittests/Runtime/MatmulTranspose.cpp -------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
 #include "tools.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/type-code.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/matmul-transpose.h"
-#include "flang/Runtime/type-code.h"
 
 using namespace Fortran::runtime;
 using Fortran::common::TypeCategory;
diff --git a/flang/unittests/Runtime/MiscIntrinsic.cpp b/flang-rt/unittests/Runtime/MiscIntrinsic.cpp
similarity index 96%
rename from flang/unittests/Runtime/MiscIntrinsic.cpp
rename to flang-rt/unittests/Runtime/MiscIntrinsic.cpp
index 7e19ed250bdc0..c6783381bfc35 100644
--- a/flang/unittests/Runtime/MiscIntrinsic.cpp
+++ b/flang-rt/unittests/Runtime/MiscIntrinsic.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/MiscIntrinsic.cpp ---------------*- C++ -*-===//
+//===-- unittests/Runtime/MiscIntrinsic.cpp ---------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
 #include "tools.h"
-#include "flang/Runtime//misc-intrinsic.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/descriptor.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/misc-intrinsic.h"
 
 using namespace Fortran::runtime;
 
diff --git a/flang/unittests/Runtime/Namelist.cpp b/flang-rt/unittests/Runtime/Namelist.cpp
similarity index 99%
rename from flang/unittests/Runtime/Namelist.cpp
rename to flang-rt/unittests/Runtime/Namelist.cpp
index 0a28f3590b86e..040dedb8cd47c 100644
--- a/flang/unittests/Runtime/Namelist.cpp
+++ b/flang-rt/unittests/Runtime/Namelist.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Namelist.cpp --------------------*- C++ -*-===//
+//===-- unittests/Runtime/Namelist.cpp --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "../../runtime/namelist.h"
+#include "flang-rt/runtime/namelist.h"
 #include "CrashHandlerFixture.h"
 #include "tools.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
 #include "flang/Runtime/io-api-consts.h"
 #include <algorithm>
 #include <cinttypes>
diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang-rt/unittests/Runtime/Numeric.cpp
similarity index 99%
rename from flang/unittests/Runtime/Numeric.cpp
rename to flang-rt/unittests/Runtime/Numeric.cpp
index 29ebbc8ad7aa7..4baad3fc9ad0e 100644
--- a/flang/unittests/Runtime/Numeric.cpp
+++ b/flang-rt/unittests/Runtime/Numeric.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Numeric.cpp ---------------------*- C++ -*-===//
+//===-- unittests/Runtime/Numeric.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/unittests/Runtime/NumericalFormatTest.cpp b/flang-rt/unittests/Runtime/NumericalFormatTest.cpp
similarity index 99%
rename from flang/unittests/Runtime/NumericalFormatTest.cpp
rename to flang-rt/unittests/Runtime/NumericalFormatTest.cpp
index 274498b8e8695..5a8ead48dcef9 100644
--- a/flang/unittests/Runtime/NumericalFormatTest.cpp
+++ b/flang-rt/unittests/Runtime/NumericalFormatTest.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/NumericalFormatTest.cpp ---------*- C++ -*-===//
+//===-- unittests/Runtime/NumericalFormatTest.cpp ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CrashHandlerFixture.h"
-#include "flang/Runtime/descriptor.h"
+#include "flang-rt/runtime/descriptor.h"
 #include "flang/Runtime/io-api-consts.h"
 #include <algorithm>
 #include <array>
diff --git a/flang/unittests/Runtime/Pointer.cpp b/flang-rt/unittests/Runtime/Pointer.cpp
similarity index 97%
rename from flang/unittests/Runtime/Pointer.cpp
rename to flang-rt/unittests/Runtime/Pointer.cpp
index 54720afab8d8a..6e8861d374889 100644
--- a/flang/unittests/Runtime/Pointer.cpp
+++ b/flang-rt/unittests/Runtime/Pointer.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Pointer.cpp--------- -------------*- C++-*-===//
+//===-- unittests/Runtime/Pointer.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/pointer.h"
-#include "gtest/gtest.h"
 #include "tools.h"
-#include "flang/Runtime/descriptor.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/descriptor.h"
 
 using namespace Fortran::runtime;
 
diff --git a/flang/unittests/Runtime/Ragged.cpp b/flang-rt/unittests/Runtime/Ragged.cpp
similarity index 94%
rename from flang/unittests/Runtime/Ragged.cpp
rename to flang-rt/unittests/Runtime/Ragged.cpp
index 5049bc83405f1..feadd032f59bd 100644
--- a/flang/unittests/Runtime/Ragged.cpp
+++ b/flang-rt/unittests/Runtime/Ragged.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Ragged.cpp ----------------------*- C++ -*-===//
+//===-- unittests/Runtime/Ragged.cpp ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/unittests/Runtime/Random.cpp b/flang-rt/unittests/Runtime/Random.cpp
similarity index 92%
rename from flang/unittests/Runtime/Random.cpp
rename to flang-rt/unittests/Runtime/Random.cpp
index cb739b9451429..bf4b540e768fd 100644
--- a/flang/unittests/Runtime/Random.cpp
+++ b/flang-rt/unittests/Runtime/Random.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Random.cpp ----------------------*- C++ -*-===//
+//===-- unittests/Runtime/Random.cpp ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Runtime//random.h"
+#include "flang/Runtime/random.h"
 #include "gtest/gtest.h"
-#include "flang/Runtime/descriptor.h"
-#include "flang/Runtime/type-code.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/type-code.h"
 #include <cmath>
 
 using namespace Fortran::runtime;
diff --git a/flang/unittests/Runtime/Reduction.cpp b/flang-rt/unittests/Runtime/Reduction.cpp
similarity index 99%
rename from flang/unittests/Runtime/Reduction.cpp
rename to flang-rt/unittests/Runtime/Reduction.cpp
index 29675399abf5c..3701a32042c58 100644
--- a/flang/unittests/Runtime/Reduction.cpp
+++ b/flang-rt/unittests/Runtime/Reduction.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Reductions.cpp ----------------------------===//
+//===-- unittests/Runtime/Reduction.cpp -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,14 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/reduction.h"
-#include "gtest/gtest.h"
 #include "tools.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/type-code.h"
 #include "flang/Common/float128.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/reduce.h"
-#include "flang/Runtime/type-code.h"
 #include <cstdint>
 #include <cstring>
 #include <string>
diff --git a/flang/unittests/Runtime/RuntimeCrashTest.cpp b/flang-rt/unittests/Runtime/RuntimeCrashTest.cpp
similarity index 98%
rename from flang/unittests/Runtime/RuntimeCrashTest.cpp
rename to flang-rt/unittests/Runtime/RuntimeCrashTest.cpp
index 72a0b290cf864..e716dac2d1203 100644
--- a/flang/unittests/Runtime/RuntimeCrashTest.cpp
+++ b/flang-rt/unittests/Runtime/RuntimeCrashTest.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/CrashHandlerFixture.cpp ---------*- C++ -*-===//
+//===-- unittests/Runtime/RuntimeCrashTest.cpp ------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 #include "CrashHandlerFixture.h"
 #include "tools.h"
-#include "../../runtime/terminator.h"
+#include "flang-rt/runtime/terminator.h"
 #include "flang/Runtime/io-api-consts.h"
 #include "flang/Runtime/transformational.h"
 #include <gtest/gtest.h>
diff --git a/flang/unittests/Runtime/Stop.cpp b/flang-rt/unittests/Runtime/Stop.cpp
similarity index 96%
rename from flang/unittests/Runtime/Stop.cpp
rename to flang-rt/unittests/Runtime/Stop.cpp
index b13602eaee5ea..5d1af20e77de7 100644
--- a/flang/unittests/Runtime/Stop.cpp
+++ b/flang-rt/unittests/Runtime/Stop.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Stop.cpp ----------------------------------===//
+//===-- unittests/Runtime/Stop.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,9 +9,10 @@
 /// Test runtime API for STOP statement and runtime API to kill the program.
 //
 //===----------------------------------------------------------------------===//
+
 #include "flang/Runtime/stop.h"
 #include "CrashHandlerFixture.h"
-#include "../../runtime/environment.h"
+#include "flang-rt/runtime/environment.h"
 #include <cstdlib>
 #include <gtest/gtest.h>
 
diff --git a/flang/unittests/Runtime/Support.cpp b/flang-rt/unittests/Runtime/Support.cpp
similarity index 96%
rename from flang/unittests/Runtime/Support.cpp
rename to flang-rt/unittests/Runtime/Support.cpp
index 8c8de73b5b979..c97a6eae3a155 100644
--- a/flang/unittests/Runtime/Support.cpp
+++ b/flang-rt/unittests/Runtime/Support.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Support.cpp ----------------------*- C++-*-===//
+//===-- unittests/Runtime/Support.cpp ---------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/support.h"
-#include "gtest/gtest.h"
 #include "tools.h"
-#include "flang/Runtime/descriptor.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/descriptor.h"
 
 using namespace Fortran::runtime;
 using Fortran::common::TypeCategory;
diff --git a/flang/unittests/Runtime/TemporaryStack.cpp b/flang-rt/unittests/Runtime/TemporaryStack.cpp
similarity index 98%
rename from flang/unittests/Runtime/TemporaryStack.cpp
rename to flang-rt/unittests/Runtime/TemporaryStack.cpp
index 4f701e09b2945..ff8efe123d7cc 100644
--- a/flang/unittests/Runtime/TemporaryStack.cpp
+++ b/flang-rt/unittests/Runtime/TemporaryStack.cpp
@@ -1,4 +1,4 @@
-//===--- flang/unittests/Runtime/TemporaryStack.cpp -------------*- C++ -*-===//
+//===-- unittests/Runtime/TemporaryStack.cpp --------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "gtest/gtest.h"
-#include "tools.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/tools.h"
+#include "flang-rt/runtime/type-code.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/temporary-stack.h"
-#include "flang/Runtime/type-code.h"
 #include <vector>
 
 using namespace Fortran::runtime;
diff --git a/flang/unittests/Runtime/Time.cpp b/flang-rt/unittests/Runtime/Time.cpp
similarity index 98%
rename from flang/unittests/Runtime/Time.cpp
rename to flang-rt/unittests/Runtime/Time.cpp
index 9309d7b1ceffa..548c0834e34a3 100644
--- a/flang/unittests/Runtime/Time.cpp
+++ b/flang-rt/unittests/Runtime/Time.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Time.cpp ----------------------------===//
+//===-- unittests/Runtime/Time.cpp ------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/unittests/Runtime/Transformational.cpp b/flang-rt/unittests/Runtime/Transformational.cpp
similarity index 99%
rename from flang/unittests/Runtime/Transformational.cpp
rename to flang-rt/unittests/Runtime/Transformational.cpp
index 1d84b7e23779a..06df96a3cc45a 100644
--- a/flang/unittests/Runtime/Transformational.cpp
+++ b/flang-rt/unittests/Runtime/Transformational.cpp
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/Transformational.cpp ----------------------===//
+//===-- unittests/Runtime/Transformational.cpp ------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,10 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/transformational.h"
-#include "gtest/gtest.h"
 #include "tools.h"
+#include "gtest/gtest.h"
+#include "flang-rt/runtime/type-code.h"
 #include "flang/Common/float128.h"
-#include "flang/Runtime/type-code.h"
 #include <vector>
 
 using namespace Fortran::runtime;
diff --git a/flang/unittests/Runtime/tools.h b/flang-rt/unittests/Runtime/tools.h
similarity index 85%
rename from flang/unittests/Runtime/tools.h
rename to flang-rt/unittests/Runtime/tools.h
index 0347edace5c05..36a4c2bd9c8b7 100644
--- a/flang/unittests/Runtime/tools.h
+++ b/flang-rt/unittests/Runtime/tools.h
@@ -1,4 +1,4 @@
-//===-- flang/unittests/Runtime/tools.h -------------------------*- C++ -*-===//
+//===-- unittests/Runtime/tools.h -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_UNITTESTS_RUNTIME_TOOLS_H_
-#define FORTRAN_UNITTESTS_RUNTIME_TOOLS_H_
+#ifndef FLANG_RT_UNITTESTS_RUNTIME_TOOLS_H_
+#define FLANG_RT_UNITTESTS_RUNTIME_TOOLS_H_
 
 #include "gtest/gtest.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/type-code.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/cpp-type.h"
-#include "flang/Runtime/descriptor.h"
-#include "flang/Runtime/type-code.h"
 #include <cstdint>
 #include <cstring>
 #include <vector>
@@ -54,4 +54,4 @@ static OwningPtr<Descriptor> MakeArray(const std::vector<int> &shape,
 }
 
 } // namespace Fortran::runtime
-#endif // FORTRAN_UNITTESTS_RUNTIME_TOOLS_H_
+#endif // FLANG_RT_UNITTESTS_RUNTIME_TOOLS_H_
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 0f98d12343c43..944474acf294c 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -23,6 +23,7 @@ if (LLVM_ENABLE_EH)
 endif()
 
 set(FLANG_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(FLANG_RT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../flang-rt")
 
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND NOT MSVC_IDE)
   message(FATAL_ERROR "In-source builds are not allowed. \
diff --git a/flang/examples/ExternalHelloWorld/CMakeLists.txt b/flang/examples/ExternalHelloWorld/CMakeLists.txt
index b61948718a5e3..dbb69475976dd 100644
--- a/flang/examples/ExternalHelloWorld/CMakeLists.txt
+++ b/flang/examples/ExternalHelloWorld/CMakeLists.txt
@@ -1,6 +1,6 @@
 # This test is not run by default as it requires input.
 add_llvm_example(external-hello-world
-  external-hello.cpp
+  ${FLANG_RT_SOURCE_DIR}/examples/ExternalHelloWorld/external-hello.cpp
 )
 
 target_link_libraries(external-hello-world
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index 58061d9862095..714d85ec073c6 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -11,7 +11,7 @@
 #ifndef FORTRAN_RUNTIME_ALLOCATABLE_H_
 #define FORTRAN_RUNTIME_ALLOCATABLE_H_
 
-#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/descriptor-consts.h"
 #include "flang/Runtime/entry-names.h"
 
 namespace Fortran::runtime {
diff --git a/flang/include/flang/Runtime/descriptor-consts.h b/flang/include/flang/Runtime/descriptor-consts.h
index abcdbc4a12002..acd7bc5ddbdef 100644
--- a/flang/include/flang/Runtime/descriptor-consts.h
+++ b/flang/include/flang/Runtime/descriptor-consts.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_RUNTIME_DESCRIPTOR_CONSTS_H_
 #define FORTRAN_RUNTIME_DESCRIPTOR_CONSTS_H_
 
+#include "flang/Common/Fortran-consts.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
 #include "flang/Common/api-attrs.h"
 #include <cstddef>
@@ -29,6 +30,7 @@ class DerivedType;
 namespace Fortran::runtime {
 class Descriptor;
 using SubscriptValue = ISO::CFI_index_t;
+using common::TypeCategory;
 
 /// Returns size in bytes of the descriptor (not the data)
 /// This must be at least as large as the largest descriptor of any target
diff --git a/flang/include/flang/Runtime/pointer.h b/flang/include/flang/Runtime/pointer.h
index 704144f08114f..67c4fe266f55c 100644
--- a/flang/include/flang/Runtime/pointer.h
+++ b/flang/include/flang/Runtime/pointer.h
@@ -12,7 +12,7 @@
 #ifndef FORTRAN_RUNTIME_POINTER_H_
 #define FORTRAN_RUNTIME_POINTER_H_
 
-#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/descriptor-consts.h"
 #include "flang/Runtime/entry-names.h"
 
 namespace Fortran::runtime {
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 1f4ee69598918..fbd1e1e2cc56e 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -16,6 +16,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   set(CMAKE_CXX_EXTENSIONS OFF)
 
   set(FLANG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/..")
+  set(FLANG_RT_SOURCE_DIR "${FLANG_SOURCE_DIR}/../flang-rt")
 
   set(LLVM_COMMON_CMAKE_UTILS "${FLANG_SOURCE_DIR}/../cmake")
   set(LLVM_CMAKE_UTILS "${FLANG_SOURCE_DIR}/../llvm/cmake")
@@ -57,8 +58,31 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     REAL(16) is mapped to __float128, or libm for targets where REAL(16) \
     is mapped to long double, etc."
     )
+
+  option(FLANG_CUF_RUNTIME
+    "Compile CUDA Fortran runtime sources" OFF)
+  if (FLANG_CUF_RUNTIME)
+    find_package(CUDAToolkit REQUIRED)
+  endif()
 endif()
 
+# Runtime files are in Flang-RT's source dir.
+function (runtime_source_files outvar)
+  cmake_parse_arguments(ARG "" "SUBDIR" "" ${ARGN})
+
+  set(new_sources "")
+  foreach (source IN LISTS "${outvar}")
+    set(new_source "${FLANG_RT_SOURCE_DIR}/lib")
+    cmake_path(APPEND new_source "${ARG_SUBDIR}")
+    cmake_path(APPEND new_source "${source}")
+    list(APPEND new_sources "${new_source}")
+  endforeach ()
+  set("${outvar}" ${new_sources} PARENT_SCOPE)
+endfunction ()
+
+# Runtime includes are in Flang-RT's source dir.
+include_directories(BEFORE "${FLANG_RT_SOURCE_DIR}/include")
+
 set(linked_libraries "")
 
 # function checks
@@ -108,7 +132,7 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL")
   set(NO_RTTI_FLAGS "-qnoeh -qnortti")
 endif ()
 
-configure_file(config.h.cmake config.h)
+configure_file("${FLANG_RT_SOURCE_DIR}/cmake/config.h.cmake.in" config.h)
 # include_directories is used here instead of target_include_directories
 # because add_flang_library creates multiple objects (STATIC/SHARED, OBJECT)
 # with different names
@@ -252,6 +276,7 @@ set(supported_files
   unit.cpp
   utf.cpp
   )
+runtime_source_files(supported_files SUBDIR "runtime")
 
 enable_cuda_compilation(flang_rt "${supported_files}")
 enable_omp_offload_compilation("${supported_files}")
@@ -285,6 +310,8 @@ if (NOT TARGET flang_rt.quadmath)
     list(APPEND sources ${f128_sources})
   endif()
 endif()
+runtime_source_files(sources SUBDIR "runtime")
+
 
 if (NOT DEFINED MSVC)
   add_flang_library(flang_rt.runtime
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index 1fd3bf22a83cf..aac1f62661810 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -13,7 +13,7 @@ include_directories(${CUDAToolkit_INCLUDE_DIRS})
 # added to the library name.
 set(CUFRT_LIBNAME flang_rt.cuda_${CUDAToolkit_VERSION_MAJOR})
 
-add_flang_library(${CUFRT_LIBNAME}
+set(sources
   allocator.cpp
   allocatable.cpp
   descriptor.cpp
@@ -24,6 +24,11 @@ add_flang_library(${CUFRT_LIBNAME}
   pointer.cpp
   registration.cpp
 )
+runtime_source_files(sources SUBDIR "cuda")
+
+add_flang_library(${CUFRT_LIBNAME}
+  ${sources}
+)
 
 if (BUILD_SHARED_LIBS)
   set(CUDA_RT_TARGET CUDA::cudart)
diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index 3c382d16a21cd..cdac5f893cf91 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -68,6 +68,7 @@ set(sources
   y1.cpp
   yn.cpp
   )
+runtime_source_files(sources SUBDIR "quadmath")
 
 include_directories(AFTER "${CMAKE_CURRENT_SOURCE_DIR}/..")
 add_library(FortranFloat128MathILib INTERFACE)
diff --git a/flang/runtime/config.h.cmake b/flang/runtime/config.h.cmake
deleted file mode 100644
index a2271be77b8c6..0000000000000
--- a/flang/runtime/config.h.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef FORTRAN_RUNTIME_CONFIG_H
-#define FORTRAN_RUNTIME_CONFIG_H
-
-/* Define to 1 if you have the `strerror_r' function. */
-#cmakedefine01 HAVE_STRERROR_R
-
-/* Define to 1 if you have the declaration of `strerror_s', and to 0 if you
-   don't. */
-#cmakedefine01 HAVE_DECL_STRERROR_S
-
-/* Define to 1 if you have the `backtrace' function. */
-#cmakedefine HAVE_BACKTRACE ${HAVE_BACKTRACE}
-
-#define BACKTRACE_HEADER <${BACKTRACE_HEADER}>
-
-#endif
diff --git a/flang/unittests/CMakeLists.txt b/flang/unittests/CMakeLists.txt
index c54ceb3332abf..a02f791135f1f 100644
--- a/flang/unittests/CMakeLists.txt
+++ b/flang/unittests/CMakeLists.txt
@@ -12,6 +12,8 @@ endif()
 add_custom_target(FlangUnitTests)
 set_target_properties(FlangUnitTests PROPERTIES FOLDER "Flang/Tests")
 
+include_directories("${FLANG_RT_SOURCE_DIR}/include")
+
 function(add_flang_unittest_offload_properties target)
   # Do not apply runtime properties if not even compiling the runtime.
   if (NOT FLANG_INCLUDE_RUNTIME)
@@ -61,7 +63,13 @@ function(add_flang_nongtest_unittest test_name)
       set(suffix .test)
   endif()
 
-  add_executable(${test_name}${suffix} ${test_name}.cpp)
+  # Sources for runtime tests are in Flang-RT.
+  set(test_filepath "${FLANG_RT_SOURCE_DIR}/unittests/Evaluate/${test_name}.cpp")
+  if (NOT EXISTS "${test_filepath}")
+    set(test_filepath "${test_name}.cpp")
+  endif ()
+
+  add_executable(${test_name}${suffix} "${test_filepath}")
   set_target_properties(${test_name}${suffix} PROPERTIES FOLDER "Flang/Tests/Unit")
 
   if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt
index f3743be49b015..40afec3e113f3 100644
--- a/flang/unittests/Runtime/CMakeLists.txt
+++ b/flang/unittests/Runtime/CMakeLists.txt
@@ -1,34 +1,34 @@
 add_flang_unittest(FlangRuntimeTests
-  AccessTest.cpp
-  Allocatable.cpp
-  ArrayConstructor.cpp
-  BufferTest.cpp
-  CharacterTest.cpp
-  CommandTest.cpp
-  Complex.cpp
-  CrashHandlerFixture.cpp
-  Derived.cpp
-  ExternalIOTest.cpp
-  Format.cpp
-  Inquiry.cpp
-  ListInputTest.cpp
-  LogicalFormatTest.cpp
-  Matmul.cpp
-  MatmulTranspose.cpp
-  MiscIntrinsic.cpp
-  Namelist.cpp
-  Numeric.cpp
-  NumericalFormatTest.cpp
-  Pointer.cpp
-  Ragged.cpp
-  Random.cpp
-  Reduction.cpp
-  RuntimeCrashTest.cpp
-  Stop.cpp
-  Support.cpp
-  Time.cpp
-  TemporaryStack.cpp
-  Transformational.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/AccessTest.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Allocatable.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/ArrayConstructor.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/BufferTest.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/CharacterTest.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/CommandTest.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Complex.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/CrashHandlerFixture.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Derived.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/ExternalIOTest.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Format.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Inquiry.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/ListInputTest.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/LogicalFormatTest.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Matmul.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/MatmulTranspose.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/MiscIntrinsic.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Namelist.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Numeric.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/NumericalFormatTest.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Pointer.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Ragged.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Random.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Reduction.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/RuntimeCrashTest.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Stop.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Support.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Time.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/TemporaryStack.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/Transformational.cpp
 )
 
 target_link_libraries(FlangRuntimeTests
diff --git a/flang/unittests/Runtime/CUDA/CMakeLists.txt b/flang/unittests/Runtime/CUDA/CMakeLists.txt
index 860b2664d623b..6901da3920a46 100644
--- a/flang/unittests/Runtime/CUDA/CMakeLists.txt
+++ b/flang/unittests/Runtime/CUDA/CMakeLists.txt
@@ -1,9 +1,9 @@
 if (FLANG_CUF_RUNTIME)
 
 add_flang_unittest(FlangCufRuntimeTests
-  Allocatable.cpp
-  AllocatorCUF.cpp
-  Memory.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/CUDA/Allocatable.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/CUDA/AllocatorCUF.cpp
+  ${FLANG_RT_SOURCE_DIR}/unittests/Runtime/CUDA/Memory.cpp
 )
 
 if (BUILD_SHARED_LIBS)

From a1120c9b797a8af57022d6840ded8e9f6c5057eb Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Sun, 16 Feb 2025 04:34:20 -0800
Subject: [PATCH 053/109] [AMDGPU] NFC: Fix some details for lit test (#127141)

Addressed comments in https://github.com/llvm/llvm-project/pull/126976
---
 llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll | 53 ++++++++++++++++-----
 1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll
index 200d68b2dc1a9..f582f984a3924 100644
--- a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll
+++ b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll
@@ -1,17 +1,48 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 --stop-after=si-fix-sgpr-copies < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
 
 ; iglp.opt should not be flagged as clobbering the memory operand for the global_load, and we should be able to
 ; lower into the scalar version (i.e. should not need to lower into vector version with waterfall loop)
-; CHECK-NOT: WATERFALL
 
-define amdgpu_kernel void @_attn_forward_fp8e5_128x32x64_BW128(ptr addrspace(1) %in, ptr addrspace(3) %out) {
+define amdgpu_kernel void @func(ptr addrspace(1) %in, ptr addrspace(3) %out) {
+; CHECK-LABEL: func:
+; CHECK:       ; %bb.0: ; %.lr.ph
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[8:9], 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], 0
+; CHECK-NEXT:    s_mov_b32 s3, 32
+; CHECK-NEXT:    s_mov_b32 s2, 0
+; CHECK-NEXT:    s_mov_b64 s[12:13], 0
+; CHECK-NEXT:  .LBB0_1: ; %loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[10:11]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_add_u32 s10, s6, s12
+; CHECK-NEXT:    s_addc_u32 s11, s7, s13
+; CHECK-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
+; CHECK-NEXT:    s_add_i32 s3, s3, -1
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
+; CHECK-NEXT:    ; iglp_opt mask(0x00000000)
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
+; CHECK-NEXT:  ; %bb.2: ; %end
+; CHECK-NEXT:    s_and_b32 s1, s1, 0xffff
+; CHECK-NEXT:    s_mov_b32 s3, s2
+; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; CHECK-NEXT:    ds_write_b64 v2, v[0:1]
+; CHECK-NEXT:    s_endpgm
 .lr.ph:
-  br label %1
+  br label %loop
 
-1:                                                ; preds = %1, %.lr.ph
-  %addr = phi ptr addrspace(1) [ null, %.lr.ph ], [ %gep, %1 ]
-  %offset = phi i64 [ 0, %.lr.ph ], [ %nextOff, %1 ]
-  %inc = phi i32 [0, %.lr.ph], [ %incCond, %1 ] 
+loop:                                                ; preds = %1, %.lr.ph
+  %addr = phi ptr addrspace(1) [ null, %.lr.ph ], [ %gep, %loop ]
+  %offset = phi i64 [ 0, %.lr.ph ], [ %nextOff, %loop ]
+  %inc = phi i32 [0, %.lr.ph], [ %incCond, %loop ]
   %rsrc = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %addr, i16 0, i32 0, i32 0)
   %load = tail call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   %load.bc = bitcast <2 x i32> %load to <8 x i8>
@@ -25,15 +56,13 @@ define amdgpu_kernel void @_attn_forward_fp8e5_128x32x64_BW128(ptr addrspace(1)
   %nextOff = extractelement <1 x i64> %unmaskedload49, i64 0
   %incCond = add i32 %inc, 1
   %cond = icmp eq i32 %incCond, 32
-  br i1 %cond, label %2, label %1 
+  br i1 %cond, label %end, label %loop
 
-2:
+end:
   store <4 x half> %shuff, ptr addrspace(3) %out, align 8
   ret void
 }
 
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32) #0
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
 declare <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1

From 8ac5d2d1805ecb70b683531b602ac3d288351e97 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Sun, 16 Feb 2025 12:40:03 +0000
Subject: [PATCH 054/109] ConstRange: test edge-cases of makeAllowedICmpRegion
 (#127080)

Exhaustively test signed-unsigned min-max edge-cases of
makeAllowedICmpRegion.
---
 llvm/unittests/IR/ConstantRangeTest.cpp | 30 ++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index daa07bf7d840d..1bafb52d357fa 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -1631,11 +1631,35 @@ TEST_F(ConstantRangeTest, Ashr) {
             ConstantRange(APInt(16, 0xfffc), APInt(16, 0xfffe)));
 }
 
-TEST(ConstantRange, MakeAllowedICmpRegion) {
-  // PR8250
-  ConstantRange SMax = ConstantRange(APInt::getSignedMaxValue(32));
+TEST(ConstantRange, MakeAllowedICmpRegionEdgeCases) {
+  ConstantRange SMax = ConstantRange(APInt::getSignedMaxValue(8));
   EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_SGT, SMax)
                   .isEmptySet());
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_SGE, SMax)
+                  .isSingleElement());
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_SLE, SMax)
+                  .isFullSet());
+  ConstantRange SMin = ConstantRange(APInt::getSignedMinValue(8));
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_SLT, SMin)
+                  .isEmptySet());
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_SLE, SMin)
+                  .isSingleElement());
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_SGE, SMin)
+                  .isFullSet());
+  ConstantRange UMax = ConstantRange(APInt::getMaxValue(8));
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_UGT, UMax)
+                  .isEmptySet());
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_UGE, UMax)
+                  .isSingleElement());
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_ULE, UMax)
+                  .isFullSet());
+  ConstantRange UMin = ConstantRange(APInt::getMinValue(8));
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_ULT, UMin)
+                  .isEmptySet());
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_ULE, UMin)
+                  .isSingleElement());
+  EXPECT_TRUE(ConstantRange::makeAllowedICmpRegion(ICmpInst::ICMP_UGE, UMin)
+                  .isFullSet());
 }
 
 TEST(ConstantRange, MakeSatisfyingICmpRegion) {

From 81c85ea30f7bd962e5306a02525ba22afa42b7c7 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Sun, 16 Feb 2025 13:43:02 +0100
Subject: [PATCH 055/109] [flang-rt] Fix aarch64-libcxx build failure

There seems to be multiple declarations of __libcpp_verbose_abort, some
with noexcept and some without. Reverting to the previous
forward-declaration (without noexcept) which seemes to have worked
before.
---
 flang-rt/lib/runtime/io-api-minimal.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang-rt/lib/runtime/io-api-minimal.cpp b/flang-rt/lib/runtime/io-api-minimal.cpp
index c706a3aa239a5..8d8c9c6070b04 100644
--- a/flang-rt/lib/runtime/io-api-minimal.cpp
+++ b/flang-rt/lib/runtime/io-api-minimal.cpp
@@ -150,7 +150,7 @@ bool IODEF(OutputLogical)(Cookie cookie, bool truth) {
 // Provide own definition for `std::__libcpp_verbose_abort` to avoid dependency
 // on the version provided by libc++.
 
-void std::__libcpp_verbose_abort(char const *format, ...) noexcept {
+void std::__libcpp_verbose_abort(char const *format, ...) {
   va_list list;
   va_start(list, format);
   std::vfprintf(stderr, format, list);

From 9c4ef23126728562b8717cf404bdbbdeea1aa72e Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Sun, 16 Feb 2025 14:01:05 +0100
Subject: [PATCH 056/109] [InstCombine] Test foldSelectICmpAnd with extra uses.
 (NFC)

---
 .../Transforms/InstCombine/select-icmp-and.ll | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/select-icmp-and.ll b/llvm/test/Transforms/InstCombine/select-icmp-and.ll
index 219a66c314a07..7c95fc125ce7b 100644
--- a/llvm/test/Transforms/InstCombine/select-icmp-and.ll
+++ b/llvm/test/Transforms/InstCombine/select-icmp-and.ll
@@ -372,6 +372,22 @@ define i32 @test15e(i32 %X) {
   ret i32 %t3
 }
 
+;; (a & 128) ? 256 : 0
+define i32 @test15e_extra_use(i32 %X) {
+; CHECK-LABEL: @test15e_extra_use(
+; CHECK-NEXT:    [[T1:%.*]] = and i32 [[X:%.*]], 128
+; CHECK-NEXT:    [[T2:%.*]] = icmp ne i32 [[T1]], 0
+; CHECK-NEXT:    [[T3:%.*]] = shl nuw nsw i32 [[T1]], 1
+; CHECK-NEXT:    call void @use1(i1 [[T2]])
+; CHECK-NEXT:    ret i32 [[T3]]
+;
+  %t1 = and i32 %X, 128
+  %t2 = icmp ne i32 %t1, 0
+  %t3 = select i1 %t2, i32 256, i32 0
+  call void @use1(i1 %t2)
+  ret i32 %t3
+}
+
 ;; (a & 128) ? 0 : 256
 define i32 @test15f(i32 %X) {
 ; CHECK-LABEL: @test15f(
@@ -386,6 +402,23 @@ define i32 @test15f(i32 %X) {
   ret i32 %t3
 }
 
+;; (a & 128) ? 0 : 256
+define i32 @test15f_extra_use(i32 %X) {
+; CHECK-LABEL: @test15f_extra_use(
+; CHECK-NEXT:    [[T1:%.*]] = and i32 [[X:%.*]], 128
+; CHECK-NEXT:    [[T2:%.*]] = icmp ne i32 [[T1]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[T1]], 1
+; CHECK-NEXT:    [[T3:%.*]] = xor i32 [[TMP1]], 256
+; CHECK-NEXT:    call void @use1(i1 [[T2]])
+; CHECK-NEXT:    ret i32 [[T3]]
+;
+  %t1 = and i32 %X, 128
+  %t2 = icmp ne i32 %t1, 0
+  %t3 = select i1 %t2, i32 0, i32 256
+  call void @use1(i1 %t2)
+  ret i32 %t3
+}
+
 ;; (a & 8) ? -1 : -9
 define i32 @test15g(i32 %X) {
 ; CHECK-LABEL: @test15g(

From 552e4659d3f935b8896a1350856781417bcbdcde Mon Sep 17 00:00:00 2001
From: GS-GOAT <86884129+GS-GOAT@users.noreply.github.com>
Date: Sun, 16 Feb 2025 19:04:56 +0530
Subject: [PATCH 057/109] [Clang][NFCI] Fix a logic issue in
 TransformDesignatedInitExpr (#127211)

It was clearly a typo regarding whether the array index expression has changed in its transforming.

This doesn't appear to be a functional change in practice, so no test case or release note provided.

Fixes #126113
---
 clang/lib/Sema/TreeTransform.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index fc1e3f7d58f4d..73e979927b4f3 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -13672,7 +13672,7 @@ TreeTransform<Derived>::TransformDesignatedInitExpr(DesignatedInitExpr *E) {
       Desig.AddDesignator(
           Designator::CreateArrayDesignator(Index.get(), D.getLBracketLoc()));
 
-      ExprChanged = ExprChanged || Init.get() != E->getArrayIndex(D);
+      ExprChanged = ExprChanged || Index.get() != E->getArrayIndex(D);
       ArrayExprs.push_back(Index.get());
       continue;
     }

From 79d8a34bc5c0a261b9e9f77c1d4480ef135481ac Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <git@ozinenko.com>
Date: Sun, 16 Feb 2025 14:51:21 +0100
Subject: [PATCH 058/109] [mlir] add some FP classification ops and their
 lowering to libdevice (#127322)

Introduce a subset of floating point classification ops to the Math
dialect. These ops mirror functions provided by the C math library and,
similarly to the existing `math.copysign`, belong to the math dialect.
Add a lowering of those ops to Nvidia libdevice calls when possible as
the first mechanism to exercise them.
---
 mlir/include/mlir/Dialect/Math/IR/MathOps.td  | 90 +++++++++++++++++++
 .../GPUCommon/OpToFuncCallLowering.h          | 29 ++++--
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  7 ++
 mlir/lib/Dialect/Math/IR/MathOps.cpp          | 14 +++
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     | 37 ++++++++
 mlir/test/Dialect/Math/ops.mlir               | 39 ++++++++
 6 files changed, 211 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Math/IR/MathOps.td b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
index 5990a9f0d2e44..8a277320e2f91 100644
--- a/mlir/include/mlir/Dialect/Math/IR/MathOps.td
+++ b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
@@ -34,6 +34,23 @@ class Math_IntegerUnaryOp<string mnemonic, list<Trait> traits = []> :
   let assemblyFormat = "$operand attr-dict `:` type($result)";
 }
 
+// Base class for floating point classification ops. Require an operand and
+// result of the same shape, which can be a floating point scalar, a vector or a
+// tensor thereof.
+class Math_FloatClassificationOp<string mnemonic, list<Trait> traits = []> :
+    Math_Op<mnemonic,
+      traits # [DeclareOpInterfaceMethods<ArithFastMathInterface>,
+                TypesMatchWith<
+          "result type has i1 element type and same shape as operands",
+          "operand", "result", "::getI1SameShape($_self)">]> {
+  let arguments = (ins FloatLike:$operand,
+      DefaultValuedAttr<Arith_FastMathAttr,
+                        "::mlir::arith::FastMathFlags::none">:$fastmath);
+  let results = (outs BoolLike:$result);
+
+  let assemblyFormat = "$operand attr-dict `:` type($operand)";
+}
+
 // Base class for unary math operations on floating point types. Require an
 // operand and result of the same type. This type can be a floating point type,
 // vector or tensor thereof.
@@ -678,6 +695,79 @@ def Math_IPowIOp : Math_IntegerBinaryOp<"ipowi"> {
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// IsFiniteOp
+//===----------------------------------------------------------------------===//
+
+def Math_IsFiniteOp : Math_FloatClassificationOp<"isfinite"> {
+  let summary = "returns true if the operand classifies as finite";
+  let description = [{
+    Determines if the given floating-point number has finite value i.e. it
+    is normal, subnormal or zero, but not infinite or NaN.
+
+    Example:
+
+    ```mlir
+    %f = math.isfinite %a : f32
+    ```
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// IsInfOp
+//===----------------------------------------------------------------------===//
+
+def Math_IsInfOp : Math_FloatClassificationOp<"isinf"> {
+  let summary = "returns true if the operand classifies as infinite";
+  let description = [{
+    Determines if the given floating-point number is positive or negative
+    infinity.
+
+    Example:
+
+    ```mlir
+    %f = math.isinf %a : f32
+    ```
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// IsNaNOp
+//===----------------------------------------------------------------------===//
+
+def Math_IsNaNOp : Math_FloatClassificationOp<"isnan"> {
+  let summary = "returns true if the operand classifies as NaN";
+  let description = [{
+    Determines if the given floating-point number is a not-a-number (NaN)
+    value.
+
+    Example:
+
+    ```mlir
+    %f = math.isnan %a : f32
+    ```
+  }];
+}
+
+
+//===----------------------------------------------------------------------===//
+// IsNormalOp
+//===----------------------------------------------------------------------===//
+
+def Math_IsNormalOp : Math_FloatClassificationOp<"isnormal"> {
+  let summary = "returns true if the operand classifies as normal";
+  let description = [{
+    Determines if the given floating-point number is normal, i.e. is neither
+    zero, subnormal, infinite, nor NaN.
+
+    Example:
+
+    ```mlir
+    %f = math.isnormal %a : f32
+    ```
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // LogOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
index 9f7ceb11752ba..0bc2f697a7662 100644
--- a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -71,11 +71,13 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
         std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
         "expected single result op");
 
+    bool isResultBool = op->getResultTypes().front().isInteger(1);
     if constexpr (!std::is_base_of<OpTrait::SameOperandsAndResultType<SourceOp>,
                                    SourceOp>::value) {
       assert(op->getNumOperands() > 0 &&
              "expected op to take at least one operand");
-      assert(op->getResultTypes().front() == op->getOperand(0).getType() &&
+      assert((op->getResultTypes().front() == op->getOperand(0).getType() ||
+              isResultBool) &&
              "expected op with same operand and result types");
     }
 
@@ -88,10 +90,13 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     for (Value operand : adaptor.getOperands())
       castedOperands.push_back(maybeCast(operand, rewriter));
 
-    Type resultType = castedOperands.front().getType();
+    Type castedOperandType = castedOperands.front().getType();
+
+    // At ABI level, booleans are treated as i32.
+    Type resultType =
+        isResultBool ? rewriter.getIntegerType(32) : castedOperandType;
     Type funcType = getFunctionType(resultType, castedOperands);
-    StringRef funcName = getFunctionName(
-        cast<LLVM::LLVMFunctionType>(funcType).getReturnType(), op);
+    StringRef funcName = getFunctionName(castedOperandType, op);
     if (funcName.empty())
       return failure();
 
@@ -104,6 +109,20 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
       return success();
     }
 
+    // Boolean result are mapping to i32 at the ABI level with zero values being
+    // interpreted as false and non-zero values being interpreted as true. Since
+    // there is no guarantee of a specific value being used to indicate true,
+    // compare for inequality with zero (rather than truncate or shift).
+    if (isResultBool) {
+      Value zero = rewriter.create<LLVM::ConstantOp>(
+          op->getLoc(), rewriter.getIntegerType(32),
+          rewriter.getI32IntegerAttr(0));
+      Value truncated = rewriter.create<LLVM::ICmpOp>(
+          op->getLoc(), LLVM::ICmpPredicate::ne, callOp.getResult(), zero);
+      rewriter.replaceOp(op, {truncated});
+      return success();
+    }
+
     assert(callOp.getResult().getType().isF32() &&
            "only f32 types are supposed to be truncated back");
     Value truncated = rewriter.create<LLVM::FPTruncOp>(
@@ -118,7 +137,7 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     if (!isa<Float16Type, BFloat16Type>(type))
       return operand;
 
-    // if there's a f16 function, no need to cast f16 values
+    // If there's an f16 function, no need to cast f16 values.
     if (!f16Func.empty() && isa<Float16Type>(type))
       return operand;
 
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index c1a4d31861d3b..9290279112715 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -595,6 +595,13 @@ void mlir::populateGpuToNVVMConversionPatterns(
   populateOpPatterns<math::FloorOp>(converter, patterns, "__nv_floorf",
                                     "__nv_floor");
   populateOpPatterns<math::FmaOp>(converter, patterns, "__nv_fmaf", "__nv_fma");
+  // Note: libdevice does not provide `__nv_isfinitef` as of moment of writing.
+  populateOpPatterns<math::IsFiniteOp>(converter, patterns, "",
+                                       "__nv_isfinited");
+  populateOpPatterns<math::IsInfOp>(converter, patterns, "__nv_isinff",
+                                    "__nv_isinfd");
+  populateOpPatterns<math::IsNaNOp>(converter, patterns, "__nv_isnanf",
+                                    "__nv_isnand");
   populateOpPatterns<math::LogOp>(converter, patterns, "__nv_logf", "__nv_log",
                                   "__nv_fast_logf");
   populateOpPatterns<math::Log10Op>(converter, patterns, "__nv_log10f",
diff --git a/mlir/lib/Dialect/Math/IR/MathOps.cpp b/mlir/lib/Dialect/Math/IR/MathOps.cpp
index 1690585e78c5d..42e357c012739 100644
--- a/mlir/lib/Dialect/Math/IR/MathOps.cpp
+++ b/mlir/lib/Dialect/Math/IR/MathOps.cpp
@@ -16,6 +16,20 @@
 using namespace mlir;
 using namespace mlir::math;
 
+//===----------------------------------------------------------------------===//
+// Common helpers
+//===----------------------------------------------------------------------===//
+
+/// Return the type of the same shape (scalar, vector or tensor) containing i1.
+static Type getI1SameShape(Type type) {
+  auto i1Type = IntegerType::get(type.getContext(), 1);
+  if (auto shapedType = llvm::dyn_cast<ShapedType>(type))
+    return shapedType.cloneWith(std::nullopt, i1Type);
+  if (llvm::isa<UnrankedTensorType>(type))
+    return UnrankedTensorType::get(i1Type);
+  return i1Type;
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 9f74e0c7947e6..664a0bb0c0d5b 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1058,3 +1058,40 @@ gpu.module @test_module_53 {
     func.return %result32, %result64 : f32, f64
   }
 }
+
+gpu.module @test_module_54 {
+  // CHECK: llvm.func @__nv_isinff(f32) -> i32
+  // CHECK: llvm.func @__nv_isinfd(f64) -> i32
+  // CHECK: llvm.func @__nv_isnanf(f32) -> i32
+  // CHECK: llvm.func @__nv_isnand(f64) -> i32
+  // CHECK: llvm.func @__nv_isfinited(f64) -> i32
+  // CHECK-LABEL: @fpclassify
+  func.func @fpclassify(%f32: f32, %f64: f64) -> (i1, i1, i1, i1, i1, i1) {
+    // CHECK: %[[INFF:.+]] = llvm.call @__nv_isinff(%{{.*}}) : (f32) -> i32
+    // CHECK: %[[ZERO:.+]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: %[[R0:.+]] = llvm.icmp "ne" %[[INFF]], %[[ZERO]]
+    %0 = math.isinf %f32 : f32
+    // CHECK: llvm.call @__nv_isinfd(%{{.*}}) : (f64) -> i32
+    // CHECK: llvm.mlir.constant(0
+    // CHECK: llvm.icmp "ne"
+    %1 = math.isinf %f64 : f64
+    // CHECK: llvm.call @__nv_isnanf(%{{.*}}) : (f32) -> i32
+    // CHECK: llvm.mlir.constant(0
+    // CHECK: llvm.icmp "ne"
+    %2 = math.isnan %f32 : f32
+    // CHECK: llvm.call @__nv_isnand(%{{.*}}) : (f64) -> i32
+    // CHECK: llvm.mlir.constant(0
+    // CHECK: llvm.icmp "ne"
+    %3 = math.isnan %f64 : f64
+    // Note: for some reason, libdevice does not provide isfinite for f32, so
+    // this should fail to convert.
+    // CHECK: math.isfinite {{.*}} : f32
+    %4 = math.isfinite %f32 : f32
+    // CHECK: llvm.call @__nv_isfinited(%{{.*}}) : (f64) -> i32
+    // CHECK: llvm.mlir.constant(0
+    // CHECK: llvm.icmp "ne"
+    %5 = math.isfinite %f64 : f64
+    // CHECK: llvm.return %[[R0]]
+    return %0, %1, %2, %3, %4, %5 : i1, i1, i1, i1, i1, i1
+  }
+}
diff --git a/mlir/test/Dialect/Math/ops.mlir b/mlir/test/Dialect/Math/ops.mlir
index 7e45d9bc6f74a..8feadedd1860e 100644
--- a/mlir/test/Dialect/Math/ops.mlir
+++ b/mlir/test/Dialect/Math/ops.mlir
@@ -298,3 +298,42 @@ func.func @fastmath(%f: f32, %i: i32, %v: vector<4xf32>, %t: tensor<4x4x?xf32>)
   %4 = math.fpowi %f, %i fastmath<fast> : f32, i32
   return
 }
+
+// CHECK-LABEL: func @fpclassify(
+// CHECK-SAME:    %[[F:.+]]: f32, %[[D:.+]]: f64,
+// CHECK-SAME:    %[[V:.+]]: vector<4xf32>, %[[T:.+]]: tensor<4x?xf32>
+func.func @fpclassify(%f: f32, %d: f64, %v: vector<4xf32>, %t: tensor<4x?xf32>) {
+  // CHECK: math.isfinite %[[F]] : f32
+  // CHECK: math.isfinite %[[D]] : f64
+  // CHECK: math.isfinite %[[V]] : vector<4xf32>
+  // CHECK: math.isfinite %[[T]] : tensor<4x?xf32>
+  math.isfinite %f : f32
+  math.isfinite %d : f64
+  math.isfinite %v : vector<4xf32>
+  math.isfinite %t : tensor<4x?xf32>
+  // CHECK: math.isinf %[[F]] : f32
+  // CHECK: math.isinf %[[D]] : f64
+  // CHECK: math.isinf %[[V]] : vector<4xf32>
+  // CHECK: math.isinf %[[T]] : tensor<4x?xf32>
+  math.isinf %f : f32
+  math.isinf %d : f64
+  math.isinf %v : vector<4xf32>
+  math.isinf %t : tensor<4x?xf32>
+  // CHECK: math.isnan %[[F]] : f32
+  // CHECK: math.isnan %[[D]] : f64
+  // CHECK: math.isnan %[[V]] : vector<4xf32>
+  // CHECK: math.isnan %[[T]] : tensor<4x?xf32>
+  math.isnan %f : f32
+  math.isnan %d : f64
+  math.isnan %v : vector<4xf32>
+  math.isnan %t : tensor<4x?xf32>
+  // CHECK: math.isnormal %[[F]] : f32
+  // CHECK: math.isnormal %[[D]] : f64
+  // CHECK: math.isnormal %[[V]] : vector<4xf32>
+  // CHECK: math.isnormal %[[T]] : tensor<4x?xf32>
+  math.isnormal %f : f32
+  math.isnormal %d : f64
+  math.isnormal %v : vector<4xf32>
+  math.isnormal %t : tensor<4x?xf32>
+  return
+}

From c01c3cf7d75ce23c165224b5470b9132f53e95c2 Mon Sep 17 00:00:00 2001
From: hstk30-hw <hanwei62@huawei.com>
Date: Sun, 16 Feb 2025 22:09:05 +0800
Subject: [PATCH 059/109] [ARM] Fix MRC cp10 and cp11 warning (#126407)

The MRC format is
    MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1,
        c_imm:$CRn, c_imm:$CRm, 0, pred:$p
The $cop is the second operand.
Patch for
https://github.com/llvm/llvm-project/commit/b7d41a11cd31388e8b542b2d881f5c9d7130b95e.
---
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp |  4 ++--
 llvm/test/MC/ARM/diagnostics.s                       | 10 +++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 01a271327049f..e1af3150a8403 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -77,8 +77,8 @@ static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
 static bool getMRCDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                   std::string &Info) {
   if (STI.hasFeature(llvm::ARM::HasV7Ops) &&
-      ((MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 10) ||
-       (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 11))) {
+      ((MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 10) ||
+       (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 11))) {
     Info = "since v7, cp10 and cp11 are reserved for advanced SIMD or floating "
            "point instructions";
     return true;
diff --git a/llvm/test/MC/ARM/diagnostics.s b/llvm/test/MC/ARM/diagnostics.s
index fa23a7da1e404..29feee99cf351 100644
--- a/llvm/test/MC/ARM/diagnostics.s
+++ b/llvm/test/MC/ARM/diagnostics.s
@@ -172,9 +172,13 @@
 
         @ p10 and p11 are reserved for NEON
         mcr p10, #2, r5, c1, c1, #4
-        mcrr p11, #8, r5, r4, c1
-@ CHECK-WARN: warning: since v7, cp10 and cp11 are reserved for advanced SIMD or floating point instructions
-@ CHECK-WARN: warning: since v7, cp10 and cp11 are reserved for advanced SIMD or floating point instructions
+        mcr p11, #2, r5, c1, c1, #4
+        mrc p10, #7, r5, c1, c1, #0
+        mrc p11, #7, r5, c1, c1, #0
+@ CHECK-ERROR-V7: warning: since v7, cp10 and cp11 are reserved for advanced SIMD or floating point instructions
+@ CHECK-ERROR-V7: warning: since v7, cp10 and cp11 are reserved for advanced SIMD or floating point instructions
+@ CHECK-ERROR-V7: warning: since v7, cp10 and cp11 are reserved for advanced SIMD or floating point instructions
+@ CHECK-ERROR-V7: warning: since v7, cp10 and cp11 are reserved for advanced SIMD or floating point instructions
 
         @ Out of range immediate for MOV
         movw r9, 0x10000

From 17d508f30d897e7392e48885850c9595546e342c Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Sun, 16 Feb 2025 15:27:38 +0100
Subject: [PATCH 060/109] [InstCombine] Test foldSelectICmpAnd with cast. (NFC)

---
 .../Transforms/InstCombine/select-icmp-and.ll | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/select-icmp-and.ll b/llvm/test/Transforms/InstCombine/select-icmp-and.ll
index 7c95fc125ce7b..516a1e8496b43 100644
--- a/llvm/test/Transforms/InstCombine/select-icmp-and.ll
+++ b/llvm/test/Transforms/InstCombine/select-icmp-and.ll
@@ -388,6 +388,37 @@ define i32 @test15e_extra_use(i32 %X) {
   ret i32 %t3
 }
 
+;; (a & 128) ? 256 : 0
+define i32 @test15e_zext(i8 %X) {
+; CHECK-LABEL: @test15e_zext(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[T3:%.*]] = shl nuw nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    ret i32 [[T3]]
+;
+  %t1 = and i8 %X, 128
+  %t2 = icmp ne i8 %t1, 0
+  %t3 = select i1 %t2, i32 256, i32 0
+  ret i32 %t3
+}
+
+;; (a & 128) ? 256 : 0
+define i32 @test15e_zext_extra_use(i8 %X) {
+; CHECK-LABEL: @test15e_zext_extra_use(
+; CHECK-NEXT:    [[T2:%.*]] = icmp slt i8 [[X:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[T3:%.*]] = shl nuw nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    call void @use1(i1 [[T2]])
+; CHECK-NEXT:    ret i32 [[T3]]
+;
+  %t1 = and i8 %X, 128
+  %t2 = icmp ne i8 %t1, 0
+  %t3 = select i1 %t2, i32 256, i32 0
+  call void @use1(i1 %t2)
+  ret i32 %t3
+}
+
 ;; (a & 128) ? 0 : 256
 define i32 @test15f(i32 %X) {
 ; CHECK-LABEL: @test15f(
@@ -419,6 +450,21 @@ define i32 @test15f_extra_use(i32 %X) {
   ret i32 %t3
 }
 
+;; (a & 128) ? 0 : 256
+define i16 @test15f_trunc(i32 %X) {
+; CHECK-LABEL: @test15f_trunc(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i16 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = and i16 [[TMP2]], 256
+; CHECK-NEXT:    [[T3:%.*]] = xor i16 [[TMP3]], 256
+; CHECK-NEXT:    ret i16 [[T3]]
+;
+  %t1 = and i32 %X, 128
+  %t2 = icmp ne i32 %t1, 0
+  %t3 = select i1 %t2, i16 0, i16 256
+  ret i16 %t3
+}
+
 ;; (a & 8) ? -1 : -9
 define i32 @test15g(i32 %X) {
 ; CHECK-LABEL: @test15g(

From b55f7512a76f2358000139074c79d4c2521588de Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Sun, 16 Feb 2025 15:39:52 +0100
Subject: [PATCH 061/109] [Flang] LLVM_ENABLE_RUNTIMES=flang-rt (#110217)

Extract Flang's runtime library to use the LLVM_ENABLE_RUNTIME
mechanism. It will only become active when
`LLVM_ENABLE_RUNTIMES=flang-rt` is used, which also changes the
`FLANG_INCLUDE_RUNTIME` to `OFF` so the old runtime build rules do not
conflict. This also means that unless `LLVM_ENABLE_RUNTIMES=flang-rt` is
passed, nothing changes with the current build process.

Motivation:
* Consistency with LLVM's other runtime libraries (compiler-rt, libc,
libcxx, openmp offload, ...)
* Allows compiling the runtime for multiple targets at once using the
LLVM_RUNTIME_TARGETS configuration options
* Installs the runtime into the compiler's per-target resource directory
so it can be automatically found even when cross-compiling

Also see RFC discussion at
https://discourse.llvm.org/t/rfc-use-llvm-enable-runtimes-for-flangs-runtime/80826
---
 flang-rt/.clang-tidy                          |   2 +
 flang-rt/CMakeLists.txt                       | 261 ++++++++++++++++++
 flang-rt/CODE_OWNERS.TXT                      |  14 +
 flang-rt/LICENSE.TXT                          | 234 ++++++++++++++++
 flang-rt/README.md                            | 188 +++++++++++++
 flang-rt/cmake/config.h.cmake.in              |   5 +
 flang-rt/cmake/modules/AddFlangRT.cmake       | 199 +++++++++++++
 .../cmake/modules/AddFlangRTOffload.cmake     | 100 +++++++
 flang-rt/cmake/modules/GetToolchainDirs.cmake | 125 +++++++++
 flang-rt/examples/CMakeLists.txt              |   9 +
 .../ExternalHelloWorld/CMakeLists.txt         |  17 ++
 flang-rt/lib/CMakeLists.txt                   |  17 ++
 flang-rt/lib/Testing/CMakeLists.txt           |  20 ++
 flang-rt/lib/cuda/CMakeLists.txt              |  34 +++
 flang-rt/lib/quadmath/CMakeLists.txt          | 136 +++++++++
 flang-rt/lib/runtime/CMakeLists.txt           | 215 +++++++++++++++
 flang-rt/test/CMakeLists.txt                  |  59 ++++
 flang-rt/test/Driver/ctofortran.f90           |  29 +-
 flang-rt/test/Driver/exec.f90                 |   8 +-
 flang-rt/test/NonGtestUnit/lit.cfg.py         |  22 ++
 flang-rt/test/NonGtestUnit/lit.site.cfg.py.in |  14 +
 flang-rt/test/Runtime/no-cpp-dep.c            |   5 +-
 flang-rt/test/Unit/lit.cfg.py                 |  21 ++
 flang-rt/test/Unit/lit.site.cfg.py.in         |  15 +
 flang-rt/test/lit.cfg.py                      | 100 +++++++
 flang-rt/test/lit.site.cfg.py.in              |  19 ++
 flang-rt/unittests/CMakeLists.txt             | 105 +++++++
 flang-rt/unittests/Evaluate/CMakeLists.txt    |  21 ++
 flang-rt/unittests/Runtime/CMakeLists.txt     |  48 ++++
 .../unittests/Runtime/CUDA/CMakeLists.txt     |  18 ++
 flang/CMakeLists.txt                          |  54 ++--
 flang/cmake/modules/FlangCommon.cmake         |  43 +++
 flang/docs/GettingStarted.md                  | 106 +++----
 flang/docs/ReleaseNotes.md                    |   6 +
 flang/test/lit.cfg.py                         |  20 --
 flang/test/lit.site.cfg.py.in                 |   3 -
 llvm/CMakeLists.txt                           |   8 +-
 .../modules/LLVMExternalProjectUtils.cmake    |  16 +-
 llvm/projects/CMakeLists.txt                  |   4 +-
 llvm/runtimes/CMakeLists.txt                  |  25 +-
 runtimes/CMakeLists.txt                       |   2 +-
 41 files changed, 2207 insertions(+), 140 deletions(-)
 create mode 100644 flang-rt/.clang-tidy
 create mode 100644 flang-rt/CMakeLists.txt
 create mode 100644 flang-rt/CODE_OWNERS.TXT
 create mode 100644 flang-rt/LICENSE.TXT
 create mode 100644 flang-rt/README.md
 create mode 100644 flang-rt/cmake/modules/AddFlangRT.cmake
 create mode 100644 flang-rt/cmake/modules/AddFlangRTOffload.cmake
 create mode 100644 flang-rt/cmake/modules/GetToolchainDirs.cmake
 create mode 100644 flang-rt/examples/CMakeLists.txt
 create mode 100644 flang-rt/examples/ExternalHelloWorld/CMakeLists.txt
 create mode 100644 flang-rt/lib/CMakeLists.txt
 create mode 100644 flang-rt/lib/Testing/CMakeLists.txt
 create mode 100644 flang-rt/lib/cuda/CMakeLists.txt
 create mode 100644 flang-rt/lib/quadmath/CMakeLists.txt
 create mode 100644 flang-rt/lib/runtime/CMakeLists.txt
 create mode 100644 flang-rt/test/CMakeLists.txt
 create mode 100644 flang-rt/test/NonGtestUnit/lit.cfg.py
 create mode 100644 flang-rt/test/NonGtestUnit/lit.site.cfg.py.in
 create mode 100644 flang-rt/test/Unit/lit.cfg.py
 create mode 100644 flang-rt/test/Unit/lit.site.cfg.py.in
 create mode 100644 flang-rt/test/lit.cfg.py
 create mode 100644 flang-rt/test/lit.site.cfg.py.in
 create mode 100644 flang-rt/unittests/CMakeLists.txt
 create mode 100644 flang-rt/unittests/Evaluate/CMakeLists.txt
 create mode 100644 flang-rt/unittests/Runtime/CMakeLists.txt
 create mode 100644 flang-rt/unittests/Runtime/CUDA/CMakeLists.txt
 create mode 100644 flang/cmake/modules/FlangCommon.cmake

diff --git a/flang-rt/.clang-tidy b/flang-rt/.clang-tidy
new file mode 100644
index 0000000000000..ee3a0ab2201bf
--- /dev/null
+++ b/flang-rt/.clang-tidy
@@ -0,0 +1,2 @@
+Checks: '-llvm-include-order,readability-braces-around-statements,-readability-identifier-naming,-clang-diagnostic-*'
+InheritParentConfig: true
diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt
new file mode 100644
index 0000000000000..df35e24ec28a7
--- /dev/null
+++ b/flang-rt/CMakeLists.txt
@@ -0,0 +1,261 @@
+#===-- CMakeLists.txt ------------------------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+#
+# Build instructions for the flang-rt library. This is file is intended to be
+# included using the LLVM_ENABLE_RUNTIMES mechanism.
+#
+#===------------------------------------------------------------------------===#
+
+if (NOT LLVM_RUNTIMES_BUILD)
+  message(FATAL_ERROR "Use this CMakeLists.txt from LLVM's runtimes build system.
+      Example:
+        cmake <llvm-project>/runtimes -DLLVM_ENABLE_RUNTIMES=flang-rt
+    ")
+endif ()
+
+set(LLVM_SUBPROJECT_TITLE "Flang-RT")
+set(FLANG_RT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(FLANG_RT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+set(FLANG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../flang")
+
+# CMake 3.24 is the first version of CMake that directly recognizes Flang.
+# LLVM's requirement is only CMake 3.20, teach CMake 3.20-3.23 how to use Flang.
+if (CMAKE_VERSION VERSION_LESS "3.24")
+  cmake_path(GET CMAKE_Fortran_COMPILER STEM _Fortran_COMPILER_STEM)
+  if (_Fortran_COMPILER_STEM STREQUAL "flang-new" OR _Fortran_COMPILER_STEM STREQUAL "flang")
+    include(CMakeForceCompiler)
+    CMAKE_FORCE_Fortran_COMPILER("${CMAKE_Fortran_COMPILER}" "LLVMFlang")
+
+    set(CMAKE_Fortran_COMPILER_ID "LLVMFlang")
+    set(CMAKE_Fortran_COMPILER_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}")
+
+    set(CMAKE_Fortran_SUBMODULE_SEP "-")
+    set(CMAKE_Fortran_SUBMODULE_EXT ".mod")
+
+    set(CMAKE_Fortran_PREPROCESS_SOURCE
+      "<CMAKE_Fortran_COMPILER> -cpp <DEFINES> <INCLUDES> <FLAGS> -E <SOURCE> > <PREPROCESSED_SOURCE>")
+
+    set(CMAKE_Fortran_FORMAT_FIXED_FLAG "-ffixed-form")
+    set(CMAKE_Fortran_FORMAT_FREE_FLAG "-ffree-form")
+
+    set(CMAKE_Fortran_MODDIR_FLAG "-module-dir")
+
+    set(CMAKE_Fortran_COMPILE_OPTIONS_PREPROCESS_ON "-cpp")
+    set(CMAKE_Fortran_COMPILE_OPTIONS_PREPROCESS_OFF "-nocpp")
+    set(CMAKE_Fortran_POSTPROCESS_FLAG "-ffixed-line-length-72")
+
+    set(CMAKE_Fortran_COMPILE_OPTIONS_TARGET "--target=")
+
+    set(CMAKE_Fortran_LINKER_WRAPPER_FLAG "-Wl,")
+    set(CMAKE_Fortran_LINKER_WRAPPER_FLAG_SEP ",")
+  endif ()
+endif ()
+enable_language(Fortran)
+
+
+list(APPEND CMAKE_MODULE_PATH
+    "${FLANG_RT_SOURCE_DIR}/cmake/modules"
+    "${FLANG_SOURCE_DIR}/cmake/modules"
+  )
+include(AddFlangRT)
+include(GetToolchainDirs)
+include(FlangCommon)
+include(HandleCompilerRT)
+include(ExtendPath)
+
+
+############################
+# Build Mode Introspection #
+############################
+
+# Determine whether we are in the runtimes/runtimes-bins directory of a
+# bootstrap build.
+set(LLVM_TREE_AVAILABLE OFF)
+if (LLVM_LIBRARY_OUTPUT_INTDIR AND LLVM_RUNTIME_OUTPUT_INTDIR AND PACKAGE_VERSION)
+  set(LLVM_TREE_AVAILABLE ON)
+endif()
+
+# Path to LLVM development tools (FileCheck, llvm-lit, not, ...)
+set(LLVM_TOOLS_DIR "${LLVM_BINARY_DIR}/bin")
+
+# Determine build and install paths.
+# The build path is absolute, but the install dir is relative, CMake's install
+# command has to apply CMAKE_INSTALL_PREFIX itself.
+get_toolchain_library_subdir(toolchain_lib_subdir)
+if (LLVM_TREE_AVAILABLE)
+  # In a bootstrap build emit the libraries into a default search path in the
+  # build directory of the just-built compiler. This allows using the
+  # just-built compiler without specifying paths to runtime libraries.
+  #
+  # Despite Clang in the name, get_clang_resource_dir does not depend on Clang
+  # being added to the build. Flang uses the same resource dir as clang.
+  include(GetClangResourceDir)
+  get_clang_resource_dir(FLANG_RT_OUTPUT_RESOURCE_DIR PREFIX "${LLVM_LIBRARY_OUTPUT_INTDIR}/..")
+  get_clang_resource_dir(FLANG_RT_INSTALL_RESOURCE_PATH)
+
+  extend_path(FLANG_RT_OUTPUT_RESOURCE_LIB_DIR "${FLANG_RT_OUTPUT_RESOURCE_DIR}" "${toolchain_lib_subdir}")
+else ()
+  # In a standalone runtimes build, do not write into LLVM_BINARY_DIR. It may be
+  # read-only and/or shared by multiple runtimes with different build
+  # configurations (e.g. Debug/Release). Use the runtime's own lib dir like any
+  # non-toolchain library.
+  # For the install prefix, still use the resource dir assuming that Flang will
+  # be installed there using the same prefix. This is to not have a difference
+  # between bootstrap and standalone runtimes builds.
+  set(FLANG_RT_OUTPUT_RESOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+  set(FLANG_RT_INSTALL_RESOURCE_PATH "lib${LLVM_LIBDIR_SUFFIX}/clang/${LLVM_VERSION_MAJOR}")
+
+  extend_path(FLANG_RT_OUTPUT_RESOURCE_LIB_DIR "${FLANG_RT_OUTPUT_RESOURCE_DIR}" "lib${LLVM_LIBDIR_SUFFIX}")
+endif ()
+extend_path(FLANG_RT_INSTALL_RESOURCE_LIB_PATH "${FLANG_RT_INSTALL_RESOURCE_PATH}" "${toolchain_lib_subdir}")
+cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_DIR)
+cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_PATH)
+cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_LIB_DIR)
+cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH)
+
+
+#################
+# Build Options #
+#################
+
+# Important: flang-rt user options must be prefixed with "FLANG_RT_". Variables
+# with this prefix will be forwarded in bootstrap builds.
+
+option(FLANG_RT_INCLUDE_TESTS "Generate build targets for the flang-rt unit and regression-tests." "${LLVM_INCLUDE_TESTS}")
+
+
+set(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT "" CACHE STRING "Compile Flang-RT with GPU support (CUDA or OpenMP)")
+set_property(CACHE FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT PROPERTY STRINGS
+    ""
+    CUDA
+    OpenMP
+  )
+if (NOT FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT)
+  # Support for GPUs disabled
+elseif (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
+  # Support for CUDA
+  set(FLANG_RT_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation")
+  option(FLANG_RT_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS "Do not compile global variables' definitions when producing PTX library" OFF)
+elseif (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP")
+  # Support for OpenMP offloading
+  set(FLANG_RT_DEVICE_ARCHITECTURES "all" CACHE STRING
+      "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')"
+    )
+
+  if (FLANG_RT_DEVICE_ARCHITECTURES STREQUAL "all")
+    # TODO: support auto detection on the build system.
+    set(all_amdgpu_architectures
+      "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
+      "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030"
+      "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
+      "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151"
+      "gfx1152;gfx1153")
+    set(all_nvptx_architectures
+      "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
+      "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90")
+    set(all_gpu_architectures
+      "${all_amdgpu_architectures};${all_nvptx_architectures}")
+      set(FLANG_RT_DEVICE_ARCHITECTURES ${all_gpu_architectures})
+  endif()
+  list(REMOVE_DUPLICATES FLANG_RT_DEVICE_ARCHITECTURES)
+else ()
+  message(FATAL_ERROR "Invalid value '${FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT}' for FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT; must be empty, 'CUDA', or 'OpenMP'")
+endif ()
+
+
+option(FLANG_RT_INCLUDE_CUF "Build the CUDA Fortran runtime (libflang_rt.cuda.a)" OFF)
+if (FLANG_RT_INCLUDE_CUF)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
+
+########################
+# System Introspection #
+########################
+
+include(CheckCXXSymbolExists)
+include(CheckCXXSourceCompiles)
+check_cxx_symbol_exists(strerror_r string.h HAVE_STRERROR_R)
+# Can't use symbol exists here as the function is overloaded in C++
+check_cxx_source_compiles(
+  "#include <string.h>
+   int main() {
+     char buf[4096];
+     return strerror_s(buf, 4096, 0);
+   }
+  "
+  HAVE_DECL_STRERROR_S)
+
+
+# Search for clang_rt.builtins library. Need in addition to msvcrt.
+if (WIN32)
+  find_compiler_rt_library(builtins FLANG_RT_BUILTINS_LIBRARY)
+endif ()
+
+
+# Check whether the compiler can undefine a macro using the "-U" flag.
+# Aternatively, we could use
+#   CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU"
+# but some older versions of CMake don't define it for GCC itself.
+check_cxx_compiler_flag("-UTESTFLAG" FLANG_RT_SUPPORTS_UNDEFINE_FLAG)
+
+# Check whether -fno-lto is supported.
+check_cxx_compiler_flag(-fno-lto FLANG_RT_HAS_FNO_LTO_FLAG)
+
+
+# function checks
+find_package(Backtrace)
+set(HAVE_BACKTRACE ${Backtrace_FOUND})
+set(BACKTRACE_HEADER ${Backtrace_HEADER})
+
+
+#####################
+# Build Preparation #
+#####################
+
+if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT AND FLANG_RT_INCLUDE_TESTS)
+  # If Fortran runtime is built as CUDA library, the linking
+  # of targets that link flang-rt must be done
+  # with CUDA_RESOLVE_DEVICE_SYMBOLS.
+  # CUDA language must be enabled for CUDA_RESOLVE_DEVICE_SYMBOLS
+  # to take effect.
+  enable_language(CUDA)
+endif()
+
+
+# C++17 is required for flang-rt; user or other runtimes may override this.
+# GTest included later also requires C++17.
+set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to conform to")
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+
+configure_file(cmake/config.h.cmake.in config.h)
+
+
+# The bootstrap build will create a phony target with the same as the top-level
+# directory ("flang-rt") and delegate it to the runtimes build dir.
+# AddFlangRT will add all non-EXCLUDE_FROM_ALL targets to it.
+add_custom_target(flang-rt)
+
+
+###################
+# Build Artifacts #
+###################
+
+add_subdirectory(lib)
+
+if (LLVM_INCLUDE_EXAMPLES)
+  add_subdirectory(examples)
+endif ()
+
+if (FLANG_RT_INCLUDE_TESTS)
+  add_subdirectory(unittests)
+  add_subdirectory(test)
+else ()
+  add_custom_target(check-flang-rt)
+endif()
diff --git a/flang-rt/CODE_OWNERS.TXT b/flang-rt/CODE_OWNERS.TXT
new file mode 100644
index 0000000000000..649243aa1e8fe
--- /dev/null
+++ b/flang-rt/CODE_OWNERS.TXT
@@ -0,0 +1,14 @@
+This file is a list of the people responsible for ensuring that patches for a
+particular part of Flang are reviewed, either by themself or by someone else.
+They are also the gatekeepers for their part of Flang, with the final word on
+what goes in or not.
+
+The list is sorted by surname and formatted to allow easy grepping and
+beautification by scripts. The fields are: name (N), email (E), web-address
+(W), PGP key ID and fingerprint (P), description (D), snail-mail address
+(S) and (I) IRC handle. Each entry should contain at least the (N), (E) and
+(D) fields.
+
+N: Steve Scalpone
+E: sscalpone@nvidia.com
+D: Anything not covered by others
diff --git a/flang-rt/LICENSE.TXT b/flang-rt/LICENSE.TXT
new file mode 100644
index 0000000000000..53bb2e7fbc764
--- /dev/null
+++ b/flang-rt/LICENSE.TXT
@@ -0,0 +1,234 @@
+==============================================================================
+The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
+==============================================================================
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
+==============================================================================
+Software from third parties included in the LLVM Project:
+==============================================================================
+The LLVM Project contains third party software which is under different license
+terms. All such code will be identified clearly using at least one of two
+mechanisms:
+1) It will be in a separate directory tree with its own `LICENSE.txt` or
+   `LICENSE` file at the top containing the specific license and restrictions
+   which apply to that software, or
+2) It will contain specific license and restriction terms at the top of every
+   file.
diff --git a/flang-rt/README.md b/flang-rt/README.md
new file mode 100644
index 0000000000000..aba64e2453872
--- /dev/null
+++ b/flang-rt/README.md
@@ -0,0 +1,188 @@
+<!--===- README.md
+
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+
+# Fortran Runtime (Flang-RT)
+
+Flang-RT is the runtime library for code emitted by the Flang compiler
+(https://flang.llvm.org).
+
+
+## Getting Started
+
+There are two build modes for the Flang-RT. The bootstrap build, also
+called the in-tree build, and the runtime-only build, also called the
+out-of-tree build.
+Not to be confused with the terms
+[in-source and out-of-source](https://cmake.org/cmake/help/latest/manual/cmake.1.html#introduction-to-cmake-buildsystems)
+builds as defined by CMake. In an in-source build, the source directory and the
+build directory are identical, whereas with an out-of-source build the
+build artifacts are stored somewhere else, possibly in a subdirectory of the
+source directory. LLVM does not support in-source builds.
+
+
+### Requirements
+
+Requirements:
+  * [Same as LLVM](https://llvm.org/docs/GettingStarted.html#requirements).
+
+
+### Bootstrapping Runtimes Build
+
+The bootstrapping build will first build Clang and Flang, then use these
+compilers to compile Flang-RT. CMake will create a secondary build tree
+configured to use these just-built compilers. The secondary build will reuse
+the same build options (Flags, Debug/Release, ...) as the primary build.
+It will also ensure that once built, Flang-RT is found by Flang from either
+the build- or install-prefix. To enable, add `flang-rt` to
+`LLVM_ENABLE_RUNTIMES`:
+
+```bash
+cmake -S <path-to-llvm-project-source>/llvm \
+  -GNinja                                   \
+  -DLLVM_ENABLE_PROJECTS="clang;flang"      \
+  -DLLVM_ENABLE_RUNTIMES=flang-rt           \
+  ...
+```
+
+It is recommended to enable building OpenMP alongside Flang and Flang-RT
+as well. This will build `omp_lib.mod` required to use OpenMP from Fortran.
+Building Compiler-RT may also be required, particularly on platforms that do
+not provide all C-ABI functionality (such as Windows).
+
+```bash
+cmake -S <path-to-llvm-project-source>/llvm     \
+  -GNinja                                       \
+  -DCMAKE_BUILD_TYPE=Release                    \
+  -DLLVM_ENABLE_PROJECTS="clang;flang;openmp"   \
+  -DLLVM_ENABLE_RUNTIMES="compiler-rt;flang-rt" \
+  ...
+```
+
+By default, the enabled runtimes will only be built for the host platform
+(`-DLLVM_RUNTIME_TARGETS=default`). To add additional targets to support
+cross-compilation via `flang --target=<target-triple>`, add more triples to
+`LLVM_RUNTIME_TARGETS`, such as
+`-DLLVM_RUNTIME_TARGETS="default;aarch64-linux-gnu"`.
+
+After configuration, build, test, and install the runtime(s) via
+
+```shell
+$ ninja flang-rt
+$ ninja check-flang-rt
+$ ninja install
+```
+
+
+### Standalone Runtimes Build
+
+Instead of building Clang and Flang from scratch, the standalone Runtime build
+uses CMake's environment introspection to find a C, C++, and Fortran compiler.
+The compiler to be used can be controlled using CMake's standard mechanisms such
+as `CMAKE_CXX_COMPILER`, `CMAKE_CXX_COMPILER`, and `CMAKE_Fortran_COMPILER`.
+`CMAKE_Fortran_COMPILER` must be `flang` built from the same Git commit as
+Flang-RT to ensure they are using the same ABI. The C and C++ compiler
+can be any compiler supporting the same ABI.
+
+In addition to the compiler, the build be able to find LLVM development tools
+such as `lit` and `FileCheck` that are not found in an LLVM's install
+directory. Use `CMAKE_BINARY_DIR` to point to directory where LLVM has
+been built. A simple build configuration might look like the following:
+
+```bash
+cmake -S <path-to-llvm-project-source>/runtimes              \
+  -GNinja                                                    \
+  -DLLVM_BINARY_DIR=<path-to-llvm-builddir>                  \
+  -DCMAKE_Fortran_COMPILER=<path-to-llvm-builddir>/bin/flang \
+  -DCMAKE_Fortran_COMPILER_WORKS=yes                         \
+  -DLLVM_ENABLE_RUNTIMES=flang-rt                            \
+  ...
+```
+
+The `CMAKE_Fortran_COMPILER_WORKS` parameter must be set because otherwise CMake
+will test whether the Fortran compiler can compile and link programs which will
+obviously fail without a runtime library available yet.
+
+Building Flang-RT for cross-compilation triple, the target triple can
+be selected using `LLVM_DEFAULT_TARGET_TRIPLE` AND `LLVM_RUNTIMES_TARGET`.
+Of course, Flang-RT can be built multiple times with different build
+configurations, but have to be located manually when using with the Flang
+driver using the `-L` option.
+
+After configuration, build, test, and install the runtime via
+
+```shell
+$ ninja
+$ ninja check-flang-rt
+$ ninja install
+```
+
+
+## Configuration Option Reference
+
+Flang-RT has the followign configuration options. This is in
+addition to the build options the LLVM_ENABLE_RUNTIMES mechanism and
+CMake itself provide.
+
+ * `FLANG_RT_INCLUDE_TESTS` (boolean; default: `ON`)
+
+   When `OFF`, does not add any tests and unittests. The `check-flang-rt`
+   build target will do nothing.
+
+ * `FLANG_RUNTIME_F128_MATH_LIB` (default: `""`)
+
+   Determines the implementation of `REAL(16)` math functions. If set to
+   `libquadmath`, uses `quadmath.h` and `-lquadmath` typically distributed with
+   gcc. If empty, disables `REAL(16)` support. For any other value, introspects
+   the compiler for `__float128` or 128-bit `long double` support.
+   [More details](docs/Real16MathSupport.md).
+
+ * `FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT` (values: `"CUDA"`,`"OpenMP"`, `""` default: `""`)
+
+   When set to `CUDA`, builds Flang-RT with experimental support for GPU
+   accelerators using CUDA. `CMAKE_CUDA_COMPILER` must be set if not
+   automatically detected by CMake. `nvcc` as well as `clang` are supported.
+
+   When set to `OpenMP`, builds Flang-RT with experimental support for
+   GPU accelerators using OpenMP offloading. Only Clang is supported for
+   `CMAKE_C_COMPILER` and `CMAKE_CXX_COMPILER`.
+
+ * `FLANG_RT_INCLUDE_CUF` (bool, default: `OFF`)
+
+   Compiles the `libflang_rt.cuda_<CUDA-version>.a/.so` library. This is
+   independent of `FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA` and only
+   requires a
+   [CUDA Toolkit installation](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html)
+   (no `CMAKE_CUDA_COMPILER`).
+
+
+### Experimental CUDA Support
+
+With `-DFLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA`, the following
+additional configuration options become available.
+
+ * `FLANG_RT_LIBCUDACXX_PATH` (path, default: `""`)
+
+   Path to libcu++ package installation.
+
+ * `FLANG_RT_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS` (boolean, default: `OFF`)
+
+   Do not compile global variables' definitions when producing PTX library.
+   Default is `OFF`, meaning global variable definitions are compiled by
+   default.
+
+
+### Experimental OpenMP Offload Support
+
+With `-DFLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=OpenMP`, the following
+additional configuration options become available.
+
+ * `FLANG_RT_DEVICE_ARCHITECTURES` (default: `"all"`)
+
+   A list of device architectures that Flang-RT is going to support.
+   If `"all"` uses a pre-defined list of architectures. Same purpose as
+   `LIBOMPTARGET_DEVICE_ARCHITECTURES` from liboffload.
diff --git a/flang-rt/cmake/config.h.cmake.in b/flang-rt/cmake/config.h.cmake.in
index 8a4668b90addd..84c234d86bae6 100644
--- a/flang-rt/cmake/config.h.cmake.in
+++ b/flang-rt/cmake/config.h.cmake.in
@@ -16,4 +16,9 @@
    don't. */
 #cmakedefine01 HAVE_DECL_STRERROR_S
 
+/* Define to 1 if you have the `backtrace' function. */
+#cmakedefine HAVE_BACKTRACE ${HAVE_BACKTRACE}
+
+#define BACKTRACE_HEADER <${BACKTRACE_HEADER}>
+
 #endif
diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake
new file mode 100644
index 0000000000000..630aeb3c65005
--- /dev/null
+++ b/flang-rt/cmake/modules/AddFlangRT.cmake
@@ -0,0 +1,199 @@
+#===-- cmake/modules/AddFlangRT.cmake --------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+# Builds a library with common options for Flang-RT.
+#
+# Usage:
+#
+# add_flangrt_library(name sources ...
+#   SHARED
+#     Build a dynamic (.so/.dll) library
+#   STATIC
+#     Build a static (.a/.lib) library
+#   OBJECT
+#     Create only object files without static/dynamic library
+#   INSTALL_WITH_TOOLCHAIN
+#     Install library into Clang's resource directory so it can be found by the
+#     Flang driver during compilation, including tests
+#   EXCLUDE_FROM_ALL
+#     Do not build library by default; typically used for libraries needed for
+#     testing only, no install
+#   LINK_TO_LLVM
+#     Library requires include path and linking to LLVM's Support component
+#   ADDITIONAL_HEADERS
+#     May specify header files for IDE generators.
+#   INCLUDE_DIRECTORIES
+#     Additional target_include_directories for all added targets
+#   LINK_LIBRARIES
+#     Additional target_link_libraries for all added targets
+#   TARGET_PROPERTIES
+#     Set target properties of all added targets
+# )
+function (add_flangrt_library name)
+  set(options STATIC SHARED OBJECT INSTALL_WITH_TOOLCHAIN EXCLUDE_FROM_ALL LINK_TO_LLVM)
+  set(multiValueArgs ADDITIONAL_HEADERS INCLUDE_DIRECTORIES LINK_LIBRARIES TARGET_PROPERTIES)
+  cmake_parse_arguments(ARG
+    "${options}"
+    ""
+    "${multiValueArgs}"
+    ${ARGN})
+
+  if (ARG_INSTALL_WITH_TOOLCHAIN AND ARG_EXCLUDE_FROM_ALL)
+    message(SEND_ERROR "add_flangrt_library(${name} ...):
+        INSTALL_WITH_TOOLCHAIN and EXCLUDE_FROM_ALL are in conflict. When
+        installing an artifact it must have been built first in the 'all' target.
+      ")
+  endif ()
+
+  # Forward libtype to add_library
+  set(extra_args "")
+  if (ARG_SHARED)
+    list(APPEND extra_args SHARED)
+  endif ()
+  if (ARG_STATIC)
+    list(APPEND extra_args STATIC)
+  endif ()
+  if (ARG_OBJECT)
+    list(APPEND extra_args OBJECT)
+  endif ()
+  if (ARG_EXCLUDE_FROM_ALL)
+    list(APPEND extra_args EXCLUDE_FROM_ALL)
+  endif ()
+
+  # Also add header files to IDEs to list as part of the library.
+  set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON)
+
+  add_library(${name} ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS})
+
+  if (ARG_INSTALL_WITH_TOOLCHAIN)
+    set_target_properties(${name} PROPERTIES FOLDER "Flang-RT/Toolchain Libraries")
+  elseif (ARG_OBJECT)
+    set_target_properties(${name} PROPERTIES FOLDER "Flang-RT/Object Libraries")
+  else ()
+    set_target_properties(${name} PROPERTIES FOLDER "Flang-RT/Libraries")
+  endif ()
+
+  # Minimum required C++ version for Flang-RT, even if CMAKE_CXX_STANDARD is defined to something else.
+  target_compile_features(${name} PRIVATE cxx_std_17)
+
+  # Use compiler-specific options to disable exceptions and RTTI.
+  if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
+    target_compile_options(${name} PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables>
+      )
+  elseif (MSVC)
+    target_compile_options(${name} PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>:/EHs-c- /GR->
+      )
+  elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL")
+    target_compile_options(${name} PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>:-qnoeh -qnortti>
+      )
+  endif ()
+
+  # Also for CUDA source when compiling with FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA
+  if (CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA")
+    # Assuming gcc as host compiler.
+    target_compile_options(${name} PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:--no-exceptions -Xcompiler -fno-rtti -Xcompiler -fno-unwind-tables -Xcompiler -fno-asynchronous-unwind-tables>
+      )
+  else ()
+    # Assuming a clang-compatible CUDA compiler.
+    target_compile_options(${name} PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables>
+      )
+  endif ()
+
+  # Flang-RT's public headers
+  target_include_directories(${name} PUBLIC "${FLANG_RT_SOURCE_DIR}/include")
+
+  # For ISO_Fortran_binding.h to be found by the runtime itself (Accessed as #include "flang/ISO_Fortran_binding.h")
+  # User applications can use #include <ISO_Fortran_binding.h>
+  target_include_directories(${name} PUBLIC "${FLANG_SOURCE_DIR}/include")
+
+  # For Flang-RT's configured config.h to be found
+  target_include_directories(${name} PRIVATE "${FLANG_RT_BINARY_DIR}")
+
+  # Disable libstdc++/libc++ assertions, even in an LLVM_ENABLE_ASSERTIONS
+  # build, to avoid an unwanted dependency on libstdc++/libc++.so.
+  if (FLANG_RT_SUPPORTS_UNDEFINE_FLAG)
+    target_compile_options(${name} PUBLIC -U_GLIBCXX_ASSERTIONS)
+    target_compile_options(${name} PUBLIC -U_LIBCPP_ENABLE_ASSERTIONS)
+  endif ()
+
+  # When building the flang runtime if LTO is enabled the archive file
+  # contains LLVM IR rather than object code. Currently flang is not
+  # LTO aware so cannot link this file to compiled Fortran code.
+  if (FLANG_RT_HAS_FNO_LTO_FLAG)
+    target_compile_options(${name} PRIVATE -fno-lto)
+  endif ()
+
+  # Flang/Clang (including clang-cl) -compiled programs targeting the MSVC ABI
+  # should only depend on msvcrt/ucrt. LLVM still emits libgcc/compiler-rt
+  # functions in some cases like 128-bit integer math (__udivti3, __modti3,
+  # __fixsfti, __floattidf, ...) that msvc does not support. We are injecting a
+  # dependency to Compiler-RT's builtin library where these are implemented.
+  if (MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    if (FLANG_RT_BUILTINS_LIBRARY)
+      target_compile_options(${name} PRIVATE "$<$<COMPILE_LANGUAGE:CXX,C>:-Xclang>" "$<$<COMPILE_LANGUAGE:CXX,C>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
+    endif ()
+  endif ()
+  if (MSVC AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+    if (FLANG_RT_BUILTINS_LIBRARY)
+      target_compile_options(${name} PRIVATE "$<$<COMPILE_LANGUAGE:Fortran>:-Xflang>" "$<$<COMPILE_LANGUAGE:Fortran>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
+    else ()
+      message(WARNING "Did not find libclang_rt.builtins.lib.
+        LLVM may emit builtins that are not implemented in msvcrt/ucrt and
+        instead falls back to builtins from Compiler-RT. Linking with ${name}
+        may result in a linker error.")
+    endif ()
+  endif ()
+
+  # Non-GTest unittests depend on LLVMSupport
+  if (ARG_LINK_TO_LLVM)
+    if (LLVM_LINK_LLVM_DYLIB)
+      set(llvm_libs LLVM)
+    else()
+      llvm_map_components_to_libnames(llvm_libs Support)
+    endif()
+    target_link_libraries(${name} PUBLIC ${llvm_libs})
+    target_include_directories(${name} PUBLIC ${LLVM_INCLUDE_DIRS})
+  endif ()
+
+  if (ARG_INCLUDE_DIRECTORIES)
+    target_include_directories(${name} ${ARG_INCLUDE_DIRECTORIES})
+  endif ()
+
+  if (ARG_LINK_LIBRARIES)
+    target_link_libraries(${name} PUBLIC ${ARG_LINK_LIBRARIES})
+  endif ()
+
+  # If this is part of the toolchain, put it into the compiler's resource
+  # directory. Otherwise it is part of testing and is not installed at all.
+  # TODO: Consider multi-configuration builds (MSVC_IDE, "Ninja Multi-Config")
+  if (ARG_INSTALL_WITH_TOOLCHAIN)
+    set_target_properties(${name}
+      PROPERTIES
+        ARCHIVE_OUTPUT_DIRECTORY "${FLANG_RT_OUTPUT_RESOURCE_LIB_DIR}"
+      )
+
+    install(TARGETS ${name}
+        ARCHIVE DESTINATION "${FLANG_RT_INSTALL_RESOURCE_LIB_PATH}"
+      )
+  endif ()
+
+  if (ARG_TARGET_PROPERTIES)
+    set_target_properties(${name} PROPERTIES ${ARG_TARGET_PROPERTIES})
+  endif ()
+
+  # flang-rt should build all the Flang-RT targets that are built in an
+  # 'all' build.
+  if (NOT ARG_EXCLUDE_FROM_ALL)
+    add_dependencies(flang-rt ${name})
+  endif ()
+endfunction (add_flangrt_library)
diff --git a/flang-rt/cmake/modules/AddFlangRTOffload.cmake b/flang-rt/cmake/modules/AddFlangRTOffload.cmake
new file mode 100644
index 0000000000000..4e4bd60c63545
--- /dev/null
+++ b/flang-rt/cmake/modules/AddFlangRTOffload.cmake
@@ -0,0 +1,100 @@
+#===-- cmake/modules/AddFlangRTOffload.cmake -------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+macro(enable_cuda_compilation name files)
+  if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
+    enable_language(CUDA)
+
+    set_target_properties(${name}
+        PROPERTIES
+          CUDA_SEPARABLE_COMPILATION ON
+      )
+
+    # Treat all supported sources as CUDA files.
+    set_source_files_properties(${files} PROPERTIES LANGUAGE CUDA)
+    set(CUDA_COMPILE_OPTIONS)
+    if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "Clang")
+      # Allow varargs.
+      set(CUDA_COMPILE_OPTIONS
+        -Xclang -fcuda-allow-variadic-functions
+        )
+    endif()
+    if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA")
+      set(CUDA_COMPILE_OPTIONS
+        --expt-relaxed-constexpr
+        # Disable these warnings:
+        #   'long double' is treated as 'double' in device code
+        -Xcudafe --diag_suppress=20208
+        -Xcudafe --display_error_number
+        )
+    endif()
+    set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS
+      "${CUDA_COMPILE_OPTIONS}")
+
+    # Create a .a library consisting of CUDA PTX.
+    # This is different from a regular static library. The CUDA_PTX_COMPILATION
+    # property can only be applied to object libraries and create *.ptx files
+    # instead of *.o files. The .a will consist of those *.ptx files only.
+    add_flangrt_library(obj.${name}PTX OBJECT ${files})
+    set_property(TARGET obj.${name}PTX PROPERTY CUDA_PTX_COMPILATION ON)
+    add_flangrt_library(${name}PTX STATIC "$<TARGET_OBJECTS:obj.${name}PTX>")
+
+    # Apply configuration options
+    if (FLANG_RT_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS)
+      target_compile_definitions(obj.${name}PTX
+        PRIVATE FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
+        )
+    endif()
+
+    # When using libcudacxx headers files, we have to use them
+    # for all files of Flang-RT.
+    if (EXISTS "${FLANG_RT_LIBCUDACXX_PATH}/include")
+      foreach (tgt IN ITEMS "${name}" "obj.${name}PTX")
+        target_include_directories(${tgt} AFTER PRIVATE "${FLANG_RT_LIBCUDACXX_PATH}/include")
+        target_compile_definitions(${tgt} PRIVATE RT_USE_LIBCUDACXX=1)
+      endforeach ()
+    endif ()
+  endif()
+endmacro()
+
+macro(enable_omp_offload_compilation name files)
+  if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP")
+    # OpenMP offload build only works with Clang compiler currently.
+
+    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND
+        "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+
+      string(REPLACE ";" "," compile_for_architectures
+        "${FLANG_RT_DEVICE_ARCHITECTURES}"
+        )
+
+      set(OMP_COMPILE_OPTIONS
+        -fopenmp
+        -fvisibility=hidden
+        -fopenmp-cuda-mode
+        --offload-arch=${compile_for_architectures}
+        # Force LTO for the device part.
+        -foffload-lto
+        )
+      set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS
+        "${OMP_COMPILE_OPTIONS}"
+        )
+      target_link_options(${name} PUBLIC ${OMP_COMPILE_OPTIONS})
+
+      # Enable "declare target" in the source code.
+      set_source_files_properties(${files}
+        PROPERTIES COMPILE_DEFINITIONS OMP_OFFLOAD_BUILD
+        )
+    else()
+      message(FATAL_ERROR
+        "Flang-rt build with OpenMP offload is not supported for these compilers:\n"
+        "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}\n"
+        "CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
+    endif()
+  endif()
+endmacro()
diff --git a/flang-rt/cmake/modules/GetToolchainDirs.cmake b/flang-rt/cmake/modules/GetToolchainDirs.cmake
new file mode 100644
index 0000000000000..426a5e8e801f3
--- /dev/null
+++ b/flang-rt/cmake/modules/GetToolchainDirs.cmake
@@ -0,0 +1,125 @@
+#===-- cmake/modules/GetToolchainDirs.cmake --------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+
+# Determine the subdirectory relative to Clang's resource dir/sysroot where to
+# install target-specific libraries, to be found by Clang/Flang driver. This was
+# adapted from Compiler-RT's mechanism to find the path for
+# libclang_rt.builtins.a.
+#
+# Compiler-RT has two mechanisms for the path (simplified):
+#
+# * LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=1: lib/${oslibname}/libclang_rt.builtins-${arch}.a
+# * LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=0: lib/${triple}/libclang_rt.builtins.a
+#
+# LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON is the newer scheme, but the old one is
+# currently still used for some platforms such as Windows. Clang looks for which
+# of the files exist before passing the path to the linker. Hence, the
+# directories have to match what Clang is looking for, which is done in
+# ToolChain::getArchSpecificLibPaths(..), ToolChain::getRuntimePath(),
+# ToolChain::getCompilerRTPath(), and ToolChain::getCompilerRT(..), not entirely
+# consistent between these functions, Compiler-RT's CMake code, and overrides
+# in different toolchains.
+#
+# For Fortran, Flang always assumes the library name libflang_rt.a without
+# architecture suffix. Hence, we always use the second scheme even as if
+# LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON, even if it actually set to OFF. It as
+# added unconditionally to the library search path by
+# ToolChain::getArchSpecificLibPaths(...).
+function (get_toolchain_library_subdir outvar)
+  if (NOT APPLE)
+    set(outval "lib")
+  else ()
+    # Required to be "darwin" for MachO toolchain.
+    get_toolchain_os_dirname(os_dirname)
+    set(outval "lib/${os_dirname}")
+  endif ()
+
+  get_toolchain_arch_dirname(arch_dirname)
+  set(outval "lib/${arch_dirname}")
+
+  set(${outvar} "${outval}" PARENT_SCOPE)
+endfunction ()
+
+
+# Corresponds to Clang's ToolChain::getOSLibName(). Adapted from Compiler-RT.
+function (get_toolchain_os_dirname outvar)
+  if (ANDROID)
+    # The CMAKE_SYSTEM_NAME for Android is "Android", but the OS is Linux and the
+    # driver will search for libraries in the "linux" directory.
+    set(outval "linux")
+  else ()
+    string(TOLOWER "${CMAKE_SYSTEM_NAME}" outval)
+  endif ()
+  set(${outvar} "${outval}" PARENT_SCOPE)
+endfunction ()
+
+
+# Corresponds to Clang's ToolChain::getRuntimePath(). Adapted from Compiler-RT.
+function (get_toolchain_arch_dirname outvar)
+  string(REPLACE "-" ";" triple_list ${LLVM_TARGET_TRIPLE})
+  list(GET triple_list 0 arch)
+
+  if("${arch}" MATCHES "^i.86$")
+    # Android uses i686, but that's remapped at a later stage.
+    set(arch "i386")
+  endif()
+
+  string(FIND ${LLVM_TARGET_TRIPLE} "-" dash_index)
+  string(SUBSTRING ${LLVM_TARGET_TRIPLE} ${dash_index} -1 triple_suffix)
+  string(SUBSTRING ${LLVM_TARGET_TRIPLE} 0 ${dash_index} triple_cpu)
+  set(arch "${triple_cpu}")
+  if("${arch}" MATCHES "^i.86$")
+    # Android uses i686, but that's remapped at a later stage.
+    set(arch "i386")
+  endif()
+
+  if(ANDROID AND ${arch} STREQUAL "i386")
+    set(target "i686${triple_suffix}")
+  elseif(${arch} STREQUAL "amd64")
+    set(target "x86_64${triple_suffix}")
+  elseif(${arch} STREQUAL "sparc64")
+    set(target "sparcv9${triple_suffix}")
+  elseif("${arch}" MATCHES "mips64|mips64el")
+    string(REGEX REPLACE "-gnu.*" "-gnuabi64" triple_suffix_gnu "${triple_suffix}")
+    string(REGEX REPLACE "mipsisa32" "mipsisa64" triple_cpu_mips "${triple_cpu}")
+    string(REGEX REPLACE "^mips$" "mips64" triple_cpu_mips "${triple_cpu_mips}")
+    string(REGEX REPLACE "^mipsel$" "mips64el" triple_cpu_mips "${triple_cpu_mips}")
+    set(target "${triple_cpu_mips}${triple_suffix_gnu}")
+  elseif("${arch}" MATCHES "mips|mipsel")
+    string(REGEX REPLACE "-gnuabi.*" "-gnu" triple_suffix_gnu "${triple_suffix}")
+    string(REGEX REPLACE "mipsisa64" "mipsisa32" triple_cpu_mips "${triple_cpu}")
+    string(REGEX REPLACE "mips64" "mips" triple_cpu_mips "${triple_cpu_mips}")
+    set(target "${triple_cpu_mips}${triple_suffix_gnu}")
+  elseif("${arch}" MATCHES "^arm")
+    # FIXME: Handle arch other than arm, armhf, armv6m
+    if (${arch} STREQUAL "armhf")
+      # If we are building for hard float but our ABI is soft float.
+      if ("${triple_suffix}" MATCHES ".*eabi$")
+        # Change "eabi" -> "eabihf"
+        set(triple_suffix "${triple_suffix}hf")
+      endif()
+      # ABI is already set in the triple, don't repeat it in the architecture.
+      set(arch "arm")
+    else ()
+      # If we are building for soft float, but the triple's ABI is hard float.
+      if ("${triple_suffix}" MATCHES ".*eabihf$")
+        # Change "eabihf" -> "eabi"
+        string(REGEX REPLACE "hf$" "" triple_suffix "${triple_suffix}")
+      endif()
+    endif()
+    set(target "${arch}${triple_suffix}")
+  elseif("${arch}" MATCHES "^amdgcn")
+    set(target "amdgcn-amd-amdhsa")
+  elseif("${arch}" MATCHES "^nvptx")
+    set(target "nvptx64-nvidia-cuda")
+  else()
+    set(target "${arch}${triple_suffix}")
+  endif()
+  set(${outvar} "${target}" PARENT_SCOPE)
+endfunction()
diff --git a/flang-rt/examples/CMakeLists.txt b/flang-rt/examples/CMakeLists.txt
new file mode 100644
index 0000000000000..f45a95d18b641
--- /dev/null
+++ b/flang-rt/examples/CMakeLists.txt
@@ -0,0 +1,9 @@
+#===-- examples/CMakeLists.txt ---------------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+add_subdirectory(ExternalHelloWorld)
diff --git a/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt b/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt
new file mode 100644
index 0000000000000..4fd04f8f2769a
--- /dev/null
+++ b/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt
@@ -0,0 +1,17 @@
+#===-- examples/ExternalHelloWorld/CMakeLists.txt --------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+# This test is not run by default as it requires input.
+add_llvm_example(external-hello-world
+  external-hello.cpp
+)
+
+target_link_libraries(external-hello-world
+  PRIVATE
+    flang_rt.runtime
+  )
diff --git a/flang-rt/lib/CMakeLists.txt b/flang-rt/lib/CMakeLists.txt
new file mode 100644
index 0000000000000..aee51dcc9fa24
--- /dev/null
+++ b/flang-rt/lib/CMakeLists.txt
@@ -0,0 +1,17 @@
+#===-- lib/CMakeLists.txt --------------------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+add_subdirectory(quadmath)
+add_subdirectory(runtime)
+if (FLANG_RT_INCLUDE_CUF)
+  add_subdirectory(cuda)
+endif()
+
+if (FLANG_RT_INCLUDE_TESTS)
+  add_subdirectory(Testing)
+endif ()
diff --git a/flang-rt/lib/Testing/CMakeLists.txt b/flang-rt/lib/Testing/CMakeLists.txt
new file mode 100644
index 0000000000000..19c20ad44c025
--- /dev/null
+++ b/flang-rt/lib/Testing/CMakeLists.txt
@@ -0,0 +1,20 @@
+#===-- lib/Testing/CMakeLists.txt ------------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+set(public_headers "")
+file(GLOB_RECURSE public_headers
+  "${FLANG_SOURCE_DIR}/lib/Testing/*.h"
+)
+
+add_flangrt_library(NonGTestTesting EXCLUDE_FROM_ALL LINK_TO_LLVM
+      ${FLANG_SOURCE_DIR}/lib/Testing/testing.cpp
+      ${FLANG_SOURCE_DIR}/lib/Testing/fp-testing.cpp
+
+    ADDITIONAL_HEADERS
+      ${public_headers}
+  )
diff --git a/flang-rt/lib/cuda/CMakeLists.txt b/flang-rt/lib/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..d5ca354c1029f
--- /dev/null
+++ b/flang-rt/lib/cuda/CMakeLists.txt
@@ -0,0 +1,34 @@
+#===-- lib/cuda/CMakeLists.txt ---------------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+
+add_flangrt_library(flang_rt.cuda STATIC
+  allocatable.cpp
+  allocator.cpp
+  descriptor.cpp
+  init.cpp
+  kernel.cpp
+  memmove-function.cpp
+  memory.cpp
+  registration.cpp
+
+  # libflang_rt.runtime depends on a certain version of CUDA. To be able to have
+  # multiple build of this library with different CUDA version, the version is
+  # added to the library name.
+  TARGET_PROPERTIES
+    OUTPUT_NAME "flang_rt.cuda_${CUDAToolkit_VERSION_MAJOR}"
+
+  INCLUDE_DIRECTORIES
+    PRIVATE ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+target_link_libraries(flang_rt.cuda
+  PUBLIC
+  flang_rt.runtime
+  CUDA::cudart_static
+)
diff --git a/flang-rt/lib/quadmath/CMakeLists.txt b/flang-rt/lib/quadmath/CMakeLists.txt
new file mode 100644
index 0000000000000..4f113216b42c8
--- /dev/null
+++ b/flang-rt/lib/quadmath/CMakeLists.txt
@@ -0,0 +1,136 @@
+#===-- lib/quadmath/CMakeLists.txt -----------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+# FortranFloat128 implements IEEE-754 128-bit float math functions.
+# It is a thin wapper and it currently relies on third-party
+# libraries available for the target.
+# It is distributed as a static library only.
+# Fortran programs/libraries that end up linking any of the provided
+# will have a dependency on the third-party library that is being
+# used for building this libflang_rt.quadmath library.
+
+include(CheckLibraryExists)
+include(CheckIncludeFile)
+
+set(sources
+  acos.cpp
+  acosh.cpp
+  asin.cpp
+  asinh.cpp
+  atan.cpp
+  atan2.cpp
+  atanh.cpp
+  ceil.cpp
+  complex-math.c
+  cos.cpp
+  cosh.cpp
+  erf.cpp
+  erfc.cpp
+  exp.cpp
+  exponent.cpp
+  floor.cpp
+  fma.cpp
+  fraction.cpp
+  hypot.cpp
+  j0.cpp
+  j1.cpp
+  jn.cpp
+  lgamma.cpp
+  llround.cpp
+  log.cpp
+  log10.cpp
+  lround.cpp
+  mod-real.cpp
+  modulo-real.cpp
+  nearest.cpp
+  nearbyint.cpp
+  norm2.cpp
+  pow.cpp
+  random.cpp
+  remainder.cpp
+  round.cpp
+  rrspacing.cpp
+  scale.cpp
+  set-exponent.cpp
+  sin.cpp
+  sinh.cpp
+  spacing.cpp
+  sqrt.cpp
+  tan.cpp
+  tanh.cpp
+  tgamma.cpp
+  trunc.cpp
+  y0.cpp
+  y1.cpp
+  yn.cpp
+  )
+
+include_directories(AFTER "${CMAKE_CURRENT_SOURCE_DIR}/..")
+add_library(FortranFloat128MathILib INTERFACE)
+target_include_directories(FortranFloat128MathILib INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>
+  )
+
+if (FLANG_RUNTIME_F128_MATH_LIB)
+  if (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libquadmath")
+    check_include_file(quadmath.h FOUND_QUADMATH_HEADER)
+    if(FOUND_QUADMATH_HEADER)
+      add_compile_definitions(HAS_QUADMATHLIB)
+    else()
+      message(FATAL_ERROR
+        "FLANG_RUNTIME_F128_MATH_LIB setting requires quadmath.h "
+        "to be available: ${FLANG_RUNTIME_F128_MATH_LIB}"
+        )
+    endif()
+  else()
+    message(FATAL_ERROR
+      "Unsupported third-party library for Fortran F128 math runtime: "
+      "${FLANG_RUNTIME_F128_MATH_LIB}"
+      )
+  endif()
+
+  if (WIN32)
+    # Do not create a flang_rt.quadmath library under Windows, the Flang
+    # driver never links it. Instead, add the sources to flang_rt.runtime.
+    target_sources(FortranFloat128MathILib INTERFACE ${sources})
+    target_compile_definitions(FortranFloat128MathILib INTERFACE HAS_QUADMATHLIB)
+  else ()
+    add_flangrt_library(flang_rt.quadmath STATIC INSTALL_WITH_TOOLCHAIN
+      ${sources})
+    target_include_directories(flang_rt.quadmath PRIVATE
+        "${FLANG_RT_SOURCE_DIR}/lib/flang_rt"
+      )
+  endif ()
+elseif (HAVE_LDBL_MANT_DIG_113)
+  # We can use 'long double' versions from libc.
+  check_library_exists(m sinl "" FOUND_LIBM)
+  if (FOUND_LIBM)
+    target_compile_definitions(FortranFloat128MathILib INTERFACE
+      HAS_LIBM
+      )
+    target_include_directories(FortranFloat128MathILib INTERFACE
+      "${FLANG_RT_SOURCE_DIR}/lib/flang_rt"
+      )
+    target_sources(FortranFloat128MathILib INTERFACE ${sources})
+  else()
+    message(FATAL_ERROR "Flang-RT cannot build without libm")
+  endif()
+else()
+  # We can use '__float128' version from libc, if it has them.
+  check_library_exists(m sinf128 "" FOUND_LIBMF128)
+  if (FOUND_LIBMF128)
+    target_compile_definitions(FortranFloat128MathILib INTERFACE
+      HAS_LIBMF128
+      )
+    target_include_directories(FortranFloat128MathILib INTERFACE
+      "${FLANG_RT_SOURCE_DIR}/lib/flang_rt"
+      )
+    # Enable this, when math-entries.h and complex-math.h is ready.
+    # target_sources(FortranFloat128MathILib INTERFACE ${sources})
+  endif()
+endif()
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
new file mode 100644
index 0000000000000..0afcbf2783533
--- /dev/null
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -0,0 +1,215 @@
+#===-- lib/runtime/CMakeLists.txt ------------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+include(AddFlangRTOffload)
+# function checks
+find_package(Backtrace)
+set(HAVE_BACKTRACE ${Backtrace_FOUND})
+set(BACKTRACE_HEADER ${Backtrace_HEADER})
+
+
+# List of files that are buildable for all devices.
+set(supported_sources
+  ${FLANG_SOURCE_DIR}/lib/Decimal/binary-to-decimal.cpp
+  ${FLANG_SOURCE_DIR}/lib/Decimal/decimal-to-binary.cpp
+  ISO_Fortran_binding.cpp
+  allocator-registry.cpp
+  allocatable.cpp
+  array-constructor.cpp
+  assign.cpp
+  buffer.cpp
+  character.cpp
+  connection.cpp
+  copy.cpp
+  derived-api.cpp
+  derived.cpp
+  descriptor-io.cpp
+  descriptor.cpp
+  dot-product.cpp
+  edit-input.cpp
+  edit-output.cpp
+  environment.cpp
+  external-unit.cpp
+  extrema.cpp
+  file.cpp
+  findloc.cpp
+  format.cpp
+  inquiry.cpp
+  internal-unit.cpp
+  io-api.cpp
+  io-api-minimal.cpp
+  io-error.cpp
+  io-stmt.cpp
+  iostat.cpp
+  matmul-transpose.cpp
+  matmul.cpp
+  memory.cpp
+  misc-intrinsic.cpp
+  namelist.cpp
+  non-tbp-dio.cpp
+  numeric.cpp
+  pointer.cpp
+  product.cpp
+  pseudo-unit.cpp
+  ragged.cpp
+  stat.cpp
+  sum.cpp
+  support.cpp
+  terminator.cpp
+  tools.cpp
+  transformational.cpp
+  type-code.cpp
+  type-info.cpp
+  unit.cpp
+  unit-map.cpp
+  utf.cpp
+)
+
+# List of source not used for GPU offloading.
+set(host_sources
+  ${FLANG_SOURCE_DIR}/module/iso_fortran_env_impl.f90
+  command.cpp
+  complex-powi.cpp
+  complex-reduction.c
+  exceptions.cpp
+  execute.cpp
+  extensions.cpp
+  main.cpp
+  random.cpp
+  reduce.cpp
+  reduction.cpp
+  stop.cpp
+  temporary-stack.cpp
+  time-intrinsic.cpp
+)
+
+file(GLOB_RECURSE public_headers
+  "${FLANG_RT_SOURCE_DIR}/include/flang_rt/*.h"
+  "${FLANG_SOURCE_DIR}/include/flang/Common/*.h"
+  )
+
+file(GLOB_RECURSE private_headers
+  "${FLANG_RT_SOURCE_DIR}/lib/flang_rt/*.h"
+  "${FLANG_SOURCE_DIR}/lib/Common/*.h"
+  )
+
+
+# Import changes from flang_rt.quadmath
+get_target_property(f128_sources
+  FortranFloat128MathILib INTERFACE_SOURCES
+  )
+if (f128_sources)
+  # The interface may define special macros for Float128Math files,
+  # so we need to propagate them.
+  get_target_property(f128_defs
+    FortranFloat128MathILib INTERFACE_COMPILE_DEFINITIONS
+    )
+  set_property(SOURCE ${f128_sources}
+    APPEND PROPERTY COMPILE_DEFINITIONS
+    ${f128_defs}
+    )
+  get_target_property(f128_include_dirs
+    FortranFloat128MathILib INTERFACE_INCLUDE_DIRECTORIES
+    )
+  set_property(SOURCE ${f128_sources}
+    APPEND PROPERTY INCLUDE_DIRECTORIES
+    ${f128_include_dirs}
+    )
+else ()
+  set(f128_sources "")
+endif ()
+
+set(sources ${supported_sources} ${host_sources} ${f128_sources})
+
+
+if (NOT WIN32)
+  add_flangrt_library(flang_rt.runtime STATIC
+    ${sources}
+    LINK_LIBRARIES ${Backtrace_LIBRARY}
+    INSTALL_WITH_TOOLCHAIN
+    ADDITIONAL_HEADERS ${public_headers} ${private_headers}
+  )
+
+  enable_cuda_compilation(flang_rt.runtime "${supported_sources}")
+  enable_omp_offload_compilation(flang_rt.runtime "${supported_sources}")
+
+  # For unittests that depend on flang_rt. Should link to the static version
+  # of the library.
+  add_library(flang_rt.runtime.static ALIAS flang_rt.runtime)
+  add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime)
+else()
+  # Target for building all versions of the runtime
+  add_custom_target(flang_rt.runtime)
+  set_target_properties(flang_rt.runtime PROPERTIES FOLDER "Flang-RT/Meta")
+
+  function (add_win_flangrt_runtime libtype suffix msvc_lib)
+    set(name "flang_rt.runtime.${suffix}")
+    add_flangrt_library(${name} ${libtype}
+        ${sources}
+        ${ARGN}
+        LINK_LIBRARIES ${Backtrace_LIBRARY}
+        ADDITIONAL_HEADERS ${public_headers} ${private_headers}
+      )
+
+    if (msvc_lib)
+      set_target_properties(${name}
+          PROPERTIES
+            MSVC_RUNTIME_LIBRARY "${msvc_lib}"
+        )
+    endif ()
+
+    # Setting an unique Fortran_MODULE_DIRECTORY is required for each variant to
+    # write a different .mod file.
+    set_target_properties(${name}
+        PROPERTIES
+          Fortran_MODULE_DIRECTORY "module.${suffix}"
+      )
+
+    enable_cuda_compilation(${name} "${supported_sources}")
+    enable_omp_offload_compilation(${name} "${supported_sources}")
+    add_dependencies(flang_rt.runtime ${name})
+  endfunction ()
+
+  # Variants of the static flang_rt for different versions of the msvc runtime.
+  #
+  # The dynamic/dynamic_dbg variants are not DLLs themselves, only require
+  # linking to msvcrt(d).dll.
+  # FIXME: Generating actual runtime DLLs is currently not possible. There are
+  # two roadblocks:
+  #
+  #  * Flang emits /DEFAULTLIB:flang_rt.dynamic.lib into
+  #    iso_fortran_env_impl.f90.obj. Because that file is itself part of
+  #    flang_rt.dynamic, this results in a recursive dependency when invoking
+  #    the linker.
+  #
+  #  * The externally-visible functions must either be annotated with
+  #    __declspec(dllexport), or listed in an exports file. A possible workaround
+  #    is CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS which would also export the internal
+  #    C++ symbols and still requires global data symbols to be annotated
+  #    manually.
+  add_win_flangrt_runtime(STATIC static      MultiThreaded         INSTALL_WITH_TOOLCHAIN)
+  add_win_flangrt_runtime(STATIC static_dbg  MultiThreadedDebug    INSTALL_WITH_TOOLCHAIN)
+  add_win_flangrt_runtime(STATIC dynamic     MultiThreadedDLL      INSTALL_WITH_TOOLCHAIN)
+  add_win_flangrt_runtime(STATIC dynamic_dbg MultiThreadedDebugDLL INSTALL_WITH_TOOLCHAIN)
+
+  # Unittests link against LLVMSupport which is using CMake's default runtime
+  # library selection, which is either MultiThreadedDLL or MultiThreadedDebugDLL
+  # depending on the configuration. They have to match or linking will fail.
+  if (GENERATOR_IS_MULTI_CONFIG)
+    # We cannot select an ALIAS library because it may be different
+    # per configuration. Fallback to CMake's default.
+    add_win_flangrt_runtime(STATIC unittest "" EXCLUDE_FROM_ALL)
+  else ()
+    string(TOLOWER ${CMAKE_BUILD_TYPE} build_type)
+    if (build_type STREQUAL "debug")
+      add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
+    else ()
+      add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
+    endif ()
+  endif ()
+endif()
diff --git a/flang-rt/test/CMakeLists.txt b/flang-rt/test/CMakeLists.txt
new file mode 100644
index 0000000000000..f5f7b8832d381
--- /dev/null
+++ b/flang-rt/test/CMakeLists.txt
@@ -0,0 +1,59 @@
+#===-- test/CMakeLists.txt -------------------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+# Test runner infrastructure for Flang. This configures the Flang test trees
+# for use by Lit, and delegates to LLVM's lit test handlers.
+
+llvm_canonicalize_cmake_booleans(
+  FLANG_STANDALONE_BUILD
+  LLVM_BUILD_EXAMPLES
+  LLVM_BYE_LINK_INTO_TOOLS
+  LLVM_ENABLE_PLUGINS
+)
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+)
+
+if (TARGET FlangRTUnitTests)
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py
+    MAIN_CONFIG
+    ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.cfg.py
+  )
+
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/NonGtestUnit/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/NonGtestUnit/lit.site.cfg.py
+    MAIN_CONFIG
+    ${CMAKE_CURRENT_SOURCE_DIR}/NonGtestUnit/lit.cfg.py
+  )
+endif ()
+
+
+add_custom_target(flang-rt-test-depends)
+set_target_properties(flang-rt-test-depends PROPERTIES FOLDER "Flang-RT/Meta")
+add_dependencies(flang-rt-test-depends
+    FlangRTUnitTests
+    flang_rt.runtime
+    flang_rt.runtime.unittest
+  )
+
+add_lit_testsuite(check-flang-rt "Running the Flang-RT regression tests"
+    ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS flang-rt-test-depends
+  )
+set_target_properties(check-flang-rt PROPERTIES FOLDER "Flang-RT/Meta")
+
+add_lit_testsuites(flang-rt ${CMAKE_CURRENT_SOURCE_DIR}
+    DEPENDS flang-rt-test-depends
+  )
diff --git a/flang-rt/test/Driver/ctofortran.f90 b/flang-rt/test/Driver/ctofortran.f90
index 10c7adaccc958..e385e7974cdc1 100644
--- a/flang-rt/test/Driver/ctofortran.f90
+++ b/flang-rt/test/Driver/ctofortran.f90
@@ -1,8 +1,10 @@
 ! UNSUPPORTED: system-windows
-! REQUIRES: flang-rt
+! UNSUPPORTED: offload-cuda
+
 ! RUN: split-file %s %t
-! RUN: chmod +x %t/runtest.sh
-! RUN: %t/runtest.sh %t %t/ffile.f90 %t/cfile.c %flang | FileCheck %s
+! RUN: %clang -I"%include/flang" -c %t/cfile.c -o %t/cfile.o
+! RUN: %flang -L"%libdir" %t/ffile.f90 %t/cfile.o -o %t/ctofortran
+! RUN: env LD_LIBRARY_PATH="$LD_LIBRARY_PATH:%libdir" %t/ctofortran | FileCheck %s
 
 !--- ffile.f90
 program fmain
@@ -66,24 +68,3 @@ end subroutine foo
   foo(desc);
   return;
 }
-!--- runtest.sh
-#!/bin/bash
-TMPDIR=$1
-FFILE=$2
-CFILE=$3
-FLANG=$4
-shift 4
-FLAGS="$*"
-BINDIR=`dirname $FLANG`
-LIBDIR=$BINDIR/../lib
-CCOMP=$BINDIR/clang
-if [ -x $CCOMP ]
-then
-  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR
-  $CCOMP $FLAGS -c $CFILE -o $TMPDIR/cfile.o
-  $FLANG $FLAGS $FFILE $TMPDIR/cfile.o -o $TMPDIR/ctofortran
-  $TMPDIR/ctofortran # should print "PASS"
-else
-  # No clang compiler, just pass by default
-  echo "PASS"
-fi
diff --git a/flang-rt/test/Driver/exec.f90 b/flang-rt/test/Driver/exec.f90
index 9ca91ee24011c..5a81a1e4c3e45 100644
--- a/flang-rt/test/Driver/exec.f90
+++ b/flang-rt/test/Driver/exec.f90
@@ -1,10 +1,10 @@
-! UNSUPPORTED: system-windows
 ! REQUIRES: flang-rt
+! UNSUPPORTED: offload-cuda
+
 ! Verify that flang can correctly build executables.
 
-! RUN: %flang %s -o %t
-! RUN: env LD_LIBRARY_PATH="$LD_LIBRARY_PATH:%llvmshlibdir" %t | FileCheck %s
-! RUN: rm -f %t
+! RUN: %flang -L"%libdir" %s -o %t
+! RUN: env LD_LIBRARY_PATH="$LD_LIBRARY_PATH:%libdir" %t | FileCheck %s
 
 ! CHECK: Hello, World!
 program hello
diff --git a/flang-rt/test/NonGtestUnit/lit.cfg.py b/flang-rt/test/NonGtestUnit/lit.cfg.py
new file mode 100644
index 0000000000000..4bee709b78f43
--- /dev/null
+++ b/flang-rt/test/NonGtestUnit/lit.cfg.py
@@ -0,0 +1,22 @@
+# -*- Python -*-
+
+import os
+
+import lit.formats
+
+# name: The name of this test suite.
+config.name = "flang-rt-OldUnit"
+
+# suffixes: A list of file extensions to treat as test files.
+# On Windows, ".exe" also matches the GTests and will execited redundantly.
+config.suffixes = [".test", ".exe"]
+
+# test_source_root: The root path where unit test binaries are located.
+config.test_source_root = os.path.join(config.flangrt_binary_dir, "unittests")
+
+# test_exec_root: The root path where tests should be run.
+# lit writes a '.lit_test_times.txt' file into this directory.
+config.test_exec_root = config.flang_rt_binary_test_dir
+
+# testFormat: The test format to use to interpret tests.
+config.test_format = lit.formats.ExecutableTest()
diff --git a/flang-rt/test/NonGtestUnit/lit.site.cfg.py.in b/flang-rt/test/NonGtestUnit/lit.site.cfg.py.in
new file mode 100644
index 0000000000000..8244b5b62d3aa
--- /dev/null
+++ b/flang-rt/test/NonGtestUnit/lit.site.cfg.py.in
@@ -0,0 +1,14 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import os
+
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.flang_rt_source_dir = "@FLANG_RT_SOURCE_DIR@"
+config.flangrt_binary_dir = "@FLANG_RT_BINARY_DIR@"
+config.flang_rt_binary_test_dir = os.path.dirname(__file__)
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, os.path.join(config.flang_rt_source_dir, 'test', 'NonGtestUnit', 'lit.cfg.py'))
diff --git a/flang-rt/test/Runtime/no-cpp-dep.c b/flang-rt/test/Runtime/no-cpp-dep.c
index 4fcf8f9d478d8..c98678b69aa14 100644
--- a/flang-rt/test/Runtime/no-cpp-dep.c
+++ b/flang-rt/test/Runtime/no-cpp-dep.c
@@ -3,10 +3,11 @@ This test makes sure that flang's runtime does not depend on the C++ runtime
 library. It tries to link this simple file against libflang_rt.runtime.a with
 a C compiler.
 
-REQUIRES: c-compiler, flang-rt
+UNSUPPORTED: system-windows
+UNSUPPORTED: offload-cuda
 
 RUN: %if system-aix %{ export OBJECT_MODE=64 %}
-RUN: %cc -std=c99 %s -I%include %libruntime -lm  \
+RUN: %cc -std=c99 %s -I%include -L"%libdir" -lflang_rt.runtime -lm \
 RUN: %if system-aix %{-lpthread %}
 RUN: rm a.out
 */
diff --git a/flang-rt/test/Unit/lit.cfg.py b/flang-rt/test/Unit/lit.cfg.py
new file mode 100644
index 0000000000000..516bc653f413f
--- /dev/null
+++ b/flang-rt/test/Unit/lit.cfg.py
@@ -0,0 +1,21 @@
+# -*- Python -*-
+
+import os
+
+import lit.formats
+
+# name: The name of this test suite.
+config.name = "flang-rt-Unit"
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = []
+
+# test_source_root: The root path where unit test binaries are located.
+config.test_source_root = os.path.join(config.flangrt_binary_dir, "unittests")
+
+# test_exec_root: The root path where tests should be run.
+# lit writes a '.lit_test_times.txt' file into this directory.
+config.test_exec_root = config.flang_rt_binary_test_dir
+
+# testFormat: The test format to use to interpret tests.
+config.test_format = lit.formats.GoogleTest(config.llvm_build_mode, "Tests")
diff --git a/flang-rt/test/Unit/lit.site.cfg.py.in b/flang-rt/test/Unit/lit.site.cfg.py.in
new file mode 100644
index 0000000000000..ed6dea07bcdde
--- /dev/null
+++ b/flang-rt/test/Unit/lit.site.cfg.py.in
@@ -0,0 +1,15 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import os
+
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_build_mode = "@LLVM_BUILD_MODE@"
+config.flang_rt_source_dir = "@FLANG_RT_SOURCE_DIR@"
+config.flangrt_binary_dir = "@FLANG_RT_BINARY_DIR@"
+config.flang_rt_binary_test_dir = os.path.dirname(__file__)
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, os.path.join(config.flang_rt_source_dir, 'test', 'Unit', 'lit.cfg.py'))
diff --git a/flang-rt/test/lit.cfg.py b/flang-rt/test/lit.cfg.py
new file mode 100644
index 0000000000000..652da31e6438f
--- /dev/null
+++ b/flang-rt/test/lit.cfg.py
@@ -0,0 +1,100 @@
+# -*- Python -*-
+
+import shlex
+import lit.util
+
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst, FindTool
+
+
+def shjoin(args, sep=" "):
+    return sep.join([shlex.quote(arg) for arg in args])
+
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = "flang-rt"
+
+# testFormat: The test format to use to interpret tests.
+#
+# For now we require '&&' between commands, until they get globally killed and
+# the test runner updated.
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = [
+    ".c",
+    ".cpp",
+    ".f",
+    ".F",
+    ".ff",
+    ".FOR",
+    ".for",
+    ".f77",
+    ".f90",
+    ".F90",
+    ".ff90",
+    ".f95",
+    ".F95",
+    ".ff95",
+    ".fpp",
+    ".FPP",
+    ".cuf",
+    ".CUF",
+    ".f18",
+    ".F18",
+    ".f03",
+    ".F03",
+    ".f08",
+    ".F08",
+    ".ll",
+    ".fir",
+    ".mlir",
+]
+
+llvm_config.use_default_substitutions()
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+# lit writes a '.lit_test_times.txt' file into this directory.
+config.test_exec_root = config.flang_rt_binary_test_dir
+
+# On MacOS, -isysroot is needed to build binaries.
+isysroot_flag = []
+if config.osx_sysroot:
+    isysroot_flag = ["-isysroot", config.osx_sysroot]
+
+tools = [
+    ToolSubst(
+        "%flang",
+        command=config.flang,
+        extra_args=isysroot_flag,
+        unresolved="fatal",
+    ),
+    ToolSubst(
+        "%clang",
+        command=FindTool("clang"),
+        extra_args=isysroot_flag,
+        unresolved="fatal",
+    ),
+    ToolSubst("%cc", command=config.cc, extra_args=isysroot_flag, unresolved="fatal"),
+]
+llvm_config.add_tool_substitutions(tools)
+
+# Let tests find LLVM's standard tools (FileCheck, split-file, not, ...)
+llvm_config.with_environment("PATH", config.llvm_tools_dir, append_path=True)
+
+# Include path for C headers that define Flang's Fortran ABI.
+config.substitutions.append(
+    ("%include", os.path.join(config.flang_source_dir, "include"))
+)
+
+# Library path of libflang_rt.runtime.a (for lib search path when using non-Flang driver for linking)
+config.substitutions.append(("%libdir", config.flang_rt_output_resource_lib_dir))
+
+# For CUDA offloading, additional steps (device linking) and libraries (cudart) are needed.
+if config.flang_rt_experimental_offload_support == "CUDA":
+    config.available_features.add("offload-cuda")
diff --git a/flang-rt/test/lit.site.cfg.py.in b/flang-rt/test/lit.site.cfg.py.in
new file mode 100644
index 0000000000000..662d076b1fe24
--- /dev/null
+++ b/flang-rt/test/lit.site.cfg.py.in
@@ -0,0 +1,19 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.flang_source_dir = "@FLANG_SOURCE_DIR@"
+config.flang_rt_source_dir = "@FLANG_RT_SOURCE_DIR@"
+config.flang_rt_binary_test_dir = os.path.dirname(__file__)
+config.flang_rt_output_resource_lib_dir = "@FLANG_RT_OUTPUT_RESOURCE_LIB_DIR@"
+config.flang_rt_experimental_offload_support = "@FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT@"
+config.cc = "@CMAKE_C_COMPILER@"
+config.flang = "@CMAKE_Fortran_COMPILER@"
+config.osx_sysroot = path(r"@CMAKE_OSX_SYSROOT@")
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, os.path.join(config.flang_rt_source_dir, 'test', 'lit.cfg.py'))
diff --git a/flang-rt/unittests/CMakeLists.txt b/flang-rt/unittests/CMakeLists.txt
new file mode 100644
index 0000000000000..e2a50d8b40a9d
--- /dev/null
+++ b/flang-rt/unittests/CMakeLists.txt
@@ -0,0 +1,105 @@
+#===-- unittests/CMakeLists.txt --------------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+# LLVM uses a modified version of GTest that uses LLVMSupport for console
+# output. Therefore it also needs to include files from LLVM. Unfortunately,
+# LLVM/GTest doesn't add the include search path itself. Limiting the scope
+# using target_include_directories does not work because with
+# LLVM_INSTALL_GTEST=ON, as llvm_gtest is an IMPORT library.
+include_directories("${LLVM_INCLUDE_DIR}" "${LLVM_MAIN_INCLUDE_DIR}")
+
+# Add GTest if not already present.
+# Using a function so LLVM_SUBPROJECT_TITLE does not propagate.
+function (build_gtest)
+  set(LLVM_SUBPROJECT_TITLE "Third-Party/Google Test")
+  add_subdirectory("${LLVM_THIRD_PARTY_DIR}/unittest" "${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest")
+endfunction ()
+if (NOT TARGET llvm_gtest)
+  build_gtest()
+endif ()
+
+if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG)
+  add_compile_options("-Wno-suggest-override")
+endif()
+
+
+# Target that depends on all unittests
+add_custom_target(FlangRTUnitTests)
+set_target_properties(FlangRTUnitTests PROPERTIES FOLDER "Flang-RT/Meta")
+
+
+function(add_flangrt_unittest_offload_properties target)
+  # Set CUDA_RESOLVE_DEVICE_SYMBOLS.
+  if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
+    set_target_properties(${target}
+      PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
+      )
+  endif()
+  # Enable OpenMP offload during linking. We may need to replace
+  # LINK_OPTIONS with COMPILE_OPTIONS when there are OpenMP offload
+  # unittests.
+  #
+  # FIXME: replace 'native' in --offload-arch option with the list
+  #        of targets that Fortran Runtime was built for.
+  if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP")
+    set_target_properties(${target}
+      PROPERTIES LINK_OPTIONS
+      "-fopenmp;--offload-arch=native"
+      )
+  endif()
+endfunction()
+
+
+function(add_flangrt_unittest test_dirname)
+  cmake_parse_arguments(ARG
+    ""
+    ""
+    "LINK_LIBS"
+    ${ARGN})
+
+  add_unittest(FlangRTUnitTests ${test_dirname} ${ARG_UNPARSED_ARGUMENTS})
+
+  target_link_libraries(${test_dirname} PRIVATE ${ARG_LINK_LIBS})
+  add_flangrt_unittest_offload_properties(${test_dirname})
+
+  # Required because LLVMSupport is compiled with this option.
+  # FIXME: According to CMake documentation, this is the default. Why is it
+  #        needed? LLVM's add_unittest doesn't set it either.
+  set_target_properties(${test_dirname}
+      PROPERTIES
+        MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL"
+    )
+endfunction()
+
+function(add_flangrt_nongtest_unittest test_name)
+  cmake_parse_arguments(ARG
+    "SLOW_TEST"
+    ""
+    "LINK_LIBS"
+    ${ARGN})
+
+  if(ARG_SLOW_TEST)
+      set(suffix .slow)
+  else()
+      set(suffix .test)
+  endif()
+
+  add_executable(${test_name}${suffix} EXCLUDE_FROM_ALL ${ARG_UNPARSED_ARGUMENTS})
+  set_target_properties(${test_name}${suffix} PROPERTIES FOLDER "Flang-RT/Tests/Unit")
+
+  target_link_libraries(${test_name}${suffix} PRIVATE NonGTestTesting ${ARG_LINK_LIBS})
+
+  if(NOT ARG_SLOW_TEST)
+    add_dependencies(FlangRTUnitTests ${test_name}${suffix})
+  endif()
+
+  add_flangrt_unittest_offload_properties(${test_name}${suffix})
+endfunction()
+
+add_subdirectory(Evaluate)
+add_subdirectory(Runtime)
diff --git a/flang-rt/unittests/Evaluate/CMakeLists.txt b/flang-rt/unittests/Evaluate/CMakeLists.txt
new file mode 100644
index 0000000000000..526ec234d57da
--- /dev/null
+++ b/flang-rt/unittests/Evaluate/CMakeLists.txt
@@ -0,0 +1,21 @@
+#===-- unittests/Evaluate/CMakeLists.txt -----------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+add_flangrt_nongtest_unittest(reshape
+  reshape.cpp
+
+  LINK_LIBS
+    flang_rt.runtime.unittest
+)
+
+add_flangrt_nongtest_unittest(ISO-Fortran-binding
+  ISO-Fortran-binding.cpp
+
+  LINK_LIBS
+    flang_rt.runtime.unittest
+)
diff --git a/flang-rt/unittests/Runtime/CMakeLists.txt b/flang-rt/unittests/Runtime/CMakeLists.txt
new file mode 100644
index 0000000000000..61d0aba93b14b
--- /dev/null
+++ b/flang-rt/unittests/Runtime/CMakeLists.txt
@@ -0,0 +1,48 @@
+#===-- unittests/Runtime/CMakeLists.txt ------------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+add_flangrt_unittest(RuntimeTests
+  AccessTest.cpp
+  Allocatable.cpp
+  ArrayConstructor.cpp
+  BufferTest.cpp
+  CharacterTest.cpp
+  CommandTest.cpp
+  Complex.cpp
+  CrashHandlerFixture.cpp
+  Derived.cpp
+  ExternalIOTest.cpp
+  Format.cpp
+  Inquiry.cpp
+  ListInputTest.cpp
+  LogicalFormatTest.cpp
+  Matmul.cpp
+  MatmulTranspose.cpp
+  MiscIntrinsic.cpp
+  Namelist.cpp
+  Numeric.cpp
+  NumericalFormatTest.cpp
+  Pointer.cpp
+  Ragged.cpp
+  Random.cpp
+  Reduction.cpp
+  RuntimeCrashTest.cpp
+  Stop.cpp
+  Support.cpp
+  Time.cpp
+  TemporaryStack.cpp
+  Transformational.cpp
+
+  LINK_LIBS
+    flang_rt.runtime.unittest
+)
+target_compile_definitions(RuntimeTests PRIVATE NOT_EXE="${LLVM_TOOLS_DIR}/not${CMAKE_EXECUTABLE_SUFFIX}")
+
+if (FLANG_RT_INCLUDE_CUF)
+  add_subdirectory(CUDA)
+endif ()
diff --git a/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt b/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt
new file mode 100644
index 0000000000000..cd69a6f472873
--- /dev/null
+++ b/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt
@@ -0,0 +1,18 @@
+#===-- unittests/Runtime/CUDA/CMakeLists.txt -------------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+add_flangrt_unittest(FlangCufRuntimeTests
+  Allocatable.cpp
+  AllocatorCUF.cpp
+  Memory.cpp
+)
+
+target_link_libraries(FlangCufRuntimeTests
+  PRIVATE
+  flang_rt.cuda
+)
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index 944474acf294c..ac8f784fd811e 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -35,17 +35,6 @@ endif()
 
 option(FLANG_ENABLE_WERROR "Fail and stop building flang if a warning is triggered." OFF)
 
-# The out of tree builds of the compiler and the Fortran runtime
-# must use the same setting of FLANG_RUNTIME_F128_MATH_LIB
-# to be composable. Failure to synchronize this setting may result
-# in linking errors or fatal failures in F128 runtime functions.
-set(FLANG_RUNTIME_F128_MATH_LIB "" CACHE STRING
-  "Specifies the target library used for implementing IEEE-754 128-bit float \
-  math in F18 runtime, e.g. it might be libquadmath for targets where \
-  REAL(16) is mapped to __float128, or libm for targets where REAL(16) \
-  is mapped to long double, etc."
-  )
-
 # Check for a standalone build and configure as appropriate from
 # there.
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
@@ -254,7 +243,25 @@ else()
   include_directories(SYSTEM ${MLIR_TABLEGEN_OUTPUT_DIR})
 endif()
 
-option(FLANG_INCLUDE_RUNTIME "Build the runtime in-tree (deprecated; to be replaced with LLVM_ENABLE_RUNTIMES=flang-rt)" ON)
+set(FLANG_INCLUDE_RUNTIME_default ON)
+if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
+  set(FLANG_INCLUDE_RUNTIME_default OFF)
+endif ()
+option(FLANG_INCLUDE_RUNTIME "Build the runtime in-tree (deprecated; to be replaced with LLVM_ENABLE_RUNTIMES=flang-rt)" ${FLANG_INCLUDE_RUNTIME_default})
+if (FLANG_INCLUDE_RUNTIME)
+  if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
+    message(WARNING "Building Flang-RT using LLVM_ENABLE_RUNTIMES. FLANG_INCLUDE_RUNTIME=${FLANG_INCLUDE_RUNTIME} ignored.")
+    set(FLANG_INCLUDE_RUNTIME OFF)
+  else ()
+     message(STATUS "Building Flang-RT in-tree")
+  endif ()
+else ()
+  if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
+    message(STATUS "Building Flang-RT using LLVM_ENABLE_RUNTIMES")
+  else ()
+    message(STATUS "Not building Flang-RT. For a usable Fortran toolchain, either add LLVM_ENABLE_RUNTIMES=flang-rt, or compile a standalone Flang-RT.")
+  endif ()
+endif ()
 
 set(FLANG_TOOLS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH
     "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')")
@@ -364,20 +371,6 @@ if (FLANG_REPOSITORY_STRING)
   add_definitions(-DFLANG_REPOSITORY_STRING="${FLANG_REPOSITORY_STRING}")
 endif()
 
-if (FLANG_RUNTIME_F128_MATH_LIB)
-  add_compile_definitions(
-    FLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}"
-    )
-endif()
-
-include(TestBigEndian)
-test_big_endian(IS_BIGENDIAN)
-if (IS_BIGENDIAN)
-  add_compile_definitions(FLANG_BIG_ENDIAN=1)
-else ()
-  add_compile_definitions(FLANG_LITTLE_ENDIAN=1)
-endif ()
-
 # Configure Flang's Version.inc file.
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/include/flang/Version.inc.in
@@ -475,6 +468,7 @@ if (APPLE)
 endif()
 
 include(AddFlang)
+include(FlangCommon)
 
 if (FLANG_INCLUDE_TESTS)
   add_compile_definitions(FLANG_INCLUDE_TESTS=1)
@@ -568,7 +562,13 @@ include(GetClangResourceDir)
 get_clang_resource_dir(HEADER_BINARY_DIR PREFIX ${LLVM_LIBRARY_OUTPUT_INTDIR}/.. SUBDIR include)
 configure_file(
   ${FLANG_SOURCE_DIR}/include/flang/ISO_Fortran_binding.h
-  ${HEADER_BINARY_DIR}/ISO_Fortran_binding.h)
+  ${HEADER_BINARY_DIR}/ISO_Fortran_binding.h COPYONLY)
+
+# llvm-test-suite explicitly searches for this header file
+# (`ISO_FORTRAN_C_HEADER`), cannot hide it in Clang's resource dir.
+configure_file(
+  ${FLANG_SOURCE_DIR}/include/flang/ISO_Fortran_binding.h
+  ${LLVM_RUNTIME_OUTPUT_INTDIR}/../include/flang/ISO_Fortran_binding.h COPYONLY)
 
 # And also install it into the install area
 get_clang_resource_dir(HEADER_INSTALL_DIR SUBDIR include)
diff --git a/flang/cmake/modules/FlangCommon.cmake b/flang/cmake/modules/FlangCommon.cmake
new file mode 100644
index 0000000000000..1b8606843b224
--- /dev/null
+++ b/flang/cmake/modules/FlangCommon.cmake
@@ -0,0 +1,43 @@
+#===-- cmake/modules/FlangCommon.txt ----------------------------===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+#
+# CMake definitions shared between Flang and Flang-RT
+#
+#===------------------------------------------------------------------------===#
+
+# The out of tree builds of the compiler and the Fortran runtime
+# must use the same setting of FLANG_RUNTIME_F128_MATH_LIB
+# to be composable. Failure to synchronize this setting may result
+# in linking errors or fatal failures in F128 runtime functions.
+set(FLANG_RUNTIME_F128_MATH_LIB "" CACHE STRING
+  "Specifies the target library used for implementing IEEE-754 128-bit float \
+  math in F18 runtime, e.g. it might be libquadmath for targets where \
+  REAL(16) is mapped to __float128, or libm for targets where REAL(16) \
+  is mapped to long double, etc."
+  )
+if (FLANG_RUNTIME_F128_MATH_LIB)
+  add_compile_definitions(FLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}")
+endif()
+
+# Check if 128-bit float computations can be done via long double
+check_cxx_source_compiles(
+  "#include <cfloat>
+   #if LDBL_MANT_DIG != 113
+   #error LDBL_MANT_DIG != 113
+   #endif
+   int main() { return 0; }
+  "
+  HAVE_LDBL_MANT_DIG_113)
+
+include(TestBigEndian)
+test_big_endian(IS_BIGENDIAN)
+if (IS_BIGENDIAN)
+  add_compile_definitions(FLANG_BIG_ENDIAN=1)
+else ()
+  add_compile_definitions(FLANG_LITTLE_ENDIAN=1)
+endif ()
diff --git a/flang/docs/GettingStarted.md b/flang/docs/GettingStarted.md
index e422a31a0b402..0b3b551ffbfba 100644
--- a/flang/docs/GettingStarted.md
+++ b/flang/docs/GettingStarted.md
@@ -30,7 +30,7 @@ https://llvm.org/docs/GettingStarted.html.
 All of the examples below use GCC as the C/C++ compilers and ninja as the build
 tool.
 
-### Building flang in tree
+### Building flang in tree with bootstrapped Flang-RT
 Building flang in tree means building flang along with all of the projects on
 which it depends.  These projects include mlir, clang, flang, openmp, and
 compiler-rt.  Note that compiler-rt is only needed to access libraries that
@@ -82,7 +82,7 @@ cmake \
   -DLLVM_TARGETS_TO_BUILD=host \
   -DLLVM_LIT_ARGS=-v \
   -DLLVM_ENABLE_PROJECTS="clang;mlir;flang;openmp" \
-  -DLLVM_ENABLE_RUNTIMES="compiler-rt" \
+  -DLLVM_ENABLE_RUNTIMES="compiler-rt;flang-rt" \
   ../llvm-project/llvm
 
 ninja
@@ -101,7 +101,7 @@ the cmake command above:
 To run the flang tests on this build, execute the command in the "build"
 directory:
 ```bash
-ninja check-flang
+ninja check-flang check-flang-rt
 ```
 
 To create the installed files:
@@ -111,34 +111,6 @@ ninja install
 echo "latest" > $INSTALLDIR/bin/versionrc
 ```
 
-To build compiler-rt:
-```bash
-cd $ROOTDIR
-rm -rf compiler-rt
-mkdir compiler-rt
-cd compiler-rt
-CC=$INSTALLDIR/bin/clang \
-CXX=$INSTALLDIR/bin/clang++ \
-cmake \
-  -G Ninja \
-  ../llvm-project/compiler-rt \
-  -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_INSTALL_PREFIX=$INSTALLDIR \
-  -DCMAKE_CXX_STANDARD=11 \
-  -DCMAKE_C_CFLAGS=-mlong-double-128 \
-  -DCMAKE_CXX_CFLAGS=-mlong-double-128 \
-  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-  -DCOMPILER_RT_BUILD_ORC=OFF \
-  -DCOMPILER_RT_BUILD_XRAY=OFF \
-  -DCOMPILER_RT_BUILD_MEMPROF=OFF \
-  -DCOMPILER_RT_BUILD_LIBFUZZER=OFF \
-  -DCOMPILER_RT_BUILD_SANITIZERS=OFF \
-  -DLLVM_CONFIG_PATH=$INSTALLDIR/bin/llvm-config
-
-ninja
-ninja install
-```
-
 Note that these instructions specify flang as one of the projects to build in
 the in tree build.  This is not strictly necessary for subsequent standalone
 builds, but doing so lets you run the flang tests to verify that the source
@@ -192,7 +164,32 @@ directory:
 ninja check-flang
 ```
 
-### Building flang runtime for accelerators
+To build Flang-RT (required for linking executables):
+```bash
+cd $ROOTDIR
+rm -rf flang-rt
+mkdir flang-rt
+cd flang-rt
+CC=$INSTALLDIR/bin/clang \
+CXX=$INSTALLDIR/bin/clang++ \
+cmake \
+  -G Ninja \
+  ../llvm-project/runtimes \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_INSTALL_PREFIX=$INSTALLDIR \
+  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+  -DLLVM_ENABLE_RUNTIMES=flang-rt \
+  -DLLVM_BINARY_DIR=$ROOTDIR/build \
+  -DLLVM_Fortran_COMPILER=$INSTALLDIR/bin/flang \
+  -DLLVM_Fortran_COMPILER_WORKS=ON
+
+ninja
+ninja check-flang-rt
+ninja install
+```
+
+
+### Building Flang-RT for accelerators
 Flang runtime can be built for accelerators in experimental mode, i.e.
 complete enabling is WIP.  CUDA and OpenMP target offload builds
 are currently supported.
@@ -203,20 +200,21 @@ are currently supported.
 Clang with NVPTX backend and NVCC compilers are supported.
 
 ```bash
-cd llvm-project/flang
+cd llvm-project
 rm -rf build_flang_runtime
 mkdir build_flang_runtime
 cd build_flang_runtime
 
 cmake \
-  -DFLANG_EXPERIMENTAL_CUDA_RUNTIME=ON \
+  -DLLVM_ENABLE_RUNTIMES=flang-rt \
+  -DFLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA \
   -DCMAKE_CUDA_ARCHITECTURES=80 \
   -DCMAKE_C_COMPILER=clang \
   -DCMAKE_CXX_COMPILER=clang++ \
   -DCMAKE_CUDA_COMPILER=clang \
   -DCMAKE_CUDA_HOST_COMPILER=clang++ \
-  ../runtime/
-make -j flang-rt
+  ../runtimes/
+make flang-rt
 ```
 
 Note that the used version of `clang` must [support](https://releases.llvm.org/16.0.0/tools/clang/docs/ReleaseNotes.html#cuda-support)
@@ -225,21 +223,22 @@ CUDA toolkit installations, please use `-DCUDAToolkit_ROOT=/some/path`
 to specify the compatible version.
 
 ```bash
-cd llvm-project/flang
+cd llvm-project
 rm -rf build_flang_runtime
 mkdir build_flang_runtime
 cd build_flang_runtime
 
 cmake \
-  -DFLANG_EXPERIMENTAL_CUDA_RUNTIME=ON \
+  -DLLVM_ENABLE_RUNTIMES=flang-rt \
+  -DFLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA \
   -DCMAKE_CUDA_ARCHITECTURES=80 \
   -DCMAKE_C_COMPILER=clang \
   -DCMAKE_CXX_COMPILER=clang++ \
   -DCMAKE_CUDA_COMPILER=nvcc \
   -DCMAKE_CUDA_HOST_COMPILER=clang++ \
-  ../runtime/
+  ../runtimes/
 
-make -j flang-rt
+make flang-rt
 ```
 
 Note that `nvcc` might limit support to certain
@@ -251,50 +250,59 @@ code.  Note that the packaging of the libraries is different
 between [Clang](https://clang.llvm.org/docs/OffloadingDesign.html#linking-target-device-code) and NVCC, so the library must be linked using
 compatible compiler drivers.
 
-#### Building in-tree
+#### Building in-tree (bootstrapping build)
 One may build Flang runtime library along with building Flang itself
 by providing these additional CMake variables on top of the Flang in-tree
 build config:
 
 For example:
 ```bash
-  -DFLANG_EXPERIMENTAL_CUDA_RUNTIME=ON \
+  -DLLVM_ENABLE_RUNTIMES=flang-rt \
+  -DFLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA \
   -DCMAKE_CUDA_ARCHITECTURES=80 \
   -DCMAKE_C_COMPILER=clang \
   -DCMAKE_CXX_COMPILER=clang++ \
   -DCMAKE_CUDA_COMPILER=clang \
   -DCMAKE_CUDA_HOST_COMPILER=clang++ \
+  ../llvm
 ```
 
 Or:
 ```bash
-  -DFLANG_EXPERIMENTAL_CUDA_RUNTIME=ON \
+  -DLLVM_ENABLE_RUNTIMES=flang-rt \
+  -DFLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA \
   -DCMAKE_CUDA_ARCHITECTURES=80 \
   -DCMAKE_C_COMPILER=gcc \
   -DCMAKE_CXX_COMPILER=g++ \
   -DCMAKE_CUDA_COMPILER=nvcc \
   -DCMAKE_CUDA_HOST_COMPILER=g++ \
+  ../llvm
 ```
 
-Normal `make -j check-flang` will work with such CMake configuration.
+Normal `make check-flang` will work with such CMake configuration.
+Consider building in parallel using the `-j<jobs>` flag, where `<jobs>` is a
+number sufficiently low for all build jobs to fit into the available RAM. Using
+the number of harware threads (`nprocs`) is likely too much for most
+commodity machines.
 
 ##### OpenMP target offload build
 Only Clang compiler is currently supported.
 
 ```bash
-cd llvm-project/flang
+cd llvm-project
 rm -rf build_flang_runtime
 mkdir build_flang_runtime
 cd build_flang_runtime
 
 cmake \
-  -DFLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD="host_device" \
+  -DLLVM_ENABLE_RUNTIMES=flang-rt \
+  -DFLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT="OpenMP" \
   -DCMAKE_C_COMPILER=clang \
   -DCMAKE_CXX_COMPILER=clang++ \
-  -DFLANG_OMP_DEVICE_ARCHITECTURES="all" \
-  ../runtime/
+  -DFLANG_RT_DEVICE_ARCHITECTURES=all \
+  ../runtimes/
 
-make -j flang-rt
+make flang-rt
 ```
 
 The result of the build is a "device-only" library, i.e. the host
diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md
index 387d4b2e62e0f..9396d956e2233 100644
--- a/flang/docs/ReleaseNotes.md
+++ b/flang/docs/ReleaseNotes.md
@@ -43,6 +43,12 @@ page](https://llvm.org/releases/).
  * The CufRuntime_cuda_${version} library has been renamed to
    `flang_rt.cuda_${version}`.
 
+ * The Fortran Runtime library has been move to a new top-level directory
+   named "flang-rt". It now supports the LLVM_ENABLE_RUNTIMES mechanism to
+   build Flang-RT for multiple target triples. libflang_rt.runtime.{a|so} will
+   now be emitted into Clang's per-target resource directory
+   (next to libclang_rt.*.*) where it is also found by Flang's driver.
+
 ## New Issues Found
 
 
diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py
index c6266f3976f7c..0ba80f9a03f2e 100644
--- a/flang/test/lit.cfg.py
+++ b/flang/test/lit.cfg.py
@@ -166,26 +166,6 @@
 if config.flang_include_runtime:
     config.available_features.add("flang-rt")
 
-# Define some variables to help us test that the flang runtime doesn't depend on
-# the C++ runtime libraries. For this we need a C compiler. If for some reason
-# we don't have one, we can just disable the test.
-if config.flang_include_runtime and config.cc:
-    libruntime = os.path.join(config.flang_lib_dir, "libflang_rt.runtime.a")
-    include = os.path.join(config.flang_src_dir, "include")
-
-    if (
-        os.path.isfile(libruntime)
-        and os.path.isdir(include)
-    ):
-        config.available_features.add("c-compiler")
-        tools.append(
-            ToolSubst(
-                "%cc", command=config.cc, extra_args=isysroot_flag, unresolved="fatal"
-            )
-        )
-        tools.append(ToolSubst("%libruntime", command=libruntime, unresolved="fatal"))
-        tools.append(ToolSubst("%include", command=include, unresolved="fatal"))
-
 # Add all the tools and their substitutions (if applicable). Use the search paths provided for
 # finding the tools.
 if config.flang_standalone_build:
diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in
index 697ba3fa79763..5b66e592bcfee 100644
--- a/flang/test/lit.site.cfg.py.in
+++ b/flang/test/lit.site.cfg.py.in
@@ -11,18 +11,15 @@ config.llvm_target_triple_env = "@LLVM_TARGET_TRIPLE_ENV@"
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
 config.errc_messages = "@LLVM_LIT_ERRC_MESSAGES@"
 config.flang_obj_root = "@FLANG_BINARY_DIR@"
-config.flang_src_dir = "@FLANG_SOURCE_DIR@"
 config.flang_tools_dir = lit_config.substitute("@FLANG_TOOLS_DIR@")
 config.flang_intrinsic_modules_dir = "@FLANG_INTRINSIC_MODULES_DIR@"
 config.flang_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
-config.flang_lib_dir = "@CMAKE_BINARY_DIR@/lib"
 config.flang_test_triple = "@FLANG_TEST_TARGET_TRIPLE@"
 config.flang_examples = @LLVM_BUILD_EXAMPLES@
 config.python_executable = "@PYTHON_EXECUTABLE@"
 config.flang_standalone_build = @FLANG_STANDALONE_BUILD@
 config.has_plugins = @LLVM_ENABLE_PLUGINS@
 config.linked_bye_extension = @LLVM_BYE_LINK_INTO_TOOLS@
-config.cc = "@CMAKE_C_COMPILER@"
 config.osx_sysroot = path(r"@CMAKE_OSX_SYSROOT@")
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.default_sysroot = "@DEFAULT_SYSROOT@"
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index f5293e8663243..88512d0f1dd96 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -164,12 +164,18 @@ if ("compiler-rt" IN_LIST LLVM_ENABLE_PROJECTS)
     "https://compiler-rt.llvm.org/ for building the runtimes.")
 endif()
 
+if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
+  if (NOT "flang" IN_LIST LLVM_ENABLE_PROJECTS)
+    message(FATAL_ERROR "Flang is not enabled, but is required for the Flang-RT runtime")
+  endif ()
+endif ()
+
 # Select the runtimes to build
 #
 # As we migrate runtimes to using the bootstrapping build, the set of default runtimes
 # should grow as we remove those runtimes from LLVM_ENABLE_PROJECTS above.
 set(LLVM_DEFAULT_RUNTIMES "libcxx;libcxxabi;libunwind")
-set(LLVM_SUPPORTED_RUNTIMES "libc;libunwind;libcxxabi;pstl;libcxx;compiler-rt;openmp;llvm-libgcc;offload")
+set(LLVM_SUPPORTED_RUNTIMES "libc;libunwind;libcxxabi;pstl;libcxx;compiler-rt;openmp;llvm-libgcc;offload;flang-rt")
 set(LLVM_ENABLE_RUNTIMES "" CACHE STRING
   "Semicolon-separated list of runtimes to build, or \"all\" (${LLVM_DEFAULT_RUNTIMES}). Supported runtimes are ${LLVM_SUPPORTED_RUNTIMES}.")
 if(LLVM_ENABLE_RUNTIMES STREQUAL "all")
diff --git a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
index 55422c2a4c023..caccb62d666ce 100644
--- a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
+++ b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake
@@ -38,6 +38,8 @@ endfunction()
 
 
 # llvm_ExternalProject_Add(name source_dir ...
+#   ENABLE_FORTRAN
+#     External project requires the Flang compiler
 #   USE_TOOLCHAIN
 #     Use just-built tools (see TOOLCHAIN_TOOLS)
 #   EXCLUDE_FROM_ALL
@@ -65,7 +67,7 @@ endfunction()
 #   )
 function(llvm_ExternalProject_Add name source_dir)
   cmake_parse_arguments(ARG
-    "USE_TOOLCHAIN;EXCLUDE_FROM_ALL;NO_INSTALL;ALWAYS_CLEAN"
+    "ENABLE_FORTRAN;USE_TOOLCHAIN;EXCLUDE_FROM_ALL;NO_INSTALL;ALWAYS_CLEAN"
     "SOURCE_DIR;FOLDER"
     "CMAKE_ARGS;TOOLCHAIN_TOOLS;RUNTIME_LIBRARIES;DEPENDS;EXTRA_TARGETS;PASSTHROUGH_PREFIXES;STRIP_TOOL;TARGET_TRIPLE"
     ${ARGN})
@@ -93,6 +95,9 @@ function(llvm_ExternalProject_Add name source_dir)
 
   if(NOT ARG_TOOLCHAIN_TOOLS)
     set(ARG_TOOLCHAIN_TOOLS clang)
+    if (ARG_ENABLE_FORTRAN)
+      list(APPEND ARG_TOOLCHAIN_TOOLS flang)
+    endif ()
     # AIX 64-bit XCOFF and big AR format is not yet supported in some of these tools.
     if(NOT _cmake_system_name STREQUAL AIX)
       list(APPEND ARG_TOOLCHAIN_TOOLS lld llvm-ar llvm-ranlib llvm-nm llvm-objdump)
@@ -143,6 +148,10 @@ function(llvm_ExternalProject_Add name source_dir)
     set(CLANG_IN_TOOLCHAIN On)
   endif()
 
+  if(flang IN_LIST TOOLCHAIN_TOOLS)
+    set(FLANG_IN_TOOLCHAIN On)
+  endif()
+
   if(RUNTIME_LIBRARIES AND CLANG_IN_TOOLCHAIN)
     list(APPEND TOOLCHAIN_BINS ${RUNTIME_LIBRARIES})
   endif()
@@ -225,6 +234,9 @@ function(llvm_ExternalProject_Add name source_dir)
                           -DCMAKE_ASM_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/clang${CMAKE_EXECUTABLE_SUFFIX})
       endif()
     endif()
+    if(FLANG_IN_TOOLCHAIN)
+      list(APPEND compiler_args -DCMAKE_Fortran_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/flang${CMAKE_EXECUTABLE_SUFFIX})
+    endif()
     if(lld IN_LIST TOOLCHAIN_TOOLS)
       if(is_msvc_target)
         list(APPEND compiler_args -DCMAKE_LINKER=${LLVM_RUNTIME_OUTPUT_INTDIR}/lld-link${CMAKE_EXECUTABLE_SUFFIX})
@@ -308,6 +320,7 @@ function(llvm_ExternalProject_Add name source_dir)
     set(compiler_args -DCMAKE_ASM_COMPILER=${CMAKE_ASM_COMPILER}
                       -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                       -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                      -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}
                       -DCMAKE_LINKER=${CMAKE_LINKER}
                       -DCMAKE_AR=${CMAKE_AR}
                       -DCMAKE_RANLIB=${CMAKE_RANLIB}
@@ -357,6 +370,7 @@ function(llvm_ExternalProject_Add name source_dir)
   if(ARG_TARGET_TRIPLE)
     list(APPEND compiler_args -DCMAKE_C_COMPILER_TARGET=${ARG_TARGET_TRIPLE})
     list(APPEND compiler_args -DCMAKE_CXX_COMPILER_TARGET=${ARG_TARGET_TRIPLE})
+    list(APPEND compiler_args -DCMAKE_Fortran_COMPILER_TARGET=${ARG_TARGET_TRIPLE})
     list(APPEND compiler_args -DCMAKE_ASM_COMPILER_TARGET=${ARG_TARGET_TRIPLE})
   endif()
 
diff --git a/llvm/projects/CMakeLists.txt b/llvm/projects/CMakeLists.txt
index 08f2fa522420b..f254cf10806d7 100644
--- a/llvm/projects/CMakeLists.txt
+++ b/llvm/projects/CMakeLists.txt
@@ -11,7 +11,8 @@ foreach(entry ${entries})
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libunwind) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/test-suite) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/openmp) AND
-       (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/cross-project-tests))
+       (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/cross-project-tests) AND
+       (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/flang-rt))
       get_filename_component(entry_name "${entry}" NAME)
       add_llvm_external_project(${entry_name})
     endif()
@@ -37,6 +38,7 @@ if(${LLVM_BUILD_RUNTIME})
   if(NOT LLVM_BUILD_EXTERNAL_COMPILER_RT)
     add_llvm_external_project(compiler-rt)
   endif()
+  add_llvm_external_project(flang-rt)
 endif()
 
 add_llvm_external_project(dragonegg)
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 70e85c123e412..2370b41fb7f0b 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -230,7 +230,7 @@ foreach(entry ${runtimes})
 endforeach()
 
 function(runtime_default_target)
-  cmake_parse_arguments(ARG "" "" "DEPENDS;CMAKE_ARGS;PREFIXES" ${ARGN})
+  cmake_parse_arguments(ARG "" "" "DEPENDS;CMAKE_ARGS;PREFIXES;EXTRA_ARGS" ${ARGN})
 
   include(${LLVM_BINARY_DIR}/runtimes/Components.cmake OPTIONAL)
   set(SUB_CHECK_TARGETS ${SUB_CHECK_TARGETS} PARENT_SCOPE)
@@ -270,14 +270,16 @@ function(runtime_default_target)
                                       -DLLVM_BUILD_TOOLS=${LLVM_BUILD_TOOLS}
                                       -DCMAKE_C_COMPILER_WORKS=ON
                                       -DCMAKE_CXX_COMPILER_WORKS=ON
+                                      -DCMAKE_Fortran_COMPILER_WORKS=ON
                                       -DCMAKE_ASM_COMPILER_WORKS=ON
                                       ${COMMON_CMAKE_ARGS}
                                       ${RUNTIMES_CMAKE_ARGS}
                                       ${ARG_CMAKE_ARGS}
                            PASSTHROUGH_PREFIXES LLVM_ENABLE_RUNTIMES
                                                 LLVM_USE_LINKER
-                                                CUDA # For runtimes that may look for the CUDA SDK (libc, offload)
+                                                CUDA CMAKE_CUDA # For runtimes that may look for the CUDA compiler and/or SDK (libc, offload, flang-rt)
                                                 FFI # offload uses libffi
+                                                FLANG_RUNTIME # Shared between Flang and Flang-RT
                                                 ${ARG_PREFIXES}
                            EXTRA_TARGETS ${extra_targets}
                                          ${test_targets}
@@ -287,7 +289,7 @@ function(runtime_default_target)
                            USE_TOOLCHAIN
                            TARGET_TRIPLE ${LLVM_TARGET_TRIPLE}
                            FOLDER "Runtimes"
-                           ${EXTRA_ARGS})
+                           ${EXTRA_ARGS} ${ARG_EXTRA_ARGS})
 endfunction()
 
 # runtime_register_target(name)
@@ -404,6 +406,7 @@ function(runtime_register_target name)
                                       -DLLVM_ENABLE_PER_TARGET_RUNTIME_DIR=${LLVM_ENABLE_PER_TARGET_RUNTIME_DIR}
                                       -DCMAKE_C_COMPILER_WORKS=ON
                                       -DCMAKE_CXX_COMPILER_WORKS=ON
+                                      -DCMAKE_Fortran_COMPILER_WORKS=ON
                                       -DCMAKE_ASM_COMPILER_WORKS=ON
                                       -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON
                                       -DLLVM_RUNTIMES_TARGET=${name}
@@ -463,10 +466,13 @@ if(build_runtimes)
   # together in a single CMake invocation.
   set(extra_deps "")
   set(extra_cmake_args "")
+  set(extra_args "")
 
   if(LLVM_INCLUDE_TESTS)
     foreach(dep FileCheck
                 clang
+                clang-offload-packager
+                flang
                 count
                 lld
                 lli
@@ -549,19 +555,24 @@ if(build_runtimes)
   if(LLVM_LIBC_FULL_BUILD)
     list(APPEND extra_cmake_args "-DLLVM_LIBC_FULL_BUILD=ON")
   endif()
+  if("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
+    list(APPEND extra_args ENABLE_FORTRAN)
+  endif ()
 
   if(NOT LLVM_RUNTIME_TARGETS)
     runtime_default_target(
       DEPENDS ${builtins_dep} ${extra_deps}
       CMAKE_ARGS ${extra_cmake_args}
-      PREFIXES ${prefixes})
+      PREFIXES ${prefixes}
+      EXTRA_ARGS ${extra_args})
     set(test_targets check-runtimes)
   else()
     if("default" IN_LIST LLVM_RUNTIME_TARGETS)
       runtime_default_target(
         DEPENDS ${builtins_dep} ${extra_deps}
         CMAKE_ARGS ${extra_cmake_args}
-        PREFIXES ${prefixes})
+        PREFIXES ${prefixes}
+        EXTRA_ARGS ${extra_args})
       list(REMOVE_ITEM LLVM_RUNTIME_TARGETS "default")
     else()
       add_custom_target(runtimes)
@@ -608,7 +619,7 @@ if(build_runtimes)
       runtime_register_target(${name}
         DEPENDS ${builtins_dep_name} ${extra_deps}
         CMAKE_ARGS -DLLVM_DEFAULT_TARGET_TRIPLE=${name} ${extra_cmake_args}
-        EXTRA_ARGS TARGET_TRIPLE ${name})
+        EXTRA_ARGS TARGET_TRIPLE ${name} ${extra_args})
     endforeach()
 
     foreach(multilib ${LLVM_RUNTIME_MULTILIBS})
@@ -620,7 +631,7 @@ if(build_runtimes)
                      -DLLVM_RUNTIMES_LIBDIR_SUBDIR=${multilib}
                      ${extra_cmake_args}
           BASE_NAME ${name}
-          EXTRA_ARGS TARGET_TRIPLE ${name})
+          EXTRA_ARGS TARGET_TRIPLE ${name} ${extra_args})
       endforeach()
     endforeach()
   endif()
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index 4a6b317a03f66..7f1e2ae065d6c 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -36,7 +36,7 @@ list(INSERT CMAKE_MODULE_PATH 0
 # We order libraries to mirror roughly how they are layered, except that compiler-rt can depend
 # on libc++, so we put it after.
 set(LLVM_DEFAULT_RUNTIMES "libc;libunwind;libcxxabi;pstl;libcxx;compiler-rt;openmp;offload")
-set(LLVM_SUPPORTED_RUNTIMES "${LLVM_DEFAULT_RUNTIMES};llvm-libgcc")
+set(LLVM_SUPPORTED_RUNTIMES "${LLVM_DEFAULT_RUNTIMES};llvm-libgcc;flang-rt")
 set(LLVM_ENABLE_RUNTIMES "" CACHE STRING
   "Semicolon-separated list of runtimes to build, or \"all\" (${LLVM_DEFAULT_RUNTIMES}). Supported runtimes are ${LLVM_SUPPORTED_RUNTIMES}.")
 if(LLVM_ENABLE_RUNTIMES STREQUAL "all" )

From 9f6b7b4e5f792bc5167a3bcfab400160cc1803ed Mon Sep 17 00:00:00 2001
From: Michael Flanders <flanders.michaelk@gmail.com>
Date: Sun, 16 Feb 2025 09:58:43 -0600
Subject: [PATCH 062/109] [analyzer] StackAddrEscapeChecker: also check return
 for child stack frames (#126986)

Fixes #123459.

This changes checking of the returned expr to also look for memory
regions whose stack frame context was a child of the current stack frame
context, e.g., for cases like this given in #123459:

```
struct S { int *p; };
S f() {
  S s;
  {
    int a = 1;
    s.p = &a;
  }
  return s;
}
```
---
 .../Checkers/StackAddrEscapeChecker.cpp       |  8 +++-
 clang/test/Analysis/stack-addr-ps.cpp         | 45 +++++++++++++++++++
 clang/test/Analysis/stackaddrleak.c           | 22 +++++++++
 3 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
index c9df15ceb3b40..2a22e8e10efb0 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
@@ -274,7 +274,13 @@ class FindStackRegionsSymbolVisitor final : public SymbolVisitor {
   void SaveIfEscapes(const MemRegion *MR) {
     const StackSpaceRegion *SSR =
         MR->getMemorySpace()->getAs<StackSpaceRegion>();
-    if (SSR && SSR->getStackFrame() == PoppedStackFrame)
+
+    if (!SSR)
+      return;
+
+    const StackFrameContext *CapturedSFC = SSR->getStackFrame();
+    if (CapturedSFC == PoppedStackFrame ||
+        PoppedStackFrame->isParentOf(CapturedSFC))
       EscapingStackRegions.push_back(MR);
   }
 
diff --git a/clang/test/Analysis/stack-addr-ps.cpp b/clang/test/Analysis/stack-addr-ps.cpp
index bf988d0a16959..2e509f358b49f 100644
--- a/clang/test/Analysis/stack-addr-ps.cpp
+++ b/clang/test/Analysis/stack-addr-ps.cpp
@@ -982,6 +982,51 @@ int& ret_local_field_ref() {
 }
 } //namespace return_address_of_true_positives
 
+namespace return_from_child_block_scope {
+struct S {
+  int *p;
+};
+
+S return_child_stack_context() {
+  S s;
+  {
+    int a = 1;
+    s = (S){ &a };
+  }
+  return s; // expected-warning {{Address of stack memory associated with local variable 'a' returned to caller}}
+}
+
+S return_child_stack_context_field() {
+  S s;
+  {
+    int a = 1;
+    s.p = &a;
+  }
+  return s; // expected-warning {{Address of stack memory associated with local variable 'a' returned to caller}}
+}
+
+// The below are reproducers from Issue #123459
+template <typename V>
+struct T {
+    V* q{};
+    T() = default;
+    T(T&& rhs) { q = rhs.q; rhs.q = nullptr;}
+    T& operator=(T&& rhs) { q = rhs.q; rhs.q = nullptr;}
+    void push_back(const V& v) { if (q == nullptr) q = new V(v); }
+    ~T() { delete q; }
+};
+
+T<S> f() {
+    T<S> t;
+    {
+        int a = 1;
+        t.push_back({ &a });
+    }
+    return t; // expected-warning {{Address of stack memory associated with local variable 'a' returned to caller}}
+}
+
+} // namespace return_from_child_block_scope
+
 namespace true_negatives_return_expressions {
 struct Container { int *x; };
 
diff --git a/clang/test/Analysis/stackaddrleak.c b/clang/test/Analysis/stackaddrleak.c
index f8101525401b0..95175996e8274 100644
--- a/clang/test/Analysis/stackaddrleak.c
+++ b/clang/test/Analysis/stackaddrleak.c
@@ -68,3 +68,25 @@ int *g_no_lifetime_bound() {
   int i = 0;
   return f_no_lifetime_bound(&i); // no-warning
 }
+
+struct child_stack_context_s {
+  int *p;
+};
+
+struct child_stack_context_s return_child_stack_context() {
+  struct child_stack_context_s s;
+  {
+    int a = 1;
+    s = (struct child_stack_context_s){ &a };
+  }
+  return s; // expected-warning {{Address of stack memory associated with local variable 'a' returned to caller}}
+}
+
+struct child_stack_context_s return_child_stack_context_field() {
+  struct child_stack_context_s s;
+  {
+    int a = 1;
+    s.p = &a;
+  }
+  return s; // expected-warning {{Address of stack memory associated with local variable 'a' returned to caller}}
+}

From 1c87e4739f487aea1fbafa06b92ec1a1c011c6f2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 08:14:20 -0800
Subject: [PATCH 063/109] [AST] Avoid repeated map lookups (NFC) (#127369)

---
 clang/lib/AST/ExternalASTMerger.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/clang/lib/AST/ExternalASTMerger.cpp b/clang/lib/AST/ExternalASTMerger.cpp
index 257e8338dedef..1c903b5104bf4 100644
--- a/clang/lib/AST/ExternalASTMerger.cpp
+++ b/clang/lib/AST/ExternalASTMerger.cpp
@@ -206,16 +206,14 @@ class LazyASTImporter : public ASTImporter {
                << "\n";
       Source<DeclContext *> FromDC(
           cast<DeclContext>(From)->getPrimaryContext());
-      if (FromOrigins.count(FromDC) &&
-          Parent.HasImporterForOrigin(*FromOrigins.at(FromDC).AST)) {
+      if (auto It = FromOrigins.find(FromDC);
+          It != FromOrigins.end() &&
+          Parent.HasImporterForOrigin(*It->second.AST)) {
         if (LoggingEnabled)
-          logs() << "(ExternalASTMerger*)" << (void*)&Parent
-                 << " forced origin (DeclContext*)"
-                 << (void*)FromOrigins.at(FromDC).DC
-                 << ", (ASTContext*)"
-                 << (void*)FromOrigins.at(FromDC).AST
-                 << "\n";
-        Parent.ForceRecordOrigin(ToDC, FromOrigins.at(FromDC));
+          logs() << "(ExternalASTMerger*)" << (void *)&Parent
+                 << " forced origin (DeclContext*)" << (void *)It->second.DC
+                 << ", (ASTContext*)" << (void *)It->second.AST << "\n";
+        Parent.ForceRecordOrigin(ToDC, It->second);
       } else {
         if (LoggingEnabled)
           logs() << "(ExternalASTMerger*)" << (void*)&Parent

From 0bae0bf8ba73bd0201c58a6cfd6d9f54aaf39ca2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 08:14:42 -0800
Subject: [PATCH 064/109] [clang-tidy] Avoid repeated hash lookups (NFC)
 (#127370)

---
 .../clang-tidy/bugprone/VirtualNearMissCheck.cpp            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
index 76fa2d916f0e8..509fce3a38471 100644
--- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
@@ -179,15 +179,15 @@ static bool checkOverrideByDerivedMethod(const CXXMethodDecl *BaseMD,
 
 bool VirtualNearMissCheck::isPossibleToBeOverridden(
     const CXXMethodDecl *BaseMD) {
-  auto Iter = PossibleMap.find(BaseMD);
-  if (Iter != PossibleMap.end())
+  auto [Iter, Inserted] = PossibleMap.try_emplace(BaseMD);
+  if (!Inserted)
     return Iter->second;
 
   bool IsPossible = !BaseMD->isImplicit() && !isa<CXXConstructorDecl>(BaseMD) &&
                     !isa<CXXDestructorDecl>(BaseMD) && BaseMD->isVirtual() &&
                     !BaseMD->isOverloadedOperator() &&
                     !isa<CXXConversionDecl>(BaseMD);
-  PossibleMap[BaseMD] = IsPossible;
+  Iter->second = IsPossible;
   return IsPossible;
 }
 

From 8d752467e0e023f9b2dc83ca1829f75024f0440d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 08:15:08 -0800
Subject: [PATCH 065/109] [AMDGPU] Avoid repeated hash lookups (NFC) (#127371)

---
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 17207773b4858..c0581e491720d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -948,12 +948,13 @@ Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
 
   SmallVector<Constant *> Elements;
   for (auto *GV : Variables) {
-    if (!LDSParams.LDSToReplacementIndicesMap.contains(GV)) {
+    auto It = LDSParams.LDSToReplacementIndicesMap.find(GV);
+    if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
       Elements.push_back(
           PoisonValue::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS)));
       continue;
     }
-    auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
+    auto &Indices = It->second;
     Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
                           ConstantInt::get(Int32Ty, Indices[1]),
                           ConstantInt::get(Int32Ty, Indices[2])};

From 03235540558c869841cf016ebd7e1aeb7eb341b0 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 08:15:36 -0800
Subject: [PATCH 066/109] [GlobalISel] Avoid repeated hash lookups (NFC)
 (#127372)

---
 llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp
index 45b403bdd0765..f338f66997657 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp
@@ -305,10 +305,10 @@ LegacyLegalizerInfo::findScalarLegalAction(const InstrAspect &Aspect) const {
   if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
     return {NotFound, LLT()};
   const unsigned OpcodeIdx = getOpcodeIdxForOpcode(Aspect.Opcode);
-  if (Aspect.Type.isPointer() &&
-      AddrSpace2PointerActions[OpcodeIdx].find(Aspect.Type.getAddressSpace()) ==
-          AddrSpace2PointerActions[OpcodeIdx].end()) {
-    return {NotFound, LLT()};
+  if (Aspect.Type.isPointer()) {
+    auto &PA = AddrSpace2PointerActions[OpcodeIdx];
+    if (PA.find(Aspect.Type.getAddressSpace()) == PA.end())
+      return {NotFound, LLT()};
   }
   const SmallVector<SizeAndActionsVec, 1> &Actions =
       Aspect.Type.isPointer()

From e81f7ca63738d04c9f3f2a4542832d6f510e65bc Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 08:16:10 -0800
Subject: [PATCH 067/109] [TableGen] Avoid repeated hash lookups (NFC)
 (#127373)

---
 llvm/utils/TableGen/AsmMatcherEmitter.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index fcc82e598c437..24822c847046d 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -2333,9 +2333,9 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
     OS << "  // " << InstructionConversionKinds[Row] << "\n";
     OS << "  { ";
     for (unsigned i = 0, e = ConversionTable[Row].size(); i != e; i += 2) {
-      OS << OperandConversionKinds[ConversionTable[Row][i]] << ", ";
-      if (OperandConversionKinds[ConversionTable[Row][i]] !=
-          CachedHashString("CVT_Tied")) {
+      const auto &OCK = OperandConversionKinds[ConversionTable[Row][i]];
+      OS << OCK << ", ";
+      if (OCK != CachedHashString("CVT_Tied")) {
         OS << (unsigned)(ConversionTable[Row][i + 1]) << ", ";
         continue;
       }

From ec880b1450c5b9526d6310d1a66cf3a5297551de Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 08:16:30 -0800
Subject: [PATCH 068/109] [X86] Avoid repeated map lookups (NFC) (#127374)

---
 llvm/lib/Target/X86/X86LowerAMXType.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index cb6127fb85749..2ea78d44d67cf 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -190,8 +190,8 @@ Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
 
 Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V,
                                       unsigned Granularity) {
-  if (Row2Col.count(V))
-    return Row2Col[V];
+  if (auto It = Row2Col.find(V); It != Row2Col.end())
+    return It->second;
   IRBuilder<> Builder(II);
   Value *RealCol = nullptr;
   if (isa<ConstantInt>(V))

From d235b72178adc710bf704078fbe0cd687642f3e0 Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777@gmail.com>
Date: Mon, 17 Feb 2025 00:36:08 +0800
Subject: [PATCH 069/109] Reapply "[Analyzer][CFG] Correctly handle rebuilt
 default arg and default init expression" (#127338)

This PR reapply https://github.com/llvm/llvm-project/pull/117437.
The issue has been fixed by the 2nd commit, we need to ignore parens in
CXXDefaultArgExpr when build CFG, because CXXDefaultArgExpr::getExpr
stripped off the top level FullExpr and ConstantExpr, ParenExpr may
occurres in the top level.

---------

Signed-off-by: yronglin <yronglin777@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   |  4 +
 clang/lib/AST/ParentMap.cpp                   | 17 +++++
 clang/lib/Analysis/CFG.cpp                    | 54 ++++++++++---
 clang/lib/Analysis/ReachableCode.cpp          | 37 ++++-----
 clang/lib/Sema/SemaExpr.cpp                   |  9 ++-
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  | 56 ++++++++------
 clang/test/AST/ast-dump-recovery.cpp          |  2 +-
 .../Analysis/lifetime-extended-regions.cpp    |  7 +-
 clang/test/SemaCXX/cxx2c-placeholder-vars.cpp |  8 +-
 clang/test/SemaCXX/warn-unreachable.cpp       | 75 +++++++++++++++++++
 10 files changed, 208 insertions(+), 61 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index efaacdf18d50a..6272f32fa845a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -270,6 +270,10 @@ Code Completion
 Static Analyzer
 ---------------
 
+- Clang currently support extending lifetime of object bound to 
+  reference members of aggregates in CFG and ExprEngine, that are
+  created from default member initializer.
+
 New features
 ^^^^^^^^^^^^
 
diff --git a/clang/lib/AST/ParentMap.cpp b/clang/lib/AST/ParentMap.cpp
index e62e71bf5a514..580613b2618fb 100644
--- a/clang/lib/AST/ParentMap.cpp
+++ b/clang/lib/AST/ParentMap.cpp
@@ -13,6 +13,7 @@
 #include "clang/AST/ParentMap.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/ExprCXX.h"
 #include "clang/AST/StmtObjC.h"
 #include "llvm/ADT/DenseMap.h"
 
@@ -103,6 +104,22 @@ static void BuildParentMap(MapTy& M, Stmt* S,
       BuildParentMap(M, SubStmt, OVMode);
     }
     break;
+  case Stmt::CXXDefaultArgExprClass:
+    if (auto *Arg = dyn_cast<CXXDefaultArgExpr>(S)) {
+      if (Arg->hasRewrittenInit()) {
+        M[Arg->getExpr()] = S;
+        BuildParentMap(M, Arg->getExpr(), OVMode);
+      }
+    }
+    break;
+  case Stmt::CXXDefaultInitExprClass:
+    if (auto *Init = dyn_cast<CXXDefaultInitExpr>(S)) {
+      if (Init->hasRewrittenInit()) {
+        M[Init->getExpr()] = S;
+        BuildParentMap(M, Init->getExpr(), OVMode);
+      }
+    }
+    break;
   default:
     for (Stmt *SubStmt : S->children()) {
       if (SubStmt) {
diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp
index 3e144395cffc6..c82dbc42fb9d8 100644
--- a/clang/lib/Analysis/CFG.cpp
+++ b/clang/lib/Analysis/CFG.cpp
@@ -556,6 +556,10 @@ class CFGBuilder {
 
 private:
   // Visitors to walk an AST and construct the CFG.
+  CFGBlock *VisitCXXDefaultArgExpr(CXXDefaultArgExpr *Default,
+                                   AddStmtChoice asc);
+  CFGBlock *VisitCXXDefaultInitExpr(CXXDefaultInitExpr *Default,
+                                    AddStmtChoice asc);
   CFGBlock *VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc);
   CFGBlock *VisitAddrLabelExpr(AddrLabelExpr *A, AddStmtChoice asc);
   CFGBlock *VisitAttributedStmt(AttributedStmt *A, AddStmtChoice asc);
@@ -2263,16 +2267,10 @@ CFGBlock *CFGBuilder::Visit(Stmt * S, AddStmtChoice asc,
                                    asc, ExternallyDestructed);
 
     case Stmt::CXXDefaultArgExprClass:
+      return VisitCXXDefaultArgExpr(cast<CXXDefaultArgExpr>(S), asc);
+
     case Stmt::CXXDefaultInitExprClass:
-      // FIXME: The expression inside a CXXDefaultArgExpr is owned by the
-      // called function's declaration, not by the caller. If we simply add
-      // this expression to the CFG, we could end up with the same Expr
-      // appearing multiple times (PR13385).
-      //
-      // It's likewise possible for multiple CXXDefaultInitExprs for the same
-      // expression to be used in the same function (through aggregate
-      // initialization).
-      return VisitStmt(S, asc);
+      return VisitCXXDefaultInitExpr(cast<CXXDefaultInitExpr>(S), asc);
 
     case Stmt::CXXBindTemporaryExprClass:
       return VisitCXXBindTemporaryExpr(cast<CXXBindTemporaryExpr>(S), asc);
@@ -2442,6 +2440,44 @@ CFGBlock *CFGBuilder::VisitChildren(Stmt *S) {
   return B;
 }
 
+CFGBlock *CFGBuilder::VisitCXXDefaultArgExpr(CXXDefaultArgExpr *Arg,
+                                             AddStmtChoice asc) {
+  if (Arg->hasRewrittenInit()) {
+    if (asc.alwaysAdd(*this, Arg)) {
+      autoCreateBlock();
+      appendStmt(Block, Arg);
+    }
+    return VisitStmt(Arg->getExpr()->IgnoreParens(), asc);
+  }
+
+  // We can't add the default argument if it's not rewritten because the
+  // expression inside a CXXDefaultArgExpr is owned by the called function's
+  // declaration, not by the caller, we could end up with the same expression
+  // appearing multiple times.
+  return VisitStmt(Arg, asc);
+}
+
+CFGBlock *CFGBuilder::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *Init,
+                                              AddStmtChoice asc) {
+  if (Init->hasRewrittenInit()) {
+    if (asc.alwaysAdd(*this, Init)) {
+      autoCreateBlock();
+      appendStmt(Block, Init);
+    }
+
+    // Unlike CXXDefaultArgExpr::getExpr stripped off the top level FullExpr and
+    // ConstantExpr, CXXDefaultInitExpr::getExpr does not do this, so we don't
+    // need to ignore ParenExprs, because the top level will not be a ParenExpr.
+    return VisitStmt(Init->getExpr(), asc);
+  }
+
+  // We can't add the default initializer if it's not rewritten because multiple
+  // CXXDefaultInitExprs for the same sub-expression to be used in the same
+  // function (through aggregate initialization). we could end up with the same
+  // expression appearing multiple times.
+  return VisitStmt(Init, asc);
+}
+
 CFGBlock *CFGBuilder::VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc) {
   if (asc.alwaysAdd(*this, ILE)) {
     autoCreateBlock();
diff --git a/clang/lib/Analysis/ReachableCode.cpp b/clang/lib/Analysis/ReachableCode.cpp
index dd81c8e0a3d54..3b1f716f8dea1 100644
--- a/clang/lib/Analysis/ReachableCode.cpp
+++ b/clang/lib/Analysis/ReachableCode.cpp
@@ -454,11 +454,12 @@ bool DeadCodeScan::isDeadCodeRoot(const clang::CFGBlock *Block) {
   return isDeadRoot;
 }
 
-// Check if the given `DeadStmt` is a coroutine statement and is a substmt of
-// the coroutine statement. `Block` is the CFGBlock containing the `DeadStmt`.
-static bool isInCoroutineStmt(const Stmt *DeadStmt, const CFGBlock *Block) {
+// Check if the given `DeadStmt` is one of target statements or is a sub-stmt of
+// them. `Block` is the CFGBlock containing the `DeadStmt`.
+template <class... Ts>
+static bool isDeadStmtInOneOf(const Stmt *DeadStmt, const CFGBlock *Block) {
   // The coroutine statement, co_return, co_await, or co_yield.
-  const Stmt *CoroStmt = nullptr;
+  const Stmt *TargetStmt = nullptr;
   // Find the first coroutine statement after the DeadStmt in the block.
   bool AfterDeadStmt = false;
   for (CFGBlock::const_iterator I = Block->begin(), E = Block->end(); I != E;
@@ -467,32 +468,27 @@ static bool isInCoroutineStmt(const Stmt *DeadStmt, const CFGBlock *Block) {
       const Stmt *S = CS->getStmt();
       if (S == DeadStmt)
         AfterDeadStmt = true;
-      if (AfterDeadStmt &&
-          // For simplicity, we only check simple coroutine statements.
-          (llvm::isa<CoreturnStmt>(S) || llvm::isa<CoroutineSuspendExpr>(S))) {
-        CoroStmt = S;
+      if (AfterDeadStmt && llvm::isa<Ts...>(S)) {
+        TargetStmt = S;
         break;
       }
     }
-  if (!CoroStmt)
+  if (!TargetStmt)
     return false;
   struct Checker : DynamicRecursiveASTVisitor {
     const Stmt *DeadStmt;
-    bool CoroutineSubStmt = false;
-    Checker(const Stmt *S) : DeadStmt(S) {
-      // Statements captured in the CFG can be implicit.
-      ShouldVisitImplicitCode = true;
-    }
+    bool IsSubStmtOfTargetStmt = false;
+    Checker(const Stmt *S) : DeadStmt(S) { ShouldVisitImplicitCode = true; }
 
     bool VisitStmt(Stmt *S) override {
       if (S == DeadStmt)
-        CoroutineSubStmt = true;
+        IsSubStmtOfTargetStmt = true;
       return true;
     }
   };
   Checker checker(DeadStmt);
-  checker.TraverseStmt(const_cast<Stmt *>(CoroStmt));
-  return checker.CoroutineSubStmt;
+  checker.TraverseStmt(const_cast<Stmt *>(TargetStmt));
+  return checker.IsSubStmtOfTargetStmt;
 }
 
 static bool isValidDeadStmt(const Stmt *S, const clang::CFGBlock *Block) {
@@ -503,7 +499,12 @@ static bool isValidDeadStmt(const Stmt *S, const clang::CFGBlock *Block) {
   // Coroutine statements are never considered dead statements, because removing
   // them may change the function semantic if it is the only coroutine statement
   // of the coroutine.
-  return !isInCoroutineStmt(S, Block);
+  //
+  // If the dead stmt is a sub-stmt of CXXDefaultInitExpr and CXXDefaultArgExpr,
+  // we would rather expect to find CXXDefaultInitExpr and CXXDefaultArgExpr as
+  // a valid dead stmt.
+  return !isDeadStmtInOneOf<CoreturnStmt, CoroutineSuspendExpr,
+                            CXXDefaultArgExpr, CXXDefaultInitExpr>(S, Block);
 }
 
 const Stmt *DeadCodeScan::findDeadCode(const clang::CFGBlock *Block) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 3cd4010740d19..5817632b61dbd 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -5570,8 +5570,10 @@ ExprResult Sema::BuildCXXDefaultArgExpr(SourceLocation CallLoc,
           /*SkipImmediateInvocations=*/NestedDefaultChecking))
     return ExprError();
 
+  Expr *RewrittenExpr = (Init == Param->getDefaultArg() ? nullptr : Init);
   return CXXDefaultArgExpr::Create(Context, InitializationContext->Loc, Param,
-                                   Init, InitializationContext->Context);
+                                   RewrittenExpr,
+                                   InitializationContext->Context);
 }
 
 static FieldDecl *FindFieldDeclInstantiationPattern(const ASTContext &Ctx,
@@ -5689,10 +5691,11 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
       return ExprError();
     }
     Init = Res.get();
-
+    Expr *RewrittenInit =
+        (Init == Field->getInClassInitializer() ? nullptr : Init);
     return CXXDefaultInitExpr::Create(Context, InitializationContext->Loc,
                                       Field, InitializationContext->Context,
-                                      Init);
+                                      RewrittenInit);
   }
 
   // DR1351:
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 69cf2dd6fc14e..d93952264a606 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1991,33 +1991,45 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       ExplodedNodeSet Tmp;
       StmtNodeBuilder Bldr2(PreVisit, Tmp, *currBldrCtx);
 
-      const Expr *ArgE;
-      if (const auto *DefE = dyn_cast<CXXDefaultArgExpr>(S))
+      bool HasRebuiltInit = false;
+      const Expr *ArgE = nullptr;
+      if (const auto *DefE = dyn_cast<CXXDefaultArgExpr>(S)) {
         ArgE = DefE->getExpr();
-      else if (const auto *DefE = dyn_cast<CXXDefaultInitExpr>(S))
+        HasRebuiltInit = DefE->hasRewrittenInit();
+      } else if (const auto *DefE = dyn_cast<CXXDefaultInitExpr>(S)) {
         ArgE = DefE->getExpr();
-      else
+        HasRebuiltInit = DefE->hasRewrittenInit();
+      } else
         llvm_unreachable("unknown constant wrapper kind");
 
-      bool IsTemporary = false;
-      if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(ArgE)) {
-        ArgE = MTE->getSubExpr();
-        IsTemporary = true;
-      }
+      if (HasRebuiltInit) {
+        for (auto *N : PreVisit) {
+          ProgramStateRef state = N->getState();
+          const LocationContext *LCtx = N->getLocationContext();
+          state = state->BindExpr(S, LCtx, state->getSVal(ArgE, LCtx));
+          Bldr2.generateNode(S, N, state);
+        }
+      } else {
+        // If it's not rewritten, the contents of these expressions are not
+        // actually part of the current function, so we fall back to constant
+        // evaluation.
+        bool IsTemporary = false;
+        if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(ArgE)) {
+          ArgE = MTE->getSubExpr();
+          IsTemporary = true;
+        }
+
+        std::optional<SVal> ConstantVal = svalBuilder.getConstantVal(ArgE);
+        const LocationContext *LCtx = Pred->getLocationContext();
+        for (auto *I : PreVisit) {
+          ProgramStateRef State = I->getState();
+          State = State->BindExpr(S, LCtx, ConstantVal.value_or(UnknownVal()));
+          if (IsTemporary)
+            State = createTemporaryRegionIfNeeded(State, LCtx, cast<Expr>(S),
+                                                  cast<Expr>(S));
 
-      std::optional<SVal> ConstantVal = svalBuilder.getConstantVal(ArgE);
-      if (!ConstantVal)
-        ConstantVal = UnknownVal();
-
-      const LocationContext *LCtx = Pred->getLocationContext();
-      for (const auto I : PreVisit) {
-        ProgramStateRef State = I->getState();
-        State = State->BindExpr(S, LCtx, *ConstantVal);
-        if (IsTemporary)
-          State = createTemporaryRegionIfNeeded(State, LCtx,
-                                                cast<Expr>(S),
-                                                cast<Expr>(S));
-        Bldr2.generateNode(S, I, State);
+          Bldr2.generateNode(S, I, State);
+        }
       }
 
       getCheckerManager().runCheckersForPostStmt(Dst, Tmp, S, *this);
diff --git a/clang/test/AST/ast-dump-recovery.cpp b/clang/test/AST/ast-dump-recovery.cpp
index b59fa3778192f..fa6d747556dd8 100644
--- a/clang/test/AST/ast-dump-recovery.cpp
+++ b/clang/test/AST/ast-dump-recovery.cpp
@@ -507,7 +507,7 @@ union U {
 // CHECK-NEXT:      `-DeclStmt {{.*}}
 // CHECK-NEXT:        `-VarDecl {{.*}} g 'U':'GH112560::U' listinit
 // CHECK-NEXT:          `-InitListExpr {{.*}} 'U':'GH112560::U' contains-errors field Field {{.*}} 'f' 'int'
-// CHECK-NEXT:            `-CXXDefaultInitExpr {{.*}} 'int' contains-errors has rewritten init
+// CHECK-NEXT:            `-CXXDefaultInitExpr {{.*}} 'int' contains-errors
 // CHECK-NEXT:              `-RecoveryExpr {{.*}} 'int' contains-errors
 // DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors
 void foo() {
diff --git a/clang/test/Analysis/lifetime-extended-regions.cpp b/clang/test/Analysis/lifetime-extended-regions.cpp
index 4458ad294af7c..02a1210d9af92 100644
--- a/clang/test/Analysis/lifetime-extended-regions.cpp
+++ b/clang/test/Analysis/lifetime-extended-regions.cpp
@@ -121,11 +121,10 @@ void aggregateWithReferences() {
   clang_analyzer_dump(viaReference.rx); // expected-warning-re {{&lifetime_extended_object{int, viaReference, S{{[0-9]+}}} }}
   clang_analyzer_dump(viaReference.ry); // expected-warning-re {{&lifetime_extended_object{Composite, viaReference, S{{[0-9]+}}} }}
   
-  // FIXME: clang currently support extending lifetime of object bound to reference members of aggregates,
-  // that are created from default member initializer. But CFG and ExprEngine need to be updated to address this change.
-  // The following expect warning: {{&lifetime_extended_object{Composite, defaultInitExtended, S{{[0-9]+}}} }}
+  // The lifetime of object bound to reference members of aggregates,
+  // that are created from default member initializer was extended.
   RefAggregate defaultInitExtended{i};
-  clang_analyzer_dump(defaultInitExtended.ry); // expected-warning {{Unknown }}
+  clang_analyzer_dump(defaultInitExtended.ry); // expected-warning-re {{&lifetime_extended_object{Composite, defaultInitExtended, S{{[0-9]+}}} }}
 }
 
 void lambda() {
diff --git a/clang/test/SemaCXX/cxx2c-placeholder-vars.cpp b/clang/test/SemaCXX/cxx2c-placeholder-vars.cpp
index 8e428c0ef0427..37824c16f4f05 100644
--- a/clang/test/SemaCXX/cxx2c-placeholder-vars.cpp
+++ b/clang/test/SemaCXX/cxx2c-placeholder-vars.cpp
@@ -274,16 +274,16 @@ void f() {
 // CHECK: ClassTemplateSpecializationDecl {{.*}} struct A definition
 // CHECK: CXXConstructorDecl {{.*}} implicit used constexpr A 'void () noexcept'
 // CHECK-NEXT: CXXCtorInitializer Field {{.*}} '_' 'int'
-// CHECK-NEXT: CXXDefaultInitExpr {{.*}} 'int' has rewritten init
+// CHECK-NEXT: CXXDefaultInitExpr {{.*}} 'int'
 // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 1
 // CHECK-NEXT: CXXCtorInitializer Field {{.*}} '_' 'int'
-// CHECK-NEXT: CXXDefaultInitExpr {{.*}} 'int' has rewritten init
+// CHECK-NEXT: CXXDefaultInitExpr {{.*}} 'int'
 // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
 // CHECK-NEXT: CXXCtorInitializer Field {{.*}} 'a' 'int'
-// CHECK-NEXT: CXXDefaultInitExpr {{.*}} 'int' has rewritten init
+// CHECK-NEXT: CXXDefaultInitExpr {{.*}} 'int'
 // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
 // CHECK-NEXT: CXXCtorInitializer Field {{.*}} '_' 'int'
-// CHECK-NEXT: CXXDefaultInitExpr {{.*}} 'int' has rewritten init
+// CHECK-NEXT: CXXDefaultInitExpr {{.*}} 'int'
 // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
 // CHECK-NEXT: CompoundStmt {{.*}}
 
diff --git a/clang/test/SemaCXX/warn-unreachable.cpp b/clang/test/SemaCXX/warn-unreachable.cpp
index e6f5bc5ef8e12..c96f26e196451 100644
--- a/clang/test/SemaCXX/warn-unreachable.cpp
+++ b/clang/test/SemaCXX/warn-unreachable.cpp
@@ -414,3 +414,78 @@ void tautological_compare(bool x, int y) {
     calledFun();
 
 }
+
+namespace test_rebuilt_default_arg {
+struct A {
+  explicit A(int = __builtin_LINE());
+};
+
+int h(int a) {
+  return 3;
+  A();  // expected-warning {{will never be executed}}
+}
+
+struct Temp {
+  Temp();
+  ~Temp();
+};
+
+struct B {
+  explicit B(const Temp &t = Temp());
+};
+int f(int a) {
+  return 3;
+  B();  // expected-warning {{will never be executed}}
+}
+} // namespace test_rebuilt_default_arg
+namespace test_rebuilt_default_init {
+
+struct A {
+  A();
+  ~A();
+};
+
+struct B {
+  const A &t = A();
+};
+int f(int a) {
+  return 3;
+  A{};  // expected-warning {{will never be executed}}
+}
+} // namespace test_rebuilt_default_init
+
+// This issue reported by the comments in https://github.com/llvm/llvm-project/pull/117437.
+// All block-level expressions should have already been IgnoreParens()ed.
+namespace gh117437_ignore_parens_in_default_arg {
+  class Location {
+    public:
+      static Location Current(int = __builtin_LINE());
+    };
+    class DOMMatrix;
+    class BasicMember {
+    public:
+      BasicMember(DOMMatrix *);
+    };
+    template <typename> using Member = BasicMember;
+    class ExceptionState {
+    public:
+      ExceptionState &ReturnThis();
+      ExceptionState(Location);
+    };
+    class NonThrowableExceptionState : public ExceptionState {
+    public:
+      NonThrowableExceptionState(Location location = Location::Current())
+          : ExceptionState(location) {}
+    };
+    class DOMMatrix {
+    public:
+      static DOMMatrix *
+      Create(int *, ExceptionState & = (NonThrowableExceptionState().ReturnThis()));
+    };
+    class CSSMatrixComponent {
+      int CSSMatrixComponent_matrix;
+      CSSMatrixComponent()
+          : matrix_(DOMMatrix::Create(&CSSMatrixComponent_matrix)) {}
+      Member<DOMMatrix> matrix_;
+    };
+} // namespace gh117437_ignore_parens_in_default_arg

From 6812fc02fbb81d679f95d5c3e15768ae11e1bad8 Mon Sep 17 00:00:00 2001
From: lonely eagle <2020382038@qq.com>
Date: Mon, 17 Feb 2025 00:50:04 +0800
Subject: [PATCH 070/109] [mlir][affine] make affine-loop-unroll-jam interface
 pass (#127402)

Made affine-loop-unroll-jam an interface pass, so it can run on the gpu
module.
---
 mlir/include/mlir/Dialect/Affine/Passes.h     |  2 +-
 mlir/include/mlir/Dialect/Affine/Passes.td    |  2 +-
 .../Affine/Transforms/LoopUnrollAndJam.cpp    |  2 +-
 mlir/test/Dialect/Affine/unroll-jam.mlir      | 45 ++++++++++++++++++-
 4 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
index ea5034b60d8bd..96bd3c6a9a7bc 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -102,7 +102,7 @@ std::unique_ptr<InterfacePass<FunctionOpInterface>> createLoopUnrollPass(
 /// Creates a loop unroll jam pass to unroll jam by the specified factor. A
 /// factor of -1 lets the pass use the default factor or the one on the command
 /// line if provided.
-std::unique_ptr<OperationPass<func::FuncOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
 createLoopUnrollAndJamPass(int unrollJamFactor = -1);
 
 /// Creates a pass to pipeline explicit movement of data across levels of the
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index 5325d3b0a1d69..728b8d25efcf2 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -219,7 +219,7 @@ def AffineLoopUnroll : InterfacePass<"affine-loop-unroll", "FunctionOpInterface"
   ];
 }
 
-def AffineLoopUnrollAndJam : Pass<"affine-loop-unroll-jam", "func::FuncOp"> {
+def AffineLoopUnrollAndJam : InterfacePass<"affine-loop-unroll-jam", "FunctionOpInterface"> {
   let summary = "Unroll and jam affine loops";
   let constructor = "mlir::affine::createLoopUnrollAndJamPass()";
   let options = [
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
index a79160df7575a..13640f085951e 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp
@@ -75,7 +75,7 @@ struct LoopUnrollAndJam
 };
 } // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
 mlir::affine::createLoopUnrollAndJamPass(int unrollJamFactor) {
   return std::make_unique<LoopUnrollAndJam>(
       unrollJamFactor == -1 ? std::nullopt
diff --git a/mlir/test/Dialect/Affine/unroll-jam.mlir b/mlir/test/Dialect/Affine/unroll-jam.mlir
index 7874580b1c39c..8ed7fccf7d251 100644
--- a/mlir/test/Dialect/Affine/unroll-jam.mlir
+++ b/mlir/test/Dialect/Affine/unroll-jam.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll-jam="unroll-jam-factor=2" | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll-jam="unroll-jam-factor=4" | FileCheck --check-prefix=UJAM-FOUR %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll-jam{unroll-jam-factor=2}))" | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll-jam{unroll-jam-factor=4}))" | FileCheck --check-prefix=UJAM-FOUR %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(gpu.module(gpu.func(affine-loop-unroll-jam{unroll-jam-factor=2})))" | FileCheck --check-prefix=GPU-HJAM %s
 
 // CHECK-DAG: [[$MAP_PLUS_1:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)>
 // CHECK-DAG: [[$MAP_DIV_OFFSET:#map[0-9]*]] = affine_map<()[s0] -> (((s0 - 1) floordiv 2) * 2 + 1)>
@@ -10,6 +11,8 @@
 // UJAM-FOUR-DAG: [[$MAP_PLUS_2:#map[0-9]*]] = affine_map<(d0) -> (d0 + 2)>
 // UJAM-FOUR-DAG: [[$MAP_PLUS_3:#map[0-9]*]] = affine_map<(d0) -> (d0 + 3)>
 
+// GPU-HJAM-DAG: [[$MAP_PLUS_1:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)>
+
 // CHECK-LABEL: func @unroll_jam_imperfect_nest() {
 func.func @unroll_jam_imperfect_nest() {
   affine.for %i = 0 to 101 {
@@ -46,6 +49,44 @@ func.func @unroll_jam_imperfect_nest() {
 // CHECK-NEXT: "foo"(%c100, %{{.*}})
 // CHECK-NEXT: return
 
+gpu.module @unroll_jam {
+  // GPU-HJAM-LABEL: func @unroll_jam_imperfect_nest() {
+  gpu.func @unroll_jam_imperfect_nest() {
+    affine.for %i = 0 to 101 {
+      %x = "addi32"(%i, %i) : (index, index) -> i32
+      affine.for %j = 0 to 17 {
+        %y = "addi32"(%i, %i) : (index, index) -> i32
+        %z = "addi32"(%y, %y) : (i32, i32) -> i32
+      }
+      %w = "foo"(%i, %x) : (index, i32) -> i32
+    }
+    gpu.return
+  }
+  // GPU-HJAM:      affine.for [[IV0:%arg[0-9]+]] = 0 to 100 step 2 {
+  // GPU-HJAM-NEXT:   [[RES1:%[0-9]+]] = "addi32"([[IV0]], [[IV0]])
+  // GPU-HJAM-NEXT:   [[INC:%[0-9]+]] = affine.apply [[$MAP_PLUS_1]]([[IV0]])
+  // GPU-HJAM-NEXT:   [[RES2:%[0-9]+]] = "addi32"([[INC]], [[INC]])
+  // GPU-HJAM-NEXT:   affine.for %{{.*}} = 0 to 17 {
+  // GPU-HJAM-NEXT:     [[RES3:%[0-9]+]] = "addi32"([[IV0]], [[IV0]])
+  // GPU-HJAM-NEXT:     "addi32"([[RES3]], [[RES3]]) : (i32, i32) -> i32
+  // GPU-HJAM-NEXT:     [[INC1:%[0-9]+]] = affine.apply [[$MAP_PLUS_1]]([[IV0]])
+  // GPU-HJAM-NEXT:     [[RES4:%[0-9]+]] = "addi32"([[INC1]], [[INC1]])
+  // GPU-HJAM-NEXT:     "addi32"([[RES4]], [[RES4]]) : (i32, i32) -> i32
+  // GPU-HJAM-NEXT:   }
+  // GPU-HJAM:        "foo"([[IV0]], [[RES1]])
+  // GPU-HJAM-NEXT:   affine.apply [[$MAP_PLUS_1]]([[IV0]])
+  // GPU-HJAM-NEXT:   "foo"({{.*}}, [[RES2]])
+  // GPU-HJAM:      }
+  // Cleanup loop (single iteration).
+  // GPU-HJAM:      "addi32"(%c100, %c100)
+  // GPU-HJAM-NEXT: affine.for [[IV0]] = 0 to 17 {
+  // GPU-HJAM-NEXT:   [[RESC:%[0-9]+]] = "addi32"(%c100, %c100)
+  // GPU-HJAM-NEXT:   "addi32"([[RESC]], [[RESC]]) : (i32, i32) -> i32
+  // GPU-HJAM-NEXT: }
+  // GPU-HJAM-NEXT: "foo"(%c100, %{{.*}})
+  // GPU-HJAM-NEXT: return
+}
+
 // CHECK-LABEL: func @loop_nest_unknown_count_1
 // CHECK-SAME: [[N:arg[0-9]+]]: index
 func.func @loop_nest_unknown_count_1(%N : index) {

From 6e94007623ca9d98d090fe04491f21ec72a5d0d4 Mon Sep 17 00:00:00 2001
From: Yeaseen <yeaseen.arafat96@gmail.com>
Date: Sun, 16 Feb 2025 11:44:46 -0700
Subject: [PATCH 071/109] [llvm] Remove `br i1 undef` in some
 `llvm/test/CodeGen` tests (#127368)

This PR replaces some instances of `br i1 undef` with function argument
value in several tests under `llvm/test/CodeGen/ `directory. This PR is
a continuation of PR #125460
---
 .../CodeGen/AArch64/arm64-storebytesmerge.ll  |  4 ++--
 llvm/test/CodeGen/AArch64/br-to-eh-lpad.ll    |  6 +++---
 llvm/test/CodeGen/AArch64/br-undef-cond.ll    |  2 +-
 llvm/test/CodeGen/AArch64/gep-nullptr.ll      |  4 ++--
 .../machine-sink-getmemoperandwithoffset.mir  |  4 ++--
 llvm/test/CodeGen/AArch64/madd-combiner.ll    |  2 +-
 .../CodeGen/AArch64/optimize-cond-branch.ll   |  2 +-
 llvm/test/CodeGen/AArch64/shrink-wrap.ll      |  4 ++--
 .../CodeGen/AArch64/tail-call-unused-zext.ll  |  8 ++++----
 .../AArch64/tailcall-ssp-split-debug.ll       |  4 ++--
 .../CodeGen/AMDGPU/cgp-bitfield-extract.ll    | 20 +++++++++----------
 .../AMDGPU/dagcomb-shuffle-vecextend-non2.ll  |  4 ++--
 12 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
index 188a4f07a33dc..db65fdde0ae25 100644
--- a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
@@ -14,12 +14,12 @@
 @q = external dso_local unnamed_addr global ptr, align 8
 
 ; Function Attrs: nounwind
-define void @test() local_unnamed_addr #0 {
+define void @test(i1 %arg) local_unnamed_addr #0 {
 entry:
   br label %for.body453.i
 
 for.body453.i:                                    ; preds = %for.body453.i, %entry
-  br i1 undef, label %for.body453.i, label %for.end705.i
+  br i1 %arg, label %for.body453.i, label %for.end705.i
 
 for.end705.i:                                     ; preds = %for.body453.i
   %0 = load ptr, ptr @q, align 8
diff --git a/llvm/test/CodeGen/AArch64/br-to-eh-lpad.ll b/llvm/test/CodeGen/AArch64/br-to-eh-lpad.ll
index 3ca6bab31c955..f17f7ff0f5f74 100644
--- a/llvm/test/CodeGen/AArch64/br-to-eh-lpad.ll
+++ b/llvm/test/CodeGen/AArch64/br-to-eh-lpad.ll
@@ -7,16 +7,16 @@
 ; that case, the machine verifier, which relies on analyzing branches for this
 ; kind of verification, is unable to check anything, so accepts the CFG.
 
-define void @test_branch_to_landingpad() personality ptr @__objc_personality_v0 {
+define void @test_branch_to_landingpad(i1 %arg) personality ptr @__objc_personality_v0 {
 entry:
-  br i1 undef, label %if.end50.thread, label %if.then6
+  br i1 %arg, label %if.end50.thread, label %if.then6
 
 lpad:
   %0 = landingpad { ptr, i32 }
           catch ptr @"OBJC_EHTYPE_$_NSString"
           catch ptr @OBJC_EHTYPE_id
           catch ptr null
-  br i1 undef, label %invoke.cont33, label %catch.fallthrough
+  br i1 %arg, label %invoke.cont33, label %catch.fallthrough
 
 catch.fallthrough:
   %matches31 = icmp eq i32 undef, 0
diff --git a/llvm/test/CodeGen/AArch64/br-undef-cond.ll b/llvm/test/CodeGen/AArch64/br-undef-cond.ll
index 785d1c883cdb9..de0fbfc10caa7 100644
--- a/llvm/test/CodeGen/AArch64/br-undef-cond.ll
+++ b/llvm/test/CodeGen/AArch64/br-undef-cond.ll
@@ -9,7 +9,7 @@ declare void @bar(ptr)
 
 define void @foo(ptr %m, i32 %off0) {
 .thread1653:
-  br i1 undef, label %0, label %.thread1880
+  br i1 poison, label %0, label %.thread1880
 
   %1 = icmp eq i32 undef, 0
   %.not = xor i1 %1, true
diff --git a/llvm/test/CodeGen/AArch64/gep-nullptr.ll b/llvm/test/CodeGen/AArch64/gep-nullptr.ll
index 8ac4314324ef4..d36d88236158e 100644
--- a/llvm/test/CodeGen/AArch64/gep-nullptr.ll
+++ b/llvm/test/CodeGen/AArch64/gep-nullptr.ll
@@ -6,9 +6,9 @@ target triple = "aarch64--linux-gnu"
 %unionMV = type { i32 }
 
 ; Function Attrs: nounwind
-define void @test(ptr %mi_block) {
+define void @test(ptr %mi_block, i1 %arg) {
 entry:
-  br i1 undef, label %for.body13.us, label %if.else
+  br i1 %arg, label %for.body13.us, label %if.else
 
 ; Just make sure we don't get a compiler ICE due to dereferncing a nullptr.
 ; CHECK-LABEL: test
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-getmemoperandwithoffset.mir b/llvm/test/CodeGen/AArch64/machine-sink-getmemoperandwithoffset.mir
index 3a21333a2b570..4b157eb6176cc 100644
--- a/llvm/test/CodeGen/AArch64/machine-sink-getmemoperandwithoffset.mir
+++ b/llvm/test/CodeGen/AArch64/machine-sink-getmemoperandwithoffset.mir
@@ -1,8 +1,8 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machine-sink -o - %s | FileCheck %s
 --- |
-  define i8 @g() {
+  define i8 @g(i1 %arg) {
   else.7:
-    br i1 undef, label %then.8, label %else.8, !make.implicit !0
+    br i1 %arg, label %then.8, label %else.8, !make.implicit !0
 
   then.8:                                           ; preds = %else.8, %else.7
     %merge = phi i8 [ 1, %else.7 ], [ 0, %else.8 ]
diff --git a/llvm/test/CodeGen/AArch64/madd-combiner.ll b/llvm/test/CodeGen/AArch64/madd-combiner.ll
index cfdeb3d97a5df..6e510712fbd21 100644
--- a/llvm/test/CodeGen/AArch64/madd-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/madd-combiner.ll
@@ -53,7 +53,7 @@ define void @mul_add_imm2() {
 entry:
   br label %for.body
 for.body:
-  br i1 undef, label %for.body, label %for.body8
+  br i1 poison, label %for.body, label %for.body8
 for.body8:
   %0 = mul i64 undef, -3
   %mul1971 = add i64 %0, -3
diff --git a/llvm/test/CodeGen/AArch64/optimize-cond-branch.ll b/llvm/test/CodeGen/AArch64/optimize-cond-branch.ll
index ceed45489402e..fdf972990e745 100644
--- a/llvm/test/CodeGen/AArch64/optimize-cond-branch.ll
+++ b/llvm/test/CodeGen/AArch64/optimize-cond-branch.ll
@@ -38,7 +38,7 @@ define void @func() uwtable {
   br i1 %c0, label %b1, label %b6
 
 b1:
-  br i1 undef, label %b3, label %b2
+  br i1 poison, label %b3, label %b2
 
 b2:
   %v0 = tail call i32 @extfunc()
diff --git a/llvm/test/CodeGen/AArch64/shrink-wrap.ll b/llvm/test/CodeGen/AArch64/shrink-wrap.ll
index 518a0126e23d6..0caba809c1331 100644
--- a/llvm/test/CodeGen/AArch64/shrink-wrap.ll
+++ b/llvm/test/CodeGen/AArch64/shrink-wrap.ll
@@ -31,7 +31,7 @@ declare fastcc i32 @foo()
 
 declare fastcc i32 @bar()
 
-define internal fastcc i32 @func(i32 %alpha, i32 %beta) {
+define internal fastcc i32 @func(i32 %alpha, i32 %beta, i1 %arg) {
 entry:
   %v1 = alloca [2 x [11 x i32]], align 4
   %v2 = alloca [11 x i32], align 16
@@ -69,7 +69,7 @@ for.body:
   %a.0983 = phi i32 [ 1, %if.end.9 ], [ %a.1, %for.inc ]
   %arrayidx = getelementptr inbounds [62 x i32], ptr @g17, i64 0, i64 undef
   %tmp5 = load i32, ptr %arrayidx, align 4
-  br i1 undef, label %for.inc, label %if.else.51
+  br i1 %arg, label %for.inc, label %if.else.51
 
 if.else.51:
   %idxprom53 = sext i32 %tmp5 to i64
diff --git a/llvm/test/CodeGen/AArch64/tail-call-unused-zext.ll b/llvm/test/CodeGen/AArch64/tail-call-unused-zext.ll
index c38fb7b8c750d..0fae26781fd18 100644
--- a/llvm/test/CodeGen/AArch64/tail-call-unused-zext.ll
+++ b/llvm/test/CodeGen/AArch64/tail-call-unused-zext.ll
@@ -6,10 +6,10 @@
 ; the attributes of the caller and the callee match.
 
 declare zeroext i1 @zcallee()
-define void @zcaller() {
+define void @zcaller(i1 %arg) {
 ; CHECK-LABEL: name: zcaller
 entry:
-  br i1 undef, label %calllabel, label %retlabel
+  br i1 %arg, label %calllabel, label %retlabel
 calllabel:
 ; CHECK: bb.1.calllabel:
 ; CHECK-NOT: BL @zcallee
@@ -21,10 +21,10 @@ retlabel:
 }
 
 declare signext i1 @scallee()
-define void @scaller() {
+define void @scaller(i1 %arg) {
 ; CHECK-LABEL: name: scaller
 entry:
-  br i1 undef, label %calllabel, label %retlabel
+  br i1 %arg, label %calllabel, label %retlabel
 calllabel:
 ; CHECK: bb.1.calllabel:
 ; CHECK-NOT: BL @scallee
diff --git a/llvm/test/CodeGen/AArch64/tailcall-ssp-split-debug.ll b/llvm/test/CodeGen/AArch64/tailcall-ssp-split-debug.ll
index 3338485bb5a55..bf8bb6ae16ef2 100644
--- a/llvm/test/CodeGen/AArch64/tailcall-ssp-split-debug.ll
+++ b/llvm/test/CodeGen/AArch64/tailcall-ssp-split-debug.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=arm64-apple-ios %s -o - | FileCheck %s
 
-define swifttailcc void @foo(ptr %call) ssp {
+define swifttailcc void @foo(ptr %call, i1 %arg) ssp {
 ; CHECK-LABEL: foo:
   %var = alloca [28 x i8], align 16
-  br i1 undef, label %if.then, label %if.end
+  br i1 %arg, label %if.then, label %if.end
 
 if.then:
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index f6611c6160fd1..14a96ac5c6338 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -35,10 +35,10 @@
 
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define amdgpu_kernel void @sink_ubfe_i32(ptr addrspace(1) %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i32(ptr addrspace(1) %out, i32 %arg1, i1 %arg) #0 {
 entry:
   %shr = lshr i32 %arg1, 8
-  br i1 undef, label %bb0, label %bb1
+  br i1 %arg, label %bb0, label %bb1
 
 bb0:
   %val0 = and i32 %shr, 255
@@ -75,10 +75,10 @@ ret:
 ; OPT: ret
 
 ; GCN-LABEL: {{^}}sink_sbfe_i32:
-define amdgpu_kernel void @sink_sbfe_i32(ptr addrspace(1) %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_sbfe_i32(ptr addrspace(1) %out, i32 %arg1, i1 %arg) #0 {
 entry:
   %shr = ashr i32 %arg1, 8
-  br i1 undef, label %bb0, label %bb1
+  br i1 %arg, label %bb0, label %bb1
 
 bb0:
   %val0 = and i32 %shr, 255
@@ -183,10 +183,10 @@ ret:
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xff, v[[LO]]
 
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(ptr addrspace(1) %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(ptr addrspace(1) %out, i64 %arg1, i1 %arg) #0 {
 entry:
   %shr = lshr i64 %arg1, 30
-  br i1 undef, label %bb0, label %bb1
+  br i1 %arg, label %bb0, label %bb1
 
 bb0:
   %val0 = and i64 %shr, 255
@@ -231,10 +231,10 @@ ret:
 ; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000f
 
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @sink_ubfe_i64_low32(ptr addrspace(1) %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_low32(ptr addrspace(1) %out, i64 %arg1, i1 %arg) #0 {
 entry:
   %shr = lshr i64 %arg1, 15
-  br i1 undef, label %bb0, label %bb1
+  br i1 %arg, label %bb0, label %bb1
 
 bb0:
   %val0 = and i64 %shr, 255
@@ -277,10 +277,10 @@ ret:
 ; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80003
 
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @sink_ubfe_i64_high32(ptr addrspace(1) %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_high32(ptr addrspace(1) %out, i64 %arg1, i1 %arg) #0 {
 entry:
   %shr = lshr i64 %arg1, 35
-  br i1 undef, label %bb0, label %bb1
+  br i1 %arg, label %bb0, label %bb1
 
 bb0:
   %val0 = and i64 %shr, 255
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-shuffle-vecextend-non2.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-shuffle-vecextend-non2.ll
index 873fa6436b3de..b5c1f1bc52fd4 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-shuffle-vecextend-non2.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-shuffle-vecextend-non2.ll
@@ -10,9 +10,9 @@
 ;
 ; GCN: s_endpgm
 
-define amdgpu_ps void @main(i32 %in1) local_unnamed_addr {
+define amdgpu_ps void @main(i32 %in1, i1 %arg) local_unnamed_addr {
 .entry:
-  br i1 undef, label %bb12, label %bb
+  br i1 %arg, label %bb12, label %bb
 
 bb:
   %__llpc_global_proxy_r5.12.vec.insert = insertelement <4 x i32> undef, i32 %in1, i32 3

From e080366a76b78a746c53caccf84661b109ccbc20 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 16 Feb 2025 19:56:12 +0100
Subject: [PATCH 072/109] [LAA] Inline hasComputableBounds in only caller,
 simplify isNoWrap.

Inline hasComputableBounds into createCheckForAccess. This removes a
level of indirection and allows for passing the AddRec directly to
isNoWrap, removing the need to retrieve the AddRec for the pointer
again.

The early continue for invariant SCEVs now also applies to forked
pointers (i.e. when there's more than one entry in TranslatedPtrs) when
ShouldCheckWrap is true, as those trivially won't wrap.

The change is NFC otherwise. replaceSymbolicStrideSCEV is now called
earlier.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 59 +++++++------------
 ...ter-dependence-analysis-forked-pointers.ll | 44 +++++++++++++-
 2 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 43380b59ac49f..7d6dbd51a404d 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -793,26 +793,6 @@ class AccessAnalysis {
 
 } // end anonymous namespace
 
-/// Check whether a pointer can participate in a runtime bounds check.
-/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
-/// by adding run-time checks (overflow checks) if necessary.
-static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr,
-                                const SCEV *PtrScev, Loop *L, bool Assume) {
-  // The bounds for loop-invariant pointer is trivial.
-  if (PSE.getSE()->isLoopInvariant(PtrScev, L))
-    return true;
-
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
-
-  if (!AR && Assume)
-    AR = PSE.getAsAddRec(Ptr);
-
-  if (!AR)
-    return false;
-
-  return AR->isAffine();
-}
-
 /// Try to compute the stride for \p AR. Used by getPtrStride.
 static std::optional<int64_t>
 getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
@@ -859,21 +839,9 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
                            PredicatedScalarEvolution &PSE, const Loop *L);
 
 /// Check whether a pointer address cannot wrap.
-static bool isNoWrap(PredicatedScalarEvolution &PSE,
-                     const DenseMap<Value *, const SCEV *> &Strides, Value *Ptr,
-                     Type *AccessTy, const Loop *L, bool Assume,
+static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
+                     Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
                      std::optional<int64_t> Stride = std::nullopt) {
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
-  if (PSE.getSE()->isLoopInvariant(PtrScev, L))
-    return true;
-
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
-  if (!AR) {
-    if (!Assume)
-      return false;
-    AR = PSE.getAsAddRec(Ptr);
-  }
-
   // The address calculation must not wrap. Otherwise, a dependence could be
   // inverted.
   if (isNoWrapAddRec(Ptr, AR, PSE, L))
@@ -1143,14 +1111,27 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
   SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs =
       findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
 
+  /// Check whether all pointers can participate in a runtime bounds check. They
+  /// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also
+  /// must not wrap.
   for (auto &P : TranslatedPtrs) {
-    if (!hasComputableBounds(PSE, Ptr, P.getPointer(), TheLoop, Assume))
+    // The bounds for loop-invariant pointer is trivial.
+    if (PSE.getSE()->isLoopInvariant(P.getPointer(), TheLoop))
+      continue;
+
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(P.getPointer());
+    if (!AR && Assume)
+      AR = PSE.getAsAddRec(Ptr);
+    if (!AR || !AR->isAffine())
       return false;
 
     // If there's only one option for Ptr, look it up after bounds and wrap
     // checking, because assumptions might have been added to PSE.
-    if (TranslatedPtrs.size() == 1)
-      P.setPointer(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr));
+    if (TranslatedPtrs.size() == 1) {
+      AR =
+          cast<SCEVAddRecExpr>(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr));
+      P.setPointer(AR);
+    }
 
     // When we run after a failing dependency check we have to make sure
     // we don't have wrapping pointers.
@@ -1159,7 +1140,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
       if (TranslatedPtrs.size() > 1)
         return false;
 
-      if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop, Assume))
+      if (!isNoWrap(PSE, AR, Ptr, AccessTy, TheLoop, Assume))
         return false;
     }
   }
@@ -1548,7 +1529,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
   if (!ShouldCheckWrap || !Stride)
     return Stride;
 
-  if (isNoWrap(PSE, StridesMap, Ptr, AccessTy, Lp, Assume, Stride))
+  if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride))
     return Stride;
 
   LLVM_DEBUG(
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll
index 006ce430249fd..5e9dc7f2b91cc 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll
@@ -5,10 +5,52 @@
 define void @dependency_check_and_runtime_checks_needed_select_of_invariant_ptrs(ptr %a, ptr %b, ptr %c, i64 %offset, i64 %n) {
 ; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_select_of_invariant_ptrs'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: cannot check memory dependencies at runtime
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %b, ptr %c
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP1]]):
+; CHECK-NEXT:          %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %b, ptr %c
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP1]]):
+; CHECK-NEXT:          %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
+; CHECK-NEXT:      Check 3:
+; CHECK-NEXT:        Comparing group ([[GRP2]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %b, ptr %c
+; CHECK-NEXT:        Against group ([[GRP3]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %b, ptr %c
+; CHECK-NEXT:      Check 4:
+; CHECK-NEXT:        Comparing group ([[GRP2]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %b, ptr %c
+; CHECK-NEXT:        Against group ([[GRP4]]):
+; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
+; CHECK-NEXT:      Check 5:
+; CHECK-NEXT:        Comparing group ([[GRP3]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %b, ptr %c
+; CHECK-NEXT:        Against group ([[GRP4]]):
+; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: %a High: ((4 * %n) + %a))
+; CHECK-NEXT:            Member: {%a,+,4}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %b High: (4 + %b))
+; CHECK-NEXT:            Member: %b
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: %c High: (4 + %c))
+; CHECK-NEXT:            Member: %c
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a))
+; CHECK-NEXT:            Member: {((4 * %offset) + %a),+,4}<%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:

From f5d63ccb22bf98ec28785fea432fc5a19b1913c4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 16 Feb 2025 20:55:08 +0100
Subject: [PATCH 073/109] [LICM] Add test with deref assumption of GEP.

---
 .../LICM/hoist-speculatable-load.ll           | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/llvm/test/Transforms/LICM/hoist-speculatable-load.ll b/llvm/test/Transforms/LICM/hoist-speculatable-load.ll
index 85411428a402f..a4a38c2eaadc3 100644
--- a/llvm/test/Transforms/LICM/hoist-speculatable-load.ll
+++ b/llvm/test/Transforms/LICM/hoist-speculatable-load.ll
@@ -164,4 +164,60 @@ exit:                                         ; preds = %if.end, %entry
   ret void
 }
 
+define void @f_chained_gep_with_nofree_nosync(ptr %ptr, ptr %ptr2, i1 %cond) nofree nosync {
+; CHECK-LABEL: @f_chained_gep_with_nofree_nosync(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP]], i32 16), "dereferenceable"(ptr [[GEP]], i32 16) ]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR_BODY_LR_PH:%.*]], label [[IF0:%.*]]
+; CHECK:       if0:
+; CHECK-NEXT:    store i32 0, ptr [[PTR2:%.*]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY_LR_PH]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_END]], label [[IF:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[PTR2]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 2
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 1
+  call void @llvm.assume(i1 true) [ "align"(ptr %gep, i32 16), "dereferenceable"(ptr %gep, i32 16) ]
+  br i1 %cond, label %for.body.lr.ph, label %if0
+
+if0:
+  store i32 0, ptr %ptr2, align 4
+  br label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %if.end
+  %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ]
+  br i1 %cond, label %if.end, label %if
+
+if:
+  %0 = load i32, ptr %gep, align 4, !invariant.load !{}
+  store i32 %0, ptr %ptr2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.body
+  %inc = add nuw nsw i32 %i.08, 1
+  %cmp = icmp slt i32 %inc, 2
+  br i1 %cmp, label %for.body, label %exit
+
+exit:                                         ; preds = %if.end, %entry
+  ret void
+}
+
+
 declare void @llvm.assume(i1 noundef)

From c22d84f7bb58005638b24f976582acf62a56d19d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 16 Feb 2025 12:12:00 -0800
Subject: [PATCH 074/109] [ELF] Refine ctx.arg.exportDynamic condition

--export-dynamic should be a no-op when ctx.hasDynsym is false.

* Drop unneeded ctx.hasDynsym checks.
* Static linking with --export-dynamic does not prevent devirtualization.
---
 lld/ELF/Driver.cpp                                  |  4 ++--
 lld/ELF/Symbols.cpp                                 |  5 +----
 lld/ELF/Writer.cpp                                  |  8 +++-----
 lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll | 11 ++++++++---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 7d14180a49926..70a293875f27b 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2617,8 +2617,7 @@ void LinkerDriver::compileBitcodeFiles(bool skipLinkedOutput) {
       for (Symbol *sym : obj->getGlobalSymbols()) {
         if (!sym->isDefined())
           continue;
-        if (ctx.hasDynsym && ctx.arg.exportDynamic &&
-            sym->computeBinding(ctx) != STB_LOCAL)
+        if (ctx.arg.exportDynamic && sym->computeBinding(ctx) != STB_LOCAL)
           sym->isExported = true;
         if (sym->hasVersionSuffix)
           sym->parseSymbolVersion(ctx);
@@ -2965,6 +2964,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
 
   // Create dynamic sections for dynamic linking and static PIE.
   ctx.hasDynsym = !ctx.sharedFiles.empty() || ctx.arg.isPic;
+  ctx.arg.exportDynamic &= ctx.hasDynsym;
 
   // If an entry symbol is in a static archive, pull out that file now.
   if (Symbol *sym = ctx.symtab->find(ctx.arg.entry))
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 80b0691428007..fe7ba370c9f5d 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -361,13 +361,10 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
   // can contain versions in the form of <name>@<version>.
   // Let them parse and update their names to exclude version suffix.
   // In addition, compute isExported and isPreemptible.
-  bool hasDynsym = ctx.hasDynsym;
   bool maybePreemptible = ctx.sharedFiles.size() || ctx.arg.shared;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (sym->hasVersionSuffix)
       sym->parseSymbolVersion(ctx);
-    if (!hasDynsym)
-      continue;
     if (sym->computeBinding(ctx) == STB_LOCAL) {
       sym->isExported = false;
       continue;
@@ -377,7 +374,7 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
     } else if (ctx.arg.exportDynamic &&
                (sym->isUsedInRegularObj || !sym->ltoCanOmit)) {
       sym->isExported = true;
-      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
+      sym->isPreemptible = maybePreemptible && computeIsPreemptible(ctx, *sym);
     }
   }
 }
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 858f92c001158..a2c49343e5c8d 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -284,7 +284,6 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
 static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Demote symbols");
   DenseMap<InputFile *, DenseMap<SectionBase *, size_t>> sectionIndexMap;
-  bool hasDynsym = ctx.hasDynsym;
   bool maybePreemptible = ctx.sharedFiles.size() || ctx.arg.shared;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (auto *d = dyn_cast<Defined>(sym)) {
@@ -301,9 +300,8 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
       }
     }
 
-    if (hasDynsym)
-      sym->isPreemptible = maybePreemptible &&
-                           (sym->isUndefined() || sym->isExported) &&
+    if (maybePreemptible)
+      sym->isPreemptible = (sym->isUndefined() || sym->isExported) &&
                            computeIsPreemptible(ctx, *sym);
   }
 }
@@ -1853,7 +1851,7 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
 
   // If the previous code block defines any non-hidden symbols (e.g.
   // __global_pointer$), they may be exported.
-  if (ctx.hasDynsym && ctx.arg.exportDynamic)
+  if (ctx.arg.exportDynamic)
     for (Symbol *sym : ctx.synthesizedSymbols)
       if (sym->computeBinding(ctx) != STB_LOCAL)
         sym->isExported = true;
diff --git a/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll b/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
index 189e3c0b821bd..bcb92a1beb17b 100644
--- a/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
+++ b/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
@@ -9,6 +9,10 @@
 ; RUN: ld.lld %t2.o -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
 ; RUN: llvm-dis %t2.o.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR
+;; --export-dynamic without .dynsym does not prevent devirtualization.
+; RUN: ld.lld %t2.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. \
+; RUN:   --export-dynamic 2>&1 | FileCheck %s --check-prefix=REMARK
 
 ;; Hybrid WPD
 ;; Generate split module with summary for hybrid Thin/Regular LTO WPD.
@@ -33,19 +37,20 @@
 ; RUN: ld.lld -shared -soname=ta %ta.o -o %ta.so
 
 ;; Index based WPD
-; RUN: ld.lld %t2.o -o %t3 -save-temps --lto-whole-program-visibility \
+
+; RUN: ld.lld %t2.o %ta.so -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
 ; RUN:   --export-dynamic 2>&1 | FileCheck /dev/null --implicit-check-not single-impl --allow-empty
 ; RUN: llvm-dis %t2.o.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-NODEVIRT-IR
 
 ;; Hybrid WPD
-; RUN: ld.lld %t.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN: ld.lld %t.o %ta.so -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
 ; RUN:   --export-dynamic 2>&1 | FileCheck /dev/null --implicit-check-not single-impl --allow-empty
 ; RUN: llvm-dis %t.o.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-NODEVIRT-IR
 
 ;; Regular LTO WPD
-; RUN: ld.lld %t4.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN: ld.lld %t4.o %ta.so -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
 ; RUN:   --export-dynamic 2>&1 | FileCheck /dev/null --implicit-check-not single-impl --allow-empty
 ; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-NODEVIRT-IR

From 627387722469a358a80d77488509fb23d890d402 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Sun, 16 Feb 2025 14:18:09 -0600
Subject: [PATCH 075/109] [lld] enable installing lld headers and libraries as
 part of distribution (#127123)

This patch allows `lld-headers` and `lld-libraries` in
`LLVM_DISTRIBUTION_COMPONENTS` to be specified and thus enable piecewise
installation of `lld/**/*.h` headers and/or lld libraries (both in
shared and static builds).
This is similar to use cases such as
`clang;clang-headers;clang-libraries`. Note when `lld-libraries` is
present, `llvm-libraries` must be present as well because various lld
libraries depend on various llvm libraries.
---
 lld/CMakeLists.txt             | 45 ++++++++++++++++++++++++++++------
 lld/cmake/modules/AddLLD.cmake |  5 ++--
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/lld/CMakeLists.txt b/lld/CMakeLists.txt
index 64c9f23805509..012a943d5bcec 100644
--- a/lld/CMakeLists.txt
+++ b/lld/CMakeLists.txt
@@ -180,14 +180,6 @@ include_directories(BEFORE
   ${CMAKE_CURRENT_SOURCE_DIR}/include
   )
 
-if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
-  install(DIRECTORY include/
-    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
-    FILES_MATCHING
-    PATTERN "*.h"
-    )
-endif()
-
 add_subdirectory(Common)
 add_subdirectory(tools/lld)
 
@@ -207,4 +199,41 @@ add_subdirectory(MachO)
 add_subdirectory(MinGW)
 add_subdirectory(wasm)
 
+add_custom_target(lld-headers)
+set_target_properties(lld-headers PROPERTIES FOLDER "Misc")
+if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+  install(DIRECTORY include/lld
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    COMPONENT lld-headers
+    FILES_MATCHING
+    PATTERN "*.h"
+    )
+
+  if (NOT LLVM_ENABLE_IDE)
+    add_llvm_install_targets(install-lld-headers
+                             DEPENDS lld-headers
+                             COMPONENT lld-headers)
+  endif()
+endif()
+
+# Custom target to install all lld libraries
+add_custom_target(lld-libraries)
+if (NOT LLVM_ENABLE_IDE)
+  add_llvm_install_targets(install-lld-libraries
+                           DEPENDS lld-libraries
+                           COMPONENT lld-libraries)
+endif()
+
+get_property(LLD_LIBS GLOBAL PROPERTY LLD_ALL_LIBS)
+if(LLD_LIBS)
+  list(REMOVE_DUPLICATES LLD_LIBS)
+  foreach(lib ${LLD_LIBS})
+    add_dependencies(lld-libraries ${lib})
+    if(NOT LLVM_ENABLE_IDE)
+      add_dependencies(install-lld-libraries install-${lib})
+      add_dependencies(install-lld-libraries-stripped install-${lib}-stripped)
+    endif()
+  endforeach()
+endif()
+
 add_subdirectory(cmake/modules)
diff --git a/lld/cmake/modules/AddLLD.cmake b/lld/cmake/modules/AddLLD.cmake
index 9f2684b6f933e..1de373ff860c4 100644
--- a/lld/cmake/modules/AddLLD.cmake
+++ b/lld/cmake/modules/AddLLD.cmake
@@ -13,7 +13,7 @@ macro(add_lld_library name)
   llvm_add_library(${name} ${ARG_ENABLE_SHARED} ${ARG_UNPARSED_ARGUMENTS})
 
   if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
-    get_target_export_arg(${name} LLD export_to_lldtargets)
+    get_target_export_arg(${name} LLD export_to_lldtargets UMBRELLA lld-libraries)
     install(TARGETS ${name}
       COMPONENT ${name}
       ${export_to_lldtargets}
@@ -21,11 +21,12 @@ macro(add_lld_library name)
       ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
       RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
 
-    if (${ARG_SHARED} AND NOT CMAKE_CONFIGURATION_TYPES)
+    if (NOT CMAKE_CONFIGURATION_TYPES)
       add_llvm_install_targets(install-${name}
         DEPENDS ${name}
         COMPONENT ${name})
     endif()
+    set_property(GLOBAL APPEND PROPERTY LLD_ALL_LIBS ${name})
     set_property(GLOBAL APPEND PROPERTY LLD_EXPORTS ${name})
   endif()
 endmacro(add_lld_library)

From b4f91b007fa4df0923b92c88103dab37c576150b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 16 Feb 2025 21:25:07 +0100
Subject: [PATCH 076/109] [LV] Use IRBuilder::insert to insert VPWidenRecipe
 (NFC).

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2cdb87fdd3f8d..4685064407f08 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8937,15 +8937,12 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
 
   unsigned ReductionOpcode = Reduction->getOpcode();
   if (ReductionOpcode == Instruction::Sub) {
-    VPBasicBlock *ParentBlock = Builder.getInsertBlock();
-    assert(ParentBlock && "Builder must have an insert block.");
-
     auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
     SmallVector<VPValue *, 2> Ops;
     Ops.push_back(Plan.getOrAddLiveIn(Zero));
     Ops.push_back(BinOp);
     BinOp = new VPWidenRecipe(*Reduction, make_range(Ops.begin(), Ops.end()));
-    ParentBlock->appendRecipe(BinOp->getDefiningRecipe());
+    Builder.insert(BinOp->getDefiningRecipe());
     ReductionOpcode = Instruction::Add;
   }
 

From f75126eeabba13ce2aab53c2e4296fca12b9da0d Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Sun, 16 Feb 2025 12:32:51 -0800
Subject: [PATCH 077/109] [FreeBSD] Support -stdlib=libstdc++

The experimental-library-flag.cpp test was failing on FreeBSD builders,
which turned to be caused by missing support for -stdlib=libcstdc++ (and
just using a hardcoded libc++ in all cases).
Simplify FreeBSD::AddCXXStdlibLibArgs() by deferring to the parent class
and dealing with the FreeSBD < 14 profiling support as a special case.

While touching the test file also drop the unnecessary `-o %t.o`. This is
not needed since the RUN lines use -### and don't produce any output.

Reviewed By: DimitryAndric, MaskRay

Pull Request: https://github.com/llvm/llvm-project/pull/126302
---
 clang/lib/Driver/ToolChains/FreeBSD.cpp        | 10 +++++-----
 .../test/Driver/experimental-library-flag.cpp  |  5 +++++
 clang/test/Driver/freebsd.cpp                  | 18 +++++++++++-------
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index a6d859f0ebfec..bd6ab0f8aba57 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -452,12 +452,12 @@ void FreeBSD::addLibCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
 
 void FreeBSD::AddCXXStdlibLibArgs(const ArgList &Args,
                                   ArgStringList &CmdArgs) const {
+  Generic_ELF::AddCXXStdlibLibArgs(Args, CmdArgs);
   unsigned Major = getTriple().getOSMajorVersion();
-  bool Profiling = Args.hasArg(options::OPT_pg) && Major != 0 && Major < 14;
-
-  CmdArgs.push_back(Profiling ? "-lc++_p" : "-lc++");
-  if (Args.hasArg(options::OPT_fexperimental_library))
-    CmdArgs.push_back("-lc++experimental");
+  bool SuffixedLib = Args.hasArg(options::OPT_pg) && Major != 0 && Major < 14;
+  if (SuffixedLib && GetCXXStdlibType(Args) == CST_Libcxx)
+    llvm::replace(CmdArgs, static_cast<const char *>("-lc++"),
+                  static_cast<const char *>("-lc++_p"));
 }
 
 void FreeBSD::AddCudaIncludeArgs(const ArgList &DriverArgs,
diff --git a/clang/test/Driver/experimental-library-flag.cpp b/clang/test/Driver/experimental-library-flag.cpp
index db6a90b50f255..62b007516897e 100644
--- a/clang/test/Driver/experimental-library-flag.cpp
+++ b/clang/test/Driver/experimental-library-flag.cpp
@@ -9,6 +9,11 @@
 // RUN: %clangxx -fexperimental-library -stdlib=libstdc++ -### %s 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIBSTDCXX %s
 // RUN: %clangxx -fexperimental-library -stdlib=libc++ -nostdlib++ -### %s 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-NOSTDLIB %s
 
+/// The FreeBSD driver did not support -stdlib=libstdc++ previously, check that it does the right thing here.
+// RUN: %clangxx --target=x86_64-unknown-freebsd -fexperimental-library -stdlib=libc++ -### %s 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIBCXX %s
+// RUN: %clangxx --target=x86_64-unknown-freebsd -fexperimental-library -stdlib=libstdc++ -### %s 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-LIBSTDCXX %s
+// RUN: %clangxx --target=x86_64-unknown-freebsd -fexperimental-library -stdlib=libc++ -nostdlib++ -### %s 2>&1 | FileCheck --check-prefixes=CHECK,CHECK-NOSTDLIB %s
+
 // -fexperimental-library must be passed to CC1.
 // CHECK: -fexperimental-library
 
diff --git a/clang/test/Driver/freebsd.cpp b/clang/test/Driver/freebsd.cpp
index dc8c98d3c3cb7..e2f76cd013f7f 100644
--- a/clang/test/Driver/freebsd.cpp
+++ b/clang/test/Driver/freebsd.cpp
@@ -1,15 +1,19 @@
-// RUN: %clangxx %s -### -o %t.o --target=amd64-unknown-freebsd -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### --target=amd64-unknown-freebsd -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-DEFAULT %s
-// RUN: %clangxx %s -### -o %t.o --target=amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-TEN %s
+// RUN: %clangxx %s -### --target=amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clangxx %s -### --target=amd64-unknown-freebsd -stdlib=libc++ 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clangxx %s -### --target=amd64-unknown-freebsd -stdlib=libstdc++ 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-STDLIBCXX %s
 // CHECK-DEFAULT: "-lc++" "-lm"
-// CHECK-TEN: "-lc++" "-lm"
+// CHECK-STDLIBCXX: "-lstdc++" "-lm"
 
-// RUN: %clangxx %s -### -pg -o %t.o --target=amd64-unknown-freebsd -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -pg --target=amd64-unknown-freebsd -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-DEFAULT %s
-// RUN: %clangxx %s -### -pg -o %t.o --target=amd64-unknown-freebsd14.0 -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -pg --target=amd64-unknown-freebsd14.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-FOURTEEN %s
-// RUN: %clangxx %s -### -pg -o %t.o --target=amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -pg --target=amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-TEN %s
 // CHECK-PG-DEFAULT: "-lc++" "-lm"
 // CHECK-PG-FOURTEEN: "-lc++" "-lm"

From 7817045e5c5cfbcbf3428ace7a4b3bfb5281a641 Mon Sep 17 00:00:00 2001
From: dong-miao <miaozhendong24@mails.ucas.ac.cn>
Date: Mon, 17 Feb 2025 05:41:46 +0800
Subject: [PATCH 078/109] [RISCV] Support [mh]edelegh CSRs (#121634)

These RV32-only CSRs are defined in privileged spec v1.13.
---
 llvm/lib/Target/RISCV/RISCVSystemOperands.td   |  6 +++++-
 llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s | 18 ++++++++++++++++++
 llvm/test/MC/RISCV/rv32-machine-csr-names.s    | 14 ++++++++++++++
 llvm/test/MC/RISCV/rv32-only-csr-names.s       |  4 ++++
 4 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index cabcb9eda06b1..5b46e7df25fc8 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -157,6 +157,8 @@ def : SysReg<"hideleg", 0x603>;
 def : SysReg<"hie", 0x604>;
 def : SysReg<"hcounteren", 0x606>;
 def : SysReg<"hgeie", 0x607>;
+let isRV32Only = 1 in
+def : SysReg<"hedelegh", 0x612>;
 
 //===----------------------------------------------------------------------===//
 // Hypervisor Trap Handling
@@ -233,8 +235,10 @@ def : SysReg<"mideleg", 0x303>;
 def : SysReg<"mie", 0x304>;
 def : SysReg<"mtvec", 0x305>;
 def : SysReg<"mcounteren", 0x306>;
-let isRV32Only = 1 in
+let isRV32Only = 1 in {
 def : SysReg<"mstatush", 0x310>;
+def : SysReg<"medelegh", 0x312>;
+} // isRV32Only
 
 //===----------------------------------------------------------------------===//
 // Machine Trap Handling
diff --git a/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s b/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s
index aadee4fb4f3ad..79d87b3f2471c 100644
--- a/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-hypervisor-csr-names.s
@@ -219,3 +219,21 @@ csrrs t2, 0x214, zero
 csrrs t1, vsiph, zero
 # uimm12
 csrrs t2, 0x254, zero
+
+##################################
+# Hypervisor Trap Setup
+##################################
+
+# hedelegh
+# name
+# CHECK-INST: csrrs t1, hedelegh, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x20,0x61]
+# CHECK-INST-ALIAS: csrr t1, hedelegh
+# uimm12
+# CHECK-INST: csrrs t2, hedelegh, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x20,0x61]
+# CHECK-INST-ALIAS: csrr t2, hedelegh
+# name
+csrrs t1, hedelegh, zero
+# uimm12
+csrrs t2, 0x612, zero
diff --git a/llvm/test/MC/RISCV/rv32-machine-csr-names.s b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
index 3d527e382376e..9e929b7eddeed 100644
--- a/llvm/test/MC/RISCV/rv32-machine-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
@@ -22,6 +22,20 @@ csrrs t1, mstatush, zero
 # uimm12
 csrrs t2, 0x310, zero
 
+# medelegh
+# name
+# CHECK-INST: csrrs t1, medelegh, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x20,0x31]
+# CHECK-INST-ALIAS: csrr t1, medelegh
+# uimm12
+# CHECK-INST: csrrs t2, medelegh, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x20,0x31]
+# CHECK-INST-ALIAS: csrr t2, medelegh
+# name
+csrrs t1, medelegh, zero
+# uimm12
+csrrs t2, 0x312, zero
+
 #########################
 # Machine Configuration
 #########################
diff --git a/llvm/test/MC/RISCV/rv32-only-csr-names.s b/llvm/test/MC/RISCV/rv32-only-csr-names.s
index db88eacf9396b..1604469210193 100644
--- a/llvm/test/MC/RISCV/rv32-only-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-only-csr-names.s
@@ -41,12 +41,16 @@ csrrs t1, henvcfgh, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system registe
 
 csrrs t1, htimedeltah, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'htimedeltah' is RV32 only
 
+csrrs t1, hedelegh, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'hedelegh' is RV32 only
+
 csrrs t1, mstatush, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mstatush' is RV32 only
 
 csrrs t1, menvcfgh, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'menvcfgh' is RV32 only
 
 csrrs t1, mseccfgh, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'mseccfgh' is RV32 only
 
+csrrs t1, medelegh, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'medelegh' is RV32 only
+
 csrrs t1, pmpcfg1, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'pmpcfg1' is RV32 only
 csrrs t1, pmpcfg3, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'pmpcfg3' is RV32 only
 csrrs t1, pmpcfg5, zero # CHECK-NEED-RV32: :[[@LINE]]:11: error: system register 'pmpcfg5' is RV32 only

From 01b7e65c9197d64531133c5890f076de6c1ae793 Mon Sep 17 00:00:00 2001
From: Alex Richardson <alexrichardson@google.com>
Date: Sun, 16 Feb 2025 13:45:14 -0800
Subject: [PATCH 079/109] [FreeBSD] Fix comparison in
 f75126eeabba13ce2aab53c2e4296fca12b9da0d

We have to compare the string contents and not the const char* pointer.
This happened to work in my testing but is not reliable.
---
 clang/lib/Driver/ToolChains/FreeBSD.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index bd6ab0f8aba57..62206c5fb3c59 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -456,8 +456,9 @@ void FreeBSD::AddCXXStdlibLibArgs(const ArgList &Args,
   unsigned Major = getTriple().getOSMajorVersion();
   bool SuffixedLib = Args.hasArg(options::OPT_pg) && Major != 0 && Major < 14;
   if (SuffixedLib && GetCXXStdlibType(Args) == CST_Libcxx)
-    llvm::replace(CmdArgs, static_cast<const char *>("-lc++"),
-                  static_cast<const char *>("-lc++_p"));
+    std::replace_if(
+        CmdArgs.begin(), CmdArgs.end(),
+        [](const char *S) { return StringRef(S) == "-lc++"; }, "-lc++_p");
 }
 
 void FreeBSD::AddCudaIncludeArgs(const ArgList &DriverArgs,

From d150101160b7d518e1329abb578c4ca4d4224621 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 16 Feb 2025 12:57:05 -0800
Subject: [PATCH 080/109] [Hexagon] Use MCRegister. NFC

---
 llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp    | 16 ++++++++--------
 llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index df25bf9531a17..f10122fdacfcd 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -460,8 +460,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
       TmpInst.setOpcode(Hexagon::A2_combinew);
       TmpInst.addOperand(MappedInst.getOperand(0));
       MCOperand &MO1 = MappedInst.getOperand(1);
-      unsigned High = RI->getSubReg(MO1.getReg(), Hexagon::isub_hi);
-      unsigned Low = RI->getSubReg(MO1.getReg(), Hexagon::isub_lo);
+      MCRegister High = RI->getSubReg(MO1.getReg(), Hexagon::isub_hi);
+      MCRegister Low = RI->getSubReg(MO1.getReg(), Hexagon::isub_lo);
       // Add a new operand for the second register in the pair.
       TmpInst.addOperand(MCOperand::createReg(High));
       TmpInst.addOperand(MCOperand::createReg(Low));
@@ -537,8 +537,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
   // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
   case Hexagon::A2_tfrp: {
     MCOperand &MO = MappedInst.getOperand(1);
-    unsigned High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
-    unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
+    MCRegister High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
+    MCRegister Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
     MO.setReg(High);
     // Add a new operand for the second register in the pair.
     MappedInst.addOperand(MCOperand::createReg(Low));
@@ -549,8 +549,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
   case Hexagon::A2_tfrpt:
   case Hexagon::A2_tfrpf: {
     MCOperand &MO = MappedInst.getOperand(2);
-    unsigned High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
-    unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
+    MCRegister High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
+    MCRegister Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
     MO.setReg(High);
     // Add a new operand for the second register in the pair.
     MappedInst.addOperand(MCOperand::createReg(Low));
@@ -563,8 +563,8 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
   case Hexagon::A2_tfrptnew:
   case Hexagon::A2_tfrpfnew: {
     MCOperand &MO = MappedInst.getOperand(2);
-    unsigned High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
-    unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
+    MCRegister High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
+    MCRegister Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
     MO.setReg(High);
     // Add a new operand for the second register in the pair.
     MappedInst.addOperand(MCOperand::createReg(Low));
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 7b681fa44f4d6..3b157006d9224 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -593,8 +593,8 @@ void HexagonCopyToCombine::combine(MachineInstr &I1, MachineInstr &I2,
     llvm_unreachable("Unexpected register class");
 
   // Get the double word register.
-  unsigned DoubleRegDest = TRI->getMatchingSuperReg(LoRegDef, SubLo, SuperRC);
-  assert(DoubleRegDest != 0 && "Expect a valid register");
+  MCRegister DoubleRegDest = TRI->getMatchingSuperReg(LoRegDef, SubLo, SuperRC);
+  assert(DoubleRegDest.isValid() && "Expect a valid register");
 
   // Setup source operands.
   MachineOperand &LoOperand = IsI1Loreg ? I1.getOperand(1) : I2.getOperand(1);

From 26fc2e90fc54313bfe3e0e1fbbb14251ed8afe29 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 16 Feb 2025 13:24:28 -0800
Subject: [PATCH 081/109] [Mips] Use MCRegisterClass::getRegister() instead of
 begin()+RegNo. NFC

---
 llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index ae1f4545c6a49..ff84a5e3d2b3b 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -6404,7 +6404,7 @@ unsigned MipsAsmParser::getATReg(SMLoc Loc) {
 }
 
 unsigned MipsAsmParser::getReg(int RC, int RegNo) {
-  return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
+  return getContext().getRegisterInfo()->getRegClass(RC).getRegister(RegNo);
 }
 
 bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {

From a7a02083acf39b9f1ad7edec3b7e344afc6cac49 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Sun, 16 Feb 2025 15:03:59 -0800
Subject: [PATCH 082/109] [flang] Assert the Options fit into the storage bits
 (#126169)

---
 flang/include/flang/Support/LangOptions.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/flang/include/flang/Support/LangOptions.h b/flang/include/flang/Support/LangOptions.h
index fac6fb92df85a..1dd676e62a9e5 100644
--- a/flang/include/flang/Support/LangOptions.h
+++ b/flang/include/flang/Support/LangOptions.h
@@ -62,7 +62,10 @@ class LangOptions : public LangOptionsBase {
 #define LANGOPT(Name, Bits, Default)
 #define ENUM_LANGOPT(Name, Type, Bits, Default) \
   Type get##Name() const { return static_cast<Type>(Name); } \
-  void set##Name(Type Value) { Name = static_cast<unsigned>(Value); }
+  void set##Name(Type Value) { \
+    assert(static_cast<unsigned>(Value) < (1u << Bits)); \
+    Name = static_cast<unsigned>(Value); \
+  }
 #include "LangOptions.def"
 
   /// Name of the IR file that contains the result of the OpenMP target

From ecb7f5aaeed2de738a79f1bb78b2196718007176 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <moulongsheng@huawei.com>
Date: Mon, 17 Feb 2025 09:29:56 +0800
Subject: [PATCH 083/109] [mlir][linalg] Update docs for `linalg.generic`(NFC)
 (#127178)

The mixed tensor/buffer semantics has been disallowed in #80660. Closes
#124090.
---
 .../mlir/Dialect/Linalg/IR/LinalgStructuredOps.td   | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 29cb8035b583b..6a439bfb09078 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -138,19 +138,6 @@ def GenericOp : LinalgStructuredBase_Op<"generic", [
       }
     }
     ```
-
-    To allow progressive lowering from the value world (a.k.a tensor values) to
-    the buffer world (a.k.a memref values), a `linalg.generic` op allows mixing
-    tensors and buffers operands and tensor results.
-
-    ```mlir
-    %C = linalg.generic #trait_attribute
-      ins(%A, %B : tensor<?x?xf32>, memref<?x?xf32, stride_specification>)
-      outs(%C : tensor<?x?xf32>)
-      {other-optional-attributes}
-      {region}
-      -> (tensor<?x?xf32>)
-    ```
   }];
 
   let arguments = (ins Variadic<AnyType>:$inputs,

From 9e8cd733c2643c92807a23b9b65099d9bb6bc560 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 16 Feb 2025 14:05:21 -0800
Subject: [PATCH 084/109] [Mips] Use MCRegister. NFC

Use id() to get rid of some implicit conversions.
---
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   | 406 +++++++++---------
 .../Target/Mips/MCTargetDesc/MipsBaseInfo.h   |   2 +-
 .../Mips/MCTargetDesc/MipsELFStreamer.cpp     |   2 +-
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp   |  10 +-
 .../lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h |   2 +-
 .../Mips/MCTargetDesc/MipsNaClELFStreamer.cpp |  12 +-
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  |  61 +--
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp       |   2 +-
 llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp   |   2 +-
 llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h     |   2 +-
 llvm/lib/Target/Mips/MipsTargetStreamer.h     |  52 +--
 11 files changed, 276 insertions(+), 277 deletions(-)

diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index ff84a5e3d2b3b..d108564e128c0 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -224,12 +224,12 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                          const MCSubtargetInfo *STI);
 
-  bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg,
+  bool loadImmediate(int64_t ImmValue, MCRegister DstReg, MCRegister SrcReg,
                      bool Is32BitImm, bool IsAddress, SMLoc IDLoc,
                      MCStreamer &Out, const MCSubtargetInfo *STI);
 
-  bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg,
-                               unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc,
+  bool loadAndAddSymbolAddress(const MCExpr *SymExpr, MCRegister DstReg,
+                               MCRegister SrcReg, bool Is32BitSym, SMLoc IDLoc,
                                MCStreamer &Out, const MCSubtargetInfo *STI);
 
   bool emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc, MCSymbol *Sym);
@@ -246,7 +246,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU, SMLoc IDLoc,
                                 MCStreamer &Out, const MCSubtargetInfo *STI);
 
-  bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+  bool expandLoadAddress(MCRegister DstReg, MCRegister BaseReg,
                          const MCOperand &Offset, bool Is32BitAddress,
                          SMLoc IDLoc, MCStreamer &Out,
                          const MCSubtargetInfo *STI);
@@ -428,12 +428,12 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   int matchMSA128CtrlRegisterName(StringRef Name);
 
-  unsigned getReg(int RC, int RegNo);
+  MCRegister getReg(int RC, int RegNo);
 
   /// Returns the internal register number for the current AT. Also checks if
   /// the current AT is unavailable (set to $0) and gives an error if it is.
   /// This should be used in pseudo-instruction expansions which need AT.
-  unsigned getATReg(SMLoc Loc);
+  MCRegister getATReg(SMLoc Loc);
 
   bool canUseATReg();
 
@@ -735,7 +735,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   void onEndOfFile() override;
 
   /// Warn if RegIndex is the same as the current AT.
-  void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc);
+  void warnIfRegIndexIsAT(MCRegister RegIndex, SMLoc Loc);
 
   void warnIfNoMacro(SMLoc Loc);
 
@@ -2123,7 +2123,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     //        of the assembler. We ought to leave it to those later stages.
     const MCSymbol *JalSym = getSingleMCSymbol(JalExpr);
 
-    if (expandLoadAddress(Mips::T9, Mips::NoRegister, Inst.getOperand(0),
+    if (expandLoadAddress(Mips::T9, MCRegister(), Inst.getOperand(0),
                           !isGP64bit(), IDLoc, Out, STI))
       return true;
 
@@ -2303,8 +2303,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         break;
       case Mips::MOVEP_MM:
       case Mips::MOVEP_MMR6: {
-        unsigned R0 = Inst.getOperand(0).getReg();
-        unsigned R1 = Inst.getOperand(1).getReg();
+        MCRegister R0 = Inst.getOperand(0).getReg();
+        MCRegister R1 = Inst.getOperand(1).getReg();
         bool RegPair = ((R0 == Mips::A1 && R1 == Mips::A2) ||
                         (R0 == Mips::A1 && R1 == Mips::A3) ||
                         (R0 == Mips::A2 && R1 == Mips::A3) ||
@@ -2451,10 +2451,9 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     assert((Inst.getOperand(1).isImm() || Inst.getOperand(1).isExpr()) &&
            "expected immediate operand kind");
 
-    return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister,
-                             Inst.getOperand(1),
-                             Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc,
-                             Out, STI)
+    return expandLoadAddress(
+               Inst.getOperand(0).getReg(), MCRegister(), Inst.getOperand(1),
+               Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc, Out, STI)
                ? MER_Fail
                : MER_Success;
   case Mips::LoadAddrReg32:
@@ -2753,14 +2752,14 @@ template <unsigned N> static bool isShiftedUIntAtAnyPosition(uint64_t x) {
 ///
 /// @param ImmValue     The immediate to load.
 /// @param DstReg       The register that will hold the immediate.
-/// @param SrcReg       A register to add to the immediate or Mips::NoRegister
+/// @param SrcReg       A register to add to the immediate or MCRegister()
 ///                     for a simple initialization.
 /// @param Is32BitImm   Is ImmValue 32-bit or 64-bit?
 /// @param IsAddress    True if the immediate represents an address. False if it
 ///                     is an integer.
 /// @param IDLoc        Location of the immediate in the source file.
-bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
-                                  unsigned SrcReg, bool Is32BitImm,
+bool MipsAsmParser::loadImmediate(int64_t ImmValue, MCRegister DstReg,
+                                  MCRegister SrcReg, bool Is32BitImm,
                                   bool IsAddress, SMLoc IDLoc, MCStreamer &Out,
                                   const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
@@ -2782,19 +2781,19 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     }
   }
 
-  unsigned ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg();
+  MCRegister ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg();
   unsigned AdduOp = !Is32BitImm ? Mips::DADDu : Mips::ADDu;
 
   bool UseSrcReg = false;
-  if (SrcReg != Mips::NoRegister)
+  if (SrcReg)
     UseSrcReg = true;
 
-  unsigned TmpReg = DstReg;
+  MCRegister TmpReg = DstReg;
   if (UseSrcReg &&
       getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
     // At this point we need AT to perform the expansions and we exit if it is
     // not available.
-    unsigned ATReg = getATReg(IDLoc);
+    MCRegister ATReg = getATReg(IDLoc);
     if (!ATReg)
       return true;
     TmpReg = ATReg;
@@ -2817,7 +2816,7 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
   }
 
   if (isUInt<16>(ImmValue)) {
-    unsigned TmpReg = DstReg;
+    MCRegister TmpReg = DstReg;
     if (SrcReg == DstReg) {
       TmpReg = getATReg(IDLoc);
       if (!TmpReg)
@@ -2896,8 +2895,8 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
   // The highest 32-bit's are equivalent to a 32-bit immediate load.
 
   // Load bits 32-63 of ImmValue into bits 0-31 of the temporary register.
-  if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false,
-                    IDLoc, Out, STI))
+  if (loadImmediate(ImmValue >> 32, TmpReg, MCRegister(), true, false, IDLoc,
+                    Out, STI))
     return false;
 
   // Shift and accumulate into the register. If a 16-bit chunk is zero, then
@@ -2933,14 +2932,14 @@ bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
   const MCOperand &DstRegOp = Inst.getOperand(0);
   assert(DstRegOp.isReg() && "expected register operand kind");
 
-  if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister,
-                    Is32BitImm, false, IDLoc, Out, STI))
+  if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), MCRegister(), Is32BitImm,
+                    false, IDLoc, Out, STI))
     return true;
 
   return false;
 }
 
-bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+bool MipsAsmParser::expandLoadAddress(MCRegister DstReg, MCRegister BaseReg,
                                       const MCOperand &Offset,
                                       bool Is32BitAddress, SMLoc IDLoc,
                                       MCStreamer &Out,
@@ -2972,13 +2971,13 @@ bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg,
 }
 
 bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
-                                            unsigned DstReg, unsigned SrcReg,
-                                            bool Is32BitSym, SMLoc IDLoc,
-                                            MCStreamer &Out,
+                                            MCRegister DstReg,
+                                            MCRegister SrcReg, bool Is32BitSym,
+                                            SMLoc IDLoc, MCStreamer &Out,
                                             const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  bool UseSrcReg = SrcReg != Mips::NoRegister && SrcReg != Mips::ZERO &&
-                   SrcReg != Mips::ZERO_64;
+  bool UseSrcReg =
+      SrcReg.isValid() && SrcReg != Mips::ZERO && SrcReg != Mips::ZERO_64;
   warnIfNoMacro(IDLoc);
 
   if (inPicMode()) {
@@ -3032,13 +3031,13 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       return false;
     }
 
-    unsigned TmpReg = DstReg;
+    MCRegister TmpReg = DstReg;
     if (UseSrcReg &&
         getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
                                                                SrcReg)) {
       // If $rs is the same as $rd, we need to use AT.
       // If it is not available we exit.
-      unsigned ATReg = getATReg(IDLoc);
+      MCRegister ATReg = getATReg(IDLoc);
       if (!ATReg)
         return true;
       TmpReg = ATReg;
@@ -3171,7 +3170,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
         getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg);
 
     if (canUseATReg() && UseSrcReg && RdRegIsRsReg) {
-      unsigned ATReg = getATReg(IDLoc);
+      MCRegister ATReg = getATReg(IDLoc);
 
       // If $rs is the same as $rd:
       // (d)la $rd, sym($rd) => lui    $at, %highest(sym)
@@ -3195,7 +3194,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
 
       return false;
     } else if (canUseATReg() && !RdRegIsRsReg && DstReg != getATReg(IDLoc)) {
-      unsigned ATReg = getATReg(IDLoc);
+      MCRegister ATReg = getATReg(IDLoc);
 
       // If the $rs is different from $rd or if $rs isn't specified and we
       // have $at available:
@@ -3265,12 +3264,12 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
   // (d)la $rd, sym/sym($rs) => lui   $rd, %hi(sym)
   //                            ori   $rd, $rd, %lo(sym)
   //                            (addu $rd, $rd, $rs)
-  unsigned TmpReg = DstReg;
+  MCRegister TmpReg = DstReg;
   if (UseSrcReg &&
       getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
     // If $rs is the same as $rd, we need to use AT.
     // If it is not available we exit.
-    unsigned ATReg = getATReg(IDLoc);
+    MCRegister ATReg = getATReg(IDLoc);
     if (!ATReg)
       return true;
     TmpReg = ATReg;
@@ -3292,10 +3291,10 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
 // Each double-precision register DO-D15 overlaps with two of the single
 // precision registers F0-F31. As an example, all of the following hold true:
 // D0 + 1 == F1, F1 + 1 == D1, F1 + 1 == F2, depending on the context.
-static unsigned nextReg(unsigned Reg) {
+static MCRegister nextReg(MCRegister Reg) {
   if (MipsMCRegisterClasses[Mips::FGR32RegClassID].contains(Reg))
     return Reg == (unsigned)Mips::F31 ? (unsigned)Mips::F0 : Reg + 1;
-  switch (Reg) {
+  switch (Reg.id()) {
   default: llvm_unreachable("Unknown register in assembly macro expansion!");
   case Mips::ZERO: return Mips::AT;
   case Mips::AT:   return Mips::V0;
@@ -3356,7 +3355,7 @@ static unsigned nextReg(unsigned Reg) {
 // address to load a 64 bit value.
 bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
                                        MCSymbol *Sym) {
-  unsigned ATReg = getATReg(IDLoc);
+  MCRegister ATReg = getATReg(IDLoc);
   if (!ATReg)
     return true;
 
@@ -3435,13 +3434,13 @@ bool MipsAsmParser::expandLoadSingleImmToGPR(MCInst &Inst, SMLoc IDLoc,
   assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
          "Invalid instruction operand.");
 
-  unsigned FirstReg = Inst.getOperand(0).getReg();
+  MCRegister FirstReg = Inst.getOperand(0).getReg();
   uint64_t ImmOp64 = Inst.getOperand(1).getImm();
 
   uint32_t ImmOp32 = covertDoubleImmToSingleImm(convertIntToDoubleImm(ImmOp64));
 
-  return loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, false, IDLoc,
-                       Out, STI);
+  return loadImmediate(ImmOp32, FirstReg, MCRegister(), true, false, IDLoc, Out,
+                       STI);
 }
 
 bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
@@ -3452,14 +3451,14 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
   assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
          "Invalid instruction operand.");
 
-  unsigned FirstReg = Inst.getOperand(0).getReg();
+  MCRegister FirstReg = Inst.getOperand(0).getReg();
   uint64_t ImmOp64 = Inst.getOperand(1).getImm();
 
   ImmOp64 = convertIntToDoubleImm(ImmOp64);
 
   uint32_t ImmOp32 = covertDoubleImmToSingleImm(ImmOp64);
 
-  unsigned TmpReg = Mips::ZERO;
+  MCRegister TmpReg = Mips::ZERO;
   if (ImmOp32 != 0) {
     TmpReg = getATReg(IDLoc);
     if (!TmpReg)
@@ -3467,7 +3466,7 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
   }
 
   if (Lo_32(ImmOp64) == 0) {
-    if (TmpReg != Mips::ZERO && loadImmediate(ImmOp32, TmpReg, Mips::NoRegister,
+    if (TmpReg != Mips::ZERO && loadImmediate(ImmOp32, TmpReg, MCRegister(),
                                               true, false, IDLoc, Out, STI))
       return true;
     TOut.emitRR(Mips::MTC1, FirstReg, TmpReg, IDLoc, STI);
@@ -3506,23 +3505,23 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
   assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
          "Invalid instruction operand.");
 
-  unsigned FirstReg = Inst.getOperand(0).getReg();
+  MCRegister FirstReg = Inst.getOperand(0).getReg();
   uint64_t ImmOp64 = Inst.getOperand(1).getImm();
 
   ImmOp64 = convertIntToDoubleImm(ImmOp64);
 
   if (Lo_32(ImmOp64) == 0) {
     if (isGP64bit()) {
-      if (loadImmediate(ImmOp64, FirstReg, Mips::NoRegister, false, false,
-                        IDLoc, Out, STI))
+      if (loadImmediate(ImmOp64, FirstReg, MCRegister(), false, false, IDLoc,
+                        Out, STI))
         return true;
     } else {
-      if (loadImmediate(Hi_32(ImmOp64), FirstReg, Mips::NoRegister, true, false,
+      if (loadImmediate(Hi_32(ImmOp64), FirstReg, MCRegister(), true, false,
                         IDLoc, Out, STI))
         return true;
 
-      if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, false,
-                        IDLoc, Out, STI))
+      if (loadImmediate(0, nextReg(FirstReg), MCRegister(), true, false, IDLoc,
+                        Out, STI))
         return true;
     }
     return false;
@@ -3544,7 +3543,7 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
   getStreamer().emitIntValue(ImmOp64, 8);
   getStreamer().switchSection(CS);
 
-  unsigned TmpReg = getATReg(IDLoc);
+  MCRegister TmpReg = getATReg(IDLoc);
   if (!TmpReg)
     return true;
 
@@ -3571,12 +3570,12 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
   assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
          "Invalid instruction operand.");
 
-  unsigned FirstReg = Inst.getOperand(0).getReg();
+  MCRegister FirstReg = Inst.getOperand(0).getReg();
   uint64_t ImmOp64 = Inst.getOperand(1).getImm();
 
   ImmOp64 = convertIntToDoubleImm(ImmOp64);
 
-  unsigned TmpReg = Mips::ZERO;
+  MCRegister TmpReg = Mips::ZERO;
   if (ImmOp64 != 0) {
     TmpReg = getATReg(IDLoc);
     if (!TmpReg)
@@ -3586,17 +3585,16 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
   if ((Lo_32(ImmOp64) == 0) &&
       !((Hi_32(ImmOp64) & 0xffff0000) && (Hi_32(ImmOp64) & 0x0000ffff))) {
     if (isGP64bit()) {
-      if (TmpReg != Mips::ZERO &&
-          loadImmediate(ImmOp64, TmpReg, Mips::NoRegister, false, false, IDLoc,
-                        Out, STI))
+      if (TmpReg != Mips::ZERO && loadImmediate(ImmOp64, TmpReg, MCRegister(),
+                                                false, false, IDLoc, Out, STI))
         return true;
       TOut.emitRR(Mips::DMTC1, FirstReg, TmpReg, IDLoc, STI);
       return false;
     }
 
     if (TmpReg != Mips::ZERO &&
-        loadImmediate(Hi_32(ImmOp64), TmpReg, Mips::NoRegister, true, false,
-                      IDLoc, Out, STI))
+        loadImmediate(Hi_32(ImmOp64), TmpReg, MCRegister(), true, false, IDLoc,
+                      Out, STI))
       return true;
 
     if (hasMips32r2()) {
@@ -3729,12 +3727,12 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   } else {
     warnIfNoMacro(IDLoc);
 
-    unsigned ATReg = getATReg(IDLoc);
+    MCRegister ATReg = getATReg(IDLoc);
     if (!ATReg)
       return true;
 
-    if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true,
-                      IDLoc, Out, STI))
+    if (loadImmediate(ImmValue, ATReg, MCRegister(), !isGP64bit(), true, IDLoc,
+                      Out, STI))
       return true;
 
     if (IsLikely) {
@@ -3761,9 +3759,9 @@ void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
   MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned OpCode = Inst.getOpcode();
-  unsigned DstReg = DstRegOp.getReg();
-  unsigned BaseReg = BaseRegOp.getReg();
-  unsigned TmpReg = DstReg;
+  MCRegister DstReg = DstRegOp.getReg();
+  MCRegister BaseReg = BaseRegOp.getReg();
+  MCRegister TmpReg = DstReg;
 
   const MCInstrDesc &Desc = MII.get(OpCode);
   int16_t DstRegClass = Desc.operands()[StartOp].RegClass;
@@ -3800,8 +3798,8 @@ void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
     if (IsLargeOffset) {
       bool Is32BitImm = isInt<32>(OffsetOp.getImm());
-      if (loadImmediate(HiOffset, TmpReg, Mips::NoRegister, Is32BitImm, true,
-                        IDLoc, Out, STI))
+      if (loadImmediate(HiOffset, TmpReg, MCRegister(), Is32BitImm, true, IDLoc,
+                        Out, STI))
         return;
     }
 
@@ -3888,9 +3886,9 @@ void MipsAsmParser::expandMem9Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
   MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned OpCode = Inst.getOpcode();
-  unsigned DstReg = DstRegOp.getReg();
-  unsigned BaseReg = BaseRegOp.getReg();
-  unsigned TmpReg = DstReg;
+  MCRegister DstReg = DstRegOp.getReg();
+  MCRegister BaseReg = BaseRegOp.getReg();
+  MCRegister TmpReg = DstReg;
 
   const MCInstrDesc &Desc = MII.get(OpCode);
   int16_t DstRegClass = Desc.operands()[StartOp].RegClass;
@@ -3967,14 +3965,14 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   MipsTargetStreamer &TOut = getTargetStreamer();
   bool EmittedNoMacroWarning = false;
   unsigned PseudoOpcode = Inst.getOpcode();
-  unsigned SrcReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(0).getReg();
   const MCOperand &TrgOp = Inst.getOperand(1);
   const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr();
 
   unsigned ZeroSrcOpcode, ZeroTrgOpcode;
   bool ReverseOrderSLT, IsUnsigned, IsLikely, AcceptsEquality;
 
-  unsigned TrgReg;
+  MCRegister TrgReg;
   if (TrgOp.isReg())
     TrgReg = TrgOp.getReg();
   else if (TrgOp.isImm()) {
@@ -4038,8 +4036,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
       break;
     }
 
-    if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(),
-                      false, IDLoc, Out, STI))
+    if (loadImmediate(TrgOp.getImm(), TrgReg, MCRegister(), !isGP64bit(), false,
+                      IDLoc, Out, STI))
       return true;
   }
 
@@ -4191,7 +4189,7 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
 
   // If neither the SrcReg nor the TrgReg are $0, we need AT to perform the
   // expansions. If it is not available, we return.
-  unsigned ATRegNum = getATReg(IDLoc);
+  MCRegister ATRegNum = getATReg(IDLoc);
   if (!ATRegNum)
     return true;
 
@@ -4241,13 +4239,13 @@ bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
   const MCOperand &RdRegOp = Inst.getOperand(0);
   assert(RdRegOp.isReg() && "expected register operand kind");
-  unsigned RdReg = RdRegOp.getReg();
+  MCRegister RdReg = RdRegOp.getReg();
 
   const MCOperand &RsRegOp = Inst.getOperand(1);
   assert(RsRegOp.isReg() && "expected register operand kind");
-  unsigned RsReg = RsRegOp.getReg();
+  MCRegister RsReg = RsRegOp.getReg();
 
-  unsigned RtReg;
+  MCRegister RtReg;
   int64_t ImmValue;
 
   const MCOperand &RtOp = Inst.getOperand(2);
@@ -4286,7 +4284,7 @@ bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                Opcode == Mips::DURemMacro || Opcode == Mips::DURemIMacro;
 
   if (RtOp.isImm()) {
-    unsigned ATReg = getATReg(IDLoc);
+    MCRegister ATReg = getATReg(IDLoc);
     if (!ATReg)
       return true;
 
@@ -4308,7 +4306,7 @@ bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       TOut.emitRRR(SubOp, RdReg, ZeroReg, RsReg, IDLoc, STI);
       return false;
     } else {
-      if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue),
+      if (loadImmediate(ImmValue, ATReg, MCRegister(), isInt<32>(ImmValue),
                         false, Inst.getLoc(), Out, STI))
         return true;
       TOut.emitRR(DivOp, RsReg, ATReg, IDLoc, STI);
@@ -4365,7 +4363,7 @@ bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return false;
   }
 
-  unsigned ATReg = getATReg(IDLoc);
+  MCRegister ATReg = getATReg(IDLoc);
   if (!ATReg)
     return true;
 
@@ -4412,12 +4410,12 @@ bool MipsAsmParser::expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU,
   assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isReg() && "Invalid instruction operand.");
 
-  unsigned FirstReg = Inst.getOperand(0).getReg();
-  unsigned SecondReg = Inst.getOperand(1).getReg();
-  unsigned ThirdReg = Inst.getOperand(2).getReg();
+  MCRegister FirstReg = Inst.getOperand(0).getReg();
+  MCRegister SecondReg = Inst.getOperand(1).getReg();
+  MCRegister ThirdReg = Inst.getOperand(2).getReg();
 
   if (hasMips1() && !hasMips2()) {
-    unsigned ATReg = getATReg(IDLoc);
+    MCRegister ATReg = getATReg(IDLoc);
     if (!ATReg)
       return true;
     TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI);
@@ -4456,14 +4454,14 @@ bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
   assert(OffsetImmOp.isImm() && "expected immediate operand kind");
 
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned DstReg = DstRegOp.getReg();
-  unsigned SrcReg = SrcRegOp.getReg();
+  MCRegister DstReg = DstRegOp.getReg();
+  MCRegister SrcReg = SrcRegOp.getReg();
   int64_t OffsetValue = OffsetImmOp.getImm();
 
   // NOTE: We always need AT for ULHU, as it is always used as the source
   // register for one of the LBu's.
   warnIfNoMacro(IDLoc);
-  unsigned ATReg = getATReg(IDLoc);
+  MCRegister ATReg = getATReg(IDLoc);
   if (!ATReg)
     return true;
 
@@ -4479,11 +4477,11 @@ bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
   if (isLittle())
     std::swap(FirstOffset, SecondOffset);
 
-  unsigned FirstLbuDstReg = IsLargeOffset ? DstReg : ATReg;
-  unsigned SecondLbuDstReg = IsLargeOffset ? ATReg : DstReg;
+  MCRegister FirstLbuDstReg = IsLargeOffset ? DstReg : ATReg;
+  MCRegister SecondLbuDstReg = IsLargeOffset ? ATReg : DstReg;
 
-  unsigned LbuSrcReg = IsLargeOffset ? ATReg : SrcReg;
-  unsigned SllReg = IsLargeOffset ? DstReg : ATReg;
+  MCRegister LbuSrcReg = IsLargeOffset ? ATReg : SrcReg;
+  MCRegister SllReg = IsLargeOffset ? DstReg : ATReg;
 
   TOut.emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg,
                FirstOffset, IDLoc, STI);
@@ -4508,12 +4506,12 @@ bool MipsAsmParser::expandUsh(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   assert(OffsetImmOp.isImm() && "expected immediate operand kind");
 
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned DstReg = DstRegOp.getReg();
-  unsigned SrcReg = SrcRegOp.getReg();
+  MCRegister DstReg = DstRegOp.getReg();
+  MCRegister SrcReg = SrcRegOp.getReg();
   int64_t OffsetValue = OffsetImmOp.getImm();
 
   warnIfNoMacro(IDLoc);
-  unsigned ATReg = getATReg(IDLoc);
+  MCRegister ATReg = getATReg(IDLoc);
   if (!ATReg)
     return true;
 
@@ -4559,8 +4557,8 @@ bool MipsAsmParser::expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   assert(OffsetImmOp.isImm() && "expected immediate operand kind");
 
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned DstReg = DstRegOp.getReg();
-  unsigned SrcReg = SrcRegOp.getReg();
+  MCRegister DstReg = DstRegOp.getReg();
+  MCRegister SrcReg = SrcRegOp.getReg();
   int64_t OffsetValue = OffsetImmOp.getImm();
 
   // Compute left/right load/store offsets.
@@ -4572,7 +4570,7 @@ bool MipsAsmParser::expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
   bool IsLoadInst = (Inst.getOpcode() == Mips::Ulw);
   bool DoMove = IsLoadInst && (SrcReg == DstReg) && !IsLargeOffset;
-  unsigned TmpReg = SrcReg;
+  MCRegister TmpReg = SrcReg;
   if (IsLargeOffset || DoMove) {
     warnIfNoMacro(IDLoc);
     TmpReg = getATReg(IDLoc);
@@ -4609,9 +4607,9 @@ bool MipsAsmParser::expandSge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isReg() && "Invalid instruction operand.");
 
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
-  unsigned OpReg = Inst.getOperand(2).getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
+  MCRegister OpReg = Inst.getOperand(2).getReg();
   unsigned OpCode;
 
   warnIfNoMacro(IDLoc);
@@ -4643,8 +4641,8 @@ bool MipsAsmParser::expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
 
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
   int64_t ImmValue = Inst.getOperand(2).getImm();
   unsigned OpRegCode, OpImmCode;
 
@@ -4671,15 +4669,15 @@ bool MipsAsmParser::expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     TOut.emitRRI(OpImmCode, DstReg, SrcReg, ImmValue, IDLoc, STI);
     TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
   } else {
-    unsigned ImmReg = DstReg;
+    MCRegister ImmReg = DstReg;
     if (DstReg == SrcReg) {
-      unsigned ATReg = getATReg(Inst.getLoc());
+      MCRegister ATReg = getATReg(Inst.getLoc());
       if (!ATReg)
         return true;
       ImmReg = ATReg;
     }
 
-    if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue),
+    if (loadImmediate(ImmValue, ImmReg, MCRegister(), isInt<32>(ImmValue),
                       false, IDLoc, Out, STI))
       return true;
 
@@ -4699,9 +4697,9 @@ bool MipsAsmParser::expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
 
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
-  unsigned ImmReg = DstReg;
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
+  MCRegister ImmReg = DstReg;
   int64_t ImmValue = Inst.getOperand(2).getImm();
   unsigned OpCode;
 
@@ -4721,14 +4719,14 @@ bool MipsAsmParser::expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   }
 
   if (DstReg == SrcReg) {
-    unsigned ATReg = getATReg(Inst.getLoc());
+    MCRegister ATReg = getATReg(Inst.getLoc());
     if (!ATReg)
       return true;
     ImmReg = ATReg;
   }
 
-  if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue),
-                    false, IDLoc, Out, STI))
+  if (loadImmediate(ImmValue, ImmReg, MCRegister(), isInt<32>(ImmValue), false,
+                    IDLoc, Out, STI))
     return true;
 
   // $SrcReg > $ImmReg is equal to $ImmReg < $SrcReg
@@ -4746,9 +4744,9 @@ bool MipsAsmParser::expandSle(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isReg() && "Invalid instruction operand.");
 
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
-  unsigned OpReg = Inst.getOperand(2).getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
+  MCRegister OpReg = Inst.getOperand(2).getReg();
   unsigned OpCode;
 
   warnIfNoMacro(IDLoc);
@@ -4780,8 +4778,8 @@ bool MipsAsmParser::expandSleImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
 
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
   int64_t ImmValue = Inst.getOperand(2).getImm();
   unsigned OpRegCode;
 
@@ -4801,16 +4799,16 @@ bool MipsAsmParser::expandSleImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   }
 
   // $SrcReg <= Imm is equal to (not (Imm < $SrcReg))
-  unsigned ImmReg = DstReg;
+  MCRegister ImmReg = DstReg;
   if (DstReg == SrcReg) {
-    unsigned ATReg = getATReg(Inst.getLoc());
+    MCRegister ATReg = getATReg(Inst.getLoc());
     if (!ATReg)
       return true;
     ImmReg = ATReg;
   }
 
-  if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue),
-                    false, IDLoc, Out, STI))
+  if (loadImmediate(ImmValue, ImmReg, MCRegister(), isInt<32>(ImmValue), false,
+                    IDLoc, Out, STI))
     return true;
 
   TOut.emitRRR(OpRegCode, DstReg, ImmReg, SrcReg, IDLoc, STI);
@@ -4829,10 +4827,10 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
 
-  unsigned ATReg = Mips::NoRegister;
-  unsigned FinalDstReg = Mips::NoRegister;
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
+  MCRegister ATReg;
+  MCRegister FinalDstReg;
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
   int64_t ImmValue = Inst.getOperand(2).getImm();
 
   bool Is32Bit = isInt<32>(ImmValue) || (!isGP64bit() && isUInt<32>(ImmValue));
@@ -4847,7 +4845,7 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
     DstReg = ATReg;
   }
 
-  if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false,
+  if (!loadImmediate(ImmValue, DstReg, MCRegister(), Is32Bit, false,
                      Inst.getLoc(), Out, STI)) {
     switch (FinalOpcode) {
     default:
@@ -4917,7 +4915,7 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
       break;
     }
 
-    if (FinalDstReg == Mips::NoRegister)
+    if (!FinalDstReg)
       TOut.emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, STI);
     else
       TOut.emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc, STI);
@@ -4929,11 +4927,11 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
 bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                    const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned ATReg = Mips::NoRegister;
-  unsigned DReg = Inst.getOperand(0).getReg();
-  unsigned SReg = Inst.getOperand(1).getReg();
-  unsigned TReg = Inst.getOperand(2).getReg();
-  unsigned TmpReg = DReg;
+  MCRegister ATReg;
+  MCRegister DReg = Inst.getOperand(0).getReg();
+  MCRegister SReg = Inst.getOperand(1).getReg();
+  MCRegister TReg = Inst.getOperand(2).getReg();
+  MCRegister TmpReg = DReg;
 
   unsigned FirstShift = Mips::NOP;
   unsigned SecondShift = Mips::NOP;
@@ -4992,9 +4990,9 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
                                       MCStreamer &Out,
                                       const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned ATReg = Mips::NoRegister;
-  unsigned DReg = Inst.getOperand(0).getReg();
-  unsigned SReg = Inst.getOperand(1).getReg();
+  MCRegister ATReg;
+  MCRegister DReg = Inst.getOperand(0).getReg();
+  MCRegister SReg = Inst.getOperand(1).getReg();
   int64_t ImmValue = Inst.getOperand(2).getImm();
 
   unsigned FirstShift = Mips::NOP;
@@ -5054,11 +5052,11 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
 bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                     const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned ATReg = Mips::NoRegister;
-  unsigned DReg = Inst.getOperand(0).getReg();
-  unsigned SReg = Inst.getOperand(1).getReg();
-  unsigned TReg = Inst.getOperand(2).getReg();
-  unsigned TmpReg = DReg;
+  MCRegister ATReg;
+  MCRegister DReg = Inst.getOperand(0).getReg();
+  MCRegister SReg = Inst.getOperand(1).getReg();
+  MCRegister TReg = Inst.getOperand(2).getReg();
+  MCRegister TmpReg = DReg;
 
   unsigned FirstShift = Mips::NOP;
   unsigned SecondShift = Mips::NOP;
@@ -5117,9 +5115,9 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
                                        MCStreamer &Out,
                                        const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned ATReg = Mips::NoRegister;
-  unsigned DReg = Inst.getOperand(0).getReg();
-  unsigned SReg = Inst.getOperand(1).getReg();
+  MCRegister ATReg;
+  MCRegister DReg = Inst.getOperand(0).getReg();
+  MCRegister SReg = Inst.getOperand(1).getReg();
   int64_t ImmValue = Inst.getOperand(2).getImm() % 64;
 
   unsigned FirstShift = Mips::NOP;
@@ -5211,8 +5209,8 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
 bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                               const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned FirstRegOp = Inst.getOperand(0).getReg();
-  unsigned SecondRegOp = Inst.getOperand(1).getReg();
+  MCRegister FirstRegOp = Inst.getOperand(0).getReg();
+  MCRegister SecondRegOp = Inst.getOperand(1).getReg();
 
   TOut.emitRI(Mips::BGEZ, SecondRegOp, 8, IDLoc, STI);
   if (FirstRegOp != SecondRegOp)
@@ -5227,17 +5225,16 @@ bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 bool MipsAsmParser::expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                  const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned ATReg = Mips::NoRegister;
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
+  MCRegister ATReg;
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
   int32_t ImmValue = Inst.getOperand(2).getImm();
 
   ATReg = getATReg(IDLoc);
   if (!ATReg)
     return true;
 
-  loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out,
-                STI);
+  loadImmediate(ImmValue, ATReg, MCRegister(), true, false, IDLoc, Out, STI);
 
   TOut.emitRR(Inst.getOpcode() == Mips::MULImmMacro ? Mips::MULT : Mips::DMULT,
               SrcReg, ATReg, IDLoc, STI);
@@ -5250,10 +5247,10 @@ bool MipsAsmParser::expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 bool MipsAsmParser::expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned ATReg = Mips::NoRegister;
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
-  unsigned TmpReg = Inst.getOperand(2).getReg();
+  MCRegister ATReg;
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
+  MCRegister TmpReg = Inst.getOperand(2).getReg();
 
   ATReg = getATReg(Inst.getLoc());
   if (!ATReg)
@@ -5292,10 +5289,10 @@ bool MipsAsmParser::expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 bool MipsAsmParser::expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                 const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned ATReg = Mips::NoRegister;
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
-  unsigned TmpReg = Inst.getOperand(2).getReg();
+  MCRegister ATReg;
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
+  MCRegister TmpReg = Inst.getOperand(2).getReg();
 
   ATReg = getATReg(IDLoc);
   if (!ATReg)
@@ -5328,9 +5325,9 @@ bool MipsAsmParser::expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 bool MipsAsmParser::expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                     const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
-  unsigned TmpReg = Inst.getOperand(2).getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
+  MCRegister TmpReg = Inst.getOperand(2).getReg();
 
   TOut.emitRR(Mips::DMULTu, SrcReg, TmpReg, IDLoc, STI);
   TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
@@ -5354,9 +5351,9 @@ bool MipsAsmParser::expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc,
 
   MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned Opcode = IsLoad ? Mips::LW : Mips::SW;
-  unsigned FirstReg = Inst.getOperand(0).getReg();
-  unsigned SecondReg = nextReg(FirstReg);
-  unsigned BaseReg = Inst.getOperand(1).getReg();
+  MCRegister FirstReg = Inst.getOperand(0).getReg();
+  MCRegister SecondReg = nextReg(FirstReg);
+  MCRegister BaseReg = Inst.getOperand(1).getReg();
   if (!SecondReg)
     return true;
 
@@ -5401,9 +5398,9 @@ bool MipsAsmParser::expandStoreDM1Macro(MCInst &Inst, SMLoc IDLoc,
 
   MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned Opcode = Mips::SWC1;
-  unsigned FirstReg = Inst.getOperand(0).getReg();
-  unsigned SecondReg = nextReg(FirstReg);
-  unsigned BaseReg = Inst.getOperand(1).getReg();
+  MCRegister FirstReg = Inst.getOperand(0).getReg();
+  MCRegister SecondReg = nextReg(FirstReg);
+  MCRegister BaseReg = Inst.getOperand(1).getReg();
   if (!SecondReg)
     return true;
 
@@ -5437,9 +5434,9 @@ bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isReg() && "Invalid instruction operand.");
 
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
-  unsigned OpReg = Inst.getOperand(2).getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
+  MCRegister OpReg = Inst.getOperand(2).getReg();
 
   warnIfNoMacro(IDLoc);
 
@@ -5449,7 +5446,7 @@ bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return false;
   }
 
-  unsigned Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg;
+  MCRegister Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg;
   TOut.emitRRI(Mips::SLTiu, DstReg, Reg, 1, IDLoc, STI);
   return false;
 }
@@ -5463,8 +5460,8 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
 
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
   int64_t Imm = Inst.getOperand(2).getImm();
 
   warnIfNoMacro(IDLoc);
@@ -5490,12 +5487,12 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   }
 
   if (!isUInt<16>(Imm)) {
-    unsigned ATReg = getATReg(IDLoc);
+    MCRegister ATReg = getATReg(IDLoc);
     if (!ATReg)
       return true;
 
-    if (loadImmediate(Imm, ATReg, Mips::NoRegister, true, isGP64bit(), IDLoc,
-                      Out, STI))
+    if (loadImmediate(Imm, ATReg, MCRegister(), true, isGP64bit(), IDLoc, Out,
+                      STI))
       return true;
 
     TOut.emitRRR(Mips::XOR, DstReg, SrcReg, ATReg, IDLoc, STI);
@@ -5518,9 +5515,9 @@ bool MipsAsmParser::expandSne(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isReg() && "Invalid instruction operand.");
 
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
-  unsigned OpReg = Inst.getOperand(2).getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
+  MCRegister OpReg = Inst.getOperand(2).getReg();
 
   warnIfNoMacro(IDLoc);
 
@@ -5530,7 +5527,7 @@ bool MipsAsmParser::expandSne(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return false;
   }
 
-  unsigned Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg;
+  MCRegister Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg;
   TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, Reg, IDLoc, STI);
   return false;
 }
@@ -5544,8 +5541,8 @@ bool MipsAsmParser::expandSneI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
          Inst.getOperand(1).isReg() &&
          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
 
-  unsigned DstReg = Inst.getOperand(0).getReg();
-  unsigned SrcReg = Inst.getOperand(1).getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  MCRegister SrcReg = Inst.getOperand(1).getReg();
   int64_t ImmValue = Inst.getOperand(2).getImm();
 
   warnIfNoMacro(IDLoc);
@@ -5557,8 +5554,7 @@ bool MipsAsmParser::expandSneI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
   if (SrcReg == Mips::ZERO) {
     Warning(IDLoc, "comparison is always true");
-    if (loadImmediate(1, DstReg, Mips::NoRegister, true, false, IDLoc, Out,
-                      STI))
+    if (loadImmediate(1, DstReg, MCRegister(), true, false, IDLoc, Out, STI))
       return true;
     return false;
   }
@@ -5577,12 +5573,12 @@ bool MipsAsmParser::expandSneI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return false;
   }
 
-  unsigned ATReg = getATReg(IDLoc);
+  MCRegister ATReg = getATReg(IDLoc);
   if (!ATReg)
     return true;
 
-  if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue),
-                    false, IDLoc, Out, STI))
+  if (loadImmediate(ImmValue, ATReg, MCRegister(), isInt<32>(ImmValue), false,
+                    IDLoc, Out, STI))
     return true;
 
   TOut.emitRRR(Mips::XOR, DstReg, SrcReg, ATReg, IDLoc, STI);
@@ -5597,7 +5593,7 @@ static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) {
   switch (Inst.getOpcode()) {
     case Mips::MFTLO:
     case Mips::MTTLO:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
+      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg().id()) {
         case Mips::AC0:
           return Mips::ZERO;
         case Mips::AC1:
@@ -5611,7 +5607,7 @@ static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) {
     }
     case Mips::MFTHI:
     case Mips::MTTHI:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
+      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg().id()) {
         case Mips::AC0:
           return Mips::AT;
         case Mips::AC1:
@@ -5625,7 +5621,7 @@ static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) {
     }
     case Mips::MFTACX:
     case Mips::MTTACX:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
+      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg().id()) {
         case Mips::AC0:
           return Mips::V0;
         case Mips::AC1:
@@ -5648,7 +5644,7 @@ static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) {
 // Map the floating point register operand to the corresponding register
 // operand.
 static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) {
-  switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg()) {
+  switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg().id()) {
     case Mips::F0:  return Mips::ZERO;
     case Mips::F1:  return Mips::AT;
     case Mips::F2:  return Mips::V0;
@@ -5687,7 +5683,7 @@ static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) {
 
 // Map the coprocessor operand the corresponding gpr register operand.
 static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) {
-  switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg()) {
+  switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg().id()) {
     case Mips::COP00:  return Mips::ZERO;
     case Mips::COP01:  return Mips::AT;
     case Mips::COP02:  return Mips::V0;
@@ -5729,7 +5725,7 @@ static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) {
 bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                     const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned rd = 0;
+  MCRegister rd;
   unsigned u = 1;
   unsigned sel = 0;
   unsigned h = 0;
@@ -5806,8 +5802,8 @@ bool MipsAsmParser::expandSaaAddr(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
   MipsTargetStreamer &TOut = getTargetStreamer();
   unsigned Opcode = Inst.getOpcode() == Mips::SaaAddr ? Mips::SAA : Mips::SAAD;
-  unsigned RtReg = Inst.getOperand(0).getReg();
-  unsigned BaseReg = Inst.getOperand(1).getReg();
+  MCRegister RtReg = Inst.getOperand(0).getReg();
+  MCRegister BaseReg = Inst.getOperand(1).getReg();
   const MCOperand &BaseOp = Inst.getOperand(2);
 
   if (BaseOp.isImm()) {
@@ -5818,7 +5814,7 @@ bool MipsAsmParser::expandSaaAddr(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     }
   }
 
-  unsigned ATReg = getATReg(IDLoc);
+  MCRegister ATReg = getATReg(IDLoc);
   if (!ATReg)
     return true;
 
@@ -6197,9 +6193,9 @@ bool MipsAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   llvm_unreachable("Implement any new match types added!");
 }
 
-void MipsAsmParser::warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc) {
-  if (RegIndex != 0 && AssemblerOptions.back()->getATRegIndex() == RegIndex)
-    Warning(Loc, "used $at (currently $" + Twine(RegIndex) +
+void MipsAsmParser::warnIfRegIndexIsAT(MCRegister RegIndex, SMLoc Loc) {
+  if (RegIndex && AssemblerOptions.back()->getATRegIndex() == RegIndex)
+    Warning(Loc, "used $at (currently $" + Twine(RegIndex.id()) +
                      ") without \".set noat\"");
 }
 
@@ -6214,7 +6210,7 @@ void MipsAsmParser::ConvertXWPOperands(MCInst &Inst,
       (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM) &&
       "Unexpected instruction!");
   ((MipsOperand &)*Operands[1]).addGPR32ZeroAsmRegOperands(Inst, 1);
-  int NextReg = nextReg(((MipsOperand &)*Operands[1]).getGPR32Reg());
+  MCRegister NextReg = nextReg(((MipsOperand &)*Operands[1]).getGPR32Reg());
   Inst.addOperand(MCOperand::createReg(NextReg));
   ((MipsOperand &)*Operands[2]).addMemOperands(Inst, 2);
 }
@@ -6391,19 +6387,19 @@ bool MipsAsmParser::canUseATReg() {
   return AssemblerOptions.back()->getATRegIndex() != 0;
 }
 
-unsigned MipsAsmParser::getATReg(SMLoc Loc) {
+MCRegister MipsAsmParser::getATReg(SMLoc Loc) {
   unsigned ATIndex = AssemblerOptions.back()->getATRegIndex();
   if (ATIndex == 0) {
     reportParseError(Loc,
                      "pseudo-instruction requires $at, which is not available");
     return 0;
   }
-  unsigned AT = getReg(
+  MCRegister AT = getReg(
       (isGP64bit()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, ATIndex);
   return AT;
 }
 
-unsigned MipsAsmParser::getReg(int RC, int RegNo) {
+MCRegister MipsAsmParser::getReg(int RC, int RegNo) {
   return getContext().getRegisterInfo()->getRegClass(RC).getRegister(RegNo);
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index b9a2af3341236..bd4c5d35ddfbe 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -147,7 +147,7 @@ inline static MCRegister getMSARegFromFReg(MCRegister Reg) {
   else if (Reg >= Mips::D0_64 && Reg <= Mips::D31_64)
     return Reg - Mips::D0_64 + Mips::W0;
   else
-    return Mips::NoRegister;
+    return MCRegister();
 }
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index f861268c00158..e8b9746da467c 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -46,7 +46,7 @@ void MipsELFStreamer::emitInstruction(const MCInst &Inst,
     if (!Op.isReg())
       continue;
 
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     RegInfoRecord->SetPhysRegUsed(Reg, MCRegInfo);
   }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 73ee44eec22cd..097b3cf8aa723 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -91,8 +91,8 @@ static void LowerLargeShift(MCInst& Inst) {
 void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
   // Encoding may be illegal !(rs < rt), but this situation is
   // easily fixed.
-  unsigned RegOp0 = Inst.getOperand(0).getReg();
-  unsigned RegOp1 = Inst.getOperand(1).getReg();
+  MCRegister RegOp0 = Inst.getOperand(0).getReg();
+  MCRegister RegOp1 = Inst.getOperand(1).getReg();
 
   unsigned Reg0 =  Ctx.getRegisterInfo()->getEncodingValue(RegOp0);
   unsigned Reg1 =  Ctx.getRegisterInfo()->getEncodingValue(RegOp1);
@@ -724,7 +724,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const {
   if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
+    MCRegister Reg = MO.getReg();
     unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
     return RegNo;
   } else if (MO.isImm()) {
@@ -1033,7 +1033,7 @@ MipsMCCodeEmitter::getRegisterListOpValue(const MCInst &MI, unsigned OpNo,
   // placed before memory operand (register + imm).
 
   for (unsigned I = OpNo, E = MI.getNumOperands() - 2; I < E; ++I) {
-    unsigned Reg = MI.getOperand(I).getReg();
+    MCRegister Reg = MI.getOperand(I).getReg();
     unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
     if (RegNo != 31)
       res++;
@@ -1093,7 +1093,7 @@ MipsMCCodeEmitter::getMovePRegSingleOpValue(const MCInst &MI, unsigned OpNo,
 
   MCOperand Op = MI.getOperand(OpNo);
   assert(Op.isReg() && "Operand of movep is not a register!");
-  switch (Op.getReg()) {
+  switch (Op.getReg().id()) {
   default:
     llvm_unreachable("Unknown register for movep!");
   case Mips::ZERO:  return 0;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index 2722e34b3f624..94b2f412c8cdb 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -19,7 +19,7 @@ static const Align MIPS_NACL_BUNDLE_ALIGN = Align(16);
 
 bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
                                   bool *IsStore = nullptr);
-bool baseRegNeedsLoadStoreMask(unsigned Reg);
+bool baseRegNeedsLoadStoreMask(MCRegister Reg);
 
 // This function creates an MCELFStreamer for Mips NaCl.
 MCELFStreamer *
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index 8bd2b2ac231bf..3410726c8e553 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -97,7 +97,7 @@ class MipsNaClELFStreamer : public MipsELFStreamer {
     }
   }
 
-  void emitMask(unsigned AddrReg, unsigned MaskReg,
+  void emitMask(MCRegister AddrReg, unsigned MaskReg,
                 const MCSubtargetInfo &STI) {
     MCInst MaskInst;
     MaskInst.setOpcode(Mips::AND);
@@ -110,7 +110,7 @@ class MipsNaClELFStreamer : public MipsELFStreamer {
   // Sandbox indirect branch or return instruction by inserting mask operation
   // before it.
   void sandboxIndirectJump(const MCInst &MI, const MCSubtargetInfo &STI) {
-    unsigned AddrReg = MI.getOperand(0).getReg();
+    MCRegister AddrReg = MI.getOperand(0).getReg();
 
     emitBundleLock(false);
     emitMask(AddrReg, IndirectBranchMaskReg, STI);
@@ -126,13 +126,13 @@ class MipsNaClELFStreamer : public MipsELFStreamer {
     emitBundleLock(false);
     if (MaskBefore) {
       // Sandbox memory access.
-      unsigned BaseReg = MI.getOperand(AddrIdx).getReg();
+      MCRegister BaseReg = MI.getOperand(AddrIdx).getReg();
       emitMask(BaseReg, LoadStoreStackMaskReg, STI);
     }
     MipsELFStreamer::emitInstruction(MI, STI);
     if (MaskAfter) {
       // Sandbox SP change.
-      unsigned SPReg = MI.getOperand(0).getReg();
+      MCRegister SPReg = MI.getOperand(0).getReg();
       assert((Mips::SP == SPReg) && "Unexpected stack-pointer register.");
       emitMask(SPReg, LoadStoreStackMaskReg, STI);
     }
@@ -182,7 +182,7 @@ class MipsNaClELFStreamer : public MipsELFStreamer {
       // Start the sandboxing sequence by emitting call.
       emitBundleLock(true);
       if (IsIndirectCall) {
-        unsigned TargetReg = Inst.getOperand(1).getReg();
+        MCRegister TargetReg = Inst.getOperand(1).getReg();
         emitMask(TargetReg, IndirectBranchMaskReg, STI);
       }
       MipsELFStreamer::emitInstruction(Inst, STI);
@@ -253,7 +253,7 @@ bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
   }
 }
 
-bool baseRegNeedsLoadStoreMask(unsigned Reg) {
+bool baseRegNeedsLoadStoreMask(MCRegister Reg) {
   // The contents of SP and thread pointer register do not require masking.
   return Reg != Mips::SP && Reg != Mips::T8;
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index e547e62094404..670dc71b00e5f 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -81,8 +81,8 @@ void MipsTargetStreamer::emitDirectiveNaNLegacy() {}
 void MipsTargetStreamer::emitDirectiveOptionPic0() {}
 void MipsTargetStreamer::emitDirectiveOptionPic2() {}
 void MipsTargetStreamer::emitDirectiveInsn() { forbidModuleDirective(); }
-void MipsTargetStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
-                                   unsigned ReturnReg) {}
+void MipsTargetStreamer::emitFrame(MCRegister StackReg, unsigned StackSize,
+                                   MCRegister ReturnReg) {}
 void MipsTargetStreamer::emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) {}
 void MipsTargetStreamer::emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) {
 }
@@ -173,7 +173,7 @@ void MipsTargetStreamer::emitDirectiveSetNoOddSPReg() {
   forbidModuleDirective();
 }
 
-void MipsTargetStreamer::emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+void MipsTargetStreamer::emitR(unsigned Opcode, MCRegister Reg0, SMLoc IDLoc,
                                const MCSubtargetInfo *STI) {
   MCInst TmpInst;
   TmpInst.setOpcode(Opcode);
@@ -182,7 +182,7 @@ void MipsTargetStreamer::emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
   getStreamer().emitInstruction(TmpInst, *STI);
 }
 
-void MipsTargetStreamer::emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1,
+void MipsTargetStreamer::emitRX(unsigned Opcode, MCRegister Reg0, MCOperand Op1,
                                 SMLoc IDLoc, const MCSubtargetInfo *STI) {
   MCInst TmpInst;
   TmpInst.setOpcode(Opcode);
@@ -192,13 +192,14 @@ void MipsTargetStreamer::emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1,
   getStreamer().emitInstruction(TmpInst, *STI);
 }
 
-void MipsTargetStreamer::emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm,
+void MipsTargetStreamer::emitRI(unsigned Opcode, MCRegister Reg0, int32_t Imm,
                                 SMLoc IDLoc, const MCSubtargetInfo *STI) {
   emitRX(Opcode, Reg0, MCOperand::createImm(Imm), IDLoc, STI);
 }
 
-void MipsTargetStreamer::emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
-                                SMLoc IDLoc, const MCSubtargetInfo *STI) {
+void MipsTargetStreamer::emitRR(unsigned Opcode, MCRegister Reg0,
+                                MCRegister Reg1, SMLoc IDLoc,
+                                const MCSubtargetInfo *STI) {
   emitRX(Opcode, Reg0, MCOperand::createReg(Reg1), IDLoc, STI);
 }
 
@@ -212,8 +213,8 @@ void MipsTargetStreamer::emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2,
   getStreamer().emitInstruction(TmpInst, *STI);
 }
 
-void MipsTargetStreamer::emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
-                                 MCOperand Op2, SMLoc IDLoc,
+void MipsTargetStreamer::emitRRX(unsigned Opcode, MCRegister Reg0,
+                                 MCRegister Reg1, MCOperand Op2, SMLoc IDLoc,
                                  const MCSubtargetInfo *STI) {
   MCInst TmpInst;
   TmpInst.setOpcode(Opcode);
@@ -224,14 +225,15 @@ void MipsTargetStreamer::emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
   getStreamer().emitInstruction(TmpInst, *STI);
 }
 
-void MipsTargetStreamer::emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
-                                 unsigned Reg2, SMLoc IDLoc,
+void MipsTargetStreamer::emitRRR(unsigned Opcode, MCRegister Reg0,
+                                 MCRegister Reg1, MCRegister Reg2, SMLoc IDLoc,
                                  const MCSubtargetInfo *STI) {
   emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc, STI);
 }
 
-void MipsTargetStreamer::emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
-                                  unsigned Reg2, MCOperand Op3, SMLoc IDLoc,
+void MipsTargetStreamer::emitRRRX(unsigned Opcode, MCRegister Reg0,
+                                  MCRegister Reg1, MCRegister Reg2,
+                                  MCOperand Op3, SMLoc IDLoc,
                                   const MCSubtargetInfo *STI) {
   MCInst TmpInst;
   TmpInst.setOpcode(Opcode);
@@ -243,14 +245,14 @@ void MipsTargetStreamer::emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
   getStreamer().emitInstruction(TmpInst, *STI);
 }
 
-void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
-                                 int16_t Imm, SMLoc IDLoc,
+void MipsTargetStreamer::emitRRI(unsigned Opcode, MCRegister Reg0,
+                                 MCRegister Reg1, int16_t Imm, SMLoc IDLoc,
                                  const MCSubtargetInfo *STI) {
   emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI);
 }
 
-void MipsTargetStreamer::emitRRIII(unsigned Opcode, unsigned Reg0,
-                                   unsigned Reg1, int16_t Imm0, int16_t Imm1,
+void MipsTargetStreamer::emitRRIII(unsigned Opcode, MCRegister Reg0,
+                                   MCRegister Reg1, int16_t Imm0, int16_t Imm1,
                                    int16_t Imm2, SMLoc IDLoc,
                                    const MCSubtargetInfo *STI) {
   MCInst TmpInst;
@@ -264,14 +266,14 @@ void MipsTargetStreamer::emitRRIII(unsigned Opcode, unsigned Reg0,
   getStreamer().emitInstruction(TmpInst, *STI);
 }
 
-void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg,
-                                  unsigned TrgReg, bool Is64Bit,
+void MipsTargetStreamer::emitAddu(MCRegister DstReg, MCRegister SrcReg,
+                                  MCRegister TrgReg, bool Is64Bit,
                                   const MCSubtargetInfo *STI) {
   emitRRR(Is64Bit ? Mips::DADDu : Mips::ADDu, DstReg, SrcReg, TrgReg, SMLoc(),
           STI);
 }
 
-void MipsTargetStreamer::emitDSLL(unsigned DstReg, unsigned SrcReg,
+void MipsTargetStreamer::emitDSLL(MCRegister DstReg, MCRegister SrcReg,
                                   int16_t ShiftAmount, SMLoc IDLoc,
                                   const MCSubtargetInfo *STI) {
   if (ShiftAmount >= 32) {
@@ -313,7 +315,7 @@ void MipsTargetStreamer::emitGPRestore(int Offset, SMLoc IDLoc,
 
 /// Emit a store instruction with an immediate offset.
 void MipsTargetStreamer::emitStoreWithImmOffset(
-    unsigned Opcode, unsigned SrcReg, unsigned BaseReg, int64_t Offset,
+    unsigned Opcode, MCRegister SrcReg, MCRegister BaseReg, int64_t Offset,
     function_ref<unsigned()> GetATReg, SMLoc IDLoc,
     const MCSubtargetInfo *STI) {
   if (isInt<16>(Offset)) {
@@ -325,7 +327,7 @@ void MipsTargetStreamer::emitStoreWithImmOffset(
   //                      add $at, $at, $8
   //                      sw $8, %lo(offset)($at)
 
-  unsigned ATReg = GetATReg();
+  MCRegister ATReg = GetATReg();
   if (!ATReg)
     return;
 
@@ -349,10 +351,9 @@ void MipsTargetStreamer::emitStoreWithImmOffset(
 /// permitted to be the same register iff DstReg is distinct from BaseReg and
 /// DstReg is a GPR. It is the callers responsibility to identify such cases
 /// and pass the appropriate register in TmpReg.
-void MipsTargetStreamer::emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg,
-                                               unsigned BaseReg, int64_t Offset,
-                                               unsigned TmpReg, SMLoc IDLoc,
-                                               const MCSubtargetInfo *STI) {
+void MipsTargetStreamer::emitLoadWithImmOffset(
+    unsigned Opcode, MCRegister DstReg, MCRegister BaseReg, int64_t Offset,
+    MCRegister TmpReg, SMLoc IDLoc, const MCSubtargetInfo *STI) {
   if (isInt<16>(Offset)) {
     emitRRI(Opcode, DstReg, BaseReg, Offset, IDLoc, STI);
     return;
@@ -519,8 +520,8 @@ void MipsTargetAsmStreamer::emitDirectiveInsn() {
   OS << "\t.insn\n";
 }
 
-void MipsTargetAsmStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
-                                      unsigned ReturnReg) {
+void MipsTargetAsmStreamer::emitFrame(MCRegister StackReg, unsigned StackSize,
+                                      MCRegister ReturnReg) {
   OS << "\t.frame\t$"
      << StringRef(MipsInstPrinter::getRegisterName(StackReg)).lower() << ","
      << StackSize << ",$"
@@ -1113,8 +1114,8 @@ void MipsTargetELFStreamer::emitDirectiveInsn() {
   MEF.createPendingLabelRelocs();
 }
 
-void MipsTargetELFStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
-                                      unsigned ReturnReg_) {
+void MipsTargetELFStreamer::emitFrame(MCRegister StackReg, unsigned StackSize,
+                                      MCRegister ReturnReg_) {
   MCContext &Context = getStreamer().getAssembler().getContext();
   const MCRegisterInfo *RegInfo = Context.getRegisterInfo();
 
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 6b07999d862d9..b0b7b5dc7a31d 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -380,7 +380,7 @@ void MipsAsmPrinter::emitFrameDirective() {
   const TargetRegisterInfo &RI = *MF->getSubtarget().getRegisterInfo();
 
   Register stackReg = RI.getFrameRegister(*MF);
-  unsigned returnReg = RI.getRARegister();
+  MCRegister returnReg = RI.getRARegister();
   unsigned stackSize = MF->getFrameInfo().getStackSize();
 
   getTargetStreamer().emitFrame(stackReg, stackSize, returnReg);
diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index c3e21e0ff7a0f..738fabea25bf7 100644
--- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -70,7 +70,7 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
     MIB.addReg(Mips::DSPEFI, Flag);
 }
 
-unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
+MCRegister MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
   uint64_t RegNum = RegIdx->getAsZExtVal();
   return Mips::MSACtrlRegClass.getRegister(RegNum);
 }
diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 366c352d5cc0c..4122de7646f36 100644
--- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -30,7 +30,7 @@ class MipsSEDAGToDAGISel : public MipsDAGToDAGISel {
   void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
                              MachineFunction &MF);
 
-  unsigned getMSACtrlReg(const SDValue RegIdx) const;
+  MCRegister getMSACtrlReg(const SDValue RegIdx) const;
 
   bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
 
diff --git a/llvm/lib/Target/Mips/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MipsTargetStreamer.h
index 0603379d2e4fa..c73013baa4f05 100644
--- a/llvm/lib/Target/Mips/MipsTargetStreamer.h
+++ b/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -58,8 +58,8 @@ class MipsTargetStreamer : public MCTargetStreamer {
   virtual void emitDirectiveOptionPic0();
   virtual void emitDirectiveOptionPic2();
   virtual void emitDirectiveInsn();
-  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
-                         unsigned ReturnReg);
+  virtual void emitFrame(MCRegister StackReg, unsigned StackSize,
+                         MCRegister ReturnReg);
   virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
   virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
 
@@ -118,30 +118,31 @@ class MipsTargetStreamer : public MCTargetStreamer {
   virtual void emitDirectiveModuleGINV();
   virtual void emitDirectiveModuleNoGINV();
 
-  void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+  void emitR(unsigned Opcode, MCRegister Reg0, SMLoc IDLoc,
              const MCSubtargetInfo *STI);
   void emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
               const MCSubtargetInfo *STI);
-  void emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1, SMLoc IDLoc,
+  void emitRX(unsigned Opcode, MCRegister Reg0, MCOperand Op1, SMLoc IDLoc,
               const MCSubtargetInfo *STI);
-  void emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm, SMLoc IDLoc,
+  void emitRI(unsigned Opcode, MCRegister Reg0, int32_t Imm, SMLoc IDLoc,
               const MCSubtargetInfo *STI);
-  void emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, SMLoc IDLoc,
+  void emitRR(unsigned Opcode, MCRegister Reg0, MCRegister Reg1, SMLoc IDLoc,
               const MCSubtargetInfo *STI);
-  void emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, MCOperand Op2,
+  void emitRRX(unsigned Opcode, MCRegister Reg0, MCRegister Reg1, MCOperand Op2,
                SMLoc IDLoc, const MCSubtargetInfo *STI);
-  void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
-               SMLoc IDLoc, const MCSubtargetInfo *STI);
-  void emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
-                MCOperand Op3, SMLoc IDLoc, const MCSubtargetInfo *STI);
-  void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
+  void emitRRR(unsigned Opcode, MCRegister Reg0, MCRegister Reg1,
+               MCRegister Reg2, SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitRRRX(unsigned Opcode, MCRegister Reg0, MCRegister Reg1,
+                MCRegister Reg2, MCOperand Op3, SMLoc IDLoc,
+                const MCSubtargetInfo *STI);
+  void emitRRI(unsigned Opcode, MCRegister Reg0, MCRegister Reg1, int16_t Imm,
                SMLoc IDLoc, const MCSubtargetInfo *STI);
-  void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0,
-                 int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
+  void emitRRIII(unsigned Opcode, MCRegister Reg0, MCRegister Reg1,
+                 int16_t Imm0, int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
                  const MCSubtargetInfo *STI);
-  void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit,
-                const MCSubtargetInfo *STI);
-  void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
+  void emitAddu(MCRegister DstReg, MCRegister SrcReg, MCRegister TrgReg,
+                bool Is64Bit, const MCSubtargetInfo *STI);
+  void emitDSLL(MCRegister DstReg, MCRegister SrcReg, int16_t ShiftAmount,
                 SMLoc IDLoc, const MCSubtargetInfo *STI);
   void emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
                           const MCSubtargetInfo *STI);
@@ -154,12 +155,13 @@ class MipsTargetStreamer : public MCTargetStreamer {
   /// temporary and is only called when the assembler temporary is required. It
   /// must handle the case where no assembler temporary is available (typically
   /// by reporting an error).
-  void emitStoreWithImmOffset(unsigned Opcode, unsigned SrcReg,
-                              unsigned BaseReg, int64_t Offset,
+  void emitStoreWithImmOffset(unsigned Opcode, MCRegister SrcReg,
+                              MCRegister BaseReg, int64_t Offset,
                               function_ref<unsigned()> GetATReg, SMLoc IDLoc,
                               const MCSubtargetInfo *STI);
-  void emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
-                             int64_t Offset, unsigned TmpReg, SMLoc IDLoc,
+  void emitLoadWithImmOffset(unsigned Opcode, MCRegister DstReg,
+                             MCRegister BaseReg, int64_t Offset,
+                             MCRegister TmpReg, SMLoc IDLoc,
                              const MCSubtargetInfo *STI);
   void emitGPRestore(int Offset, SMLoc IDLoc, const MCSubtargetInfo *STI);
 
@@ -240,8 +242,8 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer {
   void emitDirectiveOptionPic0() override;
   void emitDirectiveOptionPic2() override;
   void emitDirectiveInsn() override;
-  void emitFrame(unsigned StackReg, unsigned StackSize,
-                 unsigned ReturnReg) override;
+  void emitFrame(MCRegister StackReg, unsigned StackSize,
+                 MCRegister ReturnReg) override;
   void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
   void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
 
@@ -340,8 +342,8 @@ class MipsTargetELFStreamer : public MipsTargetStreamer {
   void emitDirectiveOptionPic0() override;
   void emitDirectiveOptionPic2() override;
   void emitDirectiveInsn() override;
-  void emitFrame(unsigned StackReg, unsigned StackSize,
-                 unsigned ReturnReg) override;
+  void emitFrame(MCRegister StackReg, unsigned StackSize,
+                 MCRegister ReturnReg) override;
   void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
   void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
 

From 69f3e003bfef75d28af09e2822cb5750fa45c38d Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday@polymagelabs.com>
Date: Mon, 17 Feb 2025 08:58:06 +0530
Subject: [PATCH 085/109] [MLIR] NFC. Refactor IntegerRelation getSliceBounds
 (#127308)

Refactor FlatLinearConstraints getSliceBounds. The method was too long
and nested. NFC.
---
 .../Analysis/Presburger/IntegerRelation.h     | 12 +--
 .../Analysis/FlatLinearValueConstraints.cpp   | 79 +++++++++++--------
 2 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
index a27fc8c37eeda..ddc18038e869c 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
@@ -738,6 +738,12 @@ class IntegerRelation {
   /// Same as findSymbolicIntegerLexMin but produces lexmax instead of lexmin
   SymbolicLexOpt findSymbolicIntegerLexMax() const;
 
+  /// Searches for a constraint with a non-zero coefficient at `colIdx` in
+  /// equality (isEq=true) or inequality (isEq=false) constraints.
+  /// Returns true and sets row found in search in `rowIdx`, false otherwise.
+  bool findConstraintWithNonZeroAt(unsigned colIdx, bool isEq,
+                                   unsigned *rowIdx) const;
+
   /// Return the set difference of this set and the given set, i.e.,
   /// return `this \ set`.
   PresburgerRelation subtract(const PresburgerRelation &set) const;
@@ -820,12 +826,6 @@ class IntegerRelation {
   /// Normalized each constraints by the GCD of its coefficients.
   void normalizeConstraintsByGCD();
 
-  /// Searches for a constraint with a non-zero coefficient at `colIdx` in
-  /// equality (isEq=true) or inequality (isEq=false) constraints.
-  /// Returns true and sets row found in search in `rowIdx`, false otherwise.
-  bool findConstraintWithNonZeroAt(unsigned colIdx, bool isEq,
-                                   unsigned *rowIdx) const;
-
   /// Returns true if the pos^th column is all zero for both inequalities and
   /// equalities.
   bool isColZero(unsigned pos) const;
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index 631b1c7ca895c..4653eca9887ce 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -581,48 +581,35 @@ std::pair<AffineMap, AffineMap> FlatLinearConstraints::getLowerAndUpperBound(
   return {lbMap, ubMap};
 }
 
-/// Computes the lower and upper bounds of the first 'num' dimensional
-/// variables (starting at 'offset') as affine maps of the remaining
-/// variables (dimensional and symbolic variables). Local variables are
-/// themselves explicitly computed as affine functions of other variables in
-/// this process if needed.
-void FlatLinearConstraints::getSliceBounds(unsigned offset, unsigned num,
-                                           MLIRContext *context,
-                                           SmallVectorImpl<AffineMap> *lbMaps,
-                                           SmallVectorImpl<AffineMap> *ubMaps,
-                                           bool closedUB) {
-  assert(offset + num <= getNumDimVars() && "invalid range");
-
-  // Basic simplification.
-  normalizeConstraintsByGCD();
-
-  LLVM_DEBUG(llvm::dbgs() << "getSliceBounds for variables at positions ["
-                          << offset << ", " << offset + num << ")\n");
-  LLVM_DEBUG(dumpPretty());
-
-  // Record computed/detected variables.
-  SmallVector<AffineExpr, 8> memo(getNumVars());
+/// Compute a representation of `num` identifiers starting at `offset` in `cst`
+/// as affine expressions involving other known identifiers. Each identifier's
+/// expression (in terms of known identifiers) is populated into `memo`.
+static void computeUnknownVars(const FlatLinearConstraints &cst,
+                               MLIRContext *context, unsigned offset,
+                               unsigned num,
+                               SmallVectorImpl<AffineExpr> &memo) {
   // Initialize dimensional and symbolic variables.
-  for (unsigned i = 0, e = getNumDimVars(); i < e; i++) {
+  for (unsigned i = 0, e = cst.getNumDimVars(); i < e; i++) {
     if (i < offset)
       memo[i] = getAffineDimExpr(i, context);
     else if (i >= offset + num)
       memo[i] = getAffineDimExpr(i - num, context);
   }
-  for (unsigned i = getNumDimVars(), e = getNumDimAndSymbolVars(); i < e; i++)
-    memo[i] = getAffineSymbolExpr(i - getNumDimVars(), context);
+  for (unsigned i = cst.getNumDimVars(), e = cst.getNumDimAndSymbolVars();
+       i < e; i++)
+    memo[i] = getAffineSymbolExpr(i - cst.getNumDimVars(), context);
 
   bool changed;
   do {
     changed = false;
     // Identify yet unknown variables as constants or mod's / floordiv's of
     // other variables if possible.
-    for (unsigned pos = 0; pos < getNumVars(); pos++) {
+    for (unsigned pos = 0, f = cst.getNumVars(); pos < f; pos++) {
       if (memo[pos])
         continue;
 
-      auto lbConst = getConstantBound64(BoundType::LB, pos);
-      auto ubConst = getConstantBound64(BoundType::UB, pos);
+      auto lbConst = cst.getConstantBound64(BoundType::LB, pos);
+      auto ubConst = cst.getConstantBound64(BoundType::UB, pos);
       if (lbConst.has_value() && ubConst.has_value()) {
         // Detect equality to a constant.
         if (*lbConst == *ubConst) {
@@ -633,7 +620,7 @@ void FlatLinearConstraints::getSliceBounds(unsigned offset, unsigned num,
 
         // Detect a variable as modulo of another variable w.r.t a
         // constant.
-        if (detectAsMod(*this, pos, offset, num, *lbConst, *ubConst, context,
+        if (detectAsMod(cst, pos, offset, num, *lbConst, *ubConst, context,
                         memo)) {
           changed = true;
           continue;
@@ -642,24 +629,24 @@ void FlatLinearConstraints::getSliceBounds(unsigned offset, unsigned num,
 
       // Detect a variable as a floordiv of an affine function of other
       // variables (divisor is a positive constant).
-      if (detectAsFloorDiv(*this, pos, context, memo)) {
+      if (detectAsFloorDiv(cst, pos, context, memo)) {
         changed = true;
         continue;
       }
 
       // Detect a variable as an expression of other variables.
       unsigned idx;
-      if (!findConstraintWithNonZeroAt(pos, /*isEq=*/true, &idx)) {
+      if (!cst.findConstraintWithNonZeroAt(pos, /*isEq=*/true, &idx)) {
         continue;
       }
 
       // Build AffineExpr solving for variable 'pos' in terms of all others.
       auto expr = getAffineConstantExpr(0, context);
       unsigned j, e;
-      for (j = 0, e = getNumVars(); j < e; ++j) {
+      for (j = 0, e = cst.getNumVars(); j < e; ++j) {
         if (j == pos)
           continue;
-        int64_t c = atEq64(idx, j);
+        int64_t c = cst.atEq64(idx, j);
         if (c == 0)
           continue;
         // If any of the involved IDs hasn't been found yet, we can't proceed.
@@ -673,8 +660,8 @@ void FlatLinearConstraints::getSliceBounds(unsigned offset, unsigned num,
         continue;
 
       // Add constant term to AffineExpr.
-      expr = expr + atEq64(idx, getNumVars());
-      int64_t vPos = atEq64(idx, pos);
+      expr = expr + cst.atEq64(idx, cst.getNumVars());
+      int64_t vPos = cst.atEq64(idx, pos);
       assert(vPos != 0 && "expected non-zero here");
       if (vPos > 0)
         expr = (-expr).floorDiv(vPos);
@@ -689,6 +676,30 @@ void FlatLinearConstraints::getSliceBounds(unsigned offset, unsigned num,
     // variable's explicit form is computed (in memo[pos]), it's not updated
     // again.
   } while (changed);
+}
+
+/// Computes the lower and upper bounds of the first 'num' dimensional
+/// variables (starting at 'offset') as affine maps of the remaining
+/// variables (dimensional and symbolic variables). Local variables are
+/// themselves explicitly computed as affine functions of other variables in
+/// this process if needed.
+void FlatLinearConstraints::getSliceBounds(unsigned offset, unsigned num,
+                                           MLIRContext *context,
+                                           SmallVectorImpl<AffineMap> *lbMaps,
+                                           SmallVectorImpl<AffineMap> *ubMaps,
+                                           bool closedUB) {
+  assert(offset + num <= getNumDimVars() && "invalid range");
+
+  // Basic simplification.
+  normalizeConstraintsByGCD();
+
+  LLVM_DEBUG(llvm::dbgs() << "getSliceBounds for variables at positions ["
+                          << offset << ", " << offset + num << ")\n");
+  LLVM_DEBUG(dumpPretty());
+
+  // Record computed/detected variables.
+  SmallVector<AffineExpr, 8> memo(getNumVars());
+  computeUnknownVars(*this, context, offset, num, memo);
 
   int64_t ubAdjustment = closedUB ? 0 : 1;
 

From 885382f4379b3b8060213606a8f7bd8a1750f33a Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 16 Feb 2025 19:30:33 -0800
Subject: [PATCH 086/109] [clang-format] Fix a bug in annotating braces
 (#127306)

Fixes #107616.
---
 clang/lib/Format/UnwrappedLineParser.cpp      |  2 +-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 4e040183f2f0a..3a24d72d83e27 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -510,7 +510,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
           break;
         do {
           NextTok = Tokens->getNextToken();
-        } while (NextTok->NewlinesBefore == 0 && NextTok->isNot(tok::eof));
+        } while (!NextTok->HasUnescapedNewline && NextTok->isNot(tok::eof));
 
         while (NextTok->is(tok::comment))
           NextTok = Tokens->getNextToken();
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 7b489b1764cb2..1d0870c818acc 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3519,6 +3519,19 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   EXPECT_BRACE_KIND(Tokens[11], BK_Block);
   EXPECT_BRACE_KIND(Tokens[12], BK_Block);
 
+  Tokens = annotate("class foo {\n"
+                    "  foo() {}\n"
+                    "#if defined(_MSC_VER__clang____GNUC__FOO_) || \\\n"
+                    "    (defined(__GNUC__) && defined(FOO))\n"
+                    "  foo() {}\n"
+                    "#endif\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 36u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_FunctionLBrace);
+  EXPECT_BRACE_KIND(Tokens[7], BK_Block);
+  EXPECT_TOKEN(Tokens[26], tok::identifier, TT_CtorDtorDeclName);
+  EXPECT_TOKEN(Tokens[27], tok::l_paren, TT_FunctionDeclarationLParen);
+
   Tokens = annotate("a = class extends goog.a {};",
                     getGoogleStyle(FormatStyle::LK_JavaScript));
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;

From 5d62a79bb79fee20f92f26dc55fd78440b9945ca Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 20:03:34 -0800
Subject: [PATCH 087/109] [Serialization] Remove getMacroID (#127413)

The last use was removed in:

  commit ee977933f7df9cef13cc06ac7fa3e4a22b72e41f
  Author: Richard Smith <richard-llvm@metafoo.co.uk>
  Date:   Fri May 1 21:22:17 2015 +0000
---
 clang/include/clang/Serialization/ASTWriter.h | 3 ---
 clang/lib/Serialization/ASTWriter.cpp         | 8 --------
 2 files changed, 11 deletions(-)

diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 079e39a9fb678..ad291d0948b57 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -751,9 +751,6 @@ class ASTWriter : public ASTDeserializationListener,
   /// Get the unique number used to refer to the given macro.
   serialization::MacroID getMacroRef(MacroInfo *MI, const IdentifierInfo *Name);
 
-  /// Determine the ID of an already-emitted macro.
-  serialization::MacroID getMacroID(MacroInfo *MI);
-
   uint32_t getMacroDirectivesOffset(const IdentifierInfo *Name);
 
   /// Emit a reference to a type.
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 903a165ee75c6..64791300fe722 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6666,14 +6666,6 @@ MacroID ASTWriter::getMacroRef(MacroInfo *MI, const IdentifierInfo *Name) {
   return ID;
 }
 
-MacroID ASTWriter::getMacroID(MacroInfo *MI) {
-  if (!MI || MI->isBuiltinMacro())
-    return 0;
-
-  assert(MacroIDs.contains(MI) && "Macro not emitted!");
-  return MacroIDs[MI];
-}
-
 uint32_t ASTWriter::getMacroDirectivesOffset(const IdentifierInfo *Name) {
   return IdentMacroDirectivesOffsetMap.lookup(Name);
 }

From 9b7282e545d5e47315e3ffb9e5e00d0fb547c8e3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 16 Feb 2025 20:40:09 -0800
Subject: [PATCH 088/109] [RISCV] Recognize de-interleave shuffles with 2
 sources. (#127272)

We can use vnsrl+trunc on each source and concatenate the results
with vslideup.

For low LMUL it would be better to concat first, but I'm leaving
this for later.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  14 +
 .../rvv/fixed-vectors-deinterleave-load.ll    |  34 +-
 .../rvv/fixed-vectors-shuffle-deinterleave.ll |  86 ++++
 .../rvv/fixed-vectors-shufflevector-vnsrl.ll  | 417 ++++++++++++++++++
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |  33 +-
 5 files changed, 543 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 43d32987da95a..c40ab0d09bdf6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5593,6 +5593,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
           1 < count_if(Mask, [](int Idx) { return Idx != -1; })) {
         if (SDValue Src = getSingleShuffleSrc(VT, V1, V2))
           return getDeinterleaveShiftAndTrunc(DL, VT, Src, Factor, Index, DAG);
+        if (1 < count_if(Mask,
+                         [&Mask](int Idx) { return Idx < (int)Mask.size(); }) &&
+            1 < count_if(Mask, [&Mask](int Idx) {
+              return Idx >= (int)Mask.size();
+            })) {
+          // Narrow each source and concatenate them.
+          // FIXME: For small LMUL it is better to concatenate first.
+          MVT HalfVT = VT.getHalfNumVectorElementsVT();
+          SDValue Lo =
+              getDeinterleaveShiftAndTrunc(DL, HalfVT, V1, Factor, Index, DAG);
+          SDValue Hi =
+              getDeinterleaveShiftAndTrunc(DL, HalfVT, V2, Factor, Index, DAG);
+          return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+        }
       }
     }
   }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index b4634dbf5a5e8..e53dfc23a84bb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -10,34 +10,26 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    li a0, -256
+; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v11, v9, v9
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v11, -16
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v9, v8, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v8, 2
+; CHECK-NEXT:    vslidedown.vi v0, v0, 2
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v9, 0
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v11, v11, -15
-; CHECK-NEXT:    vmerge.vim v13, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vnsrl.wi v9, v9, 8
+; CHECK-NEXT:    vnsrl.wi v11, v8, 0
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 8
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v13, v12, v0.t
-; CHECK-NEXT:    vrgather.vv v8, v13, v11, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v10, v11, 8
+; CHECK-NEXT:    vslideup.vi v9, v8, 8
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
+; CHECK-NEXT:    vmsne.vi v8, v9, 0
 ; CHECK-NEXT:    ret
   %vec = load <32 x i1>, ptr %p
   %deinterleaved.results = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index 10dadbc022e02..ad18c801069f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -369,3 +369,89 @@ entry:
   store <2 x i8> %shuffle.i5, ptr %out, align 1
   ret void
 }
+
+define void @deinterleave4_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave4_0_i8_two_source:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 4
+; CHECK-NEXT:    vse8.v v9, (a2)
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @deinterleave4_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave4_8_i8_two_source:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 8
+; CHECK-NEXT:    vnsrl.wi v9, v9, 8
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 4
+; CHECK-NEXT:    vse8.v v9, (a2)
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @deinterleave8_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave8_0_i8_two_source:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v9, (a1)
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a2)
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @deinterleave8_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave8_8_i8_two_source:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a1)
+; CHECK-NEXT:    vmv.v.i v0, -3
+; CHECK-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vse8.v v9, (a2)
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 4e5ef9c002f1a..180579e47d075 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -584,3 +584,420 @@ entry:
   store <64 x i32> %shuffle.i5, ptr %out, align 4
   ret void
 }
+
+define void @vnsrl_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i8_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vle8.v v8, (a1)
+; V-NEXT:    vle8.v v9, (a0)
+; V-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 0
+; V-NEXT:    vnsrl.wi v9, v9, 0
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 4
+; V-NEXT:    vse8.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i8_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vle8.v v8, (a1)
+; ZVE32F-NEXT:    vle8.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 0
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 4
+; ZVE32F-NEXT:    vse8.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @vnsrl_8_8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_8_8_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vle8.v v8, (a1)
+; V-NEXT:    vle8.v v9, (a0)
+; V-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 8
+; V-NEXT:    vnsrl.wi v9, v9, 8
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 4
+; V-NEXT:    vse8.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_8_8_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vle8.v v8, (a1)
+; ZVE32F-NEXT:    vle8.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 8
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 8
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 4
+; ZVE32F-NEXT:    vse8.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @vnsrl_0_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i16_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vle16.v v8, (a1)
+; V-NEXT:    vle16.v v9, (a0)
+; V-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 0
+; V-NEXT:    vnsrl.wi v9, v9, 0
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 2
+; V-NEXT:    vse16.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i16_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vle16.v v8, (a1)
+; ZVE32F-NEXT:    vle16.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 0
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 2
+; ZVE32F-NEXT:    vse16.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <4 x i16>, ptr %in0, align 2
+  %1 = load <4 x i16>, ptr %in1, align 2
+  %shuffle.i5 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x i16> %shuffle.i5, ptr %out, align 2
+  ret void
+}
+
+define void @vnsrl_16_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_16_i16_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vle16.v v8, (a1)
+; V-NEXT:    vle16.v v9, (a0)
+; V-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 16
+; V-NEXT:    vnsrl.wi v9, v9, 16
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 2
+; V-NEXT:    vse16.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_16_i16_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vle16.v v8, (a1)
+; ZVE32F-NEXT:    vle16.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 16
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 16
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 2
+; ZVE32F-NEXT:    vse16.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <4 x i16>, ptr %in0, align 2
+  %1 = load <4 x i16>, ptr %in1, align 2
+  %shuffle.i5 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x i16> %shuffle.i5, ptr %out, align 2
+  ret void
+}
+
+define void @vnsrl_0_half_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_half_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vle16.v v8, (a1)
+; V-NEXT:    vle16.v v9, (a0)
+; V-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 0
+; V-NEXT:    vnsrl.wi v9, v9, 0
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 2
+; V-NEXT:    vse16.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_half_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vle16.v v8, (a1)
+; ZVE32F-NEXT:    vle16.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 0
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 2
+; ZVE32F-NEXT:    vse16.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <4 x half>, ptr %in0, align 2
+  %1 = load <4 x half>, ptr %in1, align 2
+  %shuffle.i5 = shufflevector <4 x half> %0, <4 x half> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x half> %shuffle.i5, ptr %out, align 2
+  ret void
+}
+
+define void @vnsrl_16_half_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_16_half_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vle16.v v8, (a1)
+; V-NEXT:    vle16.v v9, (a0)
+; V-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 16
+; V-NEXT:    vnsrl.wi v9, v9, 16
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 2
+; V-NEXT:    vse16.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_16_half_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vle16.v v8, (a1)
+; ZVE32F-NEXT:    vle16.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 16
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 16
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 2
+; ZVE32F-NEXT:    vse16.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <4 x half>, ptr %in0, align 2
+  %1 = load <4 x half>, ptr %in1, align 2
+  %shuffle.i5 = shufflevector <4 x half> %0, <4 x half> %1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x half> %shuffle.i5, ptr %out, align 2
+  ret void
+}
+
+define void @vnsrl_0_i32_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i32_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V-NEXT:    vle32.v v8, (a0)
+; V-NEXT:    vle32.v v9, (a1)
+; V-NEXT:    vslideup.vi v8, v9, 1
+; V-NEXT:    vse32.v v8, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i32_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; ZVE32F-NEXT:    vse32.v v8, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x i32>, ptr %in0, align 4
+  %1 = load <2 x i32>, ptr %in1, align 4
+  %shuffle.i5 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> <i32 0, i32 2>
+  store <2 x i32> %shuffle.i5, ptr %out, align 4
+  ret void
+}
+
+define void @vnsrl_32_i32_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_32_i32_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; V-NEXT:    vle32.v v8, (a0)
+; V-NEXT:    vle32.v v9, (a1)
+; V-NEXT:    vmv.v.i v0, 1
+; V-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; V-NEXT:    vse32.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_32_i32_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vmv.v.i v0, 1
+; ZVE32F-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; ZVE32F-NEXT:    vse32.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x i32>, ptr %in0, align 4
+  %1 = load <2 x i32>, ptr %in1, align 4
+  %shuffle.i5 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> <i32 1, i32 3>
+  store <2 x i32> %shuffle.i5, ptr %out, align 4
+  ret void
+}
+
+define void @vnsrl_0_float_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_float_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V-NEXT:    vle32.v v8, (a0)
+; V-NEXT:    vle32.v v9, (a1)
+; V-NEXT:    vslideup.vi v8, v9, 1
+; V-NEXT:    vse32.v v8, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_float_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; ZVE32F-NEXT:    vse32.v v8, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x float>, ptr %in0, align 4
+  %1 = load <2 x float>, ptr %in1, align 4
+  %shuffle.i5 = shufflevector <2 x float> %0, <2 x float> %1, <2 x i32> <i32 0, i32 2>
+  store <2 x float> %shuffle.i5, ptr %out, align 4
+  ret void
+}
+
+define void @vnsrl_32_float_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_32_float_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; V-NEXT:    vle32.v v8, (a0)
+; V-NEXT:    vle32.v v9, (a1)
+; V-NEXT:    vmv.v.i v0, 1
+; V-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; V-NEXT:    vse32.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_32_float_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vmv.v.i v0, 1
+; ZVE32F-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; ZVE32F-NEXT:    vse32.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x float>, ptr %in0, align 4
+  %1 = load <2 x float>, ptr %in1, align 4
+  %shuffle.i5 = shufflevector <2 x float> %0, <2 x float> %1, <2 x i32> <i32 1, i32 3>
+  store <2 x float> %shuffle.i5, ptr %out, align 4
+  ret void
+}
+
+define void @vnsrl_0_i64_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i64_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; V-NEXT:    vle64.v v8, (a0)
+; V-NEXT:    vle64.v v9, (a1)
+; V-NEXT:    vslideup.vi v8, v9, 1
+; V-NEXT:    vse64.v v8, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i64_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vslideup.vi v8, v9, 2
+; ZVE32F-NEXT:    vse32.v v8, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x i64>, ptr %in0, align 8
+  %1 = load <2 x i64>, ptr %in1, align 8
+  %shuffle.i5 = shufflevector <2 x i64> %0, <2 x i64> %1, <2 x i32> <i32 0, i32 2>
+  store <2 x i64> %shuffle.i5, ptr %out, align 8
+  ret void
+}
+
+define void @vnsrl_64_i64_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_64_i64_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; V-NEXT:    vle64.v v8, (a0)
+; V-NEXT:    vle64.v v9, (a1)
+; V-NEXT:    vmv.v.i v0, 1
+; V-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; V-NEXT:    vse64.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_64_i64_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vmv.v.i v0, 3
+; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2, v0.t
+; ZVE32F-NEXT:    vse32.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x i64>, ptr %in0, align 8
+  %1 = load <2 x i64>, ptr %in1, align 8
+  %shuffle.i5 = shufflevector <2 x i64> %0, <2 x i64> %1, <2 x i32> <i32 1, i32 3>
+  store <2 x i64> %shuffle.i5, ptr %out, align 8
+  ret void
+}
+
+define void @vnsrl_0_double_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_double_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; V-NEXT:    vle64.v v8, (a0)
+; V-NEXT:    vle64.v v9, (a1)
+; V-NEXT:    vslideup.vi v8, v9, 1
+; V-NEXT:    vse64.v v8, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_double_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    ld a0, 0(a0)
+; ZVE32F-NEXT:    ld a1, 0(a1)
+; ZVE32F-NEXT:    sd a0, 0(a2)
+; ZVE32F-NEXT:    sd a1, 8(a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x double>, ptr %in0, align 8
+  %1 = load <2 x double>, ptr %in1, align 8
+  %shuffle.i5 = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 0, i32 2>
+  store <2 x double> %shuffle.i5, ptr %out, align 8
+  ret void
+}
+
+define void @vnsrl_64_double_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_64_double_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; V-NEXT:    vle64.v v8, (a0)
+; V-NEXT:    vle64.v v9, (a1)
+; V-NEXT:    vmv.v.i v0, 1
+; V-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; V-NEXT:    vse64.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_64_double_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    ld a0, 8(a0)
+; ZVE32F-NEXT:    ld a1, 8(a1)
+; ZVE32F-NEXT:    sd a0, 0(a2)
+; ZVE32F-NEXT:    sd a1, 8(a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x double>, ptr %in0, align 8
+  %1 = load <2 x double>, ptr %in1, align 8
+  %shuffle.i5 = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> <i32 1, i32 3>
+  store <2 x double> %shuffle.i5, ptr %out, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 9f0b2b3914836..8b41febced065 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -7,31 +7,24 @@
 define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_v16i1_v32i1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv1r.v v8, v0
-; CHECK-NEXT:    vslidedown.vi v0, v0, 2
+; CHECK-NEXT:    vslidedown.vi v8, v0, 2
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    li a0, -256
-; CHECK-NEXT:    vmerge.vim v11, v10, 1, v0
-; CHECK-NEXT:    vadd.vv v12, v9, v9
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vmerge.vim v10, v9, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vadd.vi v10, v12, -16
-; CHECK-NEXT:    vadd.vi v12, v12, -15
+; CHECK-NEXT:    vmerge.vim v8, v9, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v13, v8, 0
+; CHECK-NEXT:    vnsrl.wi v9, v10, 0
+; CHECK-NEXT:    vnsrl.wi v11, v8, 0
+; CHECK-NEXT:    vnsrl.wi v10, v10, 8
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 8
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v13, v11, v10, v0.t
-; CHECK-NEXT:    vrgather.vv v8, v11, v12, v0.t
-; CHECK-NEXT:    vmsne.vi v0, v13, 0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v11, 8
+; CHECK-NEXT:    vslideup.vi v10, v8, 8
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    vmsne.vi v8, v10, 0
 ; CHECK-NEXT:    ret
 %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
 ret {<16 x i1>, <16 x i1>} %retval

From c3cae9d6fccc96297e832a09f5230346ef4c42f3 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 17 Feb 2025 06:24:30 +0100
Subject: [PATCH 089/109] [clang][bytecode] Fix const-ness of local primitive
 temporary (#127405)

This used to cause certain std::range tests in libc++ to be diagnosed as
modifying a const-qualified field, because we set the IsConst flag to
true unconditionally. Check the type instead.
---
 clang/lib/AST/ByteCode/Compiler.cpp           |  3 +-
 .../ByteCode/non-const-local-temporary.cpp    | 53 +++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/AST/ByteCode/non-const-local-temporary.cpp

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index cf39209819ade..c8ace39d56fd0 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -2707,7 +2707,8 @@ bool Compiler<Emitter>::VisitMaterializeTemporaryExpr(
 
   // For everyhing else, use local variables.
   if (SubExprT) {
-    unsigned LocalIndex = allocateLocalPrimitive(E, *SubExprT, /*IsConst=*/true,
+    bool IsConst = SubExpr->getType().isConstQualified();
+    unsigned LocalIndex = allocateLocalPrimitive(E, *SubExprT, IsConst,
                                                  /*IsExtended=*/true);
     if (!this->visit(SubExpr))
       return false;
diff --git a/clang/test/AST/ByteCode/non-const-local-temporary.cpp b/clang/test/AST/ByteCode/non-const-local-temporary.cpp
new file mode 100644
index 0000000000000..11d4b5383dbf5
--- /dev/null
+++ b/clang/test/AST/ByteCode/non-const-local-temporary.cpp
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -std=c++2c -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -std=c++2c  -verify=ref,both %s
+
+// both-no-diagnostics
+
+namespace std {
+template <class, int __v> struct integral_constant {
+  static const int value = __v;
+};
+using size_t = decltype(sizeof(int));
+template <class _Tp, class>
+concept __weakly_equality_comparable_with = requires(_Tp __t) { __t; };
+template <size_t, class> struct tuple_element;
+template <class> struct tuple_size;
+template <class _Ip>
+concept input_or_output_iterator = requires(_Ip __i) { __i; };
+template <class _Sp, class _Ip>
+concept sentinel_for = __weakly_equality_comparable_with<_Sp, _Ip>;
+namespace ranges {
+enum subrange_kind { unsized };
+template <input_or_output_iterator _Iter, sentinel_for<_Iter> _Sent,
+          subrange_kind = unsized>
+struct subrange {
+  _Iter __begin_;
+  _Sent __end_;
+  constexpr _Sent end() { return __end_; }
+};
+template <int, class _Iter, class _Sent, subrange_kind _Kind>
+constexpr auto get(subrange<_Iter, _Sent, _Kind> __subrange) {
+  return __subrange.end();
+}
+} // namespace ranges
+template <class _Ip, class _Sp, ranges::subrange_kind _Kp>
+struct tuple_size<ranges::subrange<_Ip, _Sp, _Kp>>
+    : integral_constant<size_t, 2> {};
+template <class _Ip, class _Sp, ranges::subrange_kind _Kp>
+struct tuple_element<0, ranges::subrange<_Ip, _Sp, _Kp>> {
+  using type = _Ip;
+};
+template <class _Ip, class _Sp, ranges::subrange_kind _Kp>
+struct tuple_element<1, ranges::subrange<_Ip, _Sp, _Kp>> {
+  using type = _Sp;
+};
+} // namespace std
+constexpr bool test() {
+  int a[1];
+  auto r = std::ranges::subrange(a, a);
+  auto [first, last] = r;
+  last = a;
+  return true;
+}
+static_assert(test());
+

From 06a3abd9e85d89a3b2b1b5024b328d1047d40c2a Mon Sep 17 00:00:00 2001
From: Vikram Hegde <115221833+vikramRH@users.noreply.github.com>
Date: Mon, 17 Feb 2025 11:07:17 +0530
Subject: [PATCH 090/109] [AMDGPU][NewPM] Port "SIFormMemoryClauses" to NPM
 (#127181)

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   4 +-
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   2 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   3 +-
 .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 102 ++++++++++--------
 llvm/lib/Target/AMDGPU/SIFormMemoryClauses.h  |  22 ++++
 .../AMDGPU/limit-soft-clause-reg-pressure.mir |   1 +
 llvm/test/CodeGen/AMDGPU/memory_clause.mir    |   1 +
 .../CodeGen/AMDGPU/reserved-reg-in-clause.mir |   1 +
 .../AMDGPU/smem-no-clause-coalesced.mir       |   1 +
 .../CodeGen/AMDGPU/soft-clause-dbg-value.mir  |   1 +
 10 files changed, 92 insertions(+), 46 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/SIFormMemoryClauses.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index de3253e64b978..4a0e5ef58ac93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -50,7 +50,7 @@ FunctionPass *createLowerWWMCopiesPass();
 FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIPreAllocateWWMRegsLegacyPass();
-FunctionPass *createSIFormMemoryClausesPass();
+FunctionPass *createSIFormMemoryClausesLegacyPass();
 
 FunctionPass *createSIPostRABundlerPass();
 FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
@@ -425,7 +425,7 @@ extern char &SIInsertHardClausesID;
 void initializeSIInsertWaitcntsPass(PassRegistry&);
 extern char &SIInsertWaitcntsID;
 
-void initializeSIFormMemoryClausesPass(PassRegistry&);
+void initializeSIFormMemoryClausesLegacyPass(PassRegistry &);
 extern char &SIFormMemoryClausesID;
 
 void initializeSIPostRABundlerPass(PassRegistry&);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 14b35a4fd8327..225f84725874b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -104,6 +104,7 @@ MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
 MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
 MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
 MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
+MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
 MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
 MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
 MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
@@ -124,7 +125,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizations
 DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass())
 DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
 
-DUMMY_MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass())
 DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass())
 DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass())
 DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7c0f1040a8156..eb488843b53e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -42,6 +42,7 @@
 #include "SIFixSGPRCopies.h"
 #include "SIFixVGPRCopies.h"
 #include "SIFoldOperands.h"
+#include "SIFormMemoryClauses.h"
 #include "SILoadStoreOptimizer.h"
 #include "SILowerControlFlow.h"
 #include "SILowerSGPRSpills.h"
@@ -540,7 +541,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIMemoryLegalizerPass(*PR);
   initializeSIOptimizeExecMaskingLegacyPass(*PR);
   initializeSIPreAllocateWWMRegsLegacyPass(*PR);
-  initializeSIFormMemoryClausesPass(*PR);
+  initializeSIFormMemoryClausesLegacyPass(*PR);
   initializeSIPostRABundlerPass(*PR);
   initializeGCNCreateVOPDPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 926657b8a1e7b..bbc0280aed42e 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -14,6 +14,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "SIFormMemoryClauses.h"
 #include "AMDGPU.h"
 #include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
@@ -31,15 +32,37 @@ MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15),
 
 namespace {
 
-class SIFormMemoryClauses : public MachineFunctionPass {
+class SIFormMemoryClausesImpl {
   using RegUse = DenseMap<unsigned, std::pair<unsigned, LaneBitmask>>;
 
+  bool canBundle(const MachineInstr &MI, const RegUse &Defs,
+                 const RegUse &Uses) const;
+  bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
+  void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
+  bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
+                      GCNDownwardRPTracker &RPT);
+
+  const GCNSubtarget *ST;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  SIMachineFunctionInfo *MFI;
+  LiveIntervals *LIS;
+
+  unsigned LastRecordedOccupancy;
+  unsigned MaxVGPRs;
+  unsigned MaxSGPRs;
+
 public:
-  static char ID;
+  SIFormMemoryClausesImpl(LiveIntervals *LS) : LIS(LS) {}
+  bool run(MachineFunction &MF);
+};
 
+class SIFormMemoryClausesLegacy : public MachineFunctionPass {
 public:
-  SIFormMemoryClauses() : MachineFunctionPass(ID) {
-    initializeSIFormMemoryClausesPass(*PassRegistry::getPassRegistry());
+  static char ID;
+
+  SIFormMemoryClausesLegacy() : MachineFunctionPass(ID) {
+    initializeSIFormMemoryClausesLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -58,40 +81,22 @@ class SIFormMemoryClauses : public MachineFunctionPass {
     return MachineFunctionProperties().set(
         MachineFunctionProperties::Property::IsSSA);
   }
-
-private:
-  bool canBundle(const MachineInstr &MI, const RegUse &Defs,
-                 const RegUse &Uses) const;
-  bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
-  void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
-  bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
-                      GCNDownwardRPTracker &RPT);
-
-  const GCNSubtarget *ST;
-  const SIRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-  SIMachineFunctionInfo *MFI;
-
-  unsigned LastRecordedOccupancy;
-  unsigned MaxVGPRs;
-  unsigned MaxSGPRs;
 };
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SIFormMemoryClauses, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SIFormMemoryClausesLegacy, DEBUG_TYPE,
                       "SI Form memory clauses", false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
-INITIALIZE_PASS_END(SIFormMemoryClauses, DEBUG_TYPE,
+INITIALIZE_PASS_END(SIFormMemoryClausesLegacy, DEBUG_TYPE,
                     "SI Form memory clauses", false, false)
 
+char SIFormMemoryClausesLegacy::ID = 0;
 
-char SIFormMemoryClauses::ID = 0;
-
-char &llvm::SIFormMemoryClausesID = SIFormMemoryClauses::ID;
+char &llvm::SIFormMemoryClausesID = SIFormMemoryClausesLegacy::ID;
 
-FunctionPass *llvm::createSIFormMemoryClausesPass() {
-  return new SIFormMemoryClauses();
+FunctionPass *llvm::createSIFormMemoryClausesLegacyPass() {
+  return new SIFormMemoryClausesLegacy();
 }
 
 static bool isVMEMClauseInst(const MachineInstr &MI) {
@@ -147,8 +152,9 @@ static unsigned getMopState(const MachineOperand &MO) {
 
 // Returns false if there is a use of a def already in the map.
 // In this case we must break the clause.
-bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, const RegUse &Defs,
-                                    const RegUse &Uses) const {
+bool SIFormMemoryClausesImpl::canBundle(const MachineInstr &MI,
+                                        const RegUse &Defs,
+                                        const RegUse &Uses) const {
   // Check interference with defs.
   for (const MachineOperand &MO : MI.operands()) {
     // TODO: Prologue/Epilogue Insertion pass does not process bundled
@@ -184,8 +190,8 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, const RegUse &Defs,
 // Since all defs in the clause are early clobber we can run out of registers.
 // Function returns false if pressure would hit the limit if instruction is
 // bundled into a memory clause.
-bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
-                                        GCNDownwardRPTracker &RPT) {
+bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
+                                            GCNDownwardRPTracker &RPT) {
   // NB: skip advanceBeforeNext() call. Since all defs will be marked
   // early-clobber they will all stay alive at least to the end of the
   // clause. Therefor we should not decrease pressure even if load
@@ -213,8 +219,8 @@ bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
 }
 
 // Collect register defs and uses along with their lane masks and states.
-void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
-                                         RegUse &Defs, RegUse &Uses) const {
+void SIFormMemoryClausesImpl::collectRegUses(const MachineInstr &MI,
+                                             RegUse &Defs, RegUse &Uses) const {
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
@@ -239,9 +245,9 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
 // Check register def/use conflicts, occupancy limits and collect def/use maps.
 // Return true if instruction can be bundled with previous. If it cannot
 // def/use maps are not updated.
-bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
-                                         RegUse &Defs, RegUse &Uses,
-                                         GCNDownwardRPTracker &RPT) {
+bool SIFormMemoryClausesImpl::processRegUses(const MachineInstr &MI,
+                                             RegUse &Defs, RegUse &Uses,
+                                             GCNDownwardRPTracker &RPT) {
   if (!canBundle(MI, Defs, Uses))
     return false;
 
@@ -252,10 +258,7 @@ bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
   return true;
 }
 
-bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(MF.getFunction()))
-    return false;
-
+bool SIFormMemoryClausesImpl::run(MachineFunction &MF) {
   ST = &MF.getSubtarget<GCNSubtarget>();
   if (!ST->isXNACKEnabled())
     return false;
@@ -264,7 +267,6 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
   TRI = ST->getRegisterInfo();
   MRI = &MF.getRegInfo();
   MFI = MF.getInfo<SIMachineFunctionInfo>();
-  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   SlotIndexes *Ind = LIS->getSlotIndexes();
   bool Changed = false;
 
@@ -416,3 +418,19 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
 
   return Changed;
 }
+
+bool SIFormMemoryClausesLegacy::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  return SIFormMemoryClausesImpl(LIS).run(MF);
+}
+
+PreservedAnalyses
+SIFormMemoryClausesPass::run(MachineFunction &MF,
+                             MachineFunctionAnalysisManager &MFAM) {
+  LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  SIFormMemoryClausesImpl(&LIS).run(MF);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.h b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.h
new file mode 100644
index 0000000000000..c50a46f9ac2fb
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.h
@@ -0,0 +1,22 @@
+//===- SIFormMemoryClauses.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIFORMMEMORYCLAUSES_H
+#define LLVM_LIB_TARGET_AMDGPU_SIFORMMEMORYCLAUSES_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class SIFormMemoryClausesPass : public PassInfoMixin<SIFormMemoryClausesPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIFORMMEMORYCLAUSES_H
diff --git a/llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir b/llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir
index 46fe85a5f13f3..bd46754d10683 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack -run-pass=si-form-memory-clauses -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack -passes="si-form-memory-clauses" -o - %s | FileCheck %s
 
 # This previously would produce a bundle that could not be satisfied
 # due to using nearly the entire register budget and not considering
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.mir b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
index 4b0226a0f6586..e50c3146068f3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass=si-form-memory-clauses %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx902 -passes="si-form-memory-clauses" %s -o - | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: {{^}}name: vector_clause{{$}}
 # GCN:      %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir b/llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir
index 5346aea592348..fb1da2da0a8ff 100644
--- a/llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir
+++ b/llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-form-memory-clauses %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -passes="si-form-memory-clauses" %s -o - | FileCheck -check-prefix=GCN %s
 
 # Make sure we do not produce early-clobber list with odd subregs.
 
diff --git a/llvm/test/CodeGen/AMDGPU/smem-no-clause-coalesced.mir b/llvm/test/CodeGen/AMDGPU/smem-no-clause-coalesced.mir
index 8a4450926471e..7608e066d1169 100644
--- a/llvm/test/CodeGen/AMDGPU/smem-no-clause-coalesced.mir
+++ b/llvm/test/CodeGen/AMDGPU/smem-no-clause-coalesced.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx902 -o - %s -run-pass si-form-memory-clauses -verify-machineinstrs | FileCheck -check-prefix=XNACK %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx902 -o - %s -passes="si-form-memory-clauses" | FileCheck -check-prefix=XNACK %s
 
 # The SIFormMemoryClauses pass must not form a clause (indicated by BUNDLE)
 # from the two adjacent smem instructions, because the first one has its
diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/soft-clause-dbg-value.mir
index 728f3874a5be3..af9ff4bae8292 100644
--- a/llvm/test/CodeGen/AMDGPU/soft-clause-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/soft-clause-dbg-value.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack -run-pass=si-form-memory-clauses -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack -passes="si-form-memory-clauses" -o - %s | FileCheck %s
 
 # Make sure that debug instructions do not change the bundling, and
 # the dbg_values which break the clause are inserted after the new

From f1627e1a9e2482106ba2ea3bcd22ecaff257950d Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 17 Feb 2025 07:02:54 +0100
Subject: [PATCH 091/109] [clang][bytecode][NFC] Move reduced libcxx tests to a
 subdir (#127438)

---
 clang/test/AST/ByteCode/{ => libcxx}/allocate-arrays.cpp          | 0
 .../test/AST/ByteCode/{ => libcxx}/non-const-local-temporary.cpp  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename clang/test/AST/ByteCode/{ => libcxx}/allocate-arrays.cpp (100%)
 rename clang/test/AST/ByteCode/{ => libcxx}/non-const-local-temporary.cpp (100%)

diff --git a/clang/test/AST/ByteCode/allocate-arrays.cpp b/clang/test/AST/ByteCode/libcxx/allocate-arrays.cpp
similarity index 100%
rename from clang/test/AST/ByteCode/allocate-arrays.cpp
rename to clang/test/AST/ByteCode/libcxx/allocate-arrays.cpp
diff --git a/clang/test/AST/ByteCode/non-const-local-temporary.cpp b/clang/test/AST/ByteCode/libcxx/non-const-local-temporary.cpp
similarity index 100%
rename from clang/test/AST/ByteCode/non-const-local-temporary.cpp
rename to clang/test/AST/ByteCode/libcxx/non-const-local-temporary.cpp

From 02d4aac55cdd1760ba9cda4aa512fe1a0240bf86 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 22:47:14 -0800
Subject: [PATCH 092/109] [AMDGPU] Remove materializeImmediate (#127420)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The lase use was removed in:

  commit cbf34a5f7701148d68951320a72f483849b22eaf
  Author: Juan Manuel Martinez Caamaño <jmartinezcaamao@gmail.com>
  Date:   Fri Aug 23 14:06:17 2024 +0200
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 56 --------------------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h   |  4 --
 2 files changed, 60 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index baacb5d3d5455..8481c6333f479 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1171,62 +1171,6 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
   return Opcode;
 }
 
-void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MI,
-                                       const DebugLoc &DL, Register DestReg,
-                                       int64_t Value) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
-  if (RegClass == &AMDGPU::SReg_32RegClass ||
-      RegClass == &AMDGPU::SGPR_32RegClass ||
-      RegClass == &AMDGPU::SReg_32_XM0RegClass ||
-      RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
-      .addImm(Value);
-    return;
-  }
-
-  if (RegClass == &AMDGPU::SReg_64RegClass ||
-      RegClass == &AMDGPU::SGPR_64RegClass ||
-      RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
-      .addImm(Value);
-    return;
-  }
-
-  if (RegClass == &AMDGPU::VGPR_32RegClass) {
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
-      .addImm(Value);
-    return;
-  }
-  if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
-      .addImm(Value);
-    return;
-  }
-
-  unsigned EltSize = 4;
-  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
-  if (RI.isSGPRClass(RegClass)) {
-    if (RI.getRegSizeInBits(*RegClass) > 32) {
-      Opcode =  AMDGPU::S_MOV_B64;
-      EltSize = 8;
-    } else {
-      Opcode = AMDGPU::S_MOV_B32;
-      EltSize = 4;
-    }
-  }
-
-  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
-  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
-    int64_t IdxValue = Idx == 0 ? Value : 0;
-
-    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
-      get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
-    Builder.addImm(IdxValue);
-  }
-}
-
 const TargetRegisterClass *
 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
   return &AMDGPU::VGPR_32RegClass;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 6b0de138251ab..811e4fcbebf57 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -267,10 +267,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                    bool KillSrc, bool RenamableDest = false,
                    bool RenamableSrc = false) const override;
 
-  void materializeImmediate(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI, const DebugLoc &DL,
-                            Register DestReg, int64_t Value) const;
-
   const TargetRegisterClass *getPreferredSelectRegClass(
                                unsigned Size) const;
 

From 36f8c8b43836775c3d9d8da63b97d984b19853d1 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 17 Feb 2025 08:06:12 +0100
Subject: [PATCH 093/109] [clang][bytecode] Fix rejecting non-constexpr array
 ctors (#127448)

We shouldn't abort here when compiling, this is happening (and properly
diagnosed) when interpreting the bytecode.
---
 clang/lib/AST/ByteCode/Compiler.cpp    |  2 +-
 clang/test/AST/ByteCode/new-delete.cpp | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index c8ace39d56fd0..59c236c9da8c8 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -3029,7 +3029,7 @@ bool Compiler<Emitter>::VisitCXXConstructExpr(const CXXConstructExpr *E) {
 
     size_t NumElems = CAT->getZExtSize();
     const Function *Func = getFunction(E->getConstructor());
-    if (!Func || !Func->isConstexpr())
+    if (!Func)
       return false;
 
     // FIXME(perf): We're calling the constructor once per array element here,
diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp
index 31f066b37858d..e9850d27666e5 100644
--- a/clang/test/AST/ByteCode/new-delete.cpp
+++ b/clang/test/AST/ByteCode/new-delete.cpp
@@ -865,7 +865,6 @@ constexpr unsigned short ssmall = SS<unsigned short>(100)[42];
 constexpr auto Ss = SS<S>()[0];
 
 
-
 namespace IncompleteArray {
   struct A {
     int b = 10;
@@ -908,8 +907,19 @@ namespace IncompleteArray {
     return c;
   }
   static_assert(test4() == 12);
+}
 
+namespace NonConstexprArrayCtor {
+  struct S {
+    S() {} // both-note 2{{declared here}}
+  };
 
+  constexpr bool test() { // both-error {{never produces a constant expression}}
+     auto s = new S[1]; // both-note 2{{non-constexpr constructor}}
+     return true;
+  }
+  static_assert(test()); // both-error {{not an integral constant expression}} \
+                         // both-note {{in call to}}
 }
 
 #else

From de06978ebcff5f75913067b019d2d522d0be0872 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 23:24:15 -0800
Subject: [PATCH 094/109] [AMDGPU] Avoid repeated hash lookups (NFC) (#127445)

---
 llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index ac11526da0919..17f724c3173d9 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -270,9 +270,10 @@ bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
       MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
     if (!MOp->isReg())
       continue;
-    if (PreviousRegSeqByReg[MOp->getReg()].empty())
+    auto &Insts = PreviousRegSeqByReg[MOp->getReg()];
+    if (Insts.empty())
       continue;
-    for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) {
+    for (MachineInstr *MI : Insts) {
       CompatibleRSI = PreviousRegSeq[MI];
       if (RSI == CompatibleRSI)
         continue;

From 86d82228a58071a68d7ac450af18eadd641f3477 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 16 Feb 2025 23:44:26 -0800
Subject: [PATCH 095/109] [dsymutil] Avoid repeated hash lookups (NFC)
 (#127449)

---
 llvm/tools/dsymutil/BinaryHolder.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/tools/dsymutil/BinaryHolder.cpp b/llvm/tools/dsymutil/BinaryHolder.cpp
index 7588a33eb46b2..9429afeefa642 100644
--- a/llvm/tools/dsymutil/BinaryHolder.cpp
+++ b/llvm/tools/dsymutil/BinaryHolder.cpp
@@ -288,15 +288,13 @@ void BinaryHolder::eraseObjectEntry(StringRef Filename) {
   if (isArchive(Filename)) {
     StringRef ArchiveFilename = getArchiveAndObjectName(Filename).first;
     std::lock_guard<std::mutex> Lock(ArchiveCacheMutex);
-    ArchiveRefCounter[ArchiveFilename]--;
-    if (ArchiveRefCounter[ArchiveFilename] == 0)
+    if (--ArchiveRefCounter[ArchiveFilename] == 0)
       ArchiveCache.erase(ArchiveFilename);
     return;
   }
 
   std::lock_guard<std::mutex> Lock(ObjectCacheMutex);
-  ObjectRefCounter[Filename]--;
-  if (ObjectRefCounter[Filename] == 0)
+  if (--ObjectRefCounter[Filename] == 0)
     ObjectCache.erase(Filename);
 }
 

From 153dd19e3057169e6935027ffbc84b62b392aa35 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Feb 2025 00:06:48 -0800
Subject: [PATCH 096/109] [SelectionDAG] Remove lowerCallToExternalSymbol
 (#127408)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The last use was removed in:

  commit 05e6bb40ebfd285cc87f7ce326b7ba76c3c7f870
  Author: Roger Ferrer Ibáñez <rofirrim@gmail.com>
  Date:   Thu May 30 14:55:32 2024 +0200
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 9 ---------
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h   | 3 ---
 2 files changed, 12 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cac25fd7c1025..78a6e24e5b8d2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6281,15 +6281,6 @@ static unsigned FixedPointIntrinsicToOpcode(unsigned Intrinsic) {
   }
 }
 
-void SelectionDAGBuilder::lowerCallToExternalSymbol(const CallInst &I,
-                                           const char *FunctionName) {
-  assert(FunctionName && "FunctionName must not be nullptr");
-  SDValue Callee = DAG.getExternalSymbol(
-      FunctionName,
-      DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
-  LowerCallTo(I, Callee, I.isTailCall(), I.isMustTailCall());
-}
-
 /// Given a @llvm.call.preallocated.setup, return the corresponding
 /// preallocated call.
 static const CallBase *FindPreallocatedCall(const Value *PreallocatedSetup) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index ece48c9bedf72..8496f8ae78ce6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -705,9 +705,6 @@ class SelectionDAGBuilder {
                           DIExpression *Expr, const DebugLoc &dl,
                           unsigned DbgSDNodeOrder);
 
-  /// Lowers CallInst to an external symbol.
-  void lowerCallToExternalSymbol(const CallInst &I, const char *FunctionName);
-
   SDValue lowerStartEH(SDValue Chain, const BasicBlock *EHPadBB,
                        MCSymbol *&BeginLabel);
   SDValue lowerEndEH(SDValue Chain, const InvokeInst *II,

From b16ce8fc24f32aa0614562de0a2d0916118398fb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 17 Feb 2025 08:20:25 +0000
Subject: [PATCH 097/109] [X86] getFauxShuffleMask - match 256-bit CONCAT(SUB0,
 SUB1) 64-bit elt patterns as well as 512-bit (#127392)

The 512-bit filter was to prevent AVX1/2 regressions, but most of that is now handled by canonicalizeShuffleWithOp

Ideally we need to support smaller element widths as well.

Noticed while triaging #116931
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   6 +-
 .../vector-interleaved-store-i16-stride-3.ll  |  86 ++++---
 .../vector-interleaved-store-i16-stride-4.ll  |  74 +++---
 .../vector-interleaved-store-i16-stride-7.ll  |  70 +++---
 .../vector-interleaved-store-i16-stride-8.ll  | 196 +++++++--------
 .../vector-interleaved-store-i32-stride-3.ll  | 152 ++++++------
 .../vector-interleaved-store-i32-stride-4.ll  |  76 +++---
 .../vector-interleaved-store-i8-stride-3.ll   |  66 +++---
 .../vector-interleaved-store-i8-stride-7.ll   | 224 +++++++++---------
 .../vector-interleaved-store-i8-stride-8.ll   | 188 +++++++--------
 .../test/CodeGen/X86/vector-shuffle-256-v4.ll |  68 +++++-
 llvm/test/CodeGen/X86/widen_fadd.ll           |  84 ++-----
 llvm/test/CodeGen/X86/widen_fdiv.ll           |  45 +---
 llvm/test/CodeGen/X86/widen_fmul.ll           |  84 ++-----
 llvm/test/CodeGen/X86/widen_fsub.ll           |  84 ++-----
 15 files changed, 676 insertions(+), 827 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9592137b34842..21b08a4a93fc7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6130,11 +6130,9 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       return true;
     }
     // Handle CONCAT(SUB0, SUB1).
-    // Limit this to vXi64 512-bit vector cases to make the most of AVX512
-    // cross lane shuffles.
+    // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
     if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
-        NumBitsPerElt == 64 && NumSizeInBits == 512 &&
-        Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        NumBitsPerElt == 64 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
         Src.getOperand(0).isUndef() &&
         Src.getOperand(1).getValueType() == SubVT &&
         Src.getConstantOperandVal(2) == 0) {
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 0beb304a5673d..4a2e7d55d3e88 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -211,10 +211,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -228,10 +228,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FP-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -245,10 +245,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -262,10 +262,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -279,10 +279,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -296,10 +296,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -313,10 +313,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -330,12 +330,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vmovq %xmm1, 16(%rcx)
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rcx)
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512BW-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -345,12 +344,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -360,12 +358,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512DQ-BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-NEXT:    vmovq %xmm1, 16(%rcx)
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -375,12 +372,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index 704c92924abfb..71eb606a8665d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -217,12 +217,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-NEXT:    vmovdqa %ymm0, (%r8)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -235,12 +235,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r8)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
@@ -269,12 +269,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX512-NEXT:    vmovdqa %ymm0, (%r8)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -287,10 +287,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
 ; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
@@ -303,12 +302,12 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,2,3,10,11,2,3,10,11,20,21,28,29,20,21,28,29,22,23,30,31,22,23,30,31]
-; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%r8)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -321,10 +320,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31]
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
@@ -337,10 +335,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
-; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovdqa %ymm0, (%r8)
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23]
+; AVX512BW-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT:    vmovdqa %ymm2, (%r8)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -352,10 +349,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%r8)
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%r8)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -367,10 +363,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%r8)
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, (%r8)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -382,10 +377,9 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index f135b2f1577ec..351d98540c2a5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -584,22 +584,22 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7]
+; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm5
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT:    vpor %ymm5, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[24,25,20,21,22,23,20,21,28,29]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3],zero,zero,ymm4[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
 ; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
@@ -667,28 +667,28 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpermi2q %ymm5, %ymm0, %ymm1
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7]
+; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29]
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
 ; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
 ; AVX512-FCP-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
@@ -750,28 +750,28 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermi2q %ymm5, %ymm0, %ymm1
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7]
+; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29]
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
 ; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
 ; AVX512DQ-FCP-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index ccd2d58702de0..4be6ccd2e3575 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -456,34 +456,34 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,4]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,4]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[0,2,2,3,4,6,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,1,1,3,4,5,5,7]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,3,7,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
-; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[3,1,2,3,7,5,6,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[3,1,2,3,7,5,6,7]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vmovdqa %ymm0, 32(%rax)
-; AVX2-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -504,30 +504,30 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,8,9,0,1,8,9,16,17,18,19,20,21,22,23,18,19,26,27,18,19,26,27]
-; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm1, %ymm3
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,8,9,10,11,12,13,14,15,18,19,26,27,18,19,26,27,24,25,26,27,28,29,30,31]
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm0, %ymm5
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,0,1,8,9,0,1,8,9,16,17,18,19,20,21,22,23,18,19,26,27,18,19,26,27]
+; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm6
+; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm5, %ymm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,8,9,0,1,8,9,8,9,10,11,12,13,14,15,18,19,26,27,18,19,26,27,24,25,26,27,28,29,30,31]
+; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm6
+; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,4,5,12,13,4,5,12,13,16,17,18,19,20,21,22,23,22,23,30,31,22,23,30,31]
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
+; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm5, %ymm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,5,12,13,4,5,12,13,8,9,10,11,12,13,14,15,22,23,30,31,22,23,30,31,24,25,26,27,28,29,30,31]
 ; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
-; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
+; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
@@ -588,33 +588,33 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,1,1,3,4,5,5,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm5[3,1,2,3,7,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[3,1,2,3,7,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,1,2,0,4,5,6,4]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -676,33 +676,33 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm5[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm4[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7]
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index f9228707182f7..a2ebecd3e0f87 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -95,121 +95,113 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512-LABEL: store_i32_stride3_vf2:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vmovlps %xmm1, 16(%rcx)
-; AVX512-NEXT:    vmovaps %xmm0, (%rcx)
+; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i32_stride3_vf2:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT:    vmovlps %xmm1, 16(%rcx)
-; AVX512-FCP-NEXT:    vmovaps %xmm0, (%rcx)
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512-FCP-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i32_stride3_vf2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512DQ-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vmovlps %xmm1, 16(%rcx)
-; AVX512DQ-NEXT:    vmovaps %xmm0, (%rcx)
+; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512DQ-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512DQ-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i32_stride3_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512DQ-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT:    vmovlps %xmm1, 16(%rcx)
-; AVX512DQ-FCP-NEXT:    vmovaps %xmm0, (%rcx)
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-FCP-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_i32_stride3_vf2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512BW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vmovlps %xmm1, 16(%rcx)
-; AVX512BW-NEXT:    vmovaps %xmm0, (%rcx)
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512BW-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: store_i32_stride3_vf2:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-FCP-NEXT:    vmovlps %xmm1, 16(%rcx)
-; AVX512BW-FCP-NEXT:    vmovaps %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512BW-FCP-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: store_i32_stride3_vf2:
 ; AVX512DQ-BW:       # %bb.0:
-; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512DQ-BW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-NEXT:    vmovlps %xmm1, 16(%rcx)
-; AVX512DQ-BW-NEXT:    vmovaps %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512DQ-BW-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: store_i32_stride3_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,1,3,5,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vmovlps %xmm1, 16(%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, 16(%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
index 22040e0cdb791..c58352e503ae1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
@@ -110,16 +110,15 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512-FCP-LABEL: store_i32_stride4_vf2:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vmovaps %ymm0, (%r8)
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%r8)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -140,16 +139,15 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512DQ-FCP-LABEL: store_i32_stride4_vf2:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vmovaps %ymm0, (%r8)
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%r8)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -170,16 +168,15 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512BW-FCP-LABEL: store_i32_stride4_vf2:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vmovaps %ymm0, (%r8)
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512BW-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%r8)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -200,16 +197,15 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX512DQ-BW-FCP-LABEL: store_i32_stride4_vf2:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index 3d26171054f2e..ba1621c67f480 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -384,10 +384,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -401,10 +401,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FP-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -418,10 +418,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -435,10 +435,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23,u,u,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -452,10 +452,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23,u,u,u,u,u,u,u,u]
 ; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -469,10 +469,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -486,10 +486,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -503,10 +503,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BW-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -520,10 +520,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BW-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -537,10 +537,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-BW-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -554,10 +554,10 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,ymm2[1,9],zero,ymm2[2,10],zero,ymm2[3,11],zero,ymm2[4,12],zero,ymm2[5],zero,ymm2[21],zero,zero,ymm2[22],zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, 16(%rcx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index ab968b91153a9..be83db26aa7ed 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -912,24 +912,24 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpor %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm5, %ymm6, %ymm5
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
-; AVX2-NEXT:    vpor %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
+; AVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
@@ -954,24 +954,24 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT:    vpor %ymm5, %ymm3, %ymm3
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm5, %ymm6, %ymm5
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
-; AVX2-FP-NEXT:    vpor %ymm4, %ymm2, %ymm2
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
+; AVX2-FP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
@@ -996,22 +996,22 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
 ; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7]
 ; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm3
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm5
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,4,8],zero,zero,zero,zero,ymm5[1,5,9],zero,zero,zero,zero,ymm5[2,6,18],zero,zero,zero,zero,ymm5[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FCP-NEXT:    vpor %ymm5, %ymm1, %ymm1
+; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
 ; AVX2-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
 ; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
@@ -1038,18 +1038,18 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
+; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,5,13],zero,zero,ymm4[u,u,u,6,14],zero,zero,ymm4[u,u,u],zero,zero,ymm4[23,31,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512-NEXT:    vporq %zmm2, %zmm3, %zmm2
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u]
@@ -1079,28 +1079,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
+; AVX512-FCP-NEXT:    vpermi2q %ymm5, %ymm0, %ymm1
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
 ; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
-; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
+; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
 ; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7]
 ; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512-FCP-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
 ; AVX512-FCP-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
@@ -1121,18 +1121,18 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2]
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,5,13],zero,zero,ymm4[u,u,u,6,14],zero,zero,ymm4[u,u,u],zero,zero,ymm4[23,31,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512DQ-NEXT:    vporq %zmm2, %zmm3, %zmm2
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u]
@@ -1162,28 +1162,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512DQ-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermi2q %ymm5, %ymm0, %ymm1
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
 ; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
-; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28]
+; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1)
 ; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7]
 ; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512DQ-FCP-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
 ; AVX512DQ-FCP-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
@@ -1241,33 +1241,33 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512BW-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
-; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512BW-FCP-NEXT:    vpermi2q %ymm5, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7]
+; AVX512BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vpermd %ymm4, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm5
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,4,8],zero,zero,zero,zero,ymm5[1,5,9],zero,zero,zero,zero,ymm5[2,6,18],zero,zero,zero,zero,ymm5[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm2, %ymm2
 ; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
 ; AVX512BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
 ; AVX512BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX512BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-FCP-NEXT:    vmovq %xmm2, 48(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm1, 32(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-FCP-NEXT:    vmovq %xmm1, 48(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, 32(%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -1320,33 +1320,33 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm5, %ymm0, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,3,5,0,5,1,3,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,4,8],zero,zero,zero,zero,ymm5[1,5,9],zero,zero,zero,zero,ymm5[2,6,18],zero,zero,zero,zero,ymm5[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm2, %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
 ; AVX512DQ-BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, 48(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm1, 32(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, 48(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, 32(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 3a70df7617f18..675412defbb24 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -869,26 +869,26 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,zero,ymm1[19,27]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[18,26],zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero
-; AVX2-NEXT:    vpor %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
-; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27]
+; AVX2-NEXT:    vpor %ymm3, %ymm6, %ymm3
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm4[0,8],zero,zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero
+; AVX2-NEXT:    vpor %ymm1, %ymm6, %ymm1
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31]
+; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[4,12],zero,zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,30],zero,zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, 32(%rax)
-; AVX2-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -909,26 +909,26 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,zero,ymm1[19,27]
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[18,26],zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero
-; AVX2-FP-NEXT:    vpor %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
-; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT:    vpor %ymm6, %ymm4, %ymm4
-; AVX2-FP-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27]
+; AVX2-FP-NEXT:    vpor %ymm3, %ymm6, %ymm3
+; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm4[0,8],zero,zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero
+; AVX2-FP-NEXT:    vpor %ymm1, %ymm6, %ymm1
 ; AVX2-FP-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31]
+; AVX2-FP-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[4,12],zero,zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,30],zero,zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero
+; AVX2-FP-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-FP-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%rax)
-; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX2-FP-NEXT:    vzeroupper
 ; AVX2-FP-NEXT:    retq
 ;
@@ -989,29 +989,29 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm1, %ymm3
-; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX512-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-NEXT:    vpord %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm6
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4],ymm6[5],ymm1[6],ymm6[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
+; AVX512-NEXT:    vpshufb %ymm1, %ymm5, %ymm2
+; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm5, %ymm3
+; AVX512-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512-NEXT:    vpord %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1073,29 +1073,29 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm1, %ymm4
-; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpord %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm6
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4],ymm6[5],ymm1[6],ymm6[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128]
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31]
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm2
+; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27]
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm5, %ymm3
+; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT:    vpord %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -1198,18 +1198,14 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15]
-; AVX512BW-FCP-NEXT:    vpermd %zmm1, %zmm2, %zmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7]
-; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,2,16,18,0,2,16,18,1,3,17,19,1,3,17,19]
+; AVX512BW-FCP-NEXT:    vpermt2d %zmm3, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
+; AVX512BW-FCP-NEXT:    vpermt2d %zmm1, %zmm4, %zmm0
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movw $-21846, %cx # imm = 0xAAAA
 ; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -1272,18 +1268,14 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm1, %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,2,16,18,0,2,16,18,1,3,17,19,1,3,17,19]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm3, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm1, %zmm4, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movw $-21846, %cx # imm = 0xAAAA
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 81f79f3b1399a..da65fecba773b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -776,12 +776,34 @@ define <4 x double> @shuffle_v4f64_0044(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @shuffle_v4f64_0044_v2f64(<2 x double> %a, <2 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_0044_v2f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: shuffle_v4f64_0044_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_0044_v2f64:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0044_v2f64:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX512VL-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_0044_v2f64:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %1 = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> <i32 0, i32 0>
   %2 = shufflevector <2 x double> %b, <2 x double> poison, <2 x i32> <i32 0, i32 0>
   %3 = shufflevector <2 x double> %1, <2 x double> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -789,12 +811,34 @@ define <4 x double> @shuffle_v4f64_0044_v2f64(<2 x double> %a, <2 x double> %b)
 }
 
 define <4 x double> @shuffle_v4f64_1032_v2f64(<2 x double> %a, <2 x double> %b) {
-; ALL-LABEL: shuffle_v4f64_1032_v2f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: shuffle_v4f64_1032_v2f64:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4f64_1032_v2f64:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_1032_v2f64:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX512VL-FAST-ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_1032_v2f64:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %1 = shufflevector <2 x double> %a, <2 x double> poison, <2 x i32> <i32 1, i32 0>
   %2 = shufflevector <2 x double> %b, <2 x double> poison, <2 x i32> <i32 1, i32 0>
   %3 = shufflevector <2 x double> %1, <2 x double> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll
index e2c36393da2f6..825ee34561c79 100644
--- a/llvm/test/CodeGen/X86/widen_fadd.ll
+++ b/llvm/test/CodeGen/X86/widen_fadd.ll
@@ -65,70 +65,26 @@ define void @widen_fadd_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
 ; SSE-NEXT:    movlps %xmm2, 24(%rdx)
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: widen_fadd_v2f32_v8f32:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vaddps %xmm4, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vaddps %xmm4, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vaddps %xmm4, %xmm2, %xmm2
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vaddps %xmm4, %xmm3, %xmm3
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1OR2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1OR2-NEXT:    vmovups %ymm0, (%rdx)
-; AVX1OR2-NEXT:    vzeroupper
-; AVX1OR2-NEXT:    retq
-;
-; AVX512F-LABEL: widen_fadd_v2f32_v8f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vaddps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vaddps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vaddps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vaddps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512F-NEXT:    vmovups %ymm0, (%rdx)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: widen_fadd_v2f32_v8f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vaddps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vaddps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vaddps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vaddps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512VL-NEXT:    vmovups %ymm0, (%rdx)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: widen_fadd_v2f32_v8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vaddps %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vaddps %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vaddps %xmm4, %xmm2, %xmm2
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vaddps %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT:    vmovups %ymm0, (%rdx)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %a2 = getelementptr inbounds i8, ptr %a0, i64 8
   %b2 = getelementptr inbounds i8, ptr %b0, i64 8
   %c2 = getelementptr inbounds i8, ptr %c0, i64 8
diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll
index 4e5695500fbff..c0ec41237d301 100644
--- a/llvm/test/CodeGen/X86/widen_fdiv.ll
+++ b/llvm/test/CodeGen/X86/widen_fdiv.ll
@@ -65,44 +65,13 @@ define void @widen_fdiv_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
 ; SSE-NEXT:    movlps %xmm3, 24(%rdx)
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: widen_fdiv_v2f32_v8f32:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovups (%rdi), %ymm0
-; AVX1OR2-NEXT:    vdivps (%rsi), %ymm0, %ymm0
-; AVX1OR2-NEXT:    vmovups %ymm0, (%rdx)
-; AVX1OR2-NEXT:    vzeroupper
-; AVX1OR2-NEXT:    retq
-;
-; AVX512F-LABEL: widen_fdiv_v2f32_v8f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovups (%rdi), %ymm0
-; AVX512F-NEXT:    vdivps (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovups %ymm0, (%rdx)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: widen_fdiv_v2f32_v8f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT:    vdivps %xmm5, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512VL-NEXT:    vdivps %xmm6, %xmm3, %xmm3
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm3
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vdivps %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512VL-NEXT:    vmovups %ymm0, (%rdx)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: widen_fdiv_v2f32_v8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovups (%rdi), %ymm0
+; AVX-NEXT:    vdivps (%rsi), %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, (%rdx)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %a2 = getelementptr inbounds i8, ptr %a0, i64 8
   %b2 = getelementptr inbounds i8, ptr %b0, i64 8
   %c2 = getelementptr inbounds i8, ptr %c0, i64 8
diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll
index fc099e7c68969..7011419fbc6fc 100644
--- a/llvm/test/CodeGen/X86/widen_fmul.ll
+++ b/llvm/test/CodeGen/X86/widen_fmul.ll
@@ -65,70 +65,26 @@ define void @widen_fmul_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
 ; SSE-NEXT:    movlps %xmm2, 24(%rdx)
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: widen_fmul_v2f32_v8f32:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vmulps %xmm4, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vmulps %xmm4, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vmulps %xmm4, %xmm2, %xmm2
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vmulps %xmm4, %xmm3, %xmm3
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1OR2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1OR2-NEXT:    vmovups %ymm0, (%rdx)
-; AVX1OR2-NEXT:    vzeroupper
-; AVX1OR2-NEXT:    retq
-;
-; AVX512F-LABEL: widen_fmul_v2f32_v8f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vmulps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vmulps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vmulps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vmulps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512F-NEXT:    vmovups %ymm0, (%rdx)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: widen_fmul_v2f32_v8f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vmulps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vmulps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vmulps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vmulps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512VL-NEXT:    vmovups %ymm0, (%rdx)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: widen_fmul_v2f32_v8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vmulps %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vmulps %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vmulps %xmm4, %xmm2, %xmm2
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vmulps %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT:    vmovups %ymm0, (%rdx)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %a2 = getelementptr inbounds i8, ptr %a0, i64 8
   %b2 = getelementptr inbounds i8, ptr %b0, i64 8
   %c2 = getelementptr inbounds i8, ptr %c0, i64 8
diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll
index 3256d5c6f5e3f..915f122b50386 100644
--- a/llvm/test/CodeGen/X86/widen_fsub.ll
+++ b/llvm/test/CodeGen/X86/widen_fsub.ll
@@ -65,70 +65,26 @@ define void @widen_fsub_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
 ; SSE-NEXT:    movlps %xmm3, 24(%rdx)
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: widen_fsub_v2f32_v8f32:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vsubps %xmm4, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vsubps %xmm4, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vsubps %xmm4, %xmm2, %xmm2
-; AVX1OR2-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX1OR2-NEXT:    vsubps %xmm4, %xmm3, %xmm3
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1OR2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1OR2-NEXT:    vmovups %ymm0, (%rdx)
-; AVX1OR2-NEXT:    vzeroupper
-; AVX1OR2-NEXT:    retq
-;
-; AVX512F-LABEL: widen_fsub_v2f32_v8f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vsubps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vsubps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vsubps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT:    vsubps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512F-NEXT:    vmovups %ymm0, (%rdx)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: widen_fsub_v2f32_v8f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vsubps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vsubps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vsubps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT:    vsubps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512VL-NEXT:    vmovups %ymm0, (%rdx)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: widen_fsub_v2f32_v8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vsubps %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vsubps %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vsubps %xmm4, %xmm2, %xmm2
+; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT:    vsubps %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT:    vmovups %ymm0, (%rdx)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %a2 = getelementptr inbounds i8, ptr %a0, i64 8
   %b2 = getelementptr inbounds i8, ptr %b0, i64 8
   %c2 = getelementptr inbounds i8, ptr %c0, i64 8

From 6684a5970e74b8b4c0c83361a90e25dae9646db0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy@ericsson.com>
Date: Mon, 17 Feb 2025 09:37:29 +0100
Subject: [PATCH 098/109] [analyzer][NFC] Trivial cleanup in ArrayBoundChecker
 (#126941)

Two small stylistic improvements in code that I wrote ~a year ago:
1. fix a typo in a comment; and
2. simplify the code of `tryDividePair` by swapping the true and the
false branches.
---
 .../Checkers/ArrayBoundChecker.cpp              | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
index 109faacf1726a..f56e9192d1d66 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
@@ -323,7 +323,7 @@ compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
   // we want to ensure that assumptions coming from this precondition and
   // assumptions coming from regular C/C++ operator calls are represented by
   // constraints on the same symbolic expression. A solution that would
-  // evaluate these "mathematical" compariosns through a separate pathway would
+  // evaluate these "mathematical" comparisons through a separate pathway would
   // be a step backwards in this sense.
 
   const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
@@ -394,14 +394,13 @@ static bool tryDividePair(std::optional<int64_t> &Val1,
     return false;
   const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
   const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
-  if (!Val1HasRemainder && !Val2HasRemainder) {
-    if (Val1)
-      *Val1 /= Divisor;
-    if (Val2)
-      *Val2 /= Divisor;
-    return true;
-  }
-  return false;
+  if (Val1HasRemainder || Val2HasRemainder)
+    return false;
+  if (Val1)
+    *Val1 /= Divisor;
+  if (Val2)
+    *Val2 /= Divisor;
+  return true;
 }
 
 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,

From 94585dc59de23c52274fc542df7b821c7e6bb326 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 17 Feb 2025 08:50:38 +0000
Subject: [PATCH 099/109] [X86] Add test coverage for #116931

---
 .../CodeGen/X86/vector-shuffle-256-v32.ll     | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 15e287d66754b..176ba696e6540 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -5045,6 +5045,64 @@ define <32 x i8> @shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_
   ret <32 x i8> %5
 }
 
+; PR116931
+define void @shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31(ptr %out, <32 x i8> %a0) {
+; AVX1-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vmovdqa %xmm0, 16(%rdi)
+; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2OR512VL-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX2OR512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2OR512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2OR512VL-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2OR512VL-NEXT:    vmovdqa %xmm0, 16(%rdi)
+; AVX2OR512VL-NEXT:    vmovdqa %xmm2, (%rdi)
+; AVX2OR512VL-NEXT:    vzeroupper
+; AVX2OR512VL-NEXT:    retq
+;
+; XOPAVX1-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; XOPAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; XOPAVX1-NEXT:    vmovdqa %xmm0, 16(%rdi)
+; XOPAVX1-NEXT:    vmovdqa %xmm2, (%rdi)
+; XOPAVX1-NEXT:    vzeroupper
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; XOPAVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; XOPAVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; XOPAVX2-NEXT:    vmovdqa %xmm0, 16(%rdi)
+; XOPAVX2-NEXT:    vmovdqa %xmm2, (%rdi)
+; XOPAVX2-NEXT:    vzeroupper
+; XOPAVX2-NEXT:    retq
+  %r = shufflevector <32 x i8> %a0, <32 x i8> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i8> %r, ptr %out, align 32
+  ret void
+}
+
 define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX1-LABEL: PR28136:
 ; AVX1:       # %bb.0:

From 948a8477c6a966ee8509400d2857706e933f4149 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Mon, 17 Feb 2025 09:04:29 +0000
Subject: [PATCH 100/109] [WebAssembly] Recognise EXTEND_HIGH (#123325)

When lowering EXTEND_VECTOR_INREG, check whether the operand is a
shuffle that is moving the top half of a vector into the lower half. If
so, we can EXTEND_HIGH the input to the shuffle instead.
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  32 +++
 .../CodeGen/WebAssembly/extend-shuffles.ll    | 227 ++++++++++++++++++
 .../WebAssembly/int-mac-reduction-loops.ll    |  12 +-
 3 files changed, 263 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/WebAssembly/extend-shuffles.ll

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index fedad25c775e2..4fc79b3d6e3f8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2266,6 +2266,32 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                      Op.getOperand(1));
 }
 
+static SDValue GetExtendHigh(SDValue Op, unsigned UserOpc, EVT VT,
+                             SelectionDAG &DAG) {
+  if (Op.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return SDValue();
+
+  assert((UserOpc == WebAssemblyISD::EXTEND_LOW_U ||
+          UserOpc == WebAssemblyISD::EXTEND_LOW_S) &&
+         "expected extend_low");
+  auto *Shuffle = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  ArrayRef<int> Mask = Shuffle->getMask();
+  // Look for a shuffle which moves from the high half to the low half.
+  size_t FirstIdx = Mask.size() / 2;
+  for (size_t i = 0; i < Mask.size() / 2; ++i) {
+    if (Mask[i] != static_cast<int>(FirstIdx + i)) {
+      return SDValue();
+    }
+  }
+
+  SDLoc DL(Op);
+  unsigned Opc = UserOpc == WebAssemblyISD::EXTEND_LOW_S
+                     ? WebAssemblyISD::EXTEND_HIGH_S
+                     : WebAssemblyISD::EXTEND_HIGH_U;
+  return DAG.getNode(Opc, DL, VT, Shuffle->getOperand(0));
+}
+
 SDValue
 WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
                                                     SelectionDAG &DAG) const {
@@ -2295,6 +2321,12 @@ WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
     break;
   }
 
+  if (Scale == 2) {
+    // See if we can use EXTEND_HIGH.
+    if (auto ExtendHigh = GetExtendHigh(Op.getOperand(0), Ext, VT, DAG))
+      return ExtendHigh;
+  }
+
   SDValue Ret = Src;
   while (Scale != 1) {
     Ret = DAG.getNode(Ext, DL,
diff --git a/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll b/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll
new file mode 100644
index 0000000000000..7736e78271e55
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -mtriple=wasm32 -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=SIMD128
+
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-n32:64-S128-ni:1:10:20"
+target triple = "wasm32"
+
+define <4 x i32> @sext_high_v4i8(<8 x i8> %in) {
+; SIMD128-LABEL: sext_high_v4i8:
+; SIMD128:         .functype sext_high_v4i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i16x8.extend_low_i8x16_s $push1=, $pop0
+; SIMD128-NEXT:    i32x4.extend_low_i16x8_s $push2=, $pop1
+; SIMD128-NEXT:    return $pop2
+ %shuffle = shufflevector <8 x i8> %in, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = sext <4 x i8> %shuffle to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @zext_high_v4i8(<8 x i8> %in) {
+; SIMD128-LABEL: zext_high_v4i8:
+; SIMD128:         .functype zext_high_v4i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i16x8.extend_low_i8x16_u $push1=, $pop0
+; SIMD128-NEXT:    i32x4.extend_low_i16x8_u $push2=, $pop1
+; SIMD128-NEXT:    return $pop2
+ %shuffle = shufflevector <8 x i8> %in, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = zext <4 x i8> %shuffle to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @sext_high_v8i8(<16 x i8> %in) {
+; SIMD128-LABEL: sext_high_v8i8:
+; SIMD128:         .functype sext_high_v8i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extend_high_i8x16_s $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <16 x i8> %in, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %res = sext <8 x i8> %shuffle to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @zext_high_v8i8(<16 x i8> %in) {
+; SIMD128-LABEL: zext_high_v8i8:
+; SIMD128:         .functype zext_high_v8i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extend_high_i8x16_u $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <16 x i8> %in, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %res = zext <8 x i8> %shuffle to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <2 x i32> @sext_high_v2i16(<4 x i16> %in) {
+; SIMD128-LABEL: sext_high_v2i16:
+; SIMD128:         .functype sext_high_v2i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i32x4.extend_low_i16x8_s $push1=, $pop0
+; SIMD128-NEXT:    return $pop1
+ %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+ %res = sext <2 x i16> %shuffle to <2 x i32>
+ ret <2 x i32> %res
+}
+
+define <2 x i32> @zext_high_v2i16(<4 x i16> %in) {
+; SIMD128-LABEL: zext_high_v2i16:
+; SIMD128:         .functype zext_high_v2i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; SIMD128-NEXT:    i32x4.extend_low_i16x8_u $push1=, $pop0
+; SIMD128-NEXT:    return $pop1
+ %shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+ %res = zext <2 x i16> %shuffle to <2 x i32>
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @sext_high_v4i16(<8 x i16> %in) {
+; SIMD128-LABEL: sext_high_v4i16:
+; SIMD128:         .functype sext_high_v4i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extend_high_i16x8_s $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <8 x i16> %in, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = sext <4 x i16> %shuffle to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @zext_high_v4i16(<8 x i16> %in) {
+; SIMD128-LABEL: zext_high_v4i16:
+; SIMD128:         .functype zext_high_v4i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extend_high_i16x8_u $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <8 x i16> %in, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = zext <4 x i16> %shuffle to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @sext_high_v2i32(<4 x i32> %in) {
+; SIMD128-LABEL: sext_high_v2i32:
+; SIMD128:         .functype sext_high_v2i32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extend_high_i32x4_s $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <4 x i32> %in, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+ %res = sext <2 x i32> %shuffle to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @zext_high_v2i32(<4 x i32> %in) {
+; SIMD128-LABEL: zext_high_v2i32:
+; SIMD128:         .functype zext_high_v2i32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extend_high_i32x4_u $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <4 x i32> %in, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+ %res = zext <2 x i32> %shuffle to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i32> @sext_low_v4i8(<8 x i8> %in) {
+; SIMD128-LABEL: sext_low_v4i8:
+; SIMD128:         .functype sext_low_v4i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extend_low_i8x16_s $push0=, $0
+; SIMD128-NEXT:    i32x4.extend_low_i16x8_s $push1=, $pop0
+; SIMD128-NEXT:    return $pop1
+ %shuffle = shufflevector <8 x i8> %in, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = sext <4 x i8> %shuffle to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @zext_low_v4i8(<8 x i8> %in) {
+; SIMD128-LABEL: zext_low_v4i8:
+; SIMD128:         .functype zext_low_v4i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extend_low_i8x16_u $push0=, $0
+; SIMD128-NEXT:    i32x4.extend_low_i16x8_u $push1=, $pop0
+; SIMD128-NEXT:    return $pop1
+ %shuffle = shufflevector <8 x i8> %in, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = zext <4 x i8> %shuffle to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @sext_low_v8i8(<16 x i8> %in) {
+; SIMD128-LABEL: sext_low_v8i8:
+; SIMD128:         .functype sext_low_v8i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extend_low_i8x16_s $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <16 x i8> %in, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = sext <8 x i8> %shuffle to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @zext_low_v8i8(<16 x i8> %in) {
+; SIMD128-LABEL: zext_low_v8i8:
+; SIMD128:         .functype zext_low_v8i8 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i16x8.extend_low_i8x16_u $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <16 x i8> %in, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = zext <8 x i8> %shuffle to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @sext_low_v4i16(<8 x i16> %in) {
+; SIMD128-LABEL: sext_low_v4i16:
+; SIMD128:         .functype sext_low_v4i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extend_low_i16x8_s $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <8 x i16> %in, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = sext <4 x i16> %shuffle to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @zext_low_v4i16(<8 x i16> %in) {
+; SIMD128-LABEL: zext_low_v4i16:
+; SIMD128:         .functype zext_low_v4i16 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i32x4.extend_low_i16x8_u $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <8 x i16> %in, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = zext <4 x i16> %shuffle to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @sext_low_v2i32(<4 x i32> %in) {
+; SIMD128-LABEL: sext_low_v2i32:
+; SIMD128:         .functype sext_low_v2i32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extend_low_i32x4_s $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <4 x i32> %in, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+ %res = sext <2 x i32> %shuffle to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @zext_low_v2i32(<4 x i32> %in) {
+; SIMD128-LABEL: zext_low_v2i32:
+; SIMD128:         .functype zext_low_v2i32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extend_low_i32x4_u $push0=, $0
+; SIMD128-NEXT:    return $pop0
+ %shuffle = shufflevector <4 x i32> %in, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+ %res = zext <2 x i32> %shuffle to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @multi_use_ext_v2i32(<4 x i32> %in) {
+; SIMD128-LABEL: multi_use_ext_v2i32:
+; SIMD128:         .functype multi_use_ext_v2i32 (v128) -> (v128)
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i64x2.extend_high_i32x4_u $push1=, $0
+; SIMD128-NEXT:    i64x2.extend_high_i32x4_s $push0=, $0
+; SIMD128-NEXT:    i64x2.add $push2=, $pop1, $pop0
+; SIMD128-NEXT:    return $pop2
+ %shuffle = shufflevector <4 x i32> %in, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+ %zext = zext <2 x i32> %shuffle to <2 x i64>
+ %sext = sext <2 x i32> %shuffle to <2 x i64>
+ %res = add <2 x i64> %zext, %sext
+ ret <2 x i64> %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index c9af8e2268f1c..0184e22a3b40d 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -151,9 +151,8 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.mul
-; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
+; MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
 ; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
 ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
 ; MAX-BANDWIDTH: i64x2.add
 
@@ -272,11 +271,9 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
 ; CHECK:    i32x4.add
 
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
 ; MAX-BANDWIDTH: i32x4.mul
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
@@ -377,9 +374,8 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.mul
-; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
+; MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
 ; MAX-BANDWIDTH: i64x2.add
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
 ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
 ; MAX-BANDWIDTH: i64x2.add
 

From ea7897a617b897f87f148db48cda9fcc7c1c53dc Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Mon, 17 Feb 2025 09:09:52 +0000
Subject: [PATCH 101/109] [WebAssembly] Enable interleaved memory accesses
 (#125696)

Enable the vectorizer to access interleaved memory. This means that,
when it's decided to be profitable, the memory accesses can be
vectorized instead of the value being built up by a sequence of
load_lane instructions. This will often increase the vectorization
factor of the loop, leading to significantly better performance.

I run a reasonably large collection of benchmarks and most are not
affected by this change, with most performance changes <1%. But I see a
2.5% speedup for the total run time of TSVC, 1% speedup for SPEC2017
x265, 28% speedup for a ResNet workload and 95% for libyuv. This is
running V8 on an AArch64 box.
---
 .../WebAssemblyTargetTransformInfo.h          |   2 +
 llvm/test/CodeGen/WebAssembly/interleave.ll   | 361 ++++++++++++++++++
 2 files changed, 363 insertions(+)
 create mode 100644 llvm/test/CodeGen/WebAssembly/interleave.ll

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 4d498b154c521..ba66306374c6c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -57,6 +57,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
   /// \name Vector TTI Implementations
   /// @{
 
+  bool enableInterleavedAccessVectorization() { return true; }
+
   unsigned getNumberOfRegisters(unsigned ClassID) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
   InstructionCost getArithmeticInstrCost(
diff --git a/llvm/test/CodeGen/WebAssembly/interleave.ll b/llvm/test/CodeGen/WebAssembly/interleave.ll
new file mode 100644
index 0000000000000..c20b5e42c4850
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/interleave.ll
@@ -0,0 +1,361 @@
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
+
+target triple = "wasm32"
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-n32:64-S128-ni:1:10:20"
+
+%struct.Output32x2 = type { i32, i32 }
+%struct.Input8x2 = type { i8, i8 }
+%struct.Output32x4 = type { i32, i32, i32, i32 }
+%struct.Input8x4 = type { i8, i8, i8, i8 }
+%struct.Input16x2 = type { i16, i16 }
+%struct.Input16x4 = type { i16, i16, i16, i16 }
+%struct.Input32x2 = type { i32, i32 }
+%struct.Input32x4 = type { i32, i32, i32, i32 }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate8x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate8x2:
+; CHECK: loop
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+  %4 = load i32, ptr %0, align 4
+  %5 = icmp eq i32 %2, 0
+  br i1 %5, label %10, label %6
+
+6:                                                ; preds = %3
+  %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+  %8 = load i32, ptr %7, align 4
+  br label %12
+
+9:                                                ; preds = %12
+  store i32 %23, ptr %7, align 4
+  br label %10
+
+10:                                               ; preds = %9, %3
+  %11 = phi i32 [ %21, %9 ], [ %4, %3 ]
+  store i32 %11, ptr %0, align 4
+  ret void
+
+12:                                               ; preds = %6, %12
+  %13 = phi i32 [ %8, %6 ], [ %23, %12 ]
+  %14 = phi i32 [ 0, %6 ], [ %24, %12 ]
+  %15 = phi i32 [ %4, %6 ], [ %21, %12 ]
+  %16 = getelementptr inbounds nuw %struct.Input8x2, ptr %1, i32 %14
+  %17 = load i8, ptr %16, align 1
+  %18 = getelementptr inbounds nuw i8, ptr %16, i32 1
+  %19 = load i8, ptr %18, align 1
+  %20 = zext i8 %17 to i32
+  %21 = add i32 %15, %20
+  %22 = zext i8 %19 to i32
+  %23 = add i32 %13, %22
+  %24 = add nuw i32 %14, 1
+  %25 = icmp eq i32 %24, %2
+  br i1 %25, label %9, label %12
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate8x4(ptr dead_on_unwind noalias writable sret(%struct.Output32x4) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate8x4
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+  %4 = load i32, ptr %0, align 4
+  %5 = icmp eq i32 %2, 0
+  br i1 %5, label %14, label %6
+
+6:                                                ; preds = %3
+  %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+  %8 = getelementptr inbounds nuw i8, ptr %0, i32 8
+  %9 = getelementptr inbounds nuw i8, ptr %0, i32 12
+  %10 = load i32, ptr %7, align 4
+  %11 = load i32, ptr %8, align 4
+  %12 = load i32, ptr %9, align 4
+  br label %16
+
+13:                                               ; preds = %16
+  store i32 %33, ptr %7, align 4
+  store i32 %35, ptr %8, align 4
+  store i32 %37, ptr %9, align 4
+  br label %14
+
+14:                                               ; preds = %13, %3
+  %15 = phi i32 [ %31, %13 ], [ %4, %3 ]
+  store i32 %15, ptr %0, align 4
+  ret void
+
+16:                                               ; preds = %6, %16
+  %17 = phi i32 [ %12, %6 ], [ %37, %16 ]
+  %18 = phi i32 [ %11, %6 ], [ %35, %16 ]
+  %19 = phi i32 [ %10, %6 ], [ %33, %16 ]
+  %20 = phi i32 [ 0, %6 ], [ %38, %16 ]
+  %21 = phi i32 [ %4, %6 ], [ %31, %16 ]
+  %22 = getelementptr inbounds nuw %struct.Input8x4, ptr %1, i32 %20
+  %23 = load i8, ptr %22, align 1
+  %24 = getelementptr inbounds nuw i8, ptr %22, i32 1
+  %25 = load i8, ptr %24, align 1
+  %26 = getelementptr inbounds nuw i8, ptr %22, i32 2
+  %27 = load i8, ptr %26, align 1
+  %28 = getelementptr inbounds nuw i8, ptr %22, i32 3
+  %29 = load i8, ptr %28, align 1
+  %30 = zext i8 %23 to i32
+  %31 = add i32 %21, %30
+  %32 = zext i8 %25 to i32
+  %33 = add i32 %19, %32
+  %34 = zext i8 %27 to i32
+  %35 = add i32 %18, %34
+  %36 = zext i8 %29 to i32
+  %37 = add i32 %17, %36
+  %38 = add nuw i32 %20, 1
+  %39 = icmp eq i32 %38, %2
+  br i1 %39, label %13, label %16
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate16x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate16x2
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+  %4 = load i32, ptr %0, align 4
+  %5 = icmp eq i32 %2, 0
+  br i1 %5, label %10, label %6
+
+6:                                                ; preds = %3
+  %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+  %8 = load i32, ptr %7, align 4
+  br label %12
+
+9:                                                ; preds = %12
+  store i32 %23, ptr %7, align 4
+  br label %10
+
+10:                                               ; preds = %9, %3
+  %11 = phi i32 [ %21, %9 ], [ %4, %3 ]
+  store i32 %11, ptr %0, align 4
+  ret void
+
+12:                                               ; preds = %6, %12
+  %13 = phi i32 [ %8, %6 ], [ %23, %12 ]
+  %14 = phi i32 [ 0, %6 ], [ %24, %12 ]
+  %15 = phi i32 [ %4, %6 ], [ %21, %12 ]
+  %16 = getelementptr inbounds nuw %struct.Input16x2, ptr %1, i32 %14
+  %17 = load i16, ptr %16, align 2
+  %18 = getelementptr inbounds nuw i8, ptr %16, i32 2
+  %19 = load i16, ptr %18, align 2
+  %20 = zext i16 %17 to i32
+  %21 = add i32 %15, %20
+  %22 = zext i16 %19 to i32
+  %23 = add i32 %13, %22
+  %24 = add nuw i32 %14, 1
+  %25 = icmp eq i32 %24, %2
+  br i1 %25, label %9, label %12
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate16x4(ptr dead_on_unwind noalias writable sret(%struct.Output32x4) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate16x4
+; CHECK: loop
+; CHECK: v128.load 0:p2align=1
+; CHECK: v128.load 16:p2align=1
+; CHECK: i8x16.shuffle 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: i32x4.add
+  %4 = load i32, ptr %0, align 4
+  %5 = icmp eq i32 %2, 0
+  br i1 %5, label %14, label %6
+
+6:                                                ; preds = %3
+  %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+  %8 = getelementptr inbounds nuw i8, ptr %0, i32 8
+  %9 = getelementptr inbounds nuw i8, ptr %0, i32 12
+  %10 = load i32, ptr %7, align 4
+  %11 = load i32, ptr %8, align 4
+  %12 = load i32, ptr %9, align 4
+  br label %16
+
+13:                                               ; preds = %16
+  store i32 %33, ptr %7, align 4
+  store i32 %35, ptr %8, align 4
+  store i32 %37, ptr %9, align 4
+  br label %14
+
+14:                                               ; preds = %13, %3
+  %15 = phi i32 [ %31, %13 ], [ %4, %3 ]
+  store i32 %15, ptr %0, align 4
+  ret void
+
+16:                                               ; preds = %6, %16
+  %17 = phi i32 [ %12, %6 ], [ %37, %16 ]
+  %18 = phi i32 [ %11, %6 ], [ %35, %16 ]
+  %19 = phi i32 [ %10, %6 ], [ %33, %16 ]
+  %20 = phi i32 [ 0, %6 ], [ %38, %16 ]
+  %21 = phi i32 [ %4, %6 ], [ %31, %16 ]
+  %22 = getelementptr inbounds nuw %struct.Input16x4, ptr %1, i32 %20
+  %23 = load i16, ptr %22, align 2
+  %24 = getelementptr inbounds nuw i8, ptr %22, i32 2
+  %25 = load i16, ptr %24, align 2
+  %26 = getelementptr inbounds nuw i8, ptr %22, i32 4
+  %27 = load i16, ptr %26, align 2
+  %28 = getelementptr inbounds nuw i8, ptr %22, i32 6
+  %29 = load i16, ptr %28, align 2
+  %30 = zext i16 %23 to i32
+  %31 = add i32 %21, %30
+  %32 = zext i16 %25 to i32
+  %33 = add i32 %19, %32
+  %34 = zext i16 %27 to i32
+  %35 = add i32 %18, %34
+  %36 = zext i16 %29 to i32
+  %37 = add i32 %17, %36
+  %38 = add nuw i32 %20, 1
+  %39 = icmp eq i32 %38, %2
+  br i1 %39, label %13, label %16
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate32x2(ptr dead_on_unwind noalias writable sret(%struct.Output32x2) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate32x2
+; CHECK: loop
+; CHECK: v128.load 0:p2align=2
+; CHECK: v128.load 16:p2align=2
+; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: i32x4.add
+  %4 = load i32, ptr %0, align 4
+  %5 = icmp eq i32 %2, 0
+  br i1 %5, label %10, label %6
+
+6:                                                ; preds = %3
+  %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+  %8 = load i32, ptr %7, align 4
+  br label %12
+
+9:                                                ; preds = %12
+  store i32 %21, ptr %7, align 4
+  br label %10
+
+10:                                               ; preds = %9, %3
+  %11 = phi i32 [ %20, %9 ], [ %4, %3 ]
+  store i32 %11, ptr %0, align 4
+  ret void
+
+12:                                               ; preds = %6, %12
+  %13 = phi i32 [ %8, %6 ], [ %21, %12 ]
+  %14 = phi i32 [ 0, %6 ], [ %22, %12 ]
+  %15 = phi i32 [ %4, %6 ], [ %20, %12 ]
+  %16 = getelementptr inbounds nuw %struct.Input32x2, ptr %1, i32 %14
+  %17 = load i32, ptr %16, align 4
+  %18 = getelementptr inbounds nuw i8, ptr %16, i32 4
+  %19 = load i32, ptr %18, align 4
+  %20 = add i32 %15, %17
+  %21 = add i32 %13, %19
+  %22 = add nuw i32 %14, 1
+  %23 = icmp eq i32 %22, %2
+  br i1 %23, label %9, label %12
+}
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define hidden void @accumulate32x4(ptr dead_on_unwind noalias writable sret(%struct.Output32x4) align 4 captures(none) %0, ptr noundef readonly captures(none) %1, i32 noundef %2) local_unnamed_addr #0 {
+; CHECK-LABEL: accumulate32x4
+; CHECK: v128.load 0:p2align=2
+; CHECK: v128.load 16:p2align=2
+; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load 32:p2align=2
+; CHECK: v128.load 48:p2align=2
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.add
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i32x4.add
+  %4 = load i32, ptr %0, align 4
+  %5 = icmp eq i32 %2, 0
+  br i1 %5, label %14, label %6
+
+6:                                                ; preds = %3
+  %7 = getelementptr inbounds nuw i8, ptr %0, i32 4
+  %8 = getelementptr inbounds nuw i8, ptr %0, i32 8
+  %9 = getelementptr inbounds nuw i8, ptr %0, i32 12
+  %10 = load i32, ptr %7, align 4
+  %11 = load i32, ptr %8, align 4
+  %12 = load i32, ptr %9, align 4
+  br label %16
+
+13:                                               ; preds = %16
+  store i32 %31, ptr %7, align 4
+  store i32 %32, ptr %8, align 4
+  store i32 %33, ptr %9, align 4
+  br label %14
+
+14:                                               ; preds = %13, %3
+  %15 = phi i32 [ %30, %13 ], [ %4, %3 ]
+  store i32 %15, ptr %0, align 4
+  ret void
+
+16:                                               ; preds = %6, %16
+  %17 = phi i32 [ %12, %6 ], [ %33, %16 ]
+  %18 = phi i32 [ %11, %6 ], [ %32, %16 ]
+  %19 = phi i32 [ %10, %6 ], [ %31, %16 ]
+  %20 = phi i32 [ 0, %6 ], [ %34, %16 ]
+  %21 = phi i32 [ %4, %6 ], [ %30, %16 ]
+  %22 = getelementptr inbounds nuw %struct.Input32x4, ptr %1, i32 %20
+  %23 = load i32, ptr %22, align 4
+  %24 = getelementptr inbounds nuw i8, ptr %22, i32 4
+  %25 = load i32, ptr %24, align 4
+  %26 = getelementptr inbounds nuw i8, ptr %22, i32 8
+  %27 = load i32, ptr %26, align 4
+  %28 = getelementptr inbounds nuw i8, ptr %22, i32 12
+  %29 = load i32, ptr %28, align 4
+  %30 = add i32 %21, %23
+  %31 = add i32 %19, %25
+  %32 = add i32 %18, %27
+  %33 = add i32 %17, %29
+  %34 = add nuw i32 %20, 1
+  %35 = icmp eq i32 %34, %2
+  br i1 %35, label %13, label %16
+}

From e0545b5c6d54dcf3e3ef84cbf9695bb1aecd87db Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Feb 2025 01:22:39 -0800
Subject: [PATCH 102/109] [Analysis] Remove getGuaranteedWellDefinedOps
 (#127453)

The last use was removed in:

  commit ac9e67756e0157793d565c2cceaf82e4403f58ba
  Author: Yingwei Zheng <dtcxzyw2333@gmail.com>
  Date:   Mon Feb 26 01:53:16 2024 +0800
---
 llvm/include/llvm/Analysis/ValueTracking.h | 6 ------
 llvm/lib/Analysis/ValueTracking.cpp        | 8 --------
 2 files changed, 14 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index dba54be4c92f8..1b49f8a3e85b1 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -1004,12 +1004,6 @@ bool propagatesPoison(const Use &PoisonOp);
 void getGuaranteedNonPoisonOps(const Instruction *I,
                                SmallVectorImpl<const Value *> &Ops);
 
-/// Insert operands of I into Ops such that I will trigger undefined behavior
-/// if I is executed and that operand is not a well-defined value
-/// (i.e. has undef bits or poison).
-void getGuaranteedWellDefinedOps(const Instruction *I,
-                                 SmallVectorImpl<const Value *> &Ops);
-
 /// Return true if the given instruction must trigger undefined behavior
 /// when I is executed with any operands which appear in KnownPoison holding
 /// a poison value at the point of execution.
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 2a49a10447e0b..e4454c42c7857 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8175,14 +8175,6 @@ static bool handleGuaranteedWellDefinedOps(const Instruction *I,
   return false;
 }
 
-void llvm::getGuaranteedWellDefinedOps(
-    const Instruction *I, SmallVectorImpl<const Value *> &Operands) {
-  handleGuaranteedWellDefinedOps(I, [&](const Value *V) {
-    Operands.push_back(V);
-    return false;
-  });
-}
-
 /// Enumerates all operands of \p I that are guaranteed to not be poison.
 template <typename CallableT>
 static bool handleGuaranteedNonPoisonOps(const Instruction *I,

From b9c6d3ed26789c33fc7f959198e4459ec4e1d3ac Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Feb 2025 01:31:24 -0800
Subject: [PATCH 103/109] [clang-linker-wrapper] Avoid repeated hash lookups
 (NFC) (#127443)

---
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index e78763faad73e..7db8f3e27d704 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1060,8 +1060,9 @@ Expected<bool> getSymbolsFromBitcode(MemoryBufferRef Buffer, OffloadKind Kind,
       if (Sym.isFormatSpecific() || !Sym.isGlobal())
         continue;
 
-      bool NewSymbol = Syms.count(Sym.getName()) == 0;
-      auto OldSym = NewSymbol ? Sym_None : Syms[Sym.getName()];
+      auto It = Syms.find(Sym.getName());
+      bool NewSymbol = It == Syms.end();
+      auto OldSym = NewSymbol ? Sym_None : It->second;
 
       // We will extract if it defines a currenlty undefined non-weak
       // symbol.

From ff4e21fccc439085f6381076a2ac7d9fa371ab29 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Feb 2025 01:31:52 -0800
Subject: [PATCH 104/109] [clang-tidy] Avoid repeated hash lookups (NFC)
 (#127444)

---
 .../clang-tidy/modernize/DeprecatedHeadersCheck.cpp       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
index 6a467910521f5..3d3a79d8eaf21 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
@@ -199,10 +199,10 @@ void IncludeModernizePPCallbacks::InclusionDirective(
   // 2. Insert `using namespace std;` to the beginning of TU.
   // 3. Do nothing and let the user deal with the migration himself.
   SourceLocation DiagLoc = FilenameRange.getBegin();
-  if (CStyledHeaderToCxx.count(FileName) != 0) {
-    IncludesToBeProcessed.emplace_back(
-        IncludeMarker{CStyledHeaderToCxx[FileName], FileName,
-                      FilenameRange.getAsRange(), DiagLoc});
+  if (auto It = CStyledHeaderToCxx.find(FileName);
+      It != CStyledHeaderToCxx.end()) {
+    IncludesToBeProcessed.emplace_back(IncludeMarker{
+        It->second, FileName, FilenameRange.getAsRange(), DiagLoc});
   } else if (DeleteHeaders.count(FileName) != 0) {
     IncludesToBeProcessed.emplace_back(
         // NOLINTNEXTLINE(modernize-use-emplace) - false-positive

From fb14638817004dc96c9401d7f704d7e5cd0ef3fc Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Feb 2025 01:32:25 -0800
Subject: [PATCH 105/109] [DebugInfo] Avoid repeated hash lookups (NFC)
 (#127446)

---
 .../DebugInfo/LogicalView/Readers/LVBinaryReader.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
index 932346e1b011b..513b0d312345e 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
@@ -65,20 +65,24 @@ LVSectionIndex LVSymbolTable::update(LVScope *Function) {
     Name = Function->getName();
   std::string SymbolName(Name);
 
-  if (SymbolName.empty() || (SymbolNames.find(SymbolName) == SymbolNames.end()))
+  if (SymbolName.empty())
+    return SectionIndex;
+
+  auto It = SymbolNames.find(SymbolName);
+  if (It == SymbolNames.end())
     return SectionIndex;
 
   // Update a recorded entry with its logical scope, only if the scope has
   // ranges. That is the case when in DWARF there are 2 DIEs connected via
   // the DW_AT_specification.
   if (Function->getHasRanges()) {
-    SymbolNames[SymbolName].Scope = Function;
-    SectionIndex = SymbolNames[SymbolName].SectionIndex;
+    It->second.Scope = Function;
+    SectionIndex = It->second.SectionIndex;
   } else {
     SectionIndex = UndefinedSectionIndex;
   }
 
-  if (SymbolNames[SymbolName].IsComdat)
+  if (It->second.IsComdat)
     Function->setIsComdat();
 
   LLVM_DEBUG({ print(dbgs()); });

From d49776634e3c4bd62649e8e0fc0ff44951413c69 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Feb 2025 01:32:47 -0800
Subject: [PATCH 106/109] [Hexagon] Avoid repeated map lookups (NFC) (#127447)

---
 llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index d95d4c0d20318..ea7a429056be9 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -604,8 +604,10 @@ void HexagonCommonGEP::common() {
       uint32_t NF = N->Flags;
       // If N is used, append all original values of N to the list of
       // original values of Min.
-      if (NF & GepNode::Used)
-        MinUs.insert(Uses[N].begin(), Uses[N].end());
+      if (NF & GepNode::Used) {
+        auto &U = Uses[N];
+        MinUs.insert(U.begin(), U.end());
+      }
       Flags |= NF;
     }
     if (MinUs.empty())

From 82dc2d403066a84ef0051b06f1d179e00331f319 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 17 Feb 2025 10:43:58 +0100
Subject: [PATCH 107/109] [clang][Modules] Remove a resloved issue from
 StandardCPlusPlusModules.rst

The issue has been fixed in https://github.com/llvm/llvm-project/pull/122726
---
 clang/docs/StandardCPlusPlusModules.rst | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index 93edce0cf90b7..2720b40382717 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -1378,14 +1378,6 @@ merging happening in the semantic analyzer. This is due to a divergence in the
 implementation path. This is tracked by
 `#61465 <https://github.com/llvm/llvm-project/issues/61465>`_.
 
-Ignored ``preferred_name`` Attribute
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-When Clang writes BMIs, it will ignore the ``preferred_name`` attribute on
-declarations which use it. Thus, the preferred name will not be displayed in
-the debugger as expected. This is tracked by
-`#56490 <https://github.com/llvm/llvm-project/issues/56490>`_.
-
 Don't emit macros about module declaration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From 262e4c19878175780c88da867e88fc4e202d4788 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 17 Feb 2025 10:47:43 +0100
Subject: [PATCH 108/109] Revert "[clang][Modules] Remove a resloved issue from
 StandardCPlusPlusModules.rst"

This reverts commit 82dc2d403066a84ef0051b06f1d179e00331f319.

The fix has been reverted in f63e8ed16ef1fd2deb80cd88b5ca9d5b631b1c36
---
 clang/docs/StandardCPlusPlusModules.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index 2720b40382717..93edce0cf90b7 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -1378,6 +1378,14 @@ merging happening in the semantic analyzer. This is due to a divergence in the
 implementation path. This is tracked by
 `#61465 <https://github.com/llvm/llvm-project/issues/61465>`_.
 
+Ignored ``preferred_name`` Attribute
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When Clang writes BMIs, it will ignore the ``preferred_name`` attribute on
+declarations which use it. Thus, the preferred name will not be displayed in
+the debugger as expected. This is tracked by
+`#56490 <https://github.com/llvm/llvm-project/issues/56490>`_.
+
 Don't emit macros about module declaration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From e0e67a62076ad56f48c64a7cd2ebf5754b8326b7 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 17 Feb 2025 09:51:35 +0000
Subject: [PATCH 109/109] [LV] Add initial support for vectorizing literal
 struct return values (#109833)

This patch adds initial support for vectorizing literal struct return
values. Currently, this is limited to the case where the struct is
homogeneous (all elements have the same type) and not packed. The users
of the call also must all be `extractvalue` instructions.

The intended use case for this is vectorizing intrinsics such as:

```
declare { float, float } @llvm.sincos.f32(float %x)
```

Mapping them to structure-returning library calls such as:

```
declare { <4 x float>, <4 x float> } @Sleef_sincosf4_u10advsimd(<4 x float>)
```

Or their widened form (such as `@llvm.sincos.v4f32` in this case).

Implementing this required two main changes:

1. Supporting widening `extractvalue`
2. Adding support for vectorized struct types in LV
  * This is mostly limited to parts of the cost model and scalarization

Since the supported use case is narrow, the required changes are
relatively small.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  14 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  15 +-
 .../Vectorize/LoopVectorizationLegality.h     |  10 -
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  10 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  13 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 101 +++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  27 ++-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   6 +
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  |   7 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  14 +-
 .../AArch64/scalable-struct-return.ll         |  34 ++-
 .../AArch64/struct-return-cost.ll             | 199 ++++++++++++++++++
 .../Transforms/LoopVectorize/struct-return.ll | 111 ++++++++--
 .../vplan-widen-struct-return.ll              | 122 +++++++++++
 14 files changed, 580 insertions(+), 103 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 08ab4ee2ec1cf..9048481b49189 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1473,6 +1473,12 @@ class TargetTransformInfo {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index = -1) const;
 
+  /// \return The expected cost of aggregate inserts and extracts. This is
+  /// used when the instruction is not available; a typical use case is to
+  /// provision the cost of vectorization/scalarization in vectorizer passes.
+  InstructionCost getInsertExtractValueCost(unsigned Opcode,
+                                            TTI::TargetCostKind CostKind) const;
+
   /// \return The cost of replication shuffle of \p VF elements typed \p EltTy
   /// \p ReplicationFactor times.
   ///
@@ -2223,6 +2229,9 @@ class TargetTransformInfo::Concept {
                             const APInt &DemandedDstElts,
                             TTI::TargetCostKind CostKind) = 0;
 
+  virtual InstructionCost
+  getInsertExtractValueCost(unsigned Opcode, TTI::TargetCostKind CostKind) = 0;
+
   virtual InstructionCost
   getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -2950,6 +2959,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
                                           DemandedDstElts, CostKind);
   }
+  InstructionCost
+  getInsertExtractValueCost(unsigned Opcode,
+                            TTI::TargetCostKind CostKind) override {
+    return Impl.getInsertExtractValueCost(Opcode, CostKind);
+  }
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 5128c6b86a5f0..a8d6dd18266bb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -745,6 +745,17 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  InstructionCost
+  getInsertExtractValueCost(unsigned Opcode,
+                            TTI::TargetCostKind CostKind) const {
+    // Note: The `insertvalue` cost here is chosen to match the default case of
+    // getInstructionCost() -- as pior to adding this helper `insertvalue` was
+    // not handled.
+    if (Opcode == Instruction::InsertValue)
+      return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic;
+    return TTI::TCC_Free;
+  }
+
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
@@ -1306,9 +1317,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
     case Instruction::PHI:
     case Instruction::Switch:
       return TargetTTI->getCFInstrCost(Opcode, CostKind, I);
-    case Instruction::ExtractValue:
     case Instruction::Freeze:
       return TTI::TCC_Free;
+    case Instruction::ExtractValue:
+    case Instruction::InsertValue:
+      return TargetTTI->getInsertExtractValueCost(Opcode, CostKind);
     case Instruction::Alloca:
       if (cast<AllocaInst>(U)->isStaticAlloca())
         return TTI::TCC_Free;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 3c5cf1ebe6ba2..e959d93b57275 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -416,10 +416,6 @@ class LoopVectorizationLegality {
   /// has a vectorized variant available.
   bool hasVectorCallVariants() const { return VecCallVariantsFound; }
 
-  /// Returns true if there is at least one function call in the loop which
-  /// returns a struct type and needs to be vectorized.
-  bool hasStructVectorCall() const { return StructVecCallFound; }
-
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -639,12 +635,6 @@ class LoopVectorizationLegality {
   /// the use of those function variants.
   bool VecCallVariantsFound = false;
 
-  /// If we find a call (to be vectorized) that returns a struct type, record
-  /// that so we can bail out until this is supported.
-  /// TODO: Remove this flag once vectorizing calls with struct returns is
-  /// supported.
-  bool StructVecCallFound = false;
-
   /// Keep track of all the countable and uncountable exiting blocks if
   /// the exact backedge taken count is not computable.
   SmallVector<BasicBlock *, 4> CountableExitingBlocks;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index dc066099bdc1d..1ca9a16b18112 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1113,6 +1113,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getInsertExtractValueCost(
+    unsigned Opcode, TTI::TargetCostKind CostKind) const {
+  assert((Opcode == Instruction::InsertValue ||
+          Opcode == Instruction::ExtractValue) &&
+         "Expecting Opcode to be insertvalue/extractvalue.");
+  InstructionCost Cost = TTIImpl->getInsertExtractValueCost(Opcode, CostKind);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getReplicationShuffleCost(
     Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
     TTI::TargetCostKind CostKind) const {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e3599315e224f..420cbc5384ce4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -954,7 +954,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI && !VFDatabase::getMappings(*CI).empty())
         VecCallVariantsFound = true;
 
-      auto CanWidenInstructionTy = [this](Instruction const &Inst) {
+      auto CanWidenInstructionTy = [](Instruction const &Inst) {
         Type *InstTy = Inst.getType();
         if (!isa<StructType>(InstTy))
           return canVectorizeTy(InstTy);
@@ -962,15 +962,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // For now, we only recognize struct values returned from calls where
         // all users are extractvalue as vectorizable. All element types of the
         // struct must be types that can be widened.
-        if (isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
-            all_of(Inst.users(), IsaPred<ExtractValueInst>)) {
-          // TODO: Remove the `StructVecCallFound` flag once vectorizing calls
-          // with struct returns is supported.
-          StructVecCallFound = true;
-          return true;
-        }
-
-        return false;
+        return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
+               all_of(Inst.users(), IsaPred<ExtractValueInst>);
       };
 
       // Check that the instruction return type is vectorizable.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4685064407f08..8c41f896ad622 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2390,7 +2390,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                                VPReplicateRecipe *RepRecipe,
                                                const VPLane &Lane,
                                                VPTransformState &State) {
-  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  assert((!Instr->getType()->isAggregateType() ||
+          canVectorizeTy(Instr->getType())) &&
+         "Expected vectorizable or non-aggregate type.");
 
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2900,10 +2902,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   return ScalarCallCost;
 }
 
-static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
-  if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
-    return Elt;
-  return VectorType::get(Elt, VF);
+static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
+  if (VF.isScalar() || !canVectorizeTy(Ty))
+    return Ty;
+  return toVectorizedTy(Ty, VF);
 }
 
 InstructionCost
@@ -3650,13 +3652,15 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         }
       }
 
-      // ExtractValue instructions must be uniform, because the operands are
-      // known to be loop-invariant.
       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
-        assert(IsOutOfScope(EVI->getAggregateOperand()) &&
-               "Expected aggregate value to be loop invariant");
-        AddToWorklistIfAllowed(EVI);
-        continue;
+        if (IsOutOfScope(EVI->getAggregateOperand())) {
+          AddToWorklistIfAllowed(EVI);
+          continue;
+        }
+        // Only ExtractValue instructions where the aggregate value comes from a
+        // call are allowed to be non-uniform.
+        assert(isa<CallInst>(EVI->getAggregateOperand()) &&
+               "Expected aggregate value to be call return value");
       }
 
       // If there's no pointer operand, there's nothing to do.
@@ -4526,8 +4530,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         llvm_unreachable("unhandled recipe");
       }
 
-      auto WillWiden = [&TTI, VF](Type *ScalarTy) {
-        Type *VectorTy = toVectorTy(ScalarTy, VF);
+      auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
         if (!NumLegalParts)
           return false;
@@ -4539,7 +4542,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
           // explicitly ask TTI about the register class uses for each part.
           return NumLegalParts <= VF.getKnownMinValue();
         }
-        // Two or more parts that share a register - are vectorized.
+        // Two or more elements that share a register - are vectorized.
         return NumLegalParts < VF.getKnownMinValue();
       };
 
@@ -4558,7 +4561,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
       if (!Visited.insert({ScalarTy}).second)
         continue;
-      if (WillWiden(ScalarTy))
+      Type *WideTy = toVectorizedTy(ScalarTy, VF);
+      if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
         return true;
     }
   }
@@ -5515,10 +5519,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
-      ScalarCost += TTI.getScalarizationOverhead(
-          cast<VectorType>(toVectorTy(I->getType(), VF)),
-          APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
-          /*Extract*/ false, CostKind);
+      Type *WideTy = toVectorizedTy(I->getType(), VF);
+      for (Type *VectorTy : getContainedTypes(WideTy)) {
+        ScalarCost += TTI.getScalarizationOverhead(
+            cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+            /*Insert=*/true,
+            /*Extract=*/false, CostKind);
+      }
       ScalarCost +=
           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
     }
@@ -5529,15 +5536,18 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // overhead.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get())) {
-        assert(VectorType::isValidElementType(J->getType()) &&
+        assert(canVectorizeTy(J->getType()) &&
                "Instruction has non-scalar type");
         if (CanBeScalarized(J))
           Worklist.push_back(J);
         else if (needsExtract(J, VF)) {
-          ScalarCost += TTI.getScalarizationOverhead(
-              cast<VectorType>(toVectorTy(J->getType(), VF)),
-              APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
-              /*Extract*/ true, CostKind);
+          Type *WideTy = toVectorizedTy(J->getType(), VF);
+          for (Type *VectorTy : getContainedTypes(WideTy)) {
+            ScalarCost += TTI.getScalarizationOverhead(
+                cast<VectorType>(VectorTy),
+                APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
+                /*Extract*/ true, CostKind);
+          }
         }
       }
 
@@ -6016,13 +6026,17 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
     return 0;
 
   InstructionCost Cost = 0;
-  Type *RetTy = toVectorTy(I->getType(), VF);
+  Type *RetTy = toVectorizedTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
-      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
-    Cost += TTI.getScalarizationOverhead(
-        cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
-        /*Insert*/ true,
-        /*Extract*/ false, CostKind);
+      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
+
+    for (Type *VectorTy : getContainedTypes(RetTy)) {
+      Cost += TTI.getScalarizationOverhead(
+          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
+          /*Insert=*/true,
+          /*Extract=*/false, CostKind);
+    }
+  }
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6280,9 +6294,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       bool MaskRequired = Legal->isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
-      Type *RetTy = toVectorTy(ScalarRetTy, VF);
+      Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
-        Tys.push_back(toVectorTy(ScalarTy, VF));
+        Tys.push_back(toVectorizedTy(ScalarTy, VF));
 
       // An in-loop reduction using an fmuladd intrinsic is a special case;
       // we don't want the normal cost for that intrinsic.
@@ -6459,7 +6473,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
            HasSingleCopyAfterVectorization(I, VF));
     VectorTy = RetTy;
   } else
-    VectorTy = toVectorTy(RetTy, VF);
+    VectorTy = toVectorizedTy(RetTy, VF);
 
   if (VF.isVector() && VectorTy->isVectorTy() &&
       !TTI.getNumberOfParts(VectorTy))
@@ -8601,7 +8615,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
   case Instruction::Shl:
   case Instruction::Sub:
   case Instruction::Xor:
-  case Instruction::Freeze:
+  case Instruction::Freeze: {
     SmallVector<VPValue *> NewOps(Operands);
     if (Instruction::isBinaryOp(I->getOpcode())) {
       // The legacy cost model uses SCEV to check if some of the operands are
@@ -8626,6 +8640,16 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
     }
     return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
+  }
+  case Instruction::ExtractValue: {
+    SmallVector<VPValue *> NewOps(Operands);
+    Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
+    auto *EVI = cast<ExtractValueInst>(I);
+    assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
+    unsigned Idx = EVI->getIndices()[0];
+    NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
+    return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
+  }
   };
 }
 
@@ -9928,7 +9952,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
             VectorType::get(UI->getType(), State.VF));
         State.set(this, Poison);
       }
-      State.packScalarIntoVectorValue(this, *State.Lane);
+      State.packScalarIntoVectorizedValue(this, *State.Lane);
     }
     return;
   }
@@ -10445,13 +10469,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (LVL.hasStructVectorCall()) {
-    reportVectorizationFailure("Auto-vectorization of calls that return struct "
-                               "types is not yet supported",
-                               "StructCallVectorizationUnsupported", ORE, L);
-    return false;
-  }
-
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b0cac4f78ff3c..1332e50252978 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -336,10 +336,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
   } else {
     // Initialize packing with insertelements to start from undef.
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
+    Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     set(Def, Undef);
     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
-      packScalarIntoVectorValue(Def, Lane);
+      packScalarIntoVectorizedValue(Def, Lane);
     VectorValue = get(Def);
   }
   Builder.restoreIP(OldIP);
@@ -392,13 +392,24 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
     Builder.SetCurrentDebugLocation(DIL);
 }
 
-void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
-                                                 const VPLane &Lane) {
+void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
+                                                     const VPLane &Lane) {
   Value *ScalarInst = get(Def, Lane);
-  Value *VectorValue = get(Def);
-  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
-                                            Lane.getAsRuntimeExpr(Builder, VF));
-  set(Def, VectorValue);
+  Value *WideValue = get(Def);
+  Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
+  if (auto *StructTy = dyn_cast<StructType>(WideValue->getType())) {
+    // We must handle each element of a vectorized struct type.
+    for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
+      Value *ScalarValue = Builder.CreateExtractValue(ScalarInst, I);
+      Value *VectorValue = Builder.CreateExtractValue(WideValue, I);
+      VectorValue =
+          Builder.CreateInsertElement(VectorValue, ScalarValue, LaneExpr);
+      WideValue = Builder.CreateInsertValue(WideValue, VectorValue, I);
+    }
+  } else {
+    WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, LaneExpr);
+  }
+  set(Def, WideValue);
 }
 
 BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 71fb6d42116cf..bf61251fc9133 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -125,6 +125,12 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
   case Instruction::FNeg:
   case Instruction::Freeze:
     return inferScalarType(R->getOperand(0));
+  case Instruction::ExtractValue: {
+    assert(R->getNumOperands() == 2 && "expected single level extractvalue");
+    auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
+    auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
+    return StructTy->getTypeAtIndex(CI->getZExtValue());
+  }
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 74713daf904f0..cd1ad9bec91f7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -241,7 +241,7 @@ struct VPTransformState {
       set(Def, V, VPLane(0));
       return;
     }
-    assert((VF.isScalar() || V->getType()->isVectorTy()) &&
+    assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
            "scalar values must be stored as (0, 0)");
     Data.VPV2Vector[Def] = V;
   }
@@ -290,8 +290,9 @@ struct VPTransformState {
   /// Set the debug location in the builder using the debug location \p DL.
   void setDebugLocFrom(DebugLoc DL);
 
-  /// Construct the vector value of a scalarized value \p V one lane at a time.
-  void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane);
+  /// Construct the vectorized value of a scalarized value \p V one lane at a
+  /// time.
+  void packScalarIntoVectorizedValue(VPValue *Def, const VPLane &Lane);
 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f5d5e12b1c85d..1bba667c206cf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1169,7 +1169,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
     Arguments.push_back(V);
   }
 
-  Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
+  Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
   SmallVector<Type *> ParamTys;
   for (unsigned I = 0; I != getNumOperands(); ++I)
     ParamTys.push_back(
@@ -1475,6 +1475,14 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     break;
   }
+  case Instruction::ExtractValue: {
+    assert(getNumOperands() == 2 && "expected single level extractvalue");
+    Value *Op = State.get(getOperand(0));
+    auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
+    Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
+    State.set(this, Extract);
+    break;
+  }
   case Instruction::Freeze: {
     Value *Op = State.get(getOperand(0));
 
@@ -1576,6 +1584,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
     return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
                                           Ctx.CostKind);
   }
+  case Instruction::ExtractValue: {
+    return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
+                                             Ctx.CostKind);
+  }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
index 77781f95b0858..2fde624624ee9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -1,15 +1,18 @@
-; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s
-; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s
+; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 ; Tests basic vectorization of scalable homogeneous struct literal returns.
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
 define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; CHECK:         [[WIDE_A:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 0
+; CHECK:         [[WIDE_B:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 1
+; CHECK:         call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_A]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK:         call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_B]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 entry:
   br label %for.body
 
@@ -32,11 +35,15 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
 define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f64_widen
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; CHECK:         [[WIDE_A:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 0
+; CHECK:         [[WIDE_B:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 1
+; CHECK:         call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_A]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK:         call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_B]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 entry:
   br label %for.body
 
@@ -59,11 +66,16 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
 define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])
+; CHECK:       entry:
+; CHECK:         br i1 false, label %scalar.ph, label %vector.memcheck
+; CHECK:       vector.memcheck:
+; CHECK:       vector.body:
+; CHECK:         call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; CHECK:       for.body:
+; CHECK:         call { float, float } @foo(float [[LOAD:%.*]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
new file mode 100644
index 0000000000000..c721493243734
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
@@ -0,0 +1,199 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|@)" --version 5
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize < %s -S -o - 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-COST-LABEL: struct_return_widen
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { half, half } @foo(half %in_val)
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
+;
+; CHECK-COST: Cost of 10 for VF 2: WIDEN-CALL ir<%call> = call  @foo(ir<%in_val>) (using library function: fixed_vec_foo)
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+
+define void @struct_return_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_widen(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP2:%.*]] = call { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP3:%.*]] = call { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half> [[WIDE_LOAD1:%.*]])
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR2:[0-9]+]]
+; CHECK:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %in, i64 %iv
+  %in_val = load half, ptr %arrayidx, align 2
+  %call = tail call { half, half } @foo(half %in_val) #0
+  %extract_a = extractvalue { half, half } %call, 0
+  %extract_b = extractvalue { half, half } %call, 1
+  %arrayidx2 = getelementptr inbounds half, ptr %out_a, i64 %iv
+  store half %extract_a, ptr %arrayidx2, align 2
+  %arrayidx4 = getelementptr inbounds half, ptr %out_b, i64 %iv
+  store half %extract_b, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-COST-LABEL: struct_return_replicate
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { half, half } @foo(half %in_val)
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
+;
+; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+
+define void @struct_return_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_replicate(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP4:%.*]] = tail call { half, half } @foo(half [[TMP3:%.*]]) #[[ATTR3:[0-9]+]]
+; CHECK:    [[TMP6:%.*]] = tail call { half, half } @foo(half [[TMP5:%.*]]) #[[ATTR3]]
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR3]]
+; CHECK:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %in, i64 %iv
+  %in_val = load half, ptr %arrayidx, align 2
+  ; #1 does not have a fixed-size vector mapping (so replication is used)
+  %call = tail call { half, half } @foo(half %in_val) #1
+  %extract_a = extractvalue { half, half } %call, 0
+  %extract_b = extractvalue { half, half } %call, 1
+  %arrayidx2 = getelementptr inbounds half, ptr %out_a, i64 %iv
+  store half %extract_a, ptr %arrayidx2, align 2
+  %arrayidx4 = getelementptr inbounds half, ptr %out_b, i64 %iv
+  store half %extract_b, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-COST-LABEL: struct_return_scalable
+; CHECK-COST: LV: Found an estimated cost of 10 for VF 1 For instruction:   %call = tail call { half, half } @foo(half %in_val)
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_a = extractvalue { half, half } %call, 0
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %extract_b = extractvalue { half, half } %call, 1
+;
+; CHECK-COST: Cost of 26 for VF 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 58 for VF 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 122 for VF 8: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of Invalid for VF vscale x 1: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 1: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of Invalid for VF vscale x 2: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 2: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of Invalid for VF vscale x 4: REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 4: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+;
+; CHECK-COST: Cost of 10 for VF vscale x 8: WIDEN-CALL ir<%call> = call  @foo(ir<%in_val>, ir<true>) (using library function: scalable_vec_masked_foo)
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-COST: Cost of 0 for VF vscale x 8: WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+
+define void @struct_return_scalable(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) #2 {
+; CHECK-LABEL: define void @struct_return_scalable(
+; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK:  [[ENTRY:.*:]]
+; CHECK:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:  [[VECTOR_PH:.*:]]
+; CHECK:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:  [[VECTOR_BODY:.*:]]
+; CHECK:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:    [[TMP12:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @scalable_vec_masked_foo(<vscale x 8 x half> [[WIDE_LOAD:%.*]], <vscale x 8 x i1> splat (i1 true))
+; CHECK:    [[TMP13:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @scalable_vec_masked_foo(<vscale x 8 x half> [[WIDE_LOAD1:%.*]], <vscale x 8 x i1> splat (i1 true))
+; CHECK:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK:  [[MIDDLE_BLOCK:.*:]]
+; CHECK:  [[SCALAR_PH:.*:]]
+; CHECK:  [[FOR_BODY:.*:]]
+; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR3]]
+; CHECK:  [[EXIT:.*:]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %in, i64 %iv
+  %in_val = load half, ptr %arrayidx, align 2
+  %call = tail call { half, half } @foo(half %in_val) #1
+  %extract_a = extractvalue { half, half } %call, 0
+  %extract_b = extractvalue { half, half } %call, 1
+  %arrayidx2 = getelementptr inbounds half, ptr %out_a, i64 %iv
+  store half %extract_a, ptr %arrayidx2, align 2
+  %arrayidx4 = getelementptr inbounds half, ptr %out_b, i64 %iv
+  store half %extract_b, ptr %arrayidx4, align 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+
+declare { half, half } @foo(half)
+
+declare { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half>)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @scalable_vec_masked_foo(<vscale x 8 x half>, <vscale x 8 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
+attributes #2 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 9f98e8af2e98c..1b2a809a552d8 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -1,15 +1,20 @@
-; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s
 ; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; Tests basic vectorization of homogeneous struct literal returns.
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:         [[WIDE_A:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 0
+; CHECK:         [[WIDE_B:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 1
+; CHECK:         store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4
+; CHECK:         store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4
 entry:
   br label %for.body
 
@@ -32,11 +37,16 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f64_widen
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:        vector.body:
+; CHECK:          [[WIDE_CALL:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD:%.*]])
+; CHECK:          [[WIDE_A:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 0
+; CHECK:          [[WIDE_B:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 1
+; CHECK:          store <2 x double> [[WIDE_A]], ptr {{%.*}}, align 8
+; CHECK:          store <2 x double> [[WIDE_B]], ptr {{%.*}}, align 8
 entry:
   br label %for.body
 
@@ -59,11 +69,36 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
+; Note: Later instcombines reduce this down quite a lot.
 define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_replicate
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}})
+; CHECK:         [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}})
+;                // Lane 0
+; CHECK:         [[A_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0
+; CHECK:         [[VEC_A_0:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
+; CHECK:         [[WIDE_A_0:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VEC_A_0]], 0
+; CHECK:         [[B_0:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1
+; CHECK:         [[UNDEF_B_0:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], 1
+; CHECK:         [[VEC_B_0:%.*]] = insertelement <2 x float> [[UNDEF_B_0]], float [[B_0]], i32 0
+; CHECK:         [[WIDE_0:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A_0]], <2 x float> [[VEC_B_0]], 1
+;                // Lane 1
+; CHECK:         [[A_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0
+; CHECK:         [[VEC_A_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_0]], 0
+; CHECK:         [[VEC_A:%.*]] = insertelement <2 x float> [[VEC_A_0_EXT]], float [[A_1]], i32 1
+; CHECK:         [[WIDE_A:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_0]], <2 x float> [[VEC_A]], 0
+; CHECK:         [[B_1:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1
+; CHECK:         [[VEC_B_0_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_A]], 1
+; CHECK:         [[VEC_B:%.*]] = insertelement <2 x float> [[VEC_B_0_EXT]], float [[B_1]], i32 1
+; CHECK:         [[WIDE:%.*]] = insertvalue { <2 x float>, <2 x float> } [[WIDE_A]], <2 x float> [[VEC_B]], 1
+;                // Store wide values:
+; CHECK:         [[VEC_A_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 0
+; CHECK:         [[VEC_B_EXT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE]], 1
+; CHECK:         store <2 x float> [[VEC_A_EXT]], ptr {{%.*}}, align 4
+; CHECK:         store <2 x float> [[VEC_B_EXT]], ptr {{%.*}}, align 4
 entry:
   br label %for.body
 
@@ -87,11 +122,17 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
 ; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
-; CHECK-NOT:   vector.body:
+; CHECK-SAME:  (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])
+; CHECK:       entry:
+; CHECK:         br i1 false, label %scalar.ph, label %vector.memcheck
+; CHECK:       vector.memcheck:
+; CHECK:       vector.body:
+; CHECK:         call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:       for.body:
+; CHECK          call { float, float } @foo(float [[LOAD:%.*]])
 entry:
   br label %for.body
 
@@ -143,11 +184,11 @@ exit:
   ret void
 }
 
-; TODO: Support vectorization in this case.
-; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
 define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias writeonly %out_a) {
 ; CHECK-LABEL: define void @struct_return_i32_three_results_widen
-; CHECK-NOT:   vector.body:
+; CHECK:   vector.body:
+; CHECK:     call { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32> [[WIDE_LOAD:%.*]])
 entry:
   br label %for.body
 
@@ -167,6 +208,40 @@ exit:
   ret void
 }
 
+; Test crafted to exercise computePredInstDiscount with struct results
+; (mainly it does not crash).
+; CHECK-REMARKS: remark: {{.*}} vectorized loop
+define void @scalarized_predicated_struct_return(ptr %a) optsize {
+; CHECK-LABEL: define void @scalarized_predicated_struct_return
+; CHECK:  vector.body:
+; CHECK:  pred.store.if:
+; CHECK:     tail call { i64, i64 } @bar_i64(i64 %5)
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  %in_val = load i64, ptr %arrayidx, align 8
+  %sgt_zero = icmp sgt i64 %in_val, 0
+  br i1 %sgt_zero, label %if.then, label %for.inc
+
+if.then:
+  %call = tail call { i64, i64 } @bar_i64(i64 %in_val) #6
+  %extract_a = extractvalue { i64, i64 } %call, 0
+  %div = udiv i64 %extract_a, %in_val
+  store i64 %div, ptr %arrayidx, align 8
+  br label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
 ; Negative test. Widening structs of vectors is not supported.
 ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
 define void @negative_struct_of_vectors(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
@@ -390,13 +465,14 @@ declare { [2 x float] } @foo_arrays(float)
 declare { float, [1 x float] } @foo_one_non_widenable_element(float)
 declare { <1 x float>, <1 x float> } @foo_vectors(<1 x float>)
 declare { i32, i32, i32 } @qux(i32)
+declare { i64, i64 } @bar_i64(i64)
 
 declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)
 declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>)
 declare { <2 x float>, <2 x i32> } @fixed_vec_baz(<2 x float>)
 declare { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32>)
-
 declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @scalable_vec_masked_bar_i64(<vscale x 4 x i64>, <vscale x 4 x i1>)
 
 attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" }
 attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar(fixed_vec_bar)" }
@@ -404,3 +480,4 @@ attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_baz(fixed_vec
 attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
 attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar_named(fixed_vec_bar)" }
 attributes #5 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_qux(fixed_vec_qux)" }
+attributes #6 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_bar_i64(scalable_vec_masked_bar_i64)" }
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
new file mode 100644
index 0000000000000..bb61398ae5a6d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
@@ -0,0 +1,122 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s
+
+define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: LV: Checking a loop in 'struct_return_f32_widen'
+; CHECK:       VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:  Successor(s): vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[IN_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%in_val> = load vp<[[IN_VEC_PTR]]>
+; CHECK-NEXT:      WIDEN-CALL ir<%call> = call  @foo(ir<%in_val>) (using library function: fixed_vec_foo)
+; CHECK-NEXT:      WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-NEXT:      WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[OUT_A_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:      WIDEN store vp<[[OUT_A_VEC_PTR]]>, ir<%extract_a>
+; CHECK-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%out_b>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[OUT_B_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx4>
+; CHECK-NEXT:      WIDEN store vp<[[OUT_B_VEC_PTR]]>, ir<%extract_b>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: LV: Checking a loop in 'struct_return_f32_replicate'
+; CHECK:       VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:  Successor(s): vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[IN_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%in_val> = load vp<[[IN_VEC_PTR]]>
+; CHECK-NEXT:      REPLICATE ir<%call> = call @foo(ir<%in_val>)
+; CHECK-NEXT:      WIDEN ir<%extract_a> = extractvalue ir<%call>, ir<0>
+; CHECK-NEXT:      WIDEN ir<%extract_b> = extractvalue ir<%call>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[OUT_A_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:      WIDEN store vp<[[OUT_A_VEC_PTR]]>, ir<%extract_a>
+; CHECK-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%out_b>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<[[OUT_B_VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx4>
+; CHECK-NEXT:      WIDEN store vp<[[OUT_B_VEC_PTR]]>, ir<%extract_b>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  ; #3 does not have a fixed-size vector mapping (so replication is used)
+  %call = tail call { float, float } @foo(float %in_val) #1
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+
+declare { float, float } @foo(float)
+
+declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }