From 3063153b5643e5ed04e8a9d7b50feecf3eba325e Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Fri, 14 Feb 2025 21:10:08 -0800 Subject: [PATCH 001/109] [clang-format] Fix a bug in annotating ObjCMethodSpecifier (#127159) Fixes #58202. --- clang/lib/Format/TokenAnnotator.cpp | 2 +- clang/unittests/Format/FormatTestObjC.cpp | 7 +++++++ clang/unittests/Format/TokenAnnotatorTest.cpp | 7 +++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index b3540f39e6f69..069fd40e2834c 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -1313,7 +1313,7 @@ class AnnotatingParser { switch (bool IsIf = false; Tok->Tok.getKind()) { case tok::plus: case tok::minus: - if (!Tok->Previous && Line.MustBeDeclaration) + if (!Tok->getPreviousNonComment() && Line.MustBeDeclaration) Tok->setType(TT_ObjCMethodSpecifier); break; case tok::colon: diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp index 9b6f0c396d4db..f7f73db62045c 100644 --- a/clang/unittests/Format/FormatTestObjC.cpp +++ b/clang/unittests/Format/FormatTestObjC.cpp @@ -567,6 +567,13 @@ TEST_F(FormatTestObjC, FormatObjCMethodDeclarations) { " error:(NSError **)theError {\n" "}"); verifyFormat("+ (instancetype)new;"); + + verifyFormat("/*\n" + " */\n" + "- (void)foo;", + "/*\n" + " */- (void)foo;"); + Style.ColumnLimit = 60; verifyFormat("- (instancetype)initXxxxxx:(id)x\n" " y:(id)y\n" diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 5ab0867490122..7b489b1764cb2 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -1849,6 +1849,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsObjCMethodExpr) { EXPECT_TOKEN(Tokens[15], tok::greater, TT_BinaryOperator); } +TEST_F(TokenAnnotatorTest, UnderstandsObjCMethodDecl) { + auto Tokens = annotate("/**/ - (void)foo;"); + ASSERT_EQ(Tokens.size(), 8u) << Tokens; + EXPECT_TOKEN(Tokens[1], tok::minus, TT_ObjCMethodSpecifier); + EXPECT_TOKEN(Tokens[5], tok::identifier, TT_SelectorName); +} + TEST_F(TokenAnnotatorTest, UnderstandsLambdas) { auto Tokens = annotate("[]() constexpr {}"); ASSERT_EQ(Tokens.size(), 8u) << Tokens; From 66465c3b0ab1b32403ad5a1c3114174d87830f54 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 14 Feb 2025 21:23:33 -0800 Subject: [PATCH 002/109] Revert "Make llvm::telemetry::Manager::preDispatch protected. (#127114)" This reverts commit f7a2d70bd91094e7a85f7e189602c826a3eeb6cd. Multiple buildbot failures have been reported. See: https://github.com/llvm/llvm-project/pull/127114 --- llvm/include/llvm/Telemetry/Telemetry.h | 9 ++++----- llvm/lib/Telemetry/Telemetry.cpp | 13 ------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/Telemetry/Telemetry.h b/llvm/include/llvm/Telemetry/Telemetry.h index 8efea645ab51c..344a49df5cbf0 100644 --- a/llvm/include/llvm/Telemetry/Telemetry.h +++ b/llvm/include/llvm/Telemetry/Telemetry.h @@ -138,6 +138,10 @@ class Manager { public: virtual ~Manager() = default; + // Optional callback for subclasses to perform additional tasks before + // dispatching to Destinations. + virtual Error preDispatch(TelemetryInfo *Entry) = 0; + // Dispatch Telemetry data to the Destination(s). // The argument is non-const because the Manager may add or remove // data from the entry. @@ -146,11 +150,6 @@ class Manager { // Register a Destination. void addDestination(std::unique_ptr Destination); -protected: - // Optional callback for subclasses to perform additional tasks before - // dispatching to Destinations. - virtual Error preDispatch(TelemetryInfo *Entry) {} - private: std::vector> Destinations; }; diff --git a/llvm/lib/Telemetry/Telemetry.cpp b/llvm/lib/Telemetry/Telemetry.cpp index badb07bff5477..9e13d08334e3b 100644 --- a/llvm/lib/Telemetry/Telemetry.cpp +++ b/llvm/lib/Telemetry/Telemetry.cpp @@ -1,16 +1,3 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file provides the basic framework for Telemetry. -/// Refer to its documentation at llvm/docs/Telemetry.rst for more details. -//===---------------------------------------------------------------------===// - #include "llvm/Telemetry/Telemetry.h" namespace llvm { From 8fff0c181f26a5e8b2344c061ebf2559118b1160 Mon Sep 17 00:00:00 2001 From: Jordan R AW Date: Fri, 14 Feb 2025 21:37:39 -0800 Subject: [PATCH 003/109] [lldb] Add terminfo dependency for ncurses support (#126810) For some operating systems (e.g. chromiumos), terminfo is a separate package and library from ncurses. Both are still requirements for curses support in lldb, individually. This is a rework of this original spack commit: https://github.com/spack/spack/commit/9ea261265010eacd250691a8361f661d0576f25c Instead though, this PR uses CMake to detect whether the symbol is present and defined in the curses library, and only falls back to a separate tinfo if not found. Without this fix, LLDB cannot be built on these systems. Fixes #101368 --- lldb/cmake/modules/FindCursesAndPanel.cmake | 42 ++++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/lldb/cmake/modules/FindCursesAndPanel.cmake b/lldb/cmake/modules/FindCursesAndPanel.cmake index aaadf214bf54b..75ebaa35d7ea1 100644 --- a/lldb/cmake/modules/FindCursesAndPanel.cmake +++ b/lldb/cmake/modules/FindCursesAndPanel.cmake @@ -2,23 +2,55 @@ # FindCursesAndPanel # ----------- # -# Find the curses and panel library as a whole. +# Find the curses, terminfo, and panel library as a whole. -if(CURSES_INCLUDE_DIRS AND CURSES_LIBRARIES AND PANEL_LIBRARIES) +include(CMakePushCheckState) + +function(lldb_check_curses_tinfo CURSES_LIBRARIES CURSES_HAS_TINFO) + cmake_reset_check_state() + set(CMAKE_REQUIRED_LIBRARIES "${CURSES_LIBRARIES}") + # acs_map is one of many symbols that are part of tinfo but could + # be bundled in curses. + check_symbol_exists(acs_map "curses.h" CURSES_HAS_TINFO) +endfunction() + +if(CURSES_INCLUDE_DIRS AND CURSES_LIBRARIES AND TINFO_LIBRARIES AND PANEL_LIBRARIES) set(CURSESANDPANEL_FOUND TRUE) else() find_package(Curses QUIET) find_library(PANEL_LIBRARIES NAMES panel DOC "The curses panel library" QUIET) include(FindPackageHandleStandardArgs) + + if(CURSES_FOUND AND PANEL_LIBRARIES) + # Sometimes the curses libraries define their own terminfo symbols, + # other times they're extern and are defined by a separate terminfo library. + # Auto-detect which. + lldb_check_curses_tinfo("${CURSES_LIBRARIES}" CURSES_HAS_TINFO) + if (NOT CURSES_HAS_TINFO) + message(STATUS "curses library missing terminfo symbols, looking for tinfo separately") + find_library(TINFO_LIBRARIES NAMES tinfo DOC "The curses tinfo library" QUIET) + list(APPEND CURSES_LIBRARIES "${TINFO_LIBRARIES}") + endif() + set(HAS_TERMINFO_SYMBOLS "$,$>") + endif() + find_package_handle_standard_args(CursesAndPanel FOUND_VAR CURSESANDPANEL_FOUND REQUIRED_VARS CURSES_INCLUDE_DIRS CURSES_LIBRARIES - PANEL_LIBRARIES) - if(CURSES_FOUND AND PANEL_LIBRARIES) - mark_as_advanced(CURSES_INCLUDE_DIRS CURSES_LIBRARIES PANEL_LIBRARIES) + PANEL_LIBRARIES + HAS_TERMINFO_SYMBOLS) + + if(CURSES_FOUND AND PANEL_LIBRARIES AND HAS_TERMINFO_SYMBOLS) + mark_as_advanced(CURSES_INCLUDE_DIRS + PANEL_LIBRARIES + HAS_TERMINFO_SYMBOLS + CURSES_HAS_TINFO) + endif() + if(TINFO_LIBRARIES) + mark_as_advanced(TINFO_LIBRARIES) endif() endif() From ed32d85d31999756602a7d5c4647cb6771d8f857 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 14 Feb 2025 22:01:29 -0800 Subject: [PATCH 004/109] [lldb] Use async output & error stream for EvaluateExpression Similar to #126821, in support of #126630. --- lldb/source/Commands/CommandObjectExpression.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp index 13491b5c79442..7e26381c92405 100644 --- a/lldb/source/Commands/CommandObjectExpression.cpp +++ b/lldb/source/Commands/CommandObjectExpression.cpp @@ -500,19 +500,17 @@ bool CommandObjectExpression::EvaluateExpression(llvm::StringRef expr, void CommandObjectExpression::IOHandlerInputComplete(IOHandler &io_handler, std::string &line) { io_handler.SetIsDone(true); - StreamFileSP output_sp = io_handler.GetOutputStreamFileSP(); - StreamFileSP error_sp = io_handler.GetErrorStreamFileSP(); + StreamSP output_stream = + GetCommandInterpreter().GetDebugger().GetAsyncOutputStream(); + StreamSP error_stream = + GetCommandInterpreter().GetDebugger().GetAsyncErrorStream(); CommandReturnObject return_obj( GetCommandInterpreter().GetDebugger().GetUseColor()); - EvaluateExpression(line.c_str(), *output_sp, *error_sp, return_obj); + EvaluateExpression(line.c_str(), *output_stream, *error_stream, return_obj); - if (output_sp) - output_sp->Flush(); - if (error_sp) { - *error_sp << return_obj.GetErrorString(); - error_sp->Flush(); - } + output_stream->Flush(); + *error_stream << return_obj.GetErrorString(); } bool CommandObjectExpression::IOHandlerIsInputComplete(IOHandler &io_handler, From b24e14093dae04440f22a2da128ba29576f5b3c3 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 15 Feb 2025 06:14:21 +0000 Subject: [PATCH 005/109] [CI] Keep Track of Workflow Name Instead of Job Name The metrics script includes some logic to only read look at workflows up to the most recent workflow it has seen previously. This was broken in a previous patch when workflow metrics began to be emitted per job. The logic ending the metrics gathering would never trigger, so we would continually fetch more and more workflows until OOM. --- .ci/metrics/metrics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py index d219c9e55169e..354b5058100e7 100644 --- a/.ci/metrics/metrics.py +++ b/.ci/metrics/metrics.py @@ -24,6 +24,7 @@ class JobMetrics: status: int created_at_ns: int workflow_id: int + workflow_name: str @dataclass @@ -199,6 +200,7 @@ def get_per_workflow_metrics( job_result, created_at_ns, workflow_run.id, + workflow_run.name, ) ) @@ -278,7 +280,7 @@ def main(): for workflow_metric in reversed(current_metrics): if isinstance(workflow_metric, JobMetrics): workflows_to_track[ - workflow_metric.job_name + workflow_metric.workflow_name ] = workflow_metric.workflow_id time.sleep(SCRAPE_INTERVAL_SECONDS) From 776fa2d731c17d6ba0afad2554ebc89cf5e3e5ef Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 14 Feb 2025 22:23:46 -0800 Subject: [PATCH 006/109] [lldb] Gardening in IOHandlerCurses (NFC) - Remove _ap (auto_ptr) suffix with _up (unique_ptr) suffix - Move forward declaration from IOHandler.h to IOHandlerCursesGUI.h - Move curses namespace under lldb_private Motivated by Alex' comment in #126630. --- lldb/include/lldb/Core/IOHandler.h | 5 ----- lldb/include/lldb/Core/IOHandlerCursesGUI.h | 5 ++++- lldb/source/Core/IOHandlerCursesGUI.cpp | 20 +++++++++++--------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/lldb/include/lldb/Core/IOHandler.h b/lldb/include/lldb/Core/IOHandler.h index cb14d72413209..d6ac1cc8b5a14 100644 --- a/lldb/include/lldb/Core/IOHandler.h +++ b/lldb/include/lldb/Core/IOHandler.h @@ -32,11 +32,6 @@ namespace lldb_private { class Debugger; } // namespace lldb_private -namespace curses { -class Application; -typedef std::unique_ptr ApplicationAP; -} // namespace curses - namespace lldb_private { class IOHandler { diff --git a/lldb/include/lldb/Core/IOHandlerCursesGUI.h b/lldb/include/lldb/Core/IOHandlerCursesGUI.h index 22ca735063ba1..e9871e0532194 100644 --- a/lldb/include/lldb/Core/IOHandlerCursesGUI.h +++ b/lldb/include/lldb/Core/IOHandlerCursesGUI.h @@ -12,6 +12,9 @@ #include "lldb/Core/IOHandler.h" namespace lldb_private { +namespace curses { +class Application; +} // namespace curses class IOHandlerCursesGUI : public IOHandler { public: @@ -34,7 +37,7 @@ class IOHandlerCursesGUI : public IOHandler { void TerminalSizeChanged() override; protected: - curses::ApplicationAP m_app_ap; + std::unique_ptr m_app_up; }; } // namespace lldb_private diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp index 456ce7d16e102..c5eed0c0b4089 100644 --- a/lldb/source/Core/IOHandlerCursesGUI.cpp +++ b/lldb/source/Core/IOHandlerCursesGUI.cpp @@ -94,6 +94,7 @@ using llvm::StringRef; #define KEY_SHIFT_TAB (KEY_MAX + 1) #define KEY_ALT_ENTER (KEY_MAX + 2) +namespace lldb_private { namespace curses { class Menu; class MenuDelegate; @@ -4479,8 +4480,9 @@ class Application { }; } // namespace curses +} // namespace lldb_private -using namespace curses; +using namespace lldb_private::curses; struct Row { ValueObjectUpdater value; @@ -7573,12 +7575,12 @@ IOHandlerCursesGUI::IOHandlerCursesGUI(Debugger &debugger) void IOHandlerCursesGUI::Activate() { IOHandler::Activate(); - if (!m_app_ap) { - m_app_ap = std::make_unique(GetInputFILE(), GetOutputFILE()); + if (!m_app_up) { + m_app_up = std::make_unique(GetInputFILE(), GetOutputFILE()); // This is both a window and a menu delegate std::shared_ptr app_delegate_sp( - new ApplicationDelegate(*m_app_ap, m_debugger)); + new ApplicationDelegate(*m_app_up, m_debugger)); MenuDelegateSP app_menu_delegate_sp = std::static_pointer_cast(app_delegate_sp); @@ -7652,8 +7654,8 @@ void IOHandlerCursesGUI::Activate() { help_menu_sp->AddSubmenu(MenuSP(new Menu( "GUI Help", nullptr, 'g', ApplicationDelegate::eMenuID_HelpGUIHelp))); - m_app_ap->Initialize(); - WindowSP &main_window_sp = m_app_ap->GetMainWindow(); + m_app_up->Initialize(); + WindowSP &main_window_sp = m_app_up->GetMainWindow(); MenuSP menubar_sp(new Menu(Menu::Type::Bar)); menubar_sp->AddSubmenu(lldb_menu_sp); @@ -7734,10 +7736,10 @@ void IOHandlerCursesGUI::Activate() { } } -void IOHandlerCursesGUI::Deactivate() { m_app_ap->Terminate(); } +void IOHandlerCursesGUI::Deactivate() { m_app_up->Terminate(); } void IOHandlerCursesGUI::Run() { - m_app_ap->Run(m_debugger); + m_app_up->Run(m_debugger); SetIsDone(true); } @@ -7752,7 +7754,7 @@ bool IOHandlerCursesGUI::Interrupt() { void IOHandlerCursesGUI::GotEOF() {} void IOHandlerCursesGUI::TerminalSizeChanged() { - m_app_ap->TerminalSizeChanged(); + m_app_up->TerminalSizeChanged(); } #endif // LLDB_ENABLE_CURSES From c30a7f459452d5766da244564bc1d5888346c364 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Sat, 15 Feb 2025 08:15:00 +0100 Subject: [PATCH 007/109] [flang] Fix standalone builds against installed MLIR (#126387) 1. Add a new `MLIR_DEPS` argument group to `flang_add_library()`, and move MLIR-specific dependencies to that group. These dependencies are added as usual in regular builds, and are skipped in standalone builds, since MLIR targets are not visible there (and were already built and installed). 2. Fix the value of `MLIR_MAIN_SRC_DIR` to refer to the current source directory rather than the directory written into MLIR CMake files. The latter refers to the directory used to build the MLIR package, and is no longer valid. 3. Fix non-dylib friendly linking of `LLVMTargetParser` in `Optimizer` unittests. With these changes, I can successfully run Flang's regression tests. --- flang/CMakeLists.txt | 5 +++++ flang/cmake/modules/AddFlang.cmake | 5 ++++- flang/lib/Frontend/CMakeLists.txt | 8 ++++--- flang/lib/Lower/CMakeLists.txt | 6 +++-- flang/lib/Optimizer/Analysis/CMakeLists.txt | 6 +++-- flang/lib/Optimizer/Builder/CMakeLists.txt | 6 +++-- flang/lib/Optimizer/Dialect/CMakeLists.txt | 4 +++- .../Dialect/CUF/Attributes/CMakeLists.txt | 4 +++- .../lib/Optimizer/Dialect/CUF/CMakeLists.txt | 4 +++- .../Optimizer/Dialect/Support/CMakeLists.txt | 4 +++- flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt | 4 +++- .../Optimizer/HLFIR/Transforms/CMakeLists.txt | 4 +++- flang/lib/Optimizer/OpenACC/CMakeLists.txt | 4 +++- flang/lib/Optimizer/OpenMP/CMakeLists.txt | 4 +++- flang/lib/Optimizer/Support/CMakeLists.txt | 8 ++++--- flang/test/CMakeLists.txt | 22 +++++++++++-------- .../lib/Analysis/AliasAnalysis/CMakeLists.txt | 4 +++- flang/test/lib/OpenACC/CMakeLists.txt | 10 +++++---- flang/unittests/Optimizer/CMakeLists.txt | 13 ++++++++--- 19 files changed, 87 insertions(+), 38 deletions(-) diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index c012b884ae3be..cca56bfdc88e6 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -230,6 +230,11 @@ if (FLANG_STANDALONE_BUILD) add_custom_target(doxygen ALL) endif() + # Override the value from installed CMake files, as they refer + # to the directory used during the original MLIR package build, + # which may be no longer available. Instead, use the current checkout. + set(MLIR_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../mlir ) + else() option(FLANG_INCLUDE_TESTS "Generate build targets for the Flang unit tests." diff --git a/flang/cmake/modules/AddFlang.cmake b/flang/cmake/modules/AddFlang.cmake index c9f65eb73fef0..badbd4e7b964b 100644 --- a/flang/cmake/modules/AddFlang.cmake +++ b/flang/cmake/modules/AddFlang.cmake @@ -18,7 +18,7 @@ endmacro() function(add_flang_library name) set(options SHARED STATIC INSTALL_WITH_TOOLCHAIN) - set(multiValueArgs ADDITIONAL_HEADERS CLANG_LIBS MLIR_LIBS) + set(multiValueArgs ADDITIONAL_HEADERS CLANG_LIBS MLIR_LIBS MLIR_DEPS) cmake_parse_arguments(ARG "${options}" "" @@ -69,6 +69,9 @@ function(add_flang_library name) if (ARG_MLIR_LIBS) mlir_target_link_libraries(${name} PRIVATE ${ARG_MLIR_LIBS}) endif() + if (ARG_MLIR_DEPS AND NOT FLANG_STANDALONE_BUILD) + add_dependencies(${name} ${ARG_MLIR_DEPS}) + endif() if (TARGET ${name}) diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt index 81eef2d468d8c..80d63fca6fb76 100644 --- a/flang/lib/Frontend/CMakeLists.txt +++ b/flang/lib/Frontend/CMakeLists.txt @@ -18,9 +18,6 @@ add_flang_library(flangFrontend FIROptCodeGenPassIncGen FIROptTransformsPassIncGen HLFIRDialect - MLIRIR - ${dialect_libs} - ${extension_libs} LINK_LIBS CUFDialect @@ -56,6 +53,11 @@ add_flang_library(flangFrontend FrontendOpenACC FrontendOpenMP + MLIR_DEPS + MLIRIR + ${dialect_libs} + ${extension_libs} + MLIR_LIBS MLIRTransforms MLIRBuiltinToLLVMIRTranslation diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt index c9b249781552e..87dc2a052796a 100644 --- a/flang/lib/Lower/CMakeLists.txt +++ b/flang/lib/Lower/CMakeLists.txt @@ -44,8 +44,6 @@ add_flang_library(FortranLower FIRDialect FIRTransforms HLFIRDialect - ${dialect_libs} - ${extension_libs} LINK_LIBS CUFAttrs @@ -64,6 +62,10 @@ add_flang_library(FortranLower LINK_COMPONENTS Support + MLIR_DEPS + ${dialect_libs} + ${extension_libs} + MLIR_LIBS ${dialect_libs} ${extension_libs} diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt index c4dae898f8e57..4d4ad882c27d3 100644 --- a/flang/lib/Optimizer/Analysis/CMakeLists.txt +++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt @@ -6,8 +6,6 @@ add_flang_library(FIRAnalysis FIRDialect FIRSupport HLFIRDialect - MLIRIR - MLIROpenMPDialect LINK_LIBS FIRBuilder @@ -15,6 +13,10 @@ add_flang_library(FIRAnalysis FIRSupport HLFIRDialect + MLIR_DEPS + MLIRIR + MLIROpenMPDialect + MLIR_LIBS MLIRFuncDialect MLIRLLVMDialect diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index f8faeaa81c90c..f0563d092e3dc 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -40,8 +40,6 @@ add_flang_library(FIRBuilder CUFDialect FIRDialect HLFIRDialect - ${dialect_libs} - ${extension_libs} LINK_LIBS CUFAttrs @@ -52,6 +50,10 @@ add_flang_library(FIRBuilder FortranEvaluate HLFIRDialect + MLIR_DEPS + ${dialect_libs} + ${extension_libs} + MLIR_LIBS ${dialect_libs} ${extension_libs} diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt index d39dca8ed0000..61f9c6110491e 100644 --- a/flang/lib/Optimizer/Dialect/CMakeLists.txt +++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt @@ -12,7 +12,6 @@ add_flang_library(FIRDialect DEPENDS CanonicalizationPatternsIncGen - MLIRIR FIROpsIncGen CUFAttrsIncGen intrinsics_gen @@ -26,6 +25,9 @@ add_flang_library(FIRDialect AsmPrinter Remarks + MLIR_DEPS + MLIRIR + MLIR_LIBS MLIRArithDialect MLIRBuiltinToLLVMIRTranslation diff --git a/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt index a0f58504eff05..713bd0e97bac3 100644 --- a/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt +++ b/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt @@ -3,7 +3,6 @@ add_flang_library(CUFAttrs CUFAttr.cpp DEPENDS - MLIRIR CUFAttrsIncGen CUFOpsIncGen @@ -12,6 +11,9 @@ add_flang_library(CUFAttrs AsmPrinter Remarks + MLIR_DEPS + MLIRIR + MLIR_LIBS MLIRTargetLLVMIRExport ) diff --git a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt index e483b4a164113..5b398f2ad506a 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt +++ b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt @@ -6,7 +6,6 @@ add_flang_library(CUFDialect CUFToLLVMIRTranslation.cpp DEPENDS - MLIRIR CUFAttrsIncGen CUFOpsIncGen @@ -20,6 +19,9 @@ add_flang_library(CUFDialect AsmPrinter Remarks + MLIR_DEPS + MLIRIR + MLIR_LIBS MLIRIR MLIRGPUDialect diff --git a/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt b/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt index bfdd5279b6f29..a85d9521af1c4 100644 --- a/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt +++ b/flang/lib/Optimizer/Dialect/Support/CMakeLists.txt @@ -5,9 +5,11 @@ add_flang_library(FIRDialectSupport FIRContext.cpp DEPENDS - MLIRIR intrinsics_gen + MLIR_DEPS + MLIRIR + MLIR_LIBS ${dialect_libs} ) diff --git a/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt index 8a646bedf94b8..99e31a43e01e5 100644 --- a/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt +++ b/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt @@ -8,7 +8,6 @@ add_flang_library(HLFIRDialect CUFAttrsIncGen FIRDialect HLFIROpsIncGen - ${dialect_libs} LINK_LIBS CUFAttrs @@ -19,6 +18,9 @@ add_flang_library(HLFIRDialect AsmPrinter Remarks + MLIR_DEPS + ${dialect_libs} + MLIR_LIBS MLIRIR ${dialect_libs} diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt index 09286aced6089..7eb3cb4001d5f 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt @@ -15,7 +15,6 @@ add_flang_library(HLFIRTransforms CUFAttrsIncGen FIRDialect HLFIROpsIncGen - ${dialect_libs} LINK_LIBS CUFAttrs @@ -33,6 +32,9 @@ add_flang_library(HLFIRTransforms AsmPrinter Remarks + MLIR_DEPS + ${dialect_libs} + MLIR_LIBS MLIRIR ${dialect_libs} diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt index 1bfae603fd80d..4a09133fc110d 100644 --- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt @@ -11,7 +11,6 @@ add_flang_library(FIROpenACCSupport FIRDialectSupport FIRSupport HLFIRDialect - MLIROpenACCDialect LINK_LIBS FIRBuilder @@ -21,6 +20,9 @@ add_flang_library(FIROpenACCSupport FIRSupport HLFIRDialect + MLIR_DEPS + MLIROpenACCDialect + MLIR_LIBS MLIROpenACCDialect ) diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index 86ae93f3207cc..4a48d6e0936db 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -12,7 +12,6 @@ add_flang_library(FlangOpenMPTransforms FIRDialect HLFIROpsIncGen FlangOpenMPPassesIncGen - ${dialect_libs} LINK_LIBS FIRAnalysis @@ -24,6 +23,9 @@ add_flang_library(FlangOpenMPTransforms FortranSupport HLFIRDialect + MLIR_DEPS + ${dialect_libs} + MLIR_LIBS MLIRFuncDialect MLIROpenMPDialect diff --git a/flang/lib/Optimizer/Support/CMakeLists.txt b/flang/lib/Optimizer/Support/CMakeLists.txt index f8e4fc5bcefea..7ccdd4fd9c25c 100644 --- a/flang/lib/Optimizer/Support/CMakeLists.txt +++ b/flang/lib/Optimizer/Support/CMakeLists.txt @@ -10,9 +10,6 @@ add_flang_library(FIRSupport DEPENDS FIROpsIncGen HLFIROpsIncGen - MLIRIR - ${dialect_libs} - ${extension_libs} LINK_LIBS FIRDialect @@ -20,6 +17,11 @@ add_flang_library(FIRSupport LINK_COMPONENTS TargetParser + MLIR_DEPS + MLIRIR + ${dialect_libs} + ${extension_libs} + MLIR_LIBS ${dialect_libs} ${extension_libs} diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt index 3fac8717e9bd9..777cf5fc5433b 100644 --- a/flang/test/CMakeLists.txt +++ b/flang/test/CMakeLists.txt @@ -59,26 +59,30 @@ set(FLANG_TEST_PARAMS set(FLANG_TEST_DEPENDS flang - llvm-config - FileCheck - count - not module_files fir-opt tco bbc - llvm-dis - llvm-objdump - llvm-readobj - split-file FortranDecimal ) +if (NOT FLANG_STANDALONE_BUILD) + list(APPEND FLANG_TEST_DEPENDS + llvm-config + FileCheck + count + not + llvm-dis + llvm-objdump + llvm-readobj + split-file + ) +endif () if (FLANG_INCLUDE_RUNTIME) list(APPEND FLANG_TEST_DEPENDS flang_rt.runtime) endif () -if (LLVM_ENABLE_PLUGINS AND NOT WIN32) +if (LLVM_ENABLE_PLUGINS AND NOT WIN32 AND NOT FLANG_STANDALONE_BUILD) list(APPEND FLANG_TEST_DEPENDS Bye) endif() diff --git a/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt b/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt index cba47a4114517..16df2b607ca93 100644 --- a/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt +++ b/flang/test/lib/Analysis/AliasAnalysis/CMakeLists.txt @@ -8,7 +8,6 @@ add_flang_library(FIRTestAnalysis FIRSupport FIRTransforms FIRAnalysis - ${dialect_libs} LINK_LIBS FIRDialect @@ -18,6 +17,9 @@ add_flang_library(FIRTestAnalysis FIRAnalysis MLIRTestAnalysis + MLIR_DEPS + ${dialect_libs} + MLIR_LIBS ${dialect_libs} MLIRFuncDialect diff --git a/flang/test/lib/OpenACC/CMakeLists.txt b/flang/test/lib/OpenACC/CMakeLists.txt index e296827ef53be..1c0ac748f85e8 100644 --- a/flang/test/lib/OpenACC/CMakeLists.txt +++ b/flang/test/lib/OpenACC/CMakeLists.txt @@ -5,16 +5,18 @@ add_flang_library(FIRTestOpenACCInterfaces FIRDialect FIROpenACCSupport FIRSupport - MLIRIR - MLIROpenACCDialect - MLIRPass - MLIRSupport LINK_LIBS FIRDialect FIROpenACCSupport FIRSupport + MLIR_DEPS + MLIRIR + MLIROpenACCDialect + MLIRPass + MLIRSupport + MLIR_LIBS MLIRIR MLIROpenACCDialect diff --git a/flang/unittests/Optimizer/CMakeLists.txt b/flang/unittests/Optimizer/CMakeLists.txt index f535677c19fd2..1289341619118 100644 --- a/flang/unittests/Optimizer/CMakeLists.txt +++ b/flang/unittests/Optimizer/CMakeLists.txt @@ -1,6 +1,10 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS) +set(LLVM_LINK_COMPONENTS + TargetParser +) + set(LIBS CUFDialect FIRBuilder @@ -9,7 +13,6 @@ set(LIBS FIRDialectSupport FIRSupport HLFIRDialect - LLVMTargetParser ) add_flang_unittest(FlangOptimizerTests @@ -39,8 +42,12 @@ DEPENDS CUFDialect FIRDialect FIRSupport - HLFIRDialect - ${dialect_libs}) + HLFIRDialect) + +if(NOT FLANG_STANDALONE_BUILD) + add_dependencies(FlangOptimizerTests + ${dialect_libs}) +endif() target_link_libraries(FlangOptimizerTests PRIVATE From b6be53d4cb92592940618555ba5fbf412c0cfca8 Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Sat, 15 Feb 2025 09:19:20 +0100 Subject: [PATCH 008/109] [ValueTracking] Test for not cond to assume (NFC) --- llvm/test/Transforms/InstCombine/assume.ll | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index c21f8457e82d1..0007cc1518730 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -977,6 +977,24 @@ define i32 @range_15_31_top27(i32 %x) { ret i32 %res } +define i1 @not_cond_use(i8 %x) { +; CHECK-LABEL: @not_cond_use( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[X:%.*]], 0 +; CHECK-NEXT: tail call void @use(i1 [[CMP]]) +; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[CMP]], true +; CHECK-NEXT: tail call void @llvm.assume(i1 [[NOT]]) +; CHECK-NEXT: [[RVAL:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: ret i1 [[RVAL]] +; + %cmp = icmp eq i8 %x, 0 + tail call void @use(i1 %cmp) + %not = xor i1 %cmp, true + tail call void @llvm.assume(i1 %not) + %rval = icmp eq i8 %x, 0 + ret i1 %rval +} + +declare void @use(i1) declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} From 77b309d0721b70f7e2e646f50317478fa76b1ba5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 15 Feb 2025 01:35:01 -0800 Subject: [PATCH 009/109] [AST] Avoid repeated hash lookups (NFC) (#127299) --- clang/lib/AST/VTableBuilder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/VTableBuilder.cpp b/clang/lib/AST/VTableBuilder.cpp index 19d76df99dbe3..18893b996b5d6 100644 --- a/clang/lib/AST/VTableBuilder.cpp +++ b/clang/lib/AST/VTableBuilder.cpp @@ -2115,8 +2115,8 @@ void ItaniumVTableBuilder::dumpLayout(raw_ostream &Out) { // Dump the next address point. uint64_t NextIndex = Index + 1; - if (AddressPointsByIndex.count(NextIndex)) { - if (AddressPointsByIndex.count(NextIndex) == 1) { + if (unsigned Count = AddressPointsByIndex.count(NextIndex)) { + if (Count == 1) { const BaseSubobject &Base = AddressPointsByIndex.find(NextIndex)->second; From 8bdc312272543e8fb21868e57a6c1592668b49a4 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 15 Feb 2025 01:35:33 -0800 Subject: [PATCH 010/109] [Index] Avoid repeated hash lookups (NFC) (#127300) --- clang/lib/Index/USRGeneration.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp index 1e54b413dc59c..0a5a1bcc74865 100644 --- a/clang/lib/Index/USRGeneration.cpp +++ b/clang/lib/Index/USRGeneration.cpp @@ -859,16 +859,12 @@ void USRGenerator::VisitType(QualType T) { } // If we have already seen this (non-built-in) type, use a substitution - // encoding. - llvm::DenseMap::iterator Substitution - = TypeSubstitutions.find(T.getTypePtr()); - if (Substitution != TypeSubstitutions.end()) { + // encoding. Otherwise, record this as a substitution. + auto [Substitution, Inserted] = + TypeSubstitutions.try_emplace(T.getTypePtr(), TypeSubstitutions.size()); + if (!Inserted) { Out << 'S' << Substitution->second << '_'; return; - } else { - // Record this as a substitution. - unsigned Number = TypeSubstitutions.size(); - TypeSubstitutions[T.getTypePtr()] = Number; } if (const PointerType *PT = T->getAs()) { From 42e0ee4d7eaafd86a27418cd8c752229ce90c8e2 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 15 Feb 2025 01:36:16 -0800 Subject: [PATCH 011/109] [Sema] Avoid repeated hash lookups (NFC) (#127301) --- clang/lib/Sema/SemaDecl.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 98c245cdea78f..362df485a025c 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -16014,7 +16014,8 @@ static void diagnoseImplicitlyRetainedSelf(Sema &S) { llvm::DenseMap EscapeInfo; auto IsOrNestedInEscapingBlock = [&](const BlockDecl *BD) { - if (auto It = EscapeInfo.find(BD); It != EscapeInfo.end()) + auto [It, Inserted] = EscapeInfo.try_emplace(BD); + if (!Inserted) return It->second; bool R = false; @@ -16027,7 +16028,7 @@ static void diagnoseImplicitlyRetainedSelf(Sema &S) { CurBD = CurBD->getParent()->getInnermostBlockDecl(); } while (CurBD); - return EscapeInfo[BD] = R; + return It->second = R; }; // If the location where 'self' is implicitly retained is inside a escaping From 9453b38ac74f0d6797f12213996eac40d56537d9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 15 Feb 2025 01:36:39 -0800 Subject: [PATCH 012/109] [clang-offload-packager] Avoid repeated hash lookups (NFC) (#127302) --- .../tools/clang-offload-packager/ClangOffloadPackager.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp b/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp index c6d5b31ab512c..49cb0d70f492b 100644 --- a/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp +++ b/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp @@ -70,10 +70,9 @@ static DenseMap getImageArguments(StringRef Image, DenseMap Args; for (StringRef Arg : llvm::split(Image, ",")) { auto [Key, Value] = Arg.split("="); - if (Args.count(Key)) - Args[Key] = Saver.save(Args[Key] + "," + Value); - else - Args[Key] = Value; + auto [It, Inserted] = Args.try_emplace(Key, Value); + if (!Inserted) + It->second = Saver.save(It->second + "," + Value); } return Args; From 05209f1e598f73913bf0284bfbbb88131149bbcf Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 15 Feb 2025 01:37:02 -0800 Subject: [PATCH 013/109] [ExecutionEngine] Avoid repeated hash lookups (NFC) (#127303) --- llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index d4e341a96f5b1..380a173c1d7ed 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -635,11 +635,12 @@ void MachOPlatform::pushInitializersLoop( Worklist.pop_back(); // If we've already visited this JITDylib on this iteration then continue. - if (JDDepMap.count(DepJD)) + auto [It, Inserted] = JDDepMap.try_emplace(DepJD); + if (!Inserted) continue; // Add dep info. - auto &DM = JDDepMap[DepJD]; + auto &DM = It->second; DepJD->withLinkOrderDo([&](const JITDylibSearchOrder &O) { for (auto &KV : O) { if (KV.first == DepJD) From 7e7a3623b44da5019878b91d8334d4c16d7b86a9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 15 Feb 2025 01:38:00 -0800 Subject: [PATCH 014/109] [Hexagon] Avoid repeated map lookups (NFC) (#127304) --- llvm/lib/Target/Hexagon/RDFCopy.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp index fdd7e4cf99e35..fafdad08909dd 100644 --- a/llvm/lib/Target/Hexagon/RDFCopy.cpp +++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp @@ -133,8 +133,8 @@ bool CopyPropagation::run() { for (NodeId I : Copies) { dbgs() << "Instr: " << *DFG.addr(I).Addr->getCode(); dbgs() << " eq: {"; - if (CopyMap.count(I)) { - for (auto J : CopyMap.at(I)) + if (auto It = CopyMap.find(I); It != CopyMap.end()) { + for (auto J : It->second) dbgs() << ' ' << Print(J.first, DFG) << '=' << Print(J.second, DFG); } From 4887e41055686eede9c155e6b3296b92fe86c2d5 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Sat, 15 Feb 2025 10:38:59 +0100 Subject: [PATCH 015/109] [libc++][NFC] Make enable_ifs in consistent (#127184) We've documented the preferred `enable_if` style in the coding guidelines. This updates `` to conform to them --- libcxx/include/optional | 235 +++++++++++++++++++--------------------- 1 file changed, 114 insertions(+), 121 deletions(-) diff --git a/libcxx/include/optional b/libcxx/include/optional index c325140ee66f2..db236f86e74dd 100644 --- a/libcxx/include/optional +++ b/libcxx/include/optional @@ -672,44 +672,41 @@ public: _LIBCPP_HIDE_FROM_ABI constexpr optional(optional&&) = default; _LIBCPP_HIDE_FROM_ABI constexpr optional(nullopt_t) noexcept {} - template < - class _InPlaceT, - class... _Args, - class = enable_if_t< _And< _IsSame<_InPlaceT, in_place_t>, is_constructible >::value > > + template , is_constructible>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(_InPlaceT, _Args&&... __args) : __base(in_place, std::forward<_Args>(__args)...) {} template &, _Args...>> > + enable_if_t&, _Args...>, int> = 0> _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(in_place_t, initializer_list<_Up> __il, _Args&&... __args) : __base(in_place, __il, std::forward<_Args>(__args)...) {} - template ::template __enable_implicit<_Up>(), int> = 0> + template ::template __enable_implicit<_Up>(), int> = 0> _LIBCPP_HIDE_FROM_ABI constexpr optional(_Up&& __v) : __base(in_place, std::forward<_Up>(__v)) {} - template ::template __enable_explicit<_Up>(), int> = 0> + template ::template __enable_explicit<_Up>(), int> = 0> _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(_Up&& __v) : __base(in_place, std::forward<_Up>(__v)) {} // LWG2756: conditionally explicit conversion from const optional<_Up>& - template ::template __enable_implicit<_Up>(), int> = 0> + template ::template __enable_implicit<_Up>(), int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional(const optional<_Up>& __v) { this->__construct_from(__v); } - template ::template __enable_explicit<_Up>(), int> = 0> + template ::template __enable_explicit<_Up>(), int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit optional(const optional<_Up>& __v) { this->__construct_from(__v); } // LWG2756: conditionally explicit conversion from optional<_Up>&& - template ::template __enable_implicit<_Up>(), int> = 0> + template ::template __enable_implicit<_Up>(), int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional(optional<_Up>&& __v) { this->__construct_from(std::move(__v)); } - template ::template __enable_explicit<_Up>(), int> = 0> + template ::template __enable_explicit<_Up>(), int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit optional(optional<_Up>&& __v) { this->__construct_from(std::move(__v)); } @@ -718,7 +715,7 @@ public: template ::value, int> = 0> + enable_if_t<_IsSame<_Tag, __optional_construct_from_invoke_tag>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI constexpr explicit optional(_Tag, _Fp&& __f, _Args&&... __args) : __base(__optional_construct_from_invoke_tag{}, std::forward<_Fp>(__f), std::forward<_Args>(__args)...) {} # endif @@ -732,12 +729,12 @@ public: _LIBCPP_HIDE_FROM_ABI constexpr optional& operator=(optional&&) = default; // LWG2756 - template < - class _Up = value_type, - class = enable_if_t< _And< _IsNotSame<__remove_cvref_t<_Up>, optional>, - _Or< _IsNotSame<__remove_cvref_t<_Up>, value_type>, _Not> >, - is_constructible, - is_assignable >::value> > + template , optional>, + _Or<_IsNotSame<__remove_cvref_t<_Up>, value_type>, _Not>>, + is_constructible, + is_assignable>::value, + int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional& operator=(_Up&& __v) { if (this->has_value()) this->__get() = std::forward<_Up>(__v); @@ -747,21 +744,20 @@ public: } // LWG2756 - template ::template __enable_assign<_Up>(), int> = 0> + template ::template __enable_assign<_Up>(), int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional& operator=(const optional<_Up>& __v) { this->__assign_from(__v); return *this; } // LWG2756 - template ::template __enable_assign<_Up>(), int> = 0> + template ::template __enable_assign<_Up>(), int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 optional& operator=(optional<_Up>&& __v) { this->__assign_from(std::move(__v)); return *this; } - template > > + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(_Args&&... __args) { reset(); this->__construct(std::forward<_Args>(__args)...); @@ -770,7 +766,7 @@ public: template &, _Args...> > > + enable_if_t&, _Args...>, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) { reset(); this->__construct(__il, std::forward<_Args>(__args)...); @@ -982,17 +978,15 @@ public: using __base::reset; }; -# if _LIBCPP_STD_VER >= 17 template optional(_Tp) -> optional<_Tp>; -# endif // Comparisons between optionals -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() == std::declval()), bool>, - bool > -operator==(const optional<_Tp>& __x, const optional<_Up>& __y) { +template < + class _Tp, + class _Up, + enable_if_t() == std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, const optional<_Up>& __y) { if (static_cast(__x) != static_cast(__y)) return false; if (!static_cast(__x)) @@ -1000,11 +994,11 @@ operator==(const optional<_Tp>& __x, const optional<_Up>& __y) { return *__x == *__y; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() != std::declval()), bool>, - bool > -operator!=(const optional<_Tp>& __x, const optional<_Up>& __y) { +template < + class _Tp, + class _Up, + enable_if_t() != std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const optional<_Tp>& __x, const optional<_Up>& __y) { if (static_cast(__x) != static_cast(__y)) return true; if (!static_cast(__x)) @@ -1012,11 +1006,11 @@ operator!=(const optional<_Tp>& __x, const optional<_Up>& __y) { return *__x != *__y; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() < std::declval()), bool>, - bool > -operator<(const optional<_Tp>& __x, const optional<_Up>& __y) { +template < + class _Tp, + class _Up, + enable_if_t() < std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const optional<_Tp>& __x, const optional<_Up>& __y) { if (!static_cast(__y)) return false; if (!static_cast(__x)) @@ -1024,11 +1018,11 @@ operator<(const optional<_Tp>& __x, const optional<_Up>& __y) { return *__x < *__y; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() > std::declval()), bool>, - bool > -operator>(const optional<_Tp>& __x, const optional<_Up>& __y) { +template < + class _Tp, + class _Up, + enable_if_t() > std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const optional<_Tp>& __x, const optional<_Up>& __y) { if (!static_cast(__x)) return false; if (!static_cast(__y)) @@ -1036,11 +1030,11 @@ operator>(const optional<_Tp>& __x, const optional<_Up>& __y) { return *__x > *__y; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() <= std::declval()), bool>, - bool > -operator<=(const optional<_Tp>& __x, const optional<_Up>& __y) { +template < + class _Tp, + class _Up, + enable_if_t() <= std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const optional<_Tp>& __x, const optional<_Up>& __y) { if (!static_cast(__x)) return true; if (!static_cast(__y)) @@ -1048,11 +1042,11 @@ operator<=(const optional<_Tp>& __x, const optional<_Up>& __y) { return *__x <= *__y; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() >= std::declval()), bool>, - bool > -operator>=(const optional<_Tp>& __x, const optional<_Up>& __y) { +template < + class _Tp, + class _Up, + enable_if_t() >= std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const optional<_Tp>& __x, const optional<_Up>& __y) { if (!static_cast(__y)) return true; if (!static_cast(__x)) @@ -1145,99 +1139,99 @@ _LIBCPP_HIDE_FROM_ABI constexpr strong_ordering operator<=>(const optional<_Tp>& # endif // _LIBCPP_STD_VER <= 17 // Comparisons with T -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() == std::declval()), bool>, - bool > -operator==(const optional<_Tp>& __x, const _Up& __v) { +template < + class _Tp, + class _Up, + enable_if_t() == std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, const _Up& __v) { return static_cast(__x) ? *__x == __v : false; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() == std::declval()), bool>, - bool > -operator==(const _Tp& __v, const optional<_Up>& __x) { +template < + class _Tp, + class _Up, + enable_if_t() == std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const _Tp& __v, const optional<_Up>& __x) { return static_cast(__x) ? __v == *__x : false; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() != std::declval()), bool>, - bool > -operator!=(const optional<_Tp>& __x, const _Up& __v) { +template < + class _Tp, + class _Up, + enable_if_t() != std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const optional<_Tp>& __x, const _Up& __v) { return static_cast(__x) ? *__x != __v : true; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() != std::declval()), bool>, - bool > -operator!=(const _Tp& __v, const optional<_Up>& __x) { +template < + class _Tp, + class _Up, + enable_if_t() != std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const _Tp& __v, const optional<_Up>& __x) { return static_cast(__x) ? __v != *__x : true; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() < std::declval()), bool>, - bool > -operator<(const optional<_Tp>& __x, const _Up& __v) { +template < + class _Tp, + class _Up, + enable_if_t() < std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const optional<_Tp>& __x, const _Up& __v) { return static_cast(__x) ? *__x < __v : true; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() < std::declval()), bool>, - bool > -operator<(const _Tp& __v, const optional<_Up>& __x) { +template < + class _Tp, + class _Up, + enable_if_t() < std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const _Tp& __v, const optional<_Up>& __x) { return static_cast(__x) ? __v < *__x : false; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() <= std::declval()), bool>, - bool > -operator<=(const optional<_Tp>& __x, const _Up& __v) { +template < + class _Tp, + class _Up, + enable_if_t() <= std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const optional<_Tp>& __x, const _Up& __v) { return static_cast(__x) ? *__x <= __v : true; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() <= std::declval()), bool>, - bool > -operator<=(const _Tp& __v, const optional<_Up>& __x) { +template < + class _Tp, + class _Up, + enable_if_t() <= std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const _Tp& __v, const optional<_Up>& __x) { return static_cast(__x) ? __v <= *__x : false; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() > std::declval()), bool>, - bool > -operator>(const optional<_Tp>& __x, const _Up& __v) { +template < + class _Tp, + class _Up, + enable_if_t() > std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const optional<_Tp>& __x, const _Up& __v) { return static_cast(__x) ? *__x > __v : false; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() > std::declval()), bool>, - bool > -operator>(const _Tp& __v, const optional<_Up>& __x) { +template < + class _Tp, + class _Up, + enable_if_t() > std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const _Tp& __v, const optional<_Up>& __x) { return static_cast(__x) ? __v > *__x : true; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() >= std::declval()), bool>, - bool > -operator>=(const optional<_Tp>& __x, const _Up& __v) { +template < + class _Tp, + class _Up, + enable_if_t() >= std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const optional<_Tp>& __x, const _Up& __v) { return static_cast(__x) ? *__x >= __v : false; } -template -_LIBCPP_HIDE_FROM_ABI constexpr enable_if_t< - is_convertible_v() >= std::declval()), bool>, - bool > -operator>=(const _Tp& __v, const optional<_Up>& __x) { +template < + class _Tp, + class _Up, + enable_if_t() >= std::declval()), bool>, int> = 0> +_LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const _Tp& __v, const optional<_Up>& __x) { return static_cast(__x) ? __v >= *__x : true; } @@ -1252,9 +1246,8 @@ operator<=>(const optional<_Tp>& __x, const _Up& __v) { # endif // _LIBCPP_STD_VER >= 20 -template -inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX20 enable_if_t< is_move_constructible_v<_Tp> && is_swappable_v<_Tp>, void > +template && is_swappable_v<_Tp>, int> = 0> +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(optional<_Tp>& __x, optional<_Tp>& __y) noexcept(noexcept(__x.swap(__y))) { __x.swap(__y); } From cffc1ac3491c891ef4f80bcbfa685710e477eeac Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Sat, 15 Feb 2025 10:54:00 +0100 Subject: [PATCH 016/109] [libc++] Avoid including on arbitrary platforms (#125587) This partially reverts commit 5f2389d4. That commit started checking whether was a valid include unconditionally, however codebases are free to have such a header on their search path, which breaks compilation. LLVM libc now provides a more standard way of getting configuration macros like __LLVM_LIBC__. After this patch, we only include when we're on Linux or when we're compiling for GPUs. --- libcxx/include/__configuration/platform.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libcxx/include/__configuration/platform.h b/libcxx/include/__configuration/platform.h index 2a92ce209b91f..cff99376ee24b 100644 --- a/libcxx/include/__configuration/platform.h +++ b/libcxx/include/__configuration/platform.h @@ -30,12 +30,9 @@ // ... add new file formats here ... #endif -// To detect which libc we're using -#if __has_include() +// Need to detect which libc we're using if we're on Linux. +#if defined(__linux__) || defined(__AMDGPU__) || defined(__NVPTX__) # include -#endif - -#if defined(__linux__) # if defined(__GLIBC_PREREQ) # define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b) # else From 8f3a070db9bffe78d86d24b583effe4032baa4db Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Sat, 15 Feb 2025 10:45:22 +0100 Subject: [PATCH 017/109] [Clang] Add new WG21 papers(Hagenberg) papers to the C++ status page --- clang/www/cxx_status.html | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html index 0fc3b1d314698..2d5b96b47fe2d 100755 --- a/clang/www/cxx_status.html +++ b/clang/www/cxx_status.html @@ -251,6 +251,42 @@

C++2c implementation status

P3176R1 Clang 20 + + + Trivial unions + P3074R7 + No + + + Partial program correctness + P1494R5 + No + + + Contracts + P2900R14 + No + + + Defang and deprecate memory_order::consume + P3475R2 + No + + + Concept and variable-template template-parameters + P2841R7 + No + + + Trivial Relocatability + P2786R13 + No + + +
#embed
+ P1967R14 + No + From 70b95ca6dbee7036dcfa5995ff804471fd7e8c2a Mon Sep 17 00:00:00 2001 From: lntue Date: Sat, 15 Feb 2025 05:11:54 -0500 Subject: [PATCH 018/109] [libc][math] Fix sqrtf128 implicit conversions. (#127154) This fixes rv32 buildbot failure from https://github.com/llvm/llvm-project/pull/122578 --- libc/src/math/generic/sqrtf128.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/libc/src/math/generic/sqrtf128.cpp b/libc/src/math/generic/sqrtf128.cpp index c844d3afa11c8..3aa7db8362734 100644 --- a/libc/src/math/generic/sqrtf128.cpp +++ b/libc/src/math/generic/sqrtf128.cpp @@ -383,25 +383,26 @@ LLVM_LIBC_FUNCTION(float128, sqrtf128, (float128 x)) { // 1 so just need to add shifted m and 1. Int128 t1 = t0; Int128 sgn = t0 >> 127; // sign of the difference - t1 -= (m << 1) ^ sgn; - t1 += 1 + sgn; + Int128 m_xor_sgn = static_cast(m << 1) ^ sgn; + t1 -= m_xor_sgn; + t1 += Int128(1) + sgn; Int128 sgn1 = t1 >> 127; if (LIBC_UNLIKELY(sgn == sgn1)) { t0 = t1; v -= sgn << 15; - t1 -= (m << 1) ^ sgn; - t1 += 1 + sgn; + t1 -= m_xor_sgn; + t1 += Int128(1) + sgn; } if (t1 == 0) { // 1 ulp offset brings again an exact root - v = (m - (2 * sgn + 1)) << 15; + v = (m - static_cast((sgn << 1) + 1)) << 15; } else { t1 += t0; Int128 side = t1 >> 127; // select what is closer m or m+-1 v &= ~UInt128(0) << 15; // wipe the fractional bits - v -= ((sgn & side) | (~sgn & 1)) << (15 + side); + v -= ((sgn & side) | (~sgn & 1)) << (15 + static_cast(side)); v |= 1; // add sticky bit since we cannot have an exact mid-point // situation } From 2db262886f0c06c079e1b2808c4c14c16f8861b5 Mon Sep 17 00:00:00 2001 From: Edgar Date: Sat, 15 Feb 2025 12:21:20 +0100 Subject: [PATCH 019/109] [MLIR] Fix mlirExecutionEngineLookup throwing assert on lookup fail (#123924) Apparently trying to lookup a function pointer using the C api `mlirExecutionEngineLookup` will throw an assert instead of just returning a nullptr on builds with asserts. The docs itself says it returns a nullptr when no function is found so it should be sensible to not throw an assert in this case. --- mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp index 507be9171d328..306cebd236be9 100644 --- a/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp +++ b/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp @@ -85,18 +85,20 @@ mlirExecutionEngineInvokePacked(MlirExecutionEngine jit, MlirStringRef name, extern "C" void *mlirExecutionEngineLookupPacked(MlirExecutionEngine jit, MlirStringRef name) { - auto expectedFPtr = unwrap(jit)->lookupPacked(unwrap(name)); - if (!expectedFPtr) + auto optionalFPtr = + llvm::expectedToOptional(unwrap(jit)->lookupPacked(unwrap(name))); + if (!optionalFPtr) return nullptr; - return reinterpret_cast(*expectedFPtr); + return reinterpret_cast(*optionalFPtr); } extern "C" void *mlirExecutionEngineLookup(MlirExecutionEngine jit, MlirStringRef name) { - auto expectedFPtr = unwrap(jit)->lookup(unwrap(name)); - if (!expectedFPtr) + auto optionalFPtr = + llvm::expectedToOptional(unwrap(jit)->lookup(unwrap(name))); + if (!optionalFPtr) return nullptr; - return reinterpret_cast(*expectedFPtr); + return *optionalFPtr; } extern "C" void mlirExecutionEngineRegisterSymbol(MlirExecutionEngine jit, From 42ff31aea5828a491269b4db1ba5cff6fef7ca60 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 15 Feb 2025 11:59:52 +0000 Subject: [PATCH 020/109] [X86] combineTargetShuffle - fold VPERMV3(HI,MASK,LO) -> VPERMV(COMMUTE(MASK),CONCAT(LO,HI)) (#127199) We already handle the simpler VPERMV3(LO,MASK,HI) fold which can reuse the (widened) mask, this attempts to match the flipped concatenation, and commutes the mask to handle the flip. I've limited this to cases where we can extract the constant mask for commutation, a more general solution would XOR the MSB of the shuffle mask indices to commute, but this almost never constant folds away after lowering so the benefit was minimal. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 25 +- .../any_extend_vector_inreg_of_broadcast.ll | 46 +- ...d_vector_inreg_of_broadcast_from_memory.ll | 14 +- .../X86/avx512-shuffles/partial_permute.ll | 498 +++++++++--------- .../vector-interleaved-load-i16-stride-6.ll | 184 ++++--- .../vector-interleaved-load-i32-stride-3.ll | 76 ++- .../vector-interleaved-load-i32-stride-7.ll | 36 +- .../zero_extend_vector_inreg_of_broadcast.ll | 14 +- ...d_vector_inreg_of_broadcast_from_memory.ll | 14 +- 9 files changed, 439 insertions(+), 468 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1d2d90d543c05..9592137b34842 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42513,10 +42513,12 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, case X86ISD::VPERMV3: { // Combine VPERMV3 to widened VPERMV if the two source operands can be // freely concatenated. - if (VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.useAVX512Regs())) { + MVT WideVT = VT.getDoubleNumVectorElementsVT(); + MVT MaskVT = N.getOperand(1).getSimpleValueType(); + bool CanConcat = VT.is128BitVector() || + (VT.is256BitVector() && Subtarget.useAVX512Regs()); + if (CanConcat) { SDValue Ops[] = {N.getOperand(0), N.getOperand(2)}; - MVT WideVT = VT.getDoubleNumVectorElementsVT(); if (SDValue ConcatSrc = combineConcatVectorOps(DL, WideVT, Ops, DAG, DCI, Subtarget)) { SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG, @@ -42530,9 +42532,24 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SmallVector Mask; if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) { assert(Mask.size() == NumElts && "Unexpected shuffle mask size"); + // See if we can concatenate the commuted operands. + if (CanConcat) { + if (SDValue ConcatSrc = combineConcatVectorOps( + DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG, DCI, + Subtarget)) { + ShuffleVectorSDNode::commuteMask(Mask); + SDValue NewMask = + getConstVector(Mask, MaskVT, DAG, DL, /*IsMask=*/true); + NewMask = widenSubVector(NewMask, false, Subtarget, DAG, DL, + WideVT.getSizeInBits()); + SDValue Perm = + DAG.getNode(X86ISD::VPERMV, DL, WideVT, NewMask, ConcatSrc); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm, + DAG.getVectorIdxConstant(0, DL)); + } + } SDValue V1 = peekThroughBitcasts(N.getOperand(0)); SDValue V2 = peekThroughBitcasts(N.getOperand(2)); - MVT MaskVT = N.getOperand(1).getSimpleValueType(); // Canonicalize to VPERMV if both sources are the same. if (V1 == V2) { for (int &M : Mask) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 4c4d5cb3166a8..951a2b4cafa26 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1754,10 +1754,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1870,10 +1869,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3776,12 +3774,11 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,16,13,14,15,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -3911,11 +3908,10 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,0,0,0,0,16,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4037,11 +4033,10 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4151,10 +4146,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,0,0,0,0,16,0,0,0] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 16f0614743463..c0afc0cfe2c0a 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1415,10 +1415,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1502,10 +1501,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index aac5847061cbe..fd9b46e82e0b1 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -227,11 +227,12 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1] -; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,8,11,8,13,8,15,9] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -243,11 +244,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1] +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -304,10 +305,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16 define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] -; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> ret <16 x i16> %res @@ -315,11 +315,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) { define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -330,11 +330,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -344,11 +343,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <1 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -359,11 +358,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -373,11 +371,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <1 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -388,11 +386,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -440,10 +437,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <1 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [6,11,23,26,29,5,21,30] +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -452,11 +448,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,11,23,26,29,5,21,30] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -468,11 +464,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,11,23,26,29,5,21,30] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -657,11 +652,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] -; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -673,11 +668,10 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] +; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -731,10 +725,9 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 -; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,1,21,17,30,30,29,1] +; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -744,11 +737,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) { define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,21,17,30,30,29,1] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -761,11 +754,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,21,17,30,30,29,1] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -778,11 +770,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -795,11 +787,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [23,22,20,22,28,20,11,17] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -1114,11 +1105,12 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0] -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,3,3,4] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1130,11 +1122,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> % define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,3,3,4] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1503,11 +1495,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15] -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [15,11,14,3,8,9,13,7] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1519,11 +1511,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,11,14,3,8,9,13,7] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1535,11 +1526,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10] -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [12,6,9,13,12,10,0,2] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1551,11 +1542,10 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [12,6,9,13,12,10,0,2] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1654,11 +1644,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0] -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,13,11,10,7,13,15,14] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1671,11 +1661,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0] +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,13,11,10,7,13,15,14] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1721,9 +1710,10 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [6,0,7,2] +; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1732,11 +1722,12 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,0,7,2] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1748,11 +1739,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,4,3,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,0,7,2] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -2374,11 +2365,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4] -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,7,6,0] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: @@ -2398,11 +2389,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,6,0] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: @@ -2422,11 +2412,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1] -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,1,1,5] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: @@ -2446,11 +2436,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,1,1,5] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: @@ -2535,11 +2524,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,0,2,5] -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,6,1] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2551,11 +2540,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,0,2,5] +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,4,6,1] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2656,11 +2644,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5] -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,7,5,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: @@ -2680,11 +2668,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,7,5,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: @@ -2946,9 +2933,10 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1] -; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [6,2,4,5] +; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2957,12 +2945,13 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1] -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,2,4,5] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2974,12 +2963,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,2,4,5] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2991,12 +2980,13 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,7,7,2] -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,3,3,6] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3008,12 +2998,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,7,7,2] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,3,3,6] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3497,12 +3487,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,13,10,11,10,0,0,9] +; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: @@ -3524,12 +3514,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9] +; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: @@ -3551,9 +3540,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9] -; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [15,13,11,11,3,12,4,1] +; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3562,12 +3551,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) { define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9] -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [15,13,11,11,3,12,4,1] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3579,12 +3568,11 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,13,11,11,3,12,4,1] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3644,12 +3632,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,10,6,15,0,0,0,0] -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,14,7,12,6,14,7] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3662,12 +3650,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,10,6,15,0,0,0,0] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,2,14,7,12,6,14,7] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3680,13 +3667,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14] -; CHECK-NEXT: # xmm2 = mem[0,0] -; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm2 = [12,6,12,6,12,6,12,6] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3699,13 +3685,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14] -; CHECK-NEXT: # xmm2 = mem[0,0] -; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [12,6,12,6,12,6,12,6] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -4527,12 +4511,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,4,1,5] -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,0,5,1] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4544,12 +4528,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [2,4,1,5] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,0,5,1] +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4593,9 +4576,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,1] -; CHECK-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,6,0,5] +; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4604,12 +4587,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,4,1] -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,6,0,5] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4621,12 +4604,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,1] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,6,0,5] +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index c3b53211978ae..9d0183c816b12 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -582,20 +582,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-NEXT: vpermd (%rdi), %zmm1, %zmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512-NEXT: vpermd %zmm2, %zmm5, %zmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vmovq %xmm0, (%rdx) -; AVX512-NEXT: vmovq %xmm2, (%rcx) +; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm1, (%r8) -; AVX512-NEXT: vmovq %xmm4, (%r9) -; AVX512-NEXT: vmovq %xmm5, (%rax) +; AVX512-NEXT: vmovq %xmm5, (%r9) +; AVX512-NEXT: vmovq %xmm2, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -613,20 +613,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rax) +; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512-FCP-NEXT: vmovq %xmm2, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -645,20 +645,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-NEXT: vpermd (%rdi), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-NEXT: vmovq %xmm4, (%r9) -; AVX512DQ-NEXT: vmovq %xmm5, (%rax) +; AVX512DQ-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-NEXT: vmovq %xmm2, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -676,20 +676,20 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,13,10,3] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rax) +; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2876,22 +2876,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm5, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 @@ -2933,22 +2931,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 @@ -2990,22 +2986,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 @@ -3047,22 +3041,20 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm7, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm8[3,4,5,6,7],ymm6[8,9,10],ymm8[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] +; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index d9383f524f1d1..34f23213500c1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -103,16 +103,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i32_stride3_vf2: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] -; AVX512-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX512-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovlps %xmm1, (%rdx) +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride3_vf2: @@ -131,16 +130,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride3_vf2: @@ -159,16 +157,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i32_stride3_vf2: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride3_vf2: @@ -187,16 +184,15 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <6 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 955a7ffcec795..7948141f6becd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -239,17 +239,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 ; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -304,17 +303,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 ; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -369,17 +367,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 ; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -434,17 +431,16 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,4,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d (%rdi), %ymm2, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index c9b10d9cc8668..ec7a708fc0b02 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1754,10 +1754,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1870,10 +1869,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 5ba2257e2b49e..14c2a60a5b998 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1415,10 +1415,9 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] -; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1502,10 +1501,9 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2w (%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq From a6093d30348d7116b1112f7532743fda50258d67 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Sat, 15 Feb 2025 14:56:19 +0100 Subject: [PATCH 021/109] [libc++] Explicitly mention vector_bool in the name of benchmarks (#127313) We have some benchmarks that were benchmarking very specific functionality, namely the optimizations in vector::iterator. Call this out in the benchmarks by renaming them appropriately. In the future we will also increase the coverage of these benchmarks to test other containers. --- libcxx/test/benchmarks/algorithms/fill.bench.cpp | 16 ++++++++-------- .../algorithms/ranges_contains.bench.cpp | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/libcxx/test/benchmarks/algorithms/fill.bench.cpp b/libcxx/test/benchmarks/algorithms/fill.bench.cpp index c157b5e5c9862..6a48b25b7eb63 100644 --- a/libcxx/test/benchmarks/algorithms/fill.bench.cpp +++ b/libcxx/test/benchmarks/algorithms/fill.bench.cpp @@ -12,40 +12,40 @@ #include #include -static void bm_fill_n(benchmark::State& state) { +static void bm_fill_n_vector_bool(benchmark::State& state) { std::vector vec1(state.range()); for (auto _ : state) { benchmark::DoNotOptimize(vec1); benchmark::DoNotOptimize(std::fill_n(vec1.begin(), vec1.size(), false)); } } -BENCHMARK(bm_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20); +BENCHMARK(bm_fill_n_vector_bool)->DenseRange(1, 8)->Range(16, 1 << 20); -static void bm_ranges_fill_n(benchmark::State& state) { +static void bm_ranges_fill_n_vector_bool(benchmark::State& state) { std::vector vec1(state.range()); for (auto _ : state) { benchmark::DoNotOptimize(vec1); benchmark::DoNotOptimize(std::ranges::fill_n(vec1.begin(), vec1.size(), false)); } } -BENCHMARK(bm_ranges_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20); +BENCHMARK(bm_ranges_fill_n_vector_bool)->DenseRange(1, 8)->Range(16, 1 << 20); -static void bm_fill(benchmark::State& state) { +static void bm_fill_vector_bool(benchmark::State& state) { std::vector vec1(state.range()); for (auto _ : state) { benchmark::DoNotOptimize(vec1); std::fill(vec1.begin(), vec1.end(), false); } } -BENCHMARK(bm_fill)->DenseRange(1, 8)->Range(16, 1 << 20); +BENCHMARK(bm_fill_vector_bool)->DenseRange(1, 8)->Range(16, 1 << 20); -static void bm_ranges_fill(benchmark::State& state) { +static void bm_ranges_fill_vector_bool(benchmark::State& state) { std::vector vec1(state.range()); for (auto _ : state) { benchmark::DoNotOptimize(vec1); benchmark::DoNotOptimize(std::ranges::fill(vec1, false)); } } -BENCHMARK(bm_ranges_fill)->DenseRange(1, 8)->Range(16, 1 << 20); +BENCHMARK(bm_ranges_fill_vector_bool)->DenseRange(1, 8)->Range(16, 1 << 20); BENCHMARK_MAIN(); diff --git a/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp index b98e17a00ef83..c9a10202c8cfc 100644 --- a/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp +++ b/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp @@ -15,7 +15,7 @@ #include "test_iterators.h" -static void bm_contains_char(benchmark::State& state) { +static void bm_contains_vector_char(benchmark::State& state) { std::vector a(state.range(), 'a'); for (auto _ : state) { @@ -24,9 +24,9 @@ static void bm_contains_char(benchmark::State& state) { benchmark::DoNotOptimize(std::ranges::contains(a.begin(), a.end(), 'B')); } } -BENCHMARK(bm_contains_char)->RangeMultiplier(16)->Range(16, 16 << 20); +BENCHMARK(bm_contains_vector_char)->RangeMultiplier(16)->Range(16, 16 << 20); -static void bm_contains_int(benchmark::State& state) { +static void bm_contains_vector_int(benchmark::State& state) { std::vector a(state.range(), 1); for (auto _ : state) { @@ -35,9 +35,9 @@ static void bm_contains_int(benchmark::State& state) { benchmark::DoNotOptimize(std::ranges::contains(a.begin(), a.end(), 2)); } } -BENCHMARK(bm_contains_int)->RangeMultiplier(16)->Range(16, 16 << 20); +BENCHMARK(bm_contains_vector_int)->RangeMultiplier(16)->Range(16, 16 << 20); -static void bm_contains_bool(benchmark::State& state) { +static void bm_contains_vector_bool(benchmark::State& state) { std::vector a(state.range(), true); for (auto _ : state) { @@ -46,6 +46,6 @@ static void bm_contains_bool(benchmark::State& state) { benchmark::DoNotOptimize(std::ranges::contains(a.begin(), a.end(), false)); } } -BENCHMARK(bm_contains_bool)->RangeMultiplier(16)->Range(16, 16 << 20); +BENCHMARK(bm_contains_vector_bool)->RangeMultiplier(16)->Range(16, 16 << 20); BENCHMARK_MAIN(); From 88284e4efce09b0c9f46c3893554481815badf01 Mon Sep 17 00:00:00 2001 From: realqhc Date: Sat, 15 Feb 2025 22:26:02 +0800 Subject: [PATCH 022/109] [RISCV] Support Zb*/P Shared Instructions (#127160) This enables shared instructions between Zb* and Base-P extension. Documentation: https://jhauser.us/RISCV/ext-P/RVP-baseInstrs-014.pdf https://jhauser.us/RISCV/ext-P/RVP-instrEncodings-014.pdf --- .../Driver/print-supported-extensions-riscv.c | 1 + llvm/lib/Target/RISCV/RISCVFeatures.td | 33 ++++++++++++++++ llvm/lib/Target/RISCV/RISCVInstrInfoZb.td | 26 +++++++------ llvm/test/MC/RISCV/attribute-arch.s | 6 +++ llvm/test/MC/RISCV/rv32i-invalid.s | 4 +- llvm/test/MC/RISCV/rv32p-valid.s | 36 +++++++++++++++++ llvm/test/MC/RISCV/rv64p-valid.s | 39 +++++++++++++++++++ .../TargetParser/RISCVISAInfoTest.cpp | 1 + 8 files changed, 133 insertions(+), 13 deletions(-) create mode 100644 llvm/test/MC/RISCV/rv32p-valid.s create mode 100644 llvm/test/MC/RISCV/rv64p-valid.s diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 3443ff0b69de9..49c5bfca2716f 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -182,6 +182,7 @@ // CHECK-NEXT: xwchc 2.2 'Xwchc' (WCH/QingKe additional compressed opcodes) // CHECK-EMPTY: // CHECK-NEXT: Experimental extensions +// CHECK-NEXT: p 0.14 'P' ('Base P' (Packed SIMD)) // CHECK-NEXT: zicfilp 1.0 'Zicfilp' (Landing pad) // CHECK-NEXT: zicfiss 1.0 'Zicfiss' (Shadow stack) // CHECK-NEXT: zalasr 0.1 'Zalasr' (Load-Acquire and Store-Release Instructions) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 51aa8d7d307e4..30595119e37bf 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1016,6 +1016,39 @@ def HasStdExtSmctrOrSsctr : Predicate<"Subtarget->hasStdExtSmctrOrSsctr()">, "'Smctr' (Control Transfer Records Machine Level) or " "'Ssctr' (Control Transfer Records Supervisor Level)">; +// Packed SIMD Extensions +def FeatureStdExtP + : RISCVExperimentalExtension<0, 14, + "'Base P' (Packed SIMD)">; +def HasStdExtP : Predicate<"Subtarget->hasStdExtP()">, + AssemblerPredicate<(all_of FeatureStdExtP), + "'Base P' (Packed SIMD)">; + +def HasStdExtZbaOrP + : Predicate<"Subtarget->hasStdExtZba() || Subtarget->hasStdExtP()">, + AssemblerPredicate<(any_of FeatureStdExtZba, FeatureStdExtP), + "'Zba' (Address Generation Instructions) or " + "'Base P' (Packed-SIMD)">; + +def HasStdExtZbbOrP + : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtP()">, + AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtP), + "'Zbb' (Basic Bit-Manipulation) or " + "'Base P' (Packed-SIMD)">; + +def HasStdExtZbkbOrP + : Predicate<"Subtarget->hasStdExtZbkb() || Subtarget->hasStdExtP()">, + AssemblerPredicate<(any_of FeatureStdExtZbkb, FeatureStdExtP), + "'Zbkb' (Bitmanip instructions for Cryptography) or " + "'Base P' (Packed-SIMD)">; + +def HasStdExtZbbOrZbkbOrP + : Predicate<"Subtarget->HasStdExtZbbOrZbkb()|| Subtarget->hasStdExtP()">, + AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb, FeatureStdExtP), + "'Zbb' (Basic Bit-Manipulation) or " + "'Zbkb' (Bitmanip instructions for Cryptography) or " + "'Base P' (Packed-SIMD)">; + //===----------------------------------------------------------------------===// // Vendor extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 124caa3b69d31..2ce909c5d0e21 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -263,9 +263,10 @@ def XNOR : ALU_rr<0b0100000, 0b100, "xnor">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; } // Predicates = [HasStdExtZbbOrZbkb] -let Predicates = [HasStdExtZba] in { +let Predicates = [HasStdExtZbaOrP] in def SH1ADD : ALU_rr<0b0010000, 0b010, "sh1add">, Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>; +let Predicates = [HasStdExtZba] in { def SH2ADD : ALU_rr<0b0010000, 0b100, "sh2add">, Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>; def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">, @@ -337,30 +338,32 @@ def XPERM8 : ALU_rr<0b0010100, 0b100, "xperm8">, Sched<[WriteXPERM, ReadXPERM, ReadXPERM]>; } // Predicates = [HasStdExtZbkx] -let Predicates = [HasStdExtZbb], IsSignExtendingOpW = 1 in { +let Predicates = [HasStdExtZbbOrP], IsSignExtendingOpW = 1 in def CLZ : Unary_r<0b011000000000, 0b001, "clz">, Sched<[WriteCLZ, ReadCLZ]>; +let Predicates = [HasStdExtZbb], IsSignExtendingOpW = 1 in { def CTZ : Unary_r<0b011000000001, 0b001, "ctz">, Sched<[WriteCTZ, ReadCTZ]>; def CPOP : Unary_r<0b011000000010, 0b001, "cpop">, Sched<[WriteCPOP, ReadCPOP]>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbb, IsRV64], IsSignExtendingOpW = 1 in { +let Predicates = [HasStdExtZbbOrP, IsRV64], IsSignExtendingOpW = 1 in def CLZW : UnaryW_r<0b011000000000, 0b001, "clzw">, Sched<[WriteCLZ32, ReadCLZ32]>; +let Predicates = [HasStdExtZbb, IsRV64], IsSignExtendingOpW = 1 in { def CTZW : UnaryW_r<0b011000000001, 0b001, "ctzw">, Sched<[WriteCTZ32, ReadCTZ32]>; def CPOPW : UnaryW_r<0b011000000010, 0b001, "cpopw">, Sched<[WriteCPOP32, ReadCPOP32]>; } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbb], IsSignExtendingOpW = 1 in { +let Predicates = [HasStdExtZbbOrP], IsSignExtendingOpW = 1 in { def SEXT_B : Unary_r<0b011000000100, 0b001, "sext.b">, Sched<[WriteIALU, ReadIALU]>; def SEXT_H : Unary_r<0b011000000101, 0b001, "sext.h">, Sched<[WriteIALU, ReadIALU]>; -} // Predicates = [HasStdExtZbb] +} // Predicates = [HasStdExtZbbOrP] let Predicates = [HasStdExtZbc] in { def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr", Commutable=1>, @@ -374,7 +377,7 @@ def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh", Commutable=1>, Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>; } // Predicates = [HasStdExtZbcOrZbkc] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in { def MIN : ALU_rr<0b0000101, 0b100, "min", Commutable=1>, Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>; def MINU : ALU_rr<0b0000101, 0b101, "minu", Commutable=1>, @@ -385,9 +388,10 @@ def MAXU : ALU_rr<0b0000101, 0b111, "maxu", Commutable=1>, Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbkb] in { +let Predicates = [HasStdExtZbkbOrP] in def PACK : ALU_rr<0b0000100, 0b100, "pack">, Sched<[WritePACK, ReadPACK, ReadPACK]>; +let Predicates = [HasStdExtZbkb] in { let IsSignExtendingOpW = 1 in def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[WritePACK, ReadPACK, ReadPACK]>; @@ -407,15 +411,15 @@ def ZEXT_H_RV64 : RVBUnaryR<0b0000100, 0b100, OPC_OP_32, "zext.h">, Sched<[WriteIALU, ReadIALU]>; } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in { +let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in { def REV8_RV32 : Unary_r<0b011010011000, 0b101, "rev8">, Sched<[WriteREV8, ReadREV8]>; -} // Predicates = [HasStdExtZbbOrZbkb, IsRV32] +} // Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] -let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in { +let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in { def REV8_RV64 : Unary_r<0b011010111000, 0b101, "rev8">, Sched<[WriteREV8, ReadREV8]>; -} // Predicates = [HasStdExtZbbOrZbkb, IsRV64] +} // Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] let Predicates = [HasStdExtZbb] in { def ORC_B : Unary_r<0b001010000111, 0b101, "orc.b">, diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s index 4e77a53bd706c..a8bb9b7e6cef1 100644 --- a/llvm/test/MC/RISCV/attribute-arch.s +++ b/llvm/test/MC/RISCV/attribute-arch.s @@ -473,3 +473,9 @@ .attribute arch, "rv32i_sdtrig1p0" # CHECK: attribute 5, "rv32i2p1_sdtrig1p0" + +.attribute arch, "rv32i_p0p14" +# CHECK: attribute 5, "rv32i2p1_p0p14" + +.attribute arch, "rv64i_p0p14" +# CHECK: attribute 5, "rv64i2p1_p0p14" \ No newline at end of file diff --git a/llvm/test/MC/RISCV/rv32i-invalid.s b/llvm/test/MC/RISCV/rv32i-invalid.s index ac0e3c6c1bdbf..1ffb10789bbbd 100644 --- a/llvm/test/MC/RISCV/rv32i-invalid.s +++ b/llvm/test/MC/RISCV/rv32i-invalid.s @@ -191,8 +191,8 @@ fadd.s a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the followi fadd.d a0, a2, a4 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zdinx' (Double in Integer){{$}} fadd.h a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zhinx' (Half Float in Integer){{$}} flh ft0, (a0) # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal){{$}} -sh1add a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zba' (Address Generation Instructions){{$}} -clz a0, a1 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zbb' (Basic Bit-Manipulation){{$}} +sh1add a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zba' (Address Generation Instructions) or 'Base P' (Packed-SIMD){{$}} +clz a0, a1 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zbb' (Basic Bit-Manipulation) or 'Base P' (Packed-SIMD){{$}} clmul a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zbc' (Carry-Less Multiplication) or 'Zbkc' (Carry-less multiply instructions for Cryptography){{$}} bset a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zbs' (Single-Bit Instructions){{$}} pause # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zihintpause' (Pause Hint){{$}} diff --git a/llvm/test/MC/RISCV/rv32p-valid.s b/llvm/test/MC/RISCV/rv32p-valid.s new file mode 100644 index 0000000000000..011de0c0d1579 --- /dev/null +++ b/llvm/test/MC/RISCV/rv32p-valid.s @@ -0,0 +1,36 @@ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-p -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s +# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-p < %s \ +# RUN: | llvm-objdump --mattr=+experimental-p -M no-aliases -d -r - \ +# RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s + +# CHECK-ASM-AND-OBJ: sh1add a0, a1, a2 +# CHECK-ASM: encoding: [0x33,0xa5,0xc5,0x20] +sh1add a0, a1, a2 +# CHECK-ASM-AND-OBJ: clz a0, a1 +# CHECK-ASM: encoding: [0x13,0x95,0x05,0x60] +clz a0, a1 +# CHECK-ASM-AND-OBJ: sext.b a2, a3 +# CHECK-ASM: encoding: [0x13,0x96,0x46,0x60] +sext.b a2, a3 +# CHECK-ASM-AND-OBJ: sext.h t0, t1 +# CHECK-ASM: encoding: [0x93,0x12,0x53,0x60] +sext.h t0, t1 +# CHECK-ASM-AND-OBJ: min t0, t1, t2 +# CHECK-ASM: encoding: [0xb3,0x42,0x73,0x0a] +min t0, t1, t2 +# CHECK-ASM-AND-OBJ: minu t0, t1, t2 +# CHECK-ASM: encoding: [0xb3,0x52,0x73,0x0a] +minu t0, t1, t2 +# CHECK-ASM-AND-OBJ: max t3, t4, t5 +# CHECK-ASM: encoding: [0x33,0xee,0xee,0x0b] +max t3, t4, t5 +# CHECK-ASM-AND-OBJ: maxu a4, a5, a6 +# CHECK-ASM: encoding: [0x33,0xf7,0x07,0x0b] +maxu a4, a5, a6 +# CHECK-ASM-AND-OBJ: pack s0, s1, s2 +# CHECK-ASM: encoding: [0x33,0xc4,0x24,0x09] +pack s0, s1, s2 +# CHECK-ASM-AND-OBJ: rev8 s0, s1 +# CHECK-ASM: encoding: [0x13,0xd4,0x84,0x69] +rev8 s0, s1 diff --git a/llvm/test/MC/RISCV/rv64p-valid.s b/llvm/test/MC/RISCV/rv64p-valid.s new file mode 100644 index 0000000000000..48fa26aaaffe4 --- /dev/null +++ b/llvm/test/MC/RISCV/rv64p-valid.s @@ -0,0 +1,39 @@ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-p -riscv-no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-p < %s \ +# RUN: | llvm-objdump --mattr=+experimental-p -M no-aliases -d -r - \ +# RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s + +# CHECK-ASM-AND-OBJ: sh1add a0, a1, a2 +# CHECK-ASM: encoding: [0x33,0xa5,0xc5,0x20] +sh1add a0, a1, a2 +# CHECK-ASM-AND-OBJ: clz a0, a1 +# CHECK-ASM: encoding: [0x13,0x95,0x05,0x60] +clz a0, a1 +# CHECK-ASM-AND-OBJ: clzw s0, s1 +# CHECK-ASM: encoding: [0x1b,0x94,0x04,0x60] +clzw s0, s1 +# CHECK-ASM-AND-OBJ: sext.b a2, a3 +# CHECK-ASM: encoding: [0x13,0x96,0x46,0x60] +sext.b a2, a3 +# CHECK-ASM-AND-OBJ: sext.h t0, t1 +# CHECK-ASM: encoding: [0x93,0x12,0x53,0x60] +sext.h t0, t1 +# CHECK-ASM-AND-OBJ: min t0, t1, t2 +# CHECK-ASM: encoding: [0xb3,0x42,0x73,0x0a] +min t0, t1, t2 +# CHECK-ASM-AND-OBJ: minu t0, t1, t2 +# CHECK-ASM: encoding: [0xb3,0x52,0x73,0x0a] +minu t0, t1, t2 +# CHECK-ASM-AND-OBJ: max t3, t4, t5 +# CHECK-ASM: encoding: [0x33,0xee,0xee,0x0b] +max t3, t4, t5 +# CHECK-ASM-AND-OBJ: maxu a4, a5, a6 +# CHECK-ASM: encoding: [0x33,0xf7,0x07,0x0b] +maxu a4, a5, a6 +# CHECK-ASM-AND-OBJ: pack s0, s1, s2 +# CHECK-ASM: encoding: [0x33,0xc4,0x24,0x09] +pack s0, s1, s2 +# CHECK-ASM-AND-OBJ: rev8 s0, s1 +# CHECK-ASM: encoding: [0x13,0xd4,0x84,0x6b] +rev8 s0, s1 diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 7ebfcf915a7c5..563f587d9d1c0 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1108,6 +1108,7 @@ R"(All available -march extensions for RISC-V xwchc 2.2 Experimental extensions + p 0.14 zicfilp 1.0 This is a long dummy description zicfiss 1.0 zalasr 0.1 From 21e956df9b2b283c2f2ed710c542ebeebf8473ff Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Mon, 3 Feb 2025 09:36:44 -0500 Subject: [PATCH 023/109] [CodeGen] Remove two dead pass initializer decls. NFC - After #97727 and #101652, `LowerConstantIntrinsics` and `ExpandVectorPredicationPass` are no longer dedicated passes. --- llvm/include/llvm/InitializePasses.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index b8df4d1ecab1d..da4ffcd83213a 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -111,7 +111,6 @@ void initializeExpandMemCmpLegacyPassPass(PassRegistry &); void initializeExpandPostRAPass(PassRegistry &); void initializeExpandReductionsPass(PassRegistry &); void initializeExpandVariadicsPass(PassRegistry &); -void initializeExpandVectorPredicationPass(PassRegistry &); void initializeExternalAAWrapperPassPass(PassRegistry &); void initializeFEntryInserterPass(PassRegistry &); void initializeFinalizeISelPass(PassRegistry &); @@ -174,7 +173,6 @@ void initializeLoopStrengthReducePass(PassRegistry &); void initializeLoopTermFoldPass(PassRegistry &); void initializeLoopUnrollPass(PassRegistry &); void initializeLowerAtomicLegacyPassPass(PassRegistry &); -void initializeLowerConstantIntrinsicsPass(PassRegistry &); void initializeLowerEmuTLSPass(PassRegistry &); void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &); void initializeLowerIntrinsicsPass(PassRegistry &); From 4664a4c66b816af53f596935c3aaa2eca143ae9c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 15 Feb 2025 16:17:42 +0100 Subject: [PATCH 024/109] [LAA] Use getPointer/setPointer in createCheckForAccess (NFC). Use getPointer/setPointer to clarify we are accessing/modifying the rurrent value. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 4bdcccdae0b7e..e5b87d2d16230 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1143,9 +1143,8 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, SmallVector> TranslatedPtrs = findForkedPointer(PSE, StridesMap, Ptr, TheLoop); - for (const auto &P : TranslatedPtrs) { - const SCEV *PtrExpr = get<0>(P); - if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume)) + for (auto &P : TranslatedPtrs) { + if (!hasComputableBounds(PSE, Ptr, P.getPointer(), TheLoop, Assume)) return false; // When we run after a failing dependency check we have to make sure @@ -1161,8 +1160,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, // If there's only one option for Ptr, look it up after bounds and wrap // checking, because assumptions might have been added to PSE. if (TranslatedPtrs.size() == 1) - TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), - false}; + P.setPointer(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr)); } for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { From bfdf30e9b3d0b49344a651a5c7cd87be31d255c4 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 15 Feb 2025 17:04:32 +0000 Subject: [PATCH 025/109] [AArch64] Add patterns for addv(sext) and addv(zext) This adds patterns for v8i8->i16 vaddlv and v4i16->i32 vaddlv, for both signed and unsigned extends. --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 15 ++- llvm/test/CodeGen/AArch64/arm64-vabs.ll | 4 +- llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 3 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 129 ++++++-------------- 4 files changed, 56 insertions(+), 95 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c45b311b6ebb2..c9549f12769d1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7357,6 +7357,19 @@ defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", AArch64fmaxv>; defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", AArch64fminnmv>; defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", AArch64fminv>; +def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (sext (v8i8 V64:$op))))), (i64 0))), + (EXTRACT_SUBREG (v8i16 (SUBREG_TO_REG (i64 0), (SADDLVv8i8v V64:$op), hsub)), ssub)>; +def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (zext (v8i8 V64:$op))))), (i64 0))), + (EXTRACT_SUBREG (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$op), hsub)), ssub)>; +def : Pat<(v8i16 (AArch64uaddv (v8i16 (sext (v8i8 V64:$op))))), + (v8i16 (SUBREG_TO_REG (i64 0), (SADDLVv8i8v V64:$op), hsub))>; +def : Pat<(v8i16 (AArch64uaddv (v8i16 (zext (v8i8 V64:$op))))), + (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$op), hsub))>; +def : Pat<(v4i32 (AArch64uaddv (v4i32 (sext (v4i16 V64:$op))))), + (v4i32 (SUBREG_TO_REG (i64 0), (SADDLVv4i16v V64:$op), ssub))>; +def : Pat<(v4i32 (AArch64uaddv (v4i32 (zext (v4i16 V64:$op))))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv4i16v V64:$op), ssub))>; + multiclass SIMDAcrossLaneLongPairIntrinsic { // Patterns for addv(addlp(x)) ==> addlv def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef, @@ -7370,7 +7383,7 @@ multiclass SIMDAcrossLaneLongPairIntrinsic def : Pat<(v4i32 (AArch64uaddv (v4i32 (addlp (v8i16 V128:$op))))), (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (!cast(Opc#"v8i16v") V128:$op), ssub)>; - // Patterns for addp(addlp(x))) ==> addlv + // Patterns for addp(addlp(x)) ==> addlv def : Pat<(v2i32 (AArch64uaddv (v2i32 (addlp (v4i16 V64:$op))))), (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (!cast(Opc#"v4i16v") V64:$op), ssub)>; def : Pat<(v2i64 (AArch64uaddv (v2i64 (addlp (v4i32 V128:$op))))), diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll index cc8568709ea21..fe4657186cd2a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -443,8 +443,8 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) { define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) { ; CHECK-SD-LABEL: uabdl4s_rdx_i32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: uabdl.4s v0, v0, v1 -; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: uabd.4h v0, v0, v1 +; CHECK-SD-NEXT: uaddlv.4h s0, v0 ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 8e12446164e89..6fb4e219d39f4 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -87,8 +87,7 @@ define i32 @test_udot_v4i8_nomla(ptr nocapture readonly %a1) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: uaddlv s0, v0.4h ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index c72d00e65fcab..fd24282366282 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -141,18 +141,11 @@ entry: } define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) { -; CHECK-SD-LABEL: add_v4i16_v4i32_zext: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: addv s0, v0.4s -; CHECK-SD-NEXT: fmov w0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_v4i16_v4i32_zext: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: uaddlv s0, v0.4h -; CHECK-GI-NEXT: fmov w0, s0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_v4i16_v4i32_zext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv s0, v0.4h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret entry: %xx = zext <4 x i16> %x to <4 x i32> %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) @@ -160,18 +153,11 @@ entry: } define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) { -; CHECK-SD-LABEL: add_v4i16_v4i32_sext: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: addv s0, v0.4s -; CHECK-SD-NEXT: fmov w0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_v4i16_v4i32_sext: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: saddlv s0, v0.4h -; CHECK-GI-NEXT: fmov w0, s0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_v4i16_v4i32_sext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: saddlv s0, v0.4h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret entry: %xx = sext <4 x i16> %x to <4 x i32> %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) @@ -483,8 +469,7 @@ define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) { ; CHECK-SD-LABEL: add_v4i8_v4i32_zext: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: uaddlv s0, v0.4h ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ret ; @@ -589,8 +574,7 @@ entry: define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) { ; CHECK-SD-LABEL: add_v8i8_v8i16_sext: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: addv h0, v0.8h +; CHECK-SD-NEXT: saddlv h0, v0.8b ; CHECK-SD-NEXT: smov w0, v0.h[0] ; CHECK-SD-NEXT: ret ; @@ -939,20 +923,12 @@ entry: } define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) { -; CHECK-SD-LABEL: add_v4i16_v4i32_acc_zext: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: addv s0, v0.4s -; CHECK-SD-NEXT: fmov w8, s0 -; CHECK-SD-NEXT: add w0, w8, w0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_v4i16_v4i32_acc_zext: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: uaddlv s0, v0.4h -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: add w0, w8, w0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_v4i16_v4i32_acc_zext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv s0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: add w0, w8, w0 +; CHECK-NEXT: ret entry: %xx = zext <4 x i16> %x to <4 x i32> %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) @@ -961,20 +937,12 @@ entry: } define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) { -; CHECK-SD-LABEL: add_v4i16_v4i32_acc_sext: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: addv s0, v0.4s -; CHECK-SD-NEXT: fmov w8, s0 -; CHECK-SD-NEXT: add w0, w8, w0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_v4i16_v4i32_acc_sext: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: saddlv s0, v0.4h -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: add w0, w8, w0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_v4i16_v4i32_acc_sext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: saddlv s0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: add w0, w8, w0 +; CHECK-NEXT: ret entry: %xx = sext <4 x i16> %x to <4 x i32> %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) @@ -1324,8 +1292,7 @@ define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) { ; CHECK-SD-LABEL: add_v4i8_v4i32_acc_zext: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: uaddlv s0, v0.4h ; CHECK-SD-NEXT: fmov w8, s0 ; CHECK-SD-NEXT: add w0, w8, w0 ; CHECK-SD-NEXT: ret @@ -1402,22 +1369,13 @@ entry: } define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) { -; CHECK-SD-LABEL: add_v8i8_v8i16_acc_zext: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: addv h0, v0.8h -; CHECK-SD-NEXT: fmov w8, s0 -; CHECK-SD-NEXT: add w8, w8, w0 -; CHECK-SD-NEXT: and w0, w8, #0xffff -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: uaddlv h0, v0.8b -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: add w8, w8, w0 -; CHECK-GI-NEXT: and w0, w8, #0xffff -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_v8i8_v8i16_acc_zext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: add w8, w8, w0 +; CHECK-NEXT: and w0, w8, #0xffff +; CHECK-NEXT: ret entry: %xx = zext <8 x i8> %x to <8 x i16> %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) @@ -1426,22 +1384,13 @@ entry: } define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) { -; CHECK-SD-LABEL: add_v8i8_v8i16_acc_sext: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: addv h0, v0.8h -; CHECK-SD-NEXT: fmov w8, s0 -; CHECK-SD-NEXT: add w8, w8, w0 -; CHECK-SD-NEXT: sxth w0, w8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: saddlv h0, v0.8b -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: add w8, w8, w0 -; CHECK-GI-NEXT: sxth w0, w8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: add_v8i8_v8i16_acc_sext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: saddlv h0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: add w8, w8, w0 +; CHECK-NEXT: sxth w0, w8 +; CHECK-NEXT: ret entry: %xx = sext <8 x i8> %x to <8 x i16> %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) From b4030040359656ed20cb29de7b3912b6b249e98e Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Sat, 15 Feb 2025 17:25:03 +0000 Subject: [PATCH 026/109] ConstRange: factor and introduce splitPosNeg (NFC) (#126528) Factor out some code that splits a ConstantRange into positive and negative components, introducing ConstantRange::splitPosNeg. --- llvm/include/llvm/IR/ConstantRange.h | 4 ++++ llvm/lib/IR/ConstantRange.cpp | 27 +++++++++++++++---------- llvm/unittests/IR/ConstantRangeTest.cpp | 10 +++++++++ 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/llvm/include/llvm/IR/ConstantRange.h b/llvm/include/llvm/IR/ConstantRange.h index d086c25390fd2..3561513212ce2 100644 --- a/llvm/include/llvm/IR/ConstantRange.h +++ b/llvm/include/llvm/IR/ConstantRange.h @@ -92,6 +92,10 @@ class [[nodiscard]] ConstantRange { /// unsigned domain. static ConstantRange fromKnownBits(const KnownBits &Known, bool IsSigned); + /// Split the ConstantRange into positive and negative components, ignoring + /// zero values. + std::pair splitPosNeg() const; + /// Produce the smallest range such that all values that may satisfy the given /// predicate with any value contained within Other is contained in the /// returned range. Formally, this returns a superset of diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index 3566435398992..41e40cdf365d2 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -95,6 +95,17 @@ KnownBits ConstantRange::toKnownBits() const { return Known; } +std::pair ConstantRange::splitPosNeg() const { + uint32_t BW = getBitWidth(); + APInt Zero = APInt::getZero(BW), One = APInt(BW, 1); + APInt SignedMin = APInt::getSignedMinValue(BW); + // There are no positive 1-bit values. The 1 would get interpreted as -1. + ConstantRange PosFilter = + BW == 1 ? getEmpty() : ConstantRange(One, SignedMin); + ConstantRange NegFilter(SignedMin, Zero); + return {intersectWith(PosFilter), intersectWith(NegFilter)}; +} + ConstantRange ConstantRange::makeAllowedICmpRegion(CmpInst::Predicate Pred, const ConstantRange &CR) { if (CR.isEmptySet()) @@ -1356,20 +1367,14 @@ ConstantRange::udiv(const ConstantRange &RHS) const { } ConstantRange ConstantRange::sdiv(const ConstantRange &RHS) const { + APInt Zero = APInt::getZero(getBitWidth()); + APInt SignedMin = APInt::getSignedMinValue(getBitWidth()); + // We split up the LHS and RHS into positive and negative components // and then also compute the positive and negative components of the result // separately by combining division results with the appropriate signs. - APInt Zero = APInt::getZero(getBitWidth()); - APInt SignedMin = APInt::getSignedMinValue(getBitWidth()); - // There are no positive 1-bit values. The 1 would get interpreted as -1. - ConstantRange PosFilter = - getBitWidth() == 1 ? getEmpty() - : ConstantRange(APInt(getBitWidth(), 1), SignedMin); - ConstantRange NegFilter(SignedMin, Zero); - ConstantRange PosL = intersectWith(PosFilter); - ConstantRange NegL = intersectWith(NegFilter); - ConstantRange PosR = RHS.intersectWith(PosFilter); - ConstantRange NegR = RHS.intersectWith(NegFilter); + auto [PosL, NegL] = splitPosNeg(); + auto [PosR, NegR] = RHS.splitPosNeg(); ConstantRange PosRes = getEmpty(); if (!PosL.isEmptySet() && !PosR.isEmptySet()) diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp index c390ffea1c352..daa07bf7d840d 100644 --- a/llvm/unittests/IR/ConstantRangeTest.cpp +++ b/llvm/unittests/IR/ConstantRangeTest.cpp @@ -2126,6 +2126,16 @@ TEST(ConstantRange, GetEquivalentICmp) { }); } +TEST(ConstantRange, SplitPosNeg) { + EnumerateInterestingConstantRanges([](const ConstantRange &CR) { + auto [Pos, Neg] = CR.splitPosNeg(); + EXPECT_TRUE(Pos.isAllPositive()); + EXPECT_TRUE(Neg.isAllNegative()); + if (CR.getBitWidth() == 1) + EXPECT_TRUE(Pos.isEmptySet()); + }); +} + #define EXPECT_MAY_OVERFLOW(op) \ EXPECT_EQ(ConstantRange::OverflowResult::MayOverflow, (op)) #define EXPECT_ALWAYS_OVERFLOWS_LOW(op) \ From 948e97a40eba6c176183e8e7aefb994681b593ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Sat, 15 Feb 2025 19:36:20 +0100 Subject: [PATCH 027/109] [flang] Revert MLIR_MAIN_SRC_DIR override (#127337) This change is no longer necessary after #125842. Thanks to @nikic for letting me know. --- flang/CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index cca56bfdc88e6..c012b884ae3be 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -230,11 +230,6 @@ if (FLANG_STANDALONE_BUILD) add_custom_target(doxygen ALL) endif() - # Override the value from installed CMake files, as they refer - # to the directory used during the original MLIR package build, - # which may be no longer available. Instead, use the current checkout. - set(MLIR_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../mlir ) - else() option(FLANG_INCLUDE_TESTS "Generate build targets for the Flang unit tests." From e60de25c4e9a6d59b7fd868e803cfe3cd77d4078 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 15 Feb 2025 19:44:39 +0100 Subject: [PATCH 028/109] [LAA] Replace symbolic strides for translated pointers earlier (NFC). Move up replaceSymbolicStrideSCEV before isNoWrap. It needs to be called after hasComputableBounds, as this may create an AddRec via PSE, which replaceSymbolicStrideSCEV will look up. This is in preparation for simplifying isNoWrap. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index e5b87d2d16230..43380b59ac49f 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1147,6 +1147,11 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, if (!hasComputableBounds(PSE, Ptr, P.getPointer(), TheLoop, Assume)) return false; + // If there's only one option for Ptr, look it up after bounds and wrap + // checking, because assumptions might have been added to PSE. + if (TranslatedPtrs.size() == 1) + P.setPointer(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr)); + // When we run after a failing dependency check we have to make sure // we don't have wrapping pointers. if (ShouldCheckWrap) { @@ -1157,10 +1162,6 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop, Assume)) return false; } - // If there's only one option for Ptr, look it up after bounds and wrap - // checking, because assumptions might have been added to PSE. - if (TranslatedPtrs.size() == 1) - P.setPointer(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr)); } for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { From c17df0af23c941cd4fc97851ea51c91eee7c49e4 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Sat, 15 Feb 2025 11:04:06 -0800 Subject: [PATCH 029/109] [webkit.UncountedLambdaCapturesChecker] Fix a crash in declProtectsThis (#127309) Add a missing nullptr check to declProtectsThis. --- .../WebKit/UncountedLambdaCapturesChecker.cpp | 7 +++- ...mbda-captures-decl-protects-this-crash.cpp | 38 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-decl-protects-this-crash.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp index 4ffdac5ca4873..9527993d0edeb 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp @@ -267,6 +267,8 @@ class UncountedLambdaCapturesChecker auto OpCode = OpCE->getOperator(); if (OpCode == OO_Star || OpCode == OO_Amp) { auto *Callee = OpCE->getDirectCallee(); + if (!Callee) + return false; auto clsName = safeGetName(Callee->getParent()); if (!isRefType(clsName) || !OpCE->getNumArgs()) return false; @@ -276,9 +278,10 @@ class UncountedLambdaCapturesChecker } if (auto *UO = dyn_cast(Arg)) { auto OpCode = UO->getOpcode(); - if (OpCode == UO_Deref || OpCode == UO_AddrOf) + if (OpCode == UO_Deref || OpCode == UO_AddrOf) { Arg = UO->getSubExpr()->IgnoreParenCasts(); - continue; + continue; + } } break; } while (Arg); diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-decl-protects-this-crash.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-decl-protects-this-crash.cpp new file mode 100644 index 0000000000000..840433db5133a --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures-decl-protects-this-crash.cpp @@ -0,0 +1,38 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.UncountedLambdaCapturesChecker -verify %s + +struct Foo { + int x; + int y; + Foo(int x, int y) : x(x) , y(y) { } +}; + +template +struct Baz { + void ref() const; + void deref() const; + Foo operator*(); + bool operator!(); +}; + +inline Foo operator*(const Foo& a, const Foo& b); + +Baz someFunction(); +template void bar(CallbackType callback) { + auto baz = someFunction(); + callback(baz); +} + +struct Obj { + void ref() const; + void deref() const; + + void foo(Foo foo) { + bar([this](auto baz) { + // expected-warning@-1{{Captured raw-pointer 'this' to ref-counted type or CheckedPtr-capable type is unsafe [webkit.UncountedLambdaCapturesChecker]}} + bar([this, foo = *baz, foo2 = !baz](auto&&) { + // expected-warning@-1{{Captured raw-pointer 'this' to ref-counted type or CheckedPtr-capable type is unsafe [webkit.UncountedLambdaCapturesChecker]}} + someFunction(); + }); + }); + } +}; From 2472d38338aed9a9cca41a0ca0921b39765256c1 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Sat, 15 Feb 2025 20:11:48 +0100 Subject: [PATCH 030/109] [libc++] Move unused basic_string function definition to the dylib sources (#126219) `__init(const value_type*, size_type, size_type)` is part of our ABI, but we don't actually use the function anymore in the dylib. THis moves the definition to the `src/` directory to make it clear that the code is unused. This also allows us to remove it entirely in the unstable ABI. --- .../include/__string/extern_template_lists.h | 2 - libcxx/include/string | 31 +++------------ libcxx/src/string.cpp | 38 +++++++++++++++++++ 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/libcxx/include/__string/extern_template_lists.h b/libcxx/include/__string/extern_template_lists.h index cc536e514d4ff..dc66fa512b8bd 100644 --- a/libcxx/include/__string/extern_template_lists.h +++ b/libcxx/include/__string/extern_template_lists.h @@ -32,7 +32,6 @@ #define _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST(_Func, _CharType) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*, size_type)) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::size_type basic_string<_CharType>::rfind(value_type const*, size_type, size_type) const) \ - _Func(_LIBCPP_EXPORTED_FROM_ABI void basic_string<_CharType>::__init(value_type const*, size_type, size_type)) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::basic_string(basic_string const&)) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*)) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::basic_string(basic_string const&, allocator<_CharType> const&)) \ @@ -82,7 +81,6 @@ #define _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST(_Func, _CharType) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*, size_type)) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::size_type basic_string<_CharType>::rfind(value_type const*, size_type, size_type) const) \ - _Func(_LIBCPP_EXPORTED_FROM_ABI void basic_string<_CharType>::__init(value_type const*, size_type, size_type)) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>& basic_string<_CharType>::replace(size_type, size_type, value_type const*)) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::size_type basic_string<_CharType>::find_last_not_of(value_type const*, size_type, size_type) const) \ _Func(_LIBCPP_EXPORTED_FROM_ABI basic_string<_CharType>::~basic_string()) \ diff --git a/libcxx/include/string b/libcxx/include/string index b280f5f458459..396e73522d3e7 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -2254,7 +2254,6 @@ private: return __guess; } - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(const value_type* __s, size_type __sz, size_type __reserve); inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(const value_type* __s, size_type __sz); inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(size_type __n, value_type __c); @@ -2439,6 +2438,12 @@ private: template friend inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool operator==(const basic_string<_CharT2, _Traits2, _Allocator2>&, const _CharT2*) _NOEXCEPT; + + // These functions aren't used anymore but are part of out ABI, so we need to provide them in the dylib for backwards + // compatibility +# ifdef _LIBCPP_BUILDING_LIBRARY + void __init(const value_type* __s, size_type __sz, size_type __reserve); +# endif }; // These declarations must appear before any functions are implicitly used @@ -2490,30 +2495,6 @@ basic_string(from_range_t, _Range&&, _Allocator = _Allocator()) -> basic_string, char_traits>, _Allocator>; # endif -template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz, size_type __reserve) { - if (__libcpp_is_constant_evaluated()) - __rep_ = __rep(); - if (__reserve > max_size()) - __throw_length_error(); - pointer __p; - if (__fits_in_sso(__reserve)) { - __set_short_size(__sz); - __p = __get_short_pointer(); - } else { - auto __allocation = std::__allocate_at_least(__alloc_, __recommend(__reserve) + 1); - __p = __allocation.ptr; - __begin_lifetime(__p, __allocation.count); - __set_long_pointer(__p); - __set_long_cap(__allocation.count); - __set_long_size(__sz); - } - traits_type::copy(std::__to_address(__p), __s, __sz); - traits_type::assign(__p[__sz], value_type()); - __annotate_new(__sz); -} - template _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz) { diff --git a/libcxx/src/string.cpp b/libcxx/src/string.cpp index dc16ce781f76b..e335639883dba 100644 --- a/libcxx/src/string.cpp +++ b/libcxx/src/string.cpp @@ -37,6 +37,44 @@ void __basic_string_common::__throw_out_of_range() const { std::__throw_ou #endif // _LIBCPP_ABI_DO_NOT_EXPORT_BASIC_STRING_COMMON +// Define legacy ABI functions +// --------------------------- + +#ifndef _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION + +template +void basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz, size_type __reserve) { + if (__libcpp_is_constant_evaluated()) + __rep_ = __rep(); + if (__reserve > max_size()) + __throw_length_error(); + pointer __p; + if (__fits_in_sso(__reserve)) { + __set_short_size(__sz); + __p = __get_short_pointer(); + } else { + auto __allocation = std::__allocate_at_least(__alloc_, __recommend(__reserve) + 1); + __p = __allocation.ptr; + __begin_lifetime(__p, __allocation.count); + __set_long_pointer(__p); + __set_long_cap(__allocation.count); + __set_long_size(__sz); + } + traits_type::copy(std::__to_address(__p), __s, __sz); + traits_type::assign(__p[__sz], value_type()); + __annotate_new(__sz); +} + +# define STRING_LEGACY_API(CharT) \ + template _LIBCPP_EXPORTED_FROM_ABI void basic_string::__init(const value_type*, size_type, size_type) + +STRING_LEGACY_API(char); +# if _LIBCPP_HAS_WIDE_CHARACTERS +STRING_LEGACY_API(wchar_t); +# endif + +#endif // _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION + #define _LIBCPP_EXTERN_TEMPLATE_DEFINE(...) template __VA_ARGS__; #ifdef _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION _LIBCPP_STRING_UNSTABLE_EXTERN_TEMPLATE_LIST(_LIBCPP_EXTERN_TEMPLATE_DEFINE, char) From 248716f814d1d1fef88911d01a0b551d53c87c7a Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sat, 15 Feb 2025 20:15:32 +0100 Subject: [PATCH 031/109] [libc++] Fixes (|multi)_set spaceship operator. (#127326) The operators did not have a _Compare template arguement. The fix updates the generic container test to use allocators for all types used. No other issues were found. Fixes: #127095 --- libcxx/include/set | 8 +- .../test/support/test_container_comparisons.h | 266 +++++++++++------- 2 files changed, 168 insertions(+), 106 deletions(-) diff --git a/libcxx/include/set b/libcxx/include/set index 2784e82760d7e..3c6ea360bd06c 100644 --- a/libcxx/include/set +++ b/libcxx/include/set @@ -1003,9 +1003,9 @@ operator<=(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare, # else // _LIBCPP_STD_VER <= 17 -template +template _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Key> -operator<=>(const set<_Key, _Allocator>& __x, const set<_Key, _Allocator>& __y) { +operator<=>(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare, _Allocator>& __y) { return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } @@ -1470,9 +1470,9 @@ operator<=(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key, # else // _LIBCPP_STD_VER <= 17 -template +template _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Key> -operator<=>(const multiset<_Key, _Allocator>& __x, const multiset<_Key, _Allocator>& __y) { +operator<=>(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key, _Compare, _Allocator>& __y) { return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), __synth_three_way); } diff --git a/libcxx/test/support/test_container_comparisons.h b/libcxx/test/support/test_container_comparisons.h index 543c5899922d0..f7bf78e48a1f8 100644 --- a/libcxx/test/support/test_container_comparisons.h +++ b/libcxx/test/support/test_container_comparisons.h @@ -13,51 +13,52 @@ #include #include +#include "test_allocator.h" #include "test_comparisons.h" // Implementation detail of `test_sequence_container_spaceship` -template