From 190e09e1535d88b6f7c908b2413ca07fbb6511fe Mon Sep 17 00:00:00 2001 From: Gang Tao Date: Fri, 22 May 2026 01:55:58 +0000 Subject: [PATCH] [Celestica][Ladakh800bcls] Fix PAI concurrency crash by passing in sync callbacks --- CMakeLists.txt | 1 + fboss/agent/hw/sai/api/SwitchApi.h | 32 ++++++++++++++- fboss/agent/hw/sai/api/oss/SwitchApi.cpp | 13 +++++++ fboss/agent/platforms/sai/SaiPlatform.cpp | 4 ++ fboss/lib/phy/SaiPhyRetimer.cpp | 47 +++++++++++++++++++++++ fboss/lib/phy/SaiPhyRetimer.h | 7 ++++ 6 files changed, 103 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8102b35c76c5a..682e233c30f8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,6 +94,7 @@ if ($ENV{SAI_BRCM_PAI_IMPL}) # epdm headers include_directories("${SAI_BRCM_PAI_IMPL_INC_DIR}/epdm") + add_definitions (-DSAI_BRCM_PAI_IMPL) # SDK artifacts directory which hosts libpai.a, EPDM SW artifact, PHY Driver # SW artifact set(SAI_BRCM_PAI_IMPL_LIB_DIR "/var/FBOSS/pai_impl/lib") diff --git a/fboss/agent/hw/sai/api/SwitchApi.h b/fboss/agent/hw/sai/api/SwitchApi.h index 403d883513ec4..69e2a4e44f217 100644 --- a/fboss/agent/hw/sai/api/SwitchApi.h +++ b/fboss/agent/hw/sai/api/SwitchApi.h @@ -873,7 +873,26 @@ struct SaiSwitchTraits { SAI_SWITCH_ATTR_SWITCHING_MODE, sai_int32_t, SaiIntDefault>; + +#if defined(SAI_BRCM_PAI_IMPL) + struct AttributeSyncLockWrapper { + std::optional operator()(); + }; + using SyncLock = SaiExtensionAttribute< + sai_pointer_t, + AttributeSyncLockWrapper, + SaiPointerDefault>; + + struct AttributeSyncUnlockWrapper { + std::optional operator()(); + }; + using SyncUnlock = SaiExtensionAttribute< + sai_pointer_t, + AttributeSyncUnlockWrapper, + SaiPointerDefault>; +#endif }; + using AdapterKey = SwitchSaiId; using AdapterHostKey = std::monostate; using CreateAttributes = std::tuple< @@ -976,7 +995,14 @@ struct SaiSwitchTraits { #endif std::optional, std::optional, - std::optional>; + std::optional + +#if defined(SAI_BRCM_PAI_IMPL) + , + std::optional, + std::optional +#endif + >; // Avoid using SAI_SWITCH_STAT_PACKET_INTEGRITY_DROP as that counts // both DramPacketError and EgressRcvPacketError. As we now have a @@ -1176,6 +1202,10 @@ SAI_ATTRIBUTE_NAME(Switch, ModuleIdFabricPortList) SAI_ATTRIBUTE_NAME(Switch, LocalSystemPortIdRangeList) #endif +#if defined(SAI_BRCM_PAI_IMPL) +SAI_ATTRIBUTE_NAME(Switch, SyncLock) +SAI_ATTRIBUTE_NAME(Switch, SyncUnlock) +#endif template <> struct SaiObjectHasStats : public std::true_type {}; diff --git a/fboss/agent/hw/sai/api/oss/SwitchApi.cpp b/fboss/agent/hw/sai/api/oss/SwitchApi.cpp index 69720fde6c7e9..aa383fec2564a 100644 --- a/fboss/agent/hw/sai/api/oss/SwitchApi.cpp +++ b/fboss/agent/hw/sai/api/oss/SwitchApi.cpp @@ -400,4 +400,17 @@ const std::vector& SaiSwitchTraits::deviceWatermarkBytes() { return stats; } +#if defined(SAI_BRCM_PAI_IMPL) +#include + +std::optional +SaiSwitchTraits::Attributes::AttributeSyncLockWrapper::operator()() { + return BRCM_PAI_SWITCH_ATTR_SYNC_LOCK; +} + +std::optional +SaiSwitchTraits::Attributes::AttributeSyncUnlockWrapper::operator()() { + return BRCM_PAI_SWITCH_ATTR_SYNC_UNLOCK; +} +#endif } // namespace facebook::fboss diff --git a/fboss/agent/platforms/sai/SaiPlatform.cpp b/fboss/agent/platforms/sai/SaiPlatform.cpp index 29ad6e70183e5..17ee7f42d4790 100644 --- a/fboss/agent/platforms/sai/SaiPlatform.cpp +++ b/fboss/agent/platforms/sai/SaiPlatform.cpp @@ -1091,6 +1091,10 @@ SaiSwitchTraits::CreateAttributes SaiPlatform::getSwitchAttributes( std::nullopt, // enable PFC monitoring for the switch measureCableLengths, // enable cable propagation delay measurement std::nullopt, // switching mode (store-and-forward / cut-through) +#if defined(SAI_BRCM_PAI_IMPL) + std::nullopt, // SyncLock + std::nullopt, // SyncUnlock +#endif }; } diff --git a/fboss/lib/phy/SaiPhyRetimer.cpp b/fboss/lib/phy/SaiPhyRetimer.cpp index 58035a6330759..29f4fbc7185a7 100644 --- a/fboss/lib/phy/SaiPhyRetimer.cpp +++ b/fboss/lib/phy/SaiPhyRetimer.cpp @@ -21,6 +21,11 @@ #include +DEFINE_bool( + enable_xphy_global_lock, + false, + "Enable/disable global lock for all XPHY chips"); + namespace facebook::fboss::phy { namespace { @@ -34,6 +39,44 @@ FecMode getFecMode(sai_port_fec_mode_t fec, cfg::PortSpeed /* speed */) { static_cast(fec), ". Only SAI_PORT_FEC_MODE_NONE is supported for now"); } + +#if defined(SAI_BRCM_PAI_IMPL) +// Global mutex for fallback, controlled by environment variable +std::mutex gPaiGlobalMutex; + +sai_status_t pai_lock_callback(uint64_t platform_context) { + if (FLAGS_enable_xphy_global_lock) { + gPaiGlobalMutex.lock(); + XLOG(DBG5) << "PAI Sync Lock acquired (GLOBAL)"; + } else { + if (platform_context == 0) { + XLOG(ERR) << "PAI lock callback invoked with a null context."; + return SAI_STATUS_FAILURE; + } + auto* retimer = reinterpret_cast(platform_context); + retimer->getPaiMutex().lock(); + XLOG(DBG5) << "PAI Sync Lock acquired for xphy:" << retimer->getPhyAddr(); + } + return SAI_STATUS_SUCCESS; +} + +sai_status_t pai_unlock_callback(uint64_t platform_context) { + if (FLAGS_enable_xphy_global_lock) { + gPaiGlobalMutex.unlock(); + XLOG(DBG5) << "PAI Sync Lock released (GLOBAL)"; + } else { + if (platform_context == 0) { + // This might happen on a failed lock, so don't log an error + return SAI_STATUS_SUCCESS; + } + auto* retimer = reinterpret_cast(platform_context); + retimer->getPaiMutex().unlock(); + XLOG(DBG5) << "PAI Sync Lock released for xphy:" << retimer->getPhyAddr(); + } + return SAI_STATUS_SUCCESS; +} +#endif + /* * This is a static function for reading a Phy register. This function will be * passed to the SAI layer. The SAI driver will use this function to read a @@ -322,6 +365,10 @@ SaiSwitchTraits::CreateAttributes SaiPhyRetimer::getSwitchAttributes() { std::nullopt, // enable PFC monitoring for the switch std::nullopt, // enable cable propagation delay measurement std::nullopt, // switching mode +#if defined(SAI_BRCM_PAI_IMPL) + (sai_pointer_t)pai_lock_callback, // user sync_lock for pai + (sai_pointer_t)pai_unlock_callback, // user sync_unlock for pai +#endif }; } diff --git a/fboss/lib/phy/SaiPhyRetimer.h b/fboss/lib/phy/SaiPhyRetimer.h index ed0a45353df63..4e376d8e9a8ac 100644 --- a/fboss/lib/phy/SaiPhyRetimer.h +++ b/fboss/lib/phy/SaiPhyRetimer.h @@ -26,6 +26,8 @@ #include #include +#include + namespace facebook::fboss { class SaiPlatform; class StateObserver; @@ -171,6 +173,10 @@ class SaiPhyRetimer : public ExternalPhy, public HwSwitchCallback { void* getRegisterWriteFuncPtr(); SaiSwitchTraits::CreateAttributes getSwitchAttributes(); + std::mutex& getPaiMutex() { + return paiMutex_; + } + protected: void dumpImpl() const; @@ -186,6 +192,7 @@ class SaiPhyRetimer : public ExternalPhy, public HwSwitchCallback { SaiPlatform* platform_; BspPhyIO* xphyIO_; std::optional switchId_; + std::mutex paiMutex_; }; } // namespace facebook::fboss::phy