From f6c1d8ab05adaec16a0e217c12625a1f6b0458f3 Mon Sep 17 00:00:00 2001 From: Bram Oosterhuis Date: Tue, 12 May 2026 21:34:17 +0200 Subject: [PATCH 1/2] Implement StateMachine for plugin lifecycle management - Introduced a StateMachine class to manage the lifecycle states of plugins, including DEACTIVATED, PRECONDITION, ACTIVATED, HIBERNATED, and UNAVAILABLE. - Defined stable and transient states with clear guarantees for callbacks during state transitions. - Added detailed documentation for state transitions, callback guarantees, reentrancy, and crash handling. - Refactored existing plugin state handling to utilize the new StateMachine, improving clarity and maintainability. - Updated the Service class to integrate the StateMachine and handle state change notifications. - Removed redundant locks and streamlined the Evaluate method to leverage the StateMachine's re-evaluation capabilities. --- Source/Thunder/PluginServer.cpp | 872 ++++++++++++++++++-------------- Source/Thunder/PluginServer.h | 444 ++++++++++++++-- 2 files changed, 894 insertions(+), 422 deletions(-) diff --git a/Source/Thunder/PluginServer.cpp b/Source/Thunder/PluginServer.cpp index b86da1bbd..6180f0e63 100644 --- a/Source/Thunder/PluginServer.cpp +++ b/Source/Thunder/PluginServer.cpp @@ -294,18 +294,9 @@ namespace PluginHost { asIUnknown == false ? result = static_cast(this) : result = static_cast(this); } else { - _queryInterfaceLock.Lock(); - if ((State() == IShell::state::ACTIVATED) || (State() == IShell::state::DEACTIVATION)) { //needed as we only want to send plugin state notifications when the plugin is active or deactivating which is not guaranteed by the lock itself as it comes from the same thread handling the activation and deactivation - if (id == PluginHost::IDispatcher::ID) { - if (_jsonrpc != nullptr) { - _jsonrpc->AddRef(); - asIUnknown == false ? result = _jsonrpc : result = static_cast(_jsonrpc); - } - } else if (_handler != nullptr) { - result = _handler->QueryInterface(id, asIUnknown); - } - } - _queryInterfaceLock.Unlock(); + // Route through the state machine — only ACTIVATED and states that + // explicitly override QueryInterface will return a non-null result. + result = _stateMachine.QueryInterface(id, asIUnknown); } return (result); @@ -338,485 +329,625 @@ namespace PluginHost { // Methods to stop/start/update the service. Core::hresult Server::Service::Activate(const PluginHost::IShell::reason why) /* override */ { - Core::hresult result = Core::ERROR_NONE; + return _stateMachine.Activate(why); + } - Lock(); + uint32_t Server::Service::Resume(const reason why) /* override */ { + uint32_t result = Core::ERROR_NONE; + // Read state snapshot and release Lock() before calling Activate(). + // Holding Lock() across Activate() inverts the lock order: + // Resume: Lock() → _transitionLock (via Activate) + // Transitions: _transitionLock → Lock() + // That inversion is a deadlock. The snapshot is acceptable — Resume() + // was already operating on a snapshot since state could change between + // the check and the Activate() call anyway. + Lock(); IShell::state currentState(State()); + Unlock(); if (currentState == IShell::state::ACTIVATION) { - Unlock(); result = Core::ERROR_INPROGRESS; - } - else if ((currentState == IShell::state::UNAVAILABLE) || (currentState == IShell::state::DEACTIVATION) || (currentState == IShell::state::DESTROYED) ) { - Unlock(); + } else if ((currentState == IShell::state::DEACTIVATION) || (currentState == IShell::state::DESTROYED) || (currentState == IShell::state::HIBERNATED)) { result = Core::ERROR_ILLEGAL_STATE; - } else if (currentState == IShell::state::HIBERNATED) { - result = Wakeup(3000); + } else if (currentState == IShell::state::DEACTIVATED) { + result = Activate(why); + Lock(); + currentState = State(); Unlock(); - } else if ((currentState == IShell::state::DEACTIVATED) || (currentState == IShell::state::PRECONDITION)) { - - _reason = why; - - _queryInterfaceLock.Lock(); + } - // Load the interfaces, If we did not load them yet... - if (_handler == nullptr) { - AcquireInterfaces(); + if (currentState == IShell::ACTIVATED) { + Lock(); + // See if we need can and should RESUME. + if (_stateControl == nullptr) { + result = Core::ERROR_BAD_REQUEST; } + else { + // We have a StateControl interface, so at least start resuming, if not already resumed :-) + if (_stateControl->State() == PluginHost::IStateControl::SUSPENDED) { + result = _stateControl->Request(PluginHost::IStateControl::RESUME); + } + } + Unlock(); + } - const string callSign(PluginHost::Service::Configuration().Callsign.Value()); - const string className(PluginHost::Service::Configuration().ClassName.Value()); - - if (_handler == nullptr) { - SYSLOG(Logging::Startup, (_T("Loading of plugin [%s]:[%s], failed. Error [%s]"), className.c_str(), callSign.c_str(), ErrorMessage().c_str())); - result = Core::ERROR_UNAVAILABLE; - _reason = reason::INSTANTIATION_FAILED; - State(DEACTIVATED); - - _queryInterfaceLock.Unlock(); - - Unlock(); - - // See if the preconditions have been met.. - } else if (_precondition.IsMet() == false) { - SYSLOG(Logging::Startup, (_T("Activation of plugin [%s]:[%s], postponed, preconditions have not been met, yet."), className.c_str(), callSign.c_str())); - result = Core::ERROR_PENDING_CONDITIONS; - State(PRECONDITION); + return (result); + } - _queryInterfaceLock.Unlock(); + Core::hresult Server::Service::Deactivate(const reason why) /* override */ + { + return _stateMachine.Deactivate(why); + } - if (Thunder::Messaging::LocalLifetimeType::IsEnabled() == true) { + uint32_t Server::Service::Suspend(const reason why) { - string feedback; - uint8_t index = 1; - uint32_t delta(_precondition.Delta(_administrator.SubSystemInfo().Value())); + uint32_t result = Core::ERROR_NONE; - while (delta != 0) { - if ((delta & 0x01) != 0) { - if (feedback.empty() == false) { - feedback += ','; - } + if (StartMode() == PluginHost::IShell::startmode::DEACTIVATED) { + // We need to shutdown completely + result = Deactivate(why); + } + else { + Lock(); - PluginHost::ISubSystem::subsystem element(static_cast(index)); - feedback += string(Core::EnumerateType(element).Data()); - } + IShell::state currentState(State()); - delta = (delta >> 1); - index++; + if (currentState == IShell::state::DEACTIVATION) { + result = Core::ERROR_INPROGRESS; + } else if ((currentState == IShell::state::ACTIVATION) || (currentState == IShell::state::DESTROYED) || (currentState == IShell::state::HIBERNATED)) { + result = Core::ERROR_ILLEGAL_STATE; + } else if ((currentState == IShell::state::ACTIVATED) || (currentState == IShell::state::PRECONDITION)) { + // See if we need can and should SUSPEND. + if (_stateControl == nullptr) { + result = Core::ERROR_BAD_REQUEST; + } + else { + // We have a StateControl interface, so at least start suspending, if not already suspended :-) + if (_stateControl->State() == PluginHost::IStateControl::RESUMED) { + result = _stateControl->Request(PluginHost::IStateControl::SUSPEND); } - - TRACE(Activity, (_T("Delta preconditions: %s"), feedback.c_str())); } + } - Unlock(); + Unlock(); + } - } else { + return (result); + } - // Before we dive into the "new" initialize lets see if this has a pending OOP running, if so forcefully kill it now, no time to wait ! - if (_lastId != 0) { - _administrator.Destroy(_lastId); - _lastId = 0; - } + Core::hresult Server::Service::Unavailable(const reason why) /* override */ { + return _stateMachine.Unavailable(why); + } - TRACE(Activity, (_T("Activation plugin [%s]:[%s]"), className.c_str(), callSign.c_str())); + Core::hresult Server::Service::Hibernate(const uint32_t timeout VARIABLE_IS_NOT_USED) /* override */ { + return _stateMachine.Hibernate(timeout); + } - _administrator.Initialize(callSign, this); + // ------------------------------------------------------------------------- + // StateMachine — static state instance definitions + // ------------------------------------------------------------------------- - State(ACTIVATION); + Server::Service::StateMachine::DeactivatedState Server::Service::StateMachine::_stateDeactivated; + Server::Service::StateMachine::PreconditionState Server::Service::StateMachine::_statePrecondition; + Server::Service::StateMachine::ActivationState Server::Service::StateMachine::_stateActivation; + Server::Service::StateMachine::ActivatedState Server::Service::StateMachine::_stateActivated; + Server::Service::StateMachine::DeactivationState Server::Service::StateMachine::_stateDeactivation; + Server::Service::StateMachine::HibernatedState Server::Service::StateMachine::_stateHibernated; + Server::Service::StateMachine::UnavailableState Server::Service::StateMachine::_stateUnavailable; - Unlock(); + // ------------------------------------------------------------------------- + // DeactivatedState + // ------------------------------------------------------------------------- - REPORT_DURATION_WARNING( { ErrorMessage(_handler->Initialize(this)); }, WarningReporting::TooLongPluginState, WarningReporting::TooLongPluginState::StateChange::ACTIVATION, callSign.c_str()); + Core::hresult Server::Service::StateMachine::DeactivatedState::Activate(StateMachine& sm, const reason why) + { + const string callSign(sm._parent.PluginHost::Service::Configuration().Callsign.Value()); - if (HasError() == true) { - result = Core::ERROR_GENERAL; + sm._parent.Lock(); + sm._parent._reason = why; - SYSLOG(Logging::Startup, (_T("Activation of plugin [%s]:[%s], failed. Error [%s]"), className.c_str(), callSign.c_str(), ErrorMessage().c_str())); + Core::hresult result = sm._parent.LoadPlugin(); - _reason = reason::INITIALIZATION_FAILED; + if (result == Core::ERROR_UNAVAILABLE) { + sm.SetState(_stateDeactivated); + sm._parent.Unlock(); + return result; + } + if (result == Core::ERROR_PENDING_CONDITIONS) { + sm.SetState(_statePrecondition); + sm._parent.Unlock(); + return result; + } - if( _administrator.Configuration().LegacyInitialize() == false ) { - REPORT_DURATION_WARNING({ _handler->Deinitialize(this); }, WarningReporting::TooLongPluginState, WarningReporting::TooLongPluginState::StateChange::DEACTIVATION, callSign.c_str()); - } + sm._parent._administrator.Initialize(callSign, &sm._parent); + sm.SetState(_stateActivation); + sm._parent.Unlock(); - Lock(); - ReleaseInterfaces(); - State(DEACTIVATED); - Unlock(); + result = sm._parent.InitializePlugin(); - _queryInterfaceLock.Unlock(); + if (result != Core::ERROR_NONE) { + sm._parent.Lock(); + sm._parent._reason = IShell::reason::INITIALIZATION_FAILED; + sm._parent.UnloadPlugin(); + sm.SetState(_stateDeactivated); + sm._parent.Unlock(); + sm._callback(IShell::DEACTIVATED); + return result; + } - _administrator.Deinitialized(callSign, this); + sm._parent.Attach(); - } else { + sm._parent.Lock(); + sm.SetState(_stateActivated); + sm._parent.Unlock(); - const Core::EnumerateType textReason(why); - const string webUI(PluginHost::Service::Configuration().WebUI.Value()); - if ((PluginHost::Service::Configuration().WebUI.IsSet()) || (webUI.empty() == false)) { - EnableWebServer(webUI, EMPTY_STRING); - } + sm._callback(IShell::ACTIVATED); + return Core::ERROR_NONE; + } - if (_jsonrpc != nullptr) { - PluginHost::IShell::IConnectionServer::INotification* sink = nullptr; - _jsonrpc->Attach(sink, this); - if (sink != nullptr) { - Register(sink); - sink->Release(); - } - } + Core::hresult Server::Service::StateMachine::DeactivatedState::Unavailable(StateMachine& sm, const reason why) + { + if (sm._parent.AllowedUnavailable() == false) { + return Core::ERROR_NOT_SUPPORTED; + } - if (_external.Connector().empty() == false) { - uint32_t result = _external.Open(0); - if ((result != Core::ERROR_NONE) && (result != Core::ERROR_INPROGRESS)) { - TRACE(Trace::Error, (_T("Could not open the external connector for %s"), Callsign().c_str())); - } - } + const string callSign(sm._parent.PluginHost::Service::Configuration().Callsign.Value()); + const string className(sm._parent.PluginHost::Service::Configuration().ClassName.Value()); - SYSLOG(Logging::Startup, (_T("Activated plugin [%s]:[%s]"), className.c_str(), callSign.c_str())); - Lock(); - State(ACTIVATED); + sm._parent.Lock(); + sm._parent._reason = why; + SYSLOG(Logging::Shutdown, (_T("Unavailable plugin [%s]:[%s]"), className.c_str(), callSign.c_str())); + TRACE(Activity, (Core::Format(_T("Unavailable plugin [%s]:[%s]"), className.c_str(), callSign.c_str()))); + sm.SetState(_stateUnavailable); + sm._parent.Unlock(); - _queryInterfaceLock.Unlock(); + sm._callback(IShell::UNAVAILABLE); + return Core::ERROR_NONE; + } - _administrator.Activated(callSign, this); + // ------------------------------------------------------------------------- + // PreconditionState + // ------------------------------------------------------------------------- - _stateControl = _handler->QueryInterface(); - if (_stateControl != nullptr) { - _stateControl->Register(&_composit); + Core::hresult Server::Service::StateMachine::PreconditionState::Deactivate(StateMachine& sm, const reason why) + { + sm._parent.Lock(); + sm._parent._reason = why; + const IShell::state finalState(why == IShell::CONDITIONS ? IShell::PRECONDITION : IShell::DEACTIVATED); + sm.SetState(why == IShell::CONDITIONS ? static_cast(_statePrecondition) : static_cast(_stateDeactivated)); + sm._parent.Unlock(); - if (Resumed() == true) { - _stateControl->Request(PluginHost::IStateControl::RESUME); - } - } + sm._parent.UnloadPlugin(); - Unlock(); + sm._callback(finalState); + return Core::ERROR_NONE; + } - #ifdef THUNDER_RESTFULL_API - Notify(EMPTY_STRING, string(_T("{\"state\":\"activated\",\"reason\":\"")) + textReason.Data() + _T("\"}")); - #endif - Notify(_T("statechange"), string(_T("{\"state\":\"activated\",\"reason\":\"")) + textReason.Data() + _T("\"}")); - } - } + void Server::Service::StateMachine::PreconditionState::Reevaluate(StateMachine& sm) + { + sm._parent.Lock(); + const uint32_t subsystems(sm._parent._administrator.SubSystemInfo().Value()); + + if ((sm._parent._precondition.Evaluate(subsystems) == true) && (sm._parent._precondition.IsMet() == true)) { + sm._parent.Unlock(); + sm._Activate(sm._parent._reason); } else { - Unlock(); + sm._parent.Unlock(); } - - return (result); } - uint32_t Server::Service::Resume(const reason why) /* override */ { - uint32_t result = Core::ERROR_NONE; - Lock(); + // ------------------------------------------------------------------------- + // ActivationState — only INITIALIZATION_FAILED deactivation is legal + // ------------------------------------------------------------------------- - IShell::state currentState(State()); - - if (currentState == IShell::state::ACTIVATION) { - result = Core::ERROR_INPROGRESS; - } else if ((currentState == IShell::state::DEACTIVATION) || (currentState == IShell::state::DESTROYED) || (currentState == IShell::state::HIBERNATED)) { - result = Core::ERROR_ILLEGAL_STATE; - } else if (currentState == IShell::state::DEACTIVATED) { - result = Activate(why); - currentState = State(); - } - - if (currentState == IShell::ACTIVATED) { - // See if we need can and should RESUME. - if (_stateControl == nullptr) { - result = Core::ERROR_BAD_REQUEST; - } - else { - // We have a StateControl interface, so at least start resuming, if not already resumed :-) - if (_stateControl->State() == PluginHost::IStateControl::SUSPENDED) { - result = _stateControl->Request(PluginHost::IStateControl::RESUME); - } - } + Core::hresult Server::Service::StateMachine::ActivationState::Deactivate(StateMachine& sm, const reason why) + { + if (why != IShell::reason::INITIALIZATION_FAILED) { + return Core::ERROR_ILLEGAL_STATE; } - Unlock(); + const string callSign(sm._parent.PluginHost::Service::Configuration().Callsign.Value()); - return (result); - } - - Core::hresult Server::Service::Deactivate(const reason why) /* override */ - { - Core::hresult result = Core::ERROR_NONE; + sm._parent.Lock(); + sm._parent._reason = why; + sm.SetState(_stateDeactivation); + sm._parent.Unlock(); - Lock(); + sm.Evaluate(); // temporarily releases _transitionLock — safe, DEACTIVATION is set + Server::PostMortem(sm._parent, why, sm._parent._connection); + sm._parent.Detach(); - IShell::state currentState(State()); + sm._parent.DeinitializePlugin(); + sm._parent.Lock(); - if (currentState == IShell::state::DEACTIVATION) { - result = Core::ERROR_INPROGRESS; - Unlock(); - } - else if ( ((currentState == IShell::state::ACTIVATION) && (why != IShell::reason::INITIALIZATION_FAILED)) || (currentState == IShell::state::DESTROYED)) { - result = Core::ERROR_ILLEGAL_STATE; - Unlock(); - } - else if ( ((currentState == IShell::state::ACTIVATION) && (why == IShell::reason::INITIALIZATION_FAILED)) || (currentState == IShell::state::UNAVAILABLE) || (currentState == IShell::state::ACTIVATED) || (currentState == IShell::state::PRECONDITION) || (currentState == IShell::state::HIBERNATED) ) { - const Core::EnumerateType textReason(why); + sm.SetState(_stateDeactivated); + sm._parent.UnloadPlugin(); + sm._parent.Unlock(); - const string className(PluginHost::Service::Configuration().ClassName.Value()); - const string callSign(PluginHost::Service::Configuration().Callsign.Value()); + sm._callback(IShell::DEACTIVATED); + return Core::ERROR_NONE; + } - _reason = why; + // ------------------------------------------------------------------------- + // ActivatedState + // ------------------------------------------------------------------------- - if(currentState == IShell::state::HIBERNATED) - { - uint32_t wakeupResult = Wakeup(3000); - if(wakeupResult != Core::ERROR_NONE) - { - //Force Activated state - State(ACTIVATED); - } - currentState = ACTIVATED; - } + Core::hresult Server::Service::StateMachine::ActivatedState::Deactivate(StateMachine& sm, const reason why) + { + const string callSign(sm._parent.PluginHost::Service::Configuration().Callsign.Value()); - if ( (currentState == IShell::ACTIVATION) || (currentState == IShell::ACTIVATED)) { - ASSERT(_handler != nullptr); + sm._parent.Lock(); + sm._parent._reason = why; - State(DEACTIVATION); + SystemInfo& systeminfo = sm._parent._administrator.SubSystemInfo(); + for (const PluginHost::ISubSystem::subsystem sys : sm._parent.SubSystemControl()) { + systeminfo.Unset(sys); + } - SystemInfo& systeminfo = _administrator.SubSystemInfo(); + // Set transient state before Evaluate() so RecursiveNotification sees + // DEACTIVATION and short-circuits — no-op in DeactivationState::Reevaluate. + sm.SetState(_stateDeactivation); + sm._parent.Unlock(); - // On behalf of the plugin stop all subsystems it was controlling. - for (const PluginHost::ISubSystem::subsystem sys : SubSystemControl()) { - systeminfo.Unset(sys); - } + // Temporarily releases _transitionLock — safe because DEACTIVATION is set. + sm.Evaluate(); - Unlock(); + // Fire Deactivated() while _handler is still alive — subscribers may + // safely call QueryInterface during this notification. + sm._parent._administrator.Deactivated(callSign, &sm._parent); - // Reevaluate status. In case some plugins were dependant on the subsystems - // that have been just disabled, deactivate them too, recursively. - _administrator.Evaluate(); + Server::PostMortem(sm._parent, why, sm._parent._connection); + sm._parent.Detach(); - // And finally start tearing down this plugin... + sm._parent.DeinitializePlugin(); + sm._parent.Lock(); - if (_stateControl != nullptr) { - _stateControl->Unregister(&_composit); - _stateControl->Release(); - _stateControl = nullptr; - } + const IShell::state finalState(why == IShell::CONDITIONS ? IShell::PRECONDITION : IShell::DEACTIVATED); + sm.SetState(why == IShell::CONDITIONS ? static_cast(_statePrecondition) : static_cast(_stateDeactivated)); + sm._parent.UnloadPlugin(); + sm._parent.Unlock(); - if (currentState == IShell::ACTIVATED) { - TRACE(Activity, (_T("Deactivating plugin [%s]:[%s]"), className.c_str(), callSign.c_str())); - _administrator.Deactivated(callSign, this); - } - - // We might require PostMortem analyses if the reason is not really clear. Call the PostMortum installed so it can generate - // required logs/OS information before we start to kill it. - Server::PostMortem(*this, why, _connection); + sm._callback(finalState); + return Core::ERROR_NONE; + } - // If we enabled the webserver, we should also disable it. - if ((PluginHost::Service::Configuration().WebUI.IsSet()) || (PluginHost::Service::Configuration().WebUI.Value().empty() == false)) { - DisableWebServer(); - } + Core::hresult Server::Service::StateMachine::ActivatedState::Hibernate(StateMachine& sm, const uint32_t timeout VARIABLE_IS_NOT_USED) + { + if (sm._parent.AllowedHibernate() == false) { + return Core::ERROR_NOT_SUPPORTED; + } - _queryInterfaceLock.Lock(); + sm._parent.Lock(); - REPORT_DURATION_WARNING( { _handler->Deinitialize(this); }, WarningReporting::TooLongPluginState, WarningReporting::TooLongPluginState::StateChange::DEACTIVATION, callSign.c_str()); + if (sm._parent._connection == nullptr) { + sm._parent.Unlock(); + return Core::ERROR_INPROC; + } - Lock(); + RPC::IMonitorableProcess* local = sm._parent._connection->QueryInterface(); - if (currentState != IShell::state::ACTIVATION) { - SYSLOG(Logging::Shutdown, (_T("Deactivated plugin [%s]:[%s]"), className.c_str(), callSign.c_str())); + if (local == nullptr) { + sm._parent.Unlock(); + return Core::ERROR_BAD_REQUEST; + } -#ifdef THUNDER_RESTFULL_API - Notify(EMPTY_STRING, string(_T("{\"state\":\"deactivated\",\"reason\":\"")) + textReason.Data() + _T("\"}")); -#endif - Notify(_T("statechange"), string(_T("{\"state\":\"deactivated\",\"reason\":\"")) + textReason.Data() + _T("\"}")); - } + // Set HIBERNATED under lock as the in-progress guard. + // Note: this is set before the blocking HibernateProcess() call — the process + // is still running at this point. A HIBERNATING transient state would be + // more accurate but requires an IShell::state enum change (tracked as debt). + sm.SetState(_stateHibernated); - if (_external.Connector().empty() == false) { - _external.Close(0); - } +#ifdef HIBERNATE_SUPPORT_ENABLED + pid_t parentPID = local->ParentPID(); + local->Release(); + sm._parent.Unlock(); + + TRACE(Activity, (_T("Hibernation of plugin [%s] process [%u]"), sm._parent.Callsign().c_str(), parentPID)); + Core::hresult result = HibernateProcess(timeout, parentPID, sm._parent._administrator.Configuration().HibernateLocator().c_str(), _T(""), &sm._parent._hibernateStorage); + + sm._parent.Lock(); + // Note: in the original design this checked whether a concurrent Wakeup() or + // Deactivate() had changed state during HibernateProcess(). Under the new design + // _transitionLock is held throughout — no concurrent transition can run — so this + // check can never fire. Retained as a defensive guard only. + if (sm._parent.State() != IShell::HIBERNATED) { + SYSLOG(Logging::Startup, (_T("Hibernation aborted of plugin [%s] process [%u]"), sm._parent.Callsign().c_str(), parentPID)); + sm._parent.Unlock(); + return Core::ERROR_ABORTED; + } + sm._parent.Unlock(); - if (_jsonrpc != nullptr) { - PluginHost::IShell::IConnectionServer::INotification* sink = nullptr; + if (result == HIBERNATE_ERROR_NONE) { + result = sm._parent.HibernateChildren(parentPID, timeout); + } - _jsonrpc->Detach(sink); + if (result != Core::ERROR_NONE && result != Core::ERROR_ABORTED) { + // Rollback — try to wake the parent process to recover from failed hibernation. + TRACE(Activity, (_T("Wakeup plugin [%s] process [%u] on Hibernate error [%d]"), sm._parent.Callsign().c_str(), parentPID, result)); + WakeupProcess(timeout, parentPID, sm._parent._administrator.Configuration().HibernateLocator().c_str(), _T(""), &sm._parent._hibernateStorage); + } - if (sink != nullptr) { - Unregister(sink); - sink->Release(); - } - } + sm._parent.Lock(); +#else + local->Release(); + Core::hresult result = Core::ERROR_NONE; +#endif + // coverity[DEADCODE] — on the non-HIBERNATE_ENABLED path result is always + // ERROR_NONE here, making the else-if appear unreachable to Coverity. + // Both branches are reachable when HIBERNATE_SUPPORT_ENABLED is defined. + if (result == Core::ERROR_NONE) { + if (sm._parent.State() == IShell::state::HIBERNATED) { + SYSLOG(Logging::Startup, ("Hibernated plugin [%s]:[%s]", sm._parent.ClassName().c_str(), sm._parent.Callsign().c_str())); + sm._parent.Unlock(); + sm._callback(IShell::HIBERNATED); + } else { + // Wakeup occurred right after hibernation finished. + SYSLOG(Logging::Startup, ("Hibernation aborted of plugin [%s]:[%s]", sm._parent.ClassName().c_str(), sm._parent.Callsign().c_str())); + sm._parent.Unlock(); + result = Core::ERROR_ABORTED; } + } else if (sm._parent.State() == IShell::state::HIBERNATED) { + // Hibernation failed — roll back state to ACTIVATED. + sm.SetState(_stateActivated); + SYSLOG(Logging::Startup, (_T("Hibernation error [%d] of [%s]:[%s]"), result, sm._parent.ClassName().c_str(), sm._parent.Callsign().c_str())); + sm._parent.Unlock(); + sm._callback(IShell::ACTIVATED); + } else { + sm._parent.Unlock(); + } - State(why == CONDITIONS ? PRECONDITION : DEACTIVATED); - - // We have no need for his module anymore.. - ReleaseInterfaces(); - Unlock(); + return result; + } - if ((currentState == IShell::ACTIVATION) || (currentState == IShell::ACTIVATED)) { - _queryInterfaceLock.Unlock(); - } + void Server::Service::StateMachine::ActivatedState::Reevaluate(StateMachine& sm) + { + sm._parent.Lock(); + const uint32_t subsystems(sm._parent._administrator.SubSystemInfo().Value()); - _administrator.Deinitialized(callSign, this); + if ((sm._parent._termination.Evaluate(subsystems) == true) && (sm._parent._termination.IsMet() == false)) { + sm._parent.Unlock(); + sm._Deactivate(IShell::CONDITIONS); } else { - Unlock(); + sm._parent.Unlock(); } - - return (result); } - uint32_t Server::Service::Suspend(const reason why) { + void* Server::Service::StateMachine::ActivatedState::QueryInterface(StateMachine& sm, const uint32_t id, const bool asIUnknown) + { + // Take a ref-counted local copy of _handler under _pluginHandling. + // This keeps _handler alive across the lock boundary even if + // UnloadPlugin() runs concurrently on another thread. + sm._parent._pluginHandling.Lock(); + + if (id == PluginHost::IDispatcher::ID) { + // Fast path: return the cached _jsonrpc pointer directly rather than + // routing through _handler->QueryInterface(). This is intentional. + // _jsonrpc is set once in AcquireInterfaces() via + // _handler->QueryInterface() and is the same pointer the + // plugin would return. Bypassing the plugin's own QueryInterface avoids + // a potential lock inversion if the plugin's QI acquires internal locks. + // Assumption: plugins do not conditionally expose IDispatcher at runtime. + // If a plugin gates IDispatcher on runtime state, this fast path will + // return the cached pointer regardless — revisit if that pattern emerges. + IDispatcher* jsonrpc = sm._parent._jsonrpc; + if (jsonrpc != nullptr) { + jsonrpc->AddRef(); + } + sm._parent._pluginHandling.Unlock(); + if (jsonrpc != nullptr) { + return asIUnknown == false ? static_cast(jsonrpc) : static_cast(static_cast(jsonrpc)); + } + return nullptr; + } - uint32_t result = Core::ERROR_NONE; + IPlugin* handler = sm._parent._handler; + if (handler != nullptr) { + handler->AddRef(); + } + sm._parent._pluginHandling.Unlock(); - if (StartMode() == PluginHost::IShell::startmode::DEACTIVATED) { - // We need to shutdown completely - result = Deactivate(why); + if (handler == nullptr) { + return nullptr; } - else { - Lock(); - IShell::state currentState(State()); + void* result = handler->QueryInterface(id, asIUnknown); + handler->Release(); + return result; + } - if (currentState == IShell::state::DEACTIVATION) { - result = Core::ERROR_INPROGRESS; - } else if ((currentState == IShell::state::ACTIVATION) || (currentState == IShell::state::DESTROYED) || (currentState == IShell::state::HIBERNATED)) { - result = Core::ERROR_ILLEGAL_STATE; - } else if ((currentState == IShell::state::ACTIVATED) || (currentState == IShell::state::PRECONDITION)) { - // See if we need can and should SUSPEND. - if (_stateControl == nullptr) { - result = Core::ERROR_BAD_REQUEST; - } - else { - // We have a StateControl interface, so at least start suspending, if not already suspended :-) - if (_stateControl->State() == PluginHost::IStateControl::RESUMED) { - result = _stateControl->Request(PluginHost::IStateControl::SUSPEND); - } - } - } + // ------------------------------------------------------------------------- + // HibernatedState + // ------------------------------------------------------------------------- - Unlock(); + Core::hresult Server::Service::StateMachine::HibernatedState::Activate(StateMachine& sm, const reason why VARIABLE_IS_NOT_USED) + { + const Core::hresult result = sm._parent.Wakeup(3000); + if (result == Core::ERROR_NONE) { + sm.SetState(_stateActivated); + sm._callback(IShell::ACTIVATED); } + return result; + } - return (result); + Core::hresult Server::Service::StateMachine::HibernatedState::Deactivate(StateMachine& sm, const reason why) + { + // Always update _current before dispatching regardless of Wakeup result. + // If _current is not updated and Wakeup succeeds, _Deactivate dispatches + // back to HibernatedState::Deactivate → infinite recursion → stack overflow. + sm._parent.Wakeup(3000); // best-effort, errors are non-fatal here + sm.SetState(_stateActivated); // _current must be updated before _Deactivate + return sm._Deactivate(why); } - Core::hresult Server::Service::Unavailable(const reason why) /* override */ { - Core::hresult result = Core::ERROR_NONE; + // ------------------------------------------------------------------------- + // UnavailableState + // ------------------------------------------------------------------------- - if (AllowedUnavailable() == true) { + Core::hresult Server::Service::StateMachine::UnavailableState::Deactivate(StateMachine& sm, const reason why) + { + sm._parent.Lock(); + sm._parent._reason = why; + sm.SetState(_stateDeactivated); + sm._parent.Unlock(); + sm._callback(IShell::DEACTIVATED); + return Core::ERROR_NONE; + } - Lock(); + // ------------------------------------------------------------------------- + // Work methods — called by StateMachine, one concern each + // ------------------------------------------------------------------------- - IShell::state currentState(State()); + Core::hresult Server::Service::LoadPlugin() + { + ASSERT(_stateMachine.IsTransitionThread()); + // Pre: Lock() held by caller (state transition in progress) + const string callSign(PluginHost::Service::Configuration().Callsign.Value()); + const string className(PluginHost::Service::Configuration().ClassName.Value()); - if ((currentState == IShell::state::DEACTIVATION) || (currentState == IShell::state::ACTIVATION) || (currentState == IShell::state::DESTROYED) || (currentState == IShell::state::ACTIVATED) || (currentState == IShell::state::PRECONDITION) || (currentState == IShell::state::HIBERNATED)) { - result = Core::ERROR_ILLEGAL_STATE; - Unlock(); - } else if (currentState == IShell::state::DEACTIVATED) { + if (_handler == nullptr) { + AcquireInterfaces(); + } + + if (_handler == nullptr) { + SYSLOG(Logging::Startup, (_T("Loading of plugin [%s]:[%s], failed. Error [%s]"), className.c_str(), callSign.c_str(), ErrorMessage().c_str())); + _reason = reason::INSTANTIATION_FAILED; + return Core::ERROR_UNAVAILABLE; + } + + if (_precondition.IsMet() == false) { + SYSLOG(Logging::Startup, (_T("Activation of plugin [%s]:[%s], postponed, preconditions have not been met, yet."), className.c_str(), callSign.c_str())); + + if (Thunder::Messaging::LocalLifetimeType::IsEnabled() == true) { + string feedback; + uint8_t index = 1; + uint32_t delta(_precondition.Delta(_administrator.SubSystemInfo().Value())); - const Core::EnumerateType textReason(why); + while (delta != 0) { + if ((delta & 0x01) != 0) { + if (feedback.empty() == false) { + feedback += ','; + } + PluginHost::ISubSystem::subsystem element(static_cast(index)); + feedback += string(Core::EnumerateType(element).Data()); + } + delta = (delta >> 1); + index++; + } + TRACE(Activity, (_T("Delta preconditions: %s"), feedback.c_str())); + } - const string className(PluginHost::Service::Configuration().ClassName.Value()); - const string callSign(PluginHost::Service::Configuration().Callsign.Value()); + return Core::ERROR_PENDING_CONDITIONS; + } - _reason = why; + if (_lastId != 0) { + _administrator.Destroy(_lastId); + _lastId = 0; + } - SYSLOG(Logging::Shutdown, (_T("Unavailable plugin [%s]:[%s]"), className.c_str(), callSign.c_str())); + TRACE(Activity, (_T("Activation plugin [%s]:[%s]"), className.c_str(), callSign.c_str())); + return Core::ERROR_NONE; + } - TRACE(Activity, (Core::Format(_T("Unavailable plugin [%s]:[%s]"), className.c_str(), callSign.c_str()))); + Core::hresult Server::Service::InitializePlugin() + { + ASSERT(_stateMachine.IsTransitionThread()); + // Pre: no service Lock() — blocking call + const string callSign(PluginHost::Service::Configuration().Callsign.Value()); + const string className(PluginHost::Service::Configuration().ClassName.Value()); - State(UNAVAILABLE); - _administrator.Unavailable(callSign, this); + REPORT_DURATION_WARNING({ ErrorMessage(_handler->Initialize(this)); }, WarningReporting::TooLongPluginState, WarningReporting::TooLongPluginState::StateChange::ACTIVATION, callSign.c_str()); - Unlock(); + if (HasError() == true) { + SYSLOG(Logging::Startup, (_T("Activation of plugin [%s]:[%s], failed. Error [%s]"), className.c_str(), callSign.c_str(), ErrorMessage().c_str())); -#ifdef THUNDER_RESTFULL_API - Notify(EMPTY_STRING, string(_T("{\"state\":\"unavailable\",\"reason\":\"")) + textReason.Data() + _T("\"}")); -#endif - Notify(_T("statechange"), string(_T("{\"state\":\"unavailable\",\"reason\":\"")) + textReason.Data() + _T("\"}")); - } else { - Unlock(); + if (_administrator.Configuration().LegacyInitialize() == false) { + REPORT_DURATION_WARNING({ _handler->Deinitialize(this); }, WarningReporting::TooLongPluginState, WarningReporting::TooLongPluginState::StateChange::DEACTIVATION, callSign.c_str()); } - } else { - result = Core::ERROR_NOT_SUPPORTED; + + return Core::ERROR_GENERAL; } - return (result); + SYSLOG(Logging::Startup, (_T("Activated plugin [%s]:[%s]"), className.c_str(), callSign.c_str())); + return Core::ERROR_NONE; } - Core::hresult Server::Service::Hibernate(const uint32_t timeout VARIABLE_IS_NOT_USED) /* override */ { - Core::hresult result = Core::ERROR_NONE; + void Server::Service::DeinitializePlugin() + { + ASSERT(_stateMachine.IsTransitionThread()); + // Pre: no service Lock() — blocking call + const string callSign(PluginHost::Service::Configuration().Callsign.Value()); + REPORT_DURATION_WARNING({ _handler->Deinitialize(this); }, WarningReporting::TooLongPluginState, WarningReporting::TooLongPluginState::StateChange::DEACTIVATION, callSign.c_str()); + SYSLOG(Logging::Shutdown, (_T("Deactivated plugin [%s]:[%s]"), ClassName().c_str(), callSign.c_str())); + } - if (AllowedHibernate() == true) { + void Server::Service::UnloadPlugin() + { + ASSERT(_stateMachine.IsTransitionThread()); + // Pre: Lock() held by caller + ReleaseInterfaces(); + } - Lock(); + void Server::Service::Attach() + { + ASSERT(_stateMachine.IsTransitionThread()); + // Pre: no locks held + const string webUI(PluginHost::Service::Configuration().WebUI.Value()); + if ((PluginHost::Service::Configuration().WebUI.IsSet()) || (webUI.empty() == false)) { + EnableWebServer(webUI, EMPTY_STRING); + } - IShell::state currentState(State()); + if (_jsonrpc != nullptr) { + PluginHost::IShell::IConnectionServer::INotification* sink = nullptr; + _jsonrpc->Attach(sink, this); + if (sink != nullptr) { + Register(sink); + sink->Release(); + } + } - if (currentState != IShell::state::ACTIVATED) { - result = Core::ERROR_ILLEGAL_STATE; - } else if (_connection == nullptr) { - result = Core::ERROR_INPROC; - } else { - // Oke we have an Connection so there is something to Hibernate.. - RPC::IMonitorableProcess* local = _connection->QueryInterface(); + if (_external.Connector().empty() == false) { + uint32_t result = _external.Open(0); + if ((result != Core::ERROR_NONE) && (result != Core::ERROR_INPROGRESS)) { + TRACE(Trace::Error, (_T("Could not open the external connector for %s"), Callsign().c_str())); + } + } - if (local == nullptr) { - result = Core::ERROR_BAD_REQUEST; - } else { - State(IShell::HIBERNATED); -#ifdef HIBERNATE_SUPPORT_ENABLED - pid_t parentPID = local->ParentPID(); - local->Release(); - Unlock(); + _stateControl = _handler->QueryInterface(); + if (_stateControl != nullptr) { + _stateControl->Register(&_composit); + if (Resumed() == true) { + _stateControl->Request(PluginHost::IStateControl::RESUME); + } + } + } - TRACE(Activity, (_T("Hibernation of plugin [%s] process [%u]"), Callsign().c_str(), parentPID)); - result = HibernateProcess(timeout, parentPID, _administrator.Configuration().HibernateLocator().c_str(), _T(""), &_hibernateStorage); - Lock(); - if (State() != IShell::HIBERNATED) { - SYSLOG(Logging::Startup, (_T("Hibernation aborted of plugin [%s] process [%u]"), Callsign().c_str(), parentPID)); - result = Core::ERROR_ABORTED; - } - Unlock(); + void Server::Service::Detach() + { + ASSERT(_stateMachine.IsTransitionThread()); + // Pre: no locks held — reverse of Attach() + if (_stateControl != nullptr) { + _stateControl->Unregister(&_composit); + _stateControl->Release(); + _stateControl = nullptr; + } - if (result == HIBERNATE_ERROR_NONE) { - result = HibernateChildren(parentPID, timeout); - } + if ((PluginHost::Service::Configuration().WebUI.IsSet()) || (PluginHost::Service::Configuration().WebUI.Value().empty() == false)) { + DisableWebServer(); + } - if (result != Core::ERROR_NONE && result != Core::ERROR_ABORTED) { - // try to wakeup Parent process to revert Hibernation and recover - TRACE(Activity, (_T("Wakeup plugin [%s] process [%u] on Hibernate error [%d]"), Callsign().c_str(), parentPID, result)); - WakeupProcess(timeout, parentPID, _administrator.Configuration().HibernateLocator().c_str(), _T(""), &_hibernateStorage); - } + if (_external.Connector().empty() == false) { + _external.Close(0); + } - Lock(); -#else - local->Release(); - result = Core::ERROR_NONE; -#endif - // coverity[DEADCODE] - On the non-HIBERNATE_ENABLED path result is always - // ERROR_NONE here, making the else-if appear unreachable to Coverity. - // Both branches are reachable when HIBERNATE_ENABLED is defined. - if (result == Core::ERROR_NONE) { - if (State() == IShell::state::HIBERNATED) { - SYSLOG(Logging::Startup, ("Hibernated plugin [%s]:[%s]", ClassName().c_str(), Callsign().c_str())); - } else { - // wakeup occured right after hibernation finished - SYSLOG(Logging::Startup, ("Hibernation aborted of plugin [%s]:[%s]", ClassName().c_str(), Callsign().c_str())); - result = Core::ERROR_ABORTED; - } - } else if (State() == IShell::state::HIBERNATED) { - State(IShell::ACTIVATED); - SYSLOG(Logging::Startup, (_T("Hibernation error [%d] of [%s]:[%s]"), result, ClassName().c_str(), Callsign().c_str())); - } - } + if (_jsonrpc != nullptr) { + PluginHost::IShell::IConnectionServer::INotification* sink = nullptr; + _jsonrpc->Detach(sink); + if (sink != nullptr) { + Unregister(sink); + sink->Release(); } - Unlock(); - } else { - - result = Core::ERROR_NOT_SUPPORTED; } - - return (result); - } uint32_t Server::Service::Wakeup(const uint32_t timeout VARIABLE_IS_NOT_USED) { @@ -849,6 +980,11 @@ namespace PluginHost { result = Core::ERROR_NONE; #endif if (result == Core::ERROR_NONE) { + // Updates base class _state only — _current in StateMachine is NOT updated. + // Callers (HibernatedState::Activate and HibernatedState::Deactivate) are + // responsible for calling SetState(_stateActivated) immediately after Wakeup() + // returns to close the window where State() == ACTIVATED but _current still + // points to HibernatedState (which returns nullptr from QueryInterface). State(ACTIVATED); SYSLOG(Logging::Startup, (_T("Activated plugin from hibernation [%s]:[%s]"), ClassName().c_str(), Callsign().c_str())); } @@ -1214,7 +1350,7 @@ namespace PluginHost { { TRACE(Activity, (_T("Destruct a link with ID [%d] to [%s]"), Id(), RemoteId().c_str())); - // If we are still atatched to a service, detach, we are out of scope... + // If we are still attached to a service, detach, we are out of scope... CleanupService(); if (_security != nullptr) { diff --git a/Source/Thunder/PluginServer.h b/Source/Thunder/PluginServer.h index 56ec288ee..2b6cef155 100644 --- a/Source/Thunder/PluginServer.h +++ b/Source/Thunder/PluginServer.h @@ -683,6 +683,297 @@ namespace PluginHost { uint32_t _mask; state _state; }; + // ----------------------------------------------------------------------- + // StateMachine — lifecycle contract + // + // This documents the guarantees the state machine makes to plugin code, + // subscribers, and future maintainers. + // + // STATES + // Stable states (externally observable): + // DEACTIVATED — plugin is not loaded. _handler is nullptr. + // PRECONDITION — plugin is loaded but subsystem conditions not met. + // _handler exists, Initialize() has not been called. + // ACTIVATED — plugin is running. _handler, _jsonrpc, _stateControl + // are all valid. + // HIBERNATED — OOP process is frozen. Interfaces are valid but + // the process is suspended. + // UNAVAILABLE — explicitly marked not available. _handler is nullptr. + // + // Transient states (in-progress guards): + // ACTIVATION — Initialize() is running. All triggers return + // ERROR_INPROGRESS except Deactivate(INITIALIZATION_FAILED). + // DEACTIVATION — Deinitialize() is running. All triggers return + // ERROR_INPROGRESS. + // + // CALLBACK GUARANTEES + // The Callback registered at construction fires once per stable state + // transition, after the transition is complete and all locks are released. + // The state machine is in the new stable state when the callback fires. + // + // What is observable from within a callback: + // ACTIVATION fires: Initialize() has not yet been called. + // QueryInterface returns nullptr (ACTIVATION state). + // ACTIVATED fires: Initialize() completed successfully. + // Attach() completed. QueryInterface is valid. + // DEACTIVATION fires: Deinitialize() has not yet been called. + // QueryInterface is still valid (_handler alive). + // DEACTIVATED fires: Deinitialize() and UnloadPlugin() completed. + // QueryInterface returns nullptr. + // UNAVAILABLE fires: state set. No plugin lifecycle calls were made. + // + // What is legal from within a callback: + // - QueryInterface on this service: legal for ACTIVATED and DEACTIVATION + // callbacks. Returns nullptr for all other states. + // - Triggers on OTHER services: legal. Each service has its own + // _transitionLock. + // - Triggers on THIS service: illegal. _transitionLock is still held. + // Calling a public trigger from within a callback on the same thread + // fires ASSERT in debug builds and deadlocks in release builds. + // + // REENTRANCY + // Public triggers (Activate, Deactivate, Hibernate, Unavailable, + // Reevaluate) are serialized by _transitionLock. At most one transition + // executor exists per Service at any time. + // + // Internal triggers (_Activate, _Deactivate) bypass _transitionLock and + // are only legal from within an active transition (state class methods). + // ASSERT(_transitionThread == Core::Thread::ThreadId()) guards misuse. + // + // QueryInterface never holds _transitionLock and is never blocked by + // an active transition. It uses _pluginHandling only. + // + // PROCESS CRASH HANDLING + // When an OOP plugin process crashes during ACTIVATED state: + // - The RPC connection detects the disconnect and calls back into + // Service via the IConnectionServer::INotification interface. + // - Deactivate(FAILURE) is triggered, which follows the normal + // ACTIVATED -> DEACTIVATION -> DEACTIVATED path. + // - DeinitializePlugin() calls _handler->Deinitialize() on the + // now-dead proxy — this is safe because Thunder's COM-RPC layer + // handles disconnected proxies without crashing. + // - ReleaseInterfaces() terminates and releases the connection. + // - _lastId records the connection ID so the next Activate() can + // forcefully kill any zombie OOP process via _administrator.Destroy(). + // + // During ACTIVATION (crash mid-Initialize()): + // - REPORT_DURATION_WARNING detects the failure via HasError(). + // - Deactivate(INITIALIZATION_FAILED) is the expected recovery path. + // - ActivationState::Deactivate handles this as the sole legal + // deactivation from the ACTIVATION transient state. + // + // During DEACTIVATION (crash mid-Deinitialize()): + // - DeinitializePlugin() completes without crashing (dead proxy). + // - Normal teardown continues. State reaches DEACTIVATED. + // - The zombie process is cleaned up on the next Activate() via + // _administrator.Destroy(_lastId). + // ----------------------------------------------------------------------- + + // ----------------------------------------------------------------------- + // StateMachine — lock ordering + // + // Three locks are in play within Service. They must always be acquired + // in the order listed below. Acquiring in any other order is a deadlock. + // + // 1. _transitionLock (StateMachine) + // Outermost. Held for the full duration of a lifecycle transition + // including the post-transition callback. Ensures exactly one + // transition executor exists at any time. + // NOT held during QueryInterface. + // + // 2. Service::Lock() (_adminLock in PluginHost::Service base) + // Held briefly inside a transition to guard state mutation and + // subsystem bit changes. Never held across blocking plugin calls. + // + // 3. _pluginHandling (Service) + // Innermost. Held briefly in QueryInterface and ReleaseInterfaces + // to guard _handler lifetime. Never held while calling into plugin. + // + // Special cases: + // - _transitionLock is intentionally released around sm.Evaluate() + // to allow RecursiveNotification to call Reevaluate() on this + // service without deadlocking. Safe because SetState(DEACTIVATION) + // is always called before the release — the transient state rejects + // all concurrent operations. + // + // - _notificationLock (Notifiers) is independent of all three locks + // above. It is never held during foreign code — observer lists are + // snapshotted under the lock and callbacks fire after it is released. + // + // - QueryInterface never holds _transitionLock. It uses _pluginHandling + // only, and only ActivatedState::QueryInterface reaches plugin code. + // ----------------------------------------------------------------------- + class StateMachine { + public: + // Base state — default implementations return ERROR_ILLEGAL_STATE. + // Concrete states only override what is legal for them. + class StateBase { + public: + virtual ~StateBase() = default; + virtual IShell::state Id() const = 0; + virtual Core::hresult Activate(StateMachine&, const reason) { return Core::ERROR_ILLEGAL_STATE; } + virtual Core::hresult Deactivate(StateMachine&, const reason) { return Core::ERROR_ILLEGAL_STATE; } + virtual Core::hresult Hibernate(StateMachine&, const uint32_t) { return Core::ERROR_ILLEGAL_STATE; } + virtual Core::hresult Unavailable(StateMachine&, const reason) { return Core::ERROR_ILLEGAL_STATE; } + virtual void Reevaluate(StateMachine&) {} + virtual void* QueryInterface(StateMachine&, const uint32_t, const bool) { return nullptr; } + }; + + class DeactivatedState : public StateBase { + public: + IShell::state Id() const override { return IShell::DEACTIVATED; } + Core::hresult Activate(StateMachine&, const reason) override; + Core::hresult Unavailable(StateMachine&, const reason) override; + }; + + class PreconditionState : public StateBase { + public: + IShell::state Id() const override { return IShell::PRECONDITION; } + Core::hresult Deactivate(StateMachine&, const reason) override; + void Reevaluate(StateMachine&) override; + }; + + class ActivationState : public StateBase { + public: + IShell::state Id() const override { return IShell::ACTIVATION; } + Core::hresult Activate(StateMachine&, const reason) override { return Core::ERROR_INPROGRESS; } + Core::hresult Deactivate(StateMachine&, const reason) override; + Core::hresult Hibernate(StateMachine&, const uint32_t) override { return Core::ERROR_INPROGRESS; } + Core::hresult Unavailable(StateMachine&, const reason) override { return Core::ERROR_INPROGRESS; } + }; + + class ActivatedState : public StateBase { + public: + IShell::state Id() const override { return IShell::ACTIVATED; } + Core::hresult Deactivate(StateMachine&, const reason) override; + Core::hresult Hibernate(StateMachine&, const uint32_t timeout) override; + void Reevaluate(StateMachine&) override; + void* QueryInterface(StateMachine&, const uint32_t id, const bool asIUnknown) override; + }; + + class DeactivationState : public StateBase { + public: + IShell::state Id() const override { return IShell::DEACTIVATION; } + Core::hresult Activate(StateMachine&, const reason) override { return Core::ERROR_INPROGRESS; } + Core::hresult Deactivate(StateMachine&, const reason) override { return Core::ERROR_INPROGRESS; } + Core::hresult Hibernate(StateMachine&, const uint32_t) override { return Core::ERROR_INPROGRESS; } + Core::hresult Unavailable(StateMachine&, const reason) override { return Core::ERROR_INPROGRESS; } + }; + + class HibernatedState : public StateBase { + public: + IShell::state Id() const override { return IShell::HIBERNATED; } + Core::hresult Activate(StateMachine&, const reason) override; + Core::hresult Deactivate(StateMachine&, const reason) override; + }; + + class UnavailableState : public StateBase { + public: + IShell::state Id() const override { return IShell::UNAVAILABLE; } + Core::hresult Deactivate(StateMachine&, const reason) override; + }; + + public: + using Callback = std::function; + + StateMachine() = delete; + StateMachine(StateMachine&&) = delete; + StateMachine(const StateMachine&) = delete; + StateMachine& operator=(StateMachine&&) = delete; + StateMachine& operator=(const StateMachine&) = delete; + + StateMachine(Service& parent, Callback callback) + : _parent(parent) + , _callback(std::move(callback)) + , _transitionLock() + , _transitionThread(0) + , _current(&_stateDeactivated) + { + } + ~StateMachine() = default; + + public: + inline IShell::state Current() const { + return _current.load(std::memory_order_acquire)->Id(); + } + + inline void SetState(StateBase& newState) { + _current.store(&newState, std::memory_order_release); + // PluginHost::Service::State(value) is a plain _state = value assignment + // with no lock — calling this while holding _parent.Lock() is safe. + _parent.State(newState.Id()); + } + + inline bool IsTransitionThread() const { + return _transitionThread.load(std::memory_order_acquire) == Core::Thread::ThreadId(); + } + + // Public triggers — acquire transition lock before dispatching. + // Guarantees exactly one transition executor at a time. + // QueryInterface intentionally does NOT take this lock. + Core::hresult Activate(const reason why) { ASSERT(!IsTransitionThread()); Core::SafeSyncType guard(_transitionLock); _transitionThread = Core::Thread::ThreadId(); Core::hresult result = _Activate(why); _transitionThread = 0; return result; } + Core::hresult Deactivate(const reason why) { ASSERT(!IsTransitionThread()); Core::SafeSyncType guard(_transitionLock); _transitionThread = Core::Thread::ThreadId(); Core::hresult result = _Deactivate(why); _transitionThread = 0; return result; } + Core::hresult Hibernate(const uint32_t timeout) { ASSERT(!IsTransitionThread()); Core::SafeSyncType guard(_transitionLock); _transitionThread = Core::Thread::ThreadId(); Core::hresult result = _current.load(std::memory_order_acquire)->Hibernate(*this, timeout); _transitionThread = 0; return result; } + Core::hresult Unavailable(const reason why) { ASSERT(!IsTransitionThread()); Core::SafeSyncType guard(_transitionLock); _transitionThread = Core::Thread::ThreadId(); Core::hresult result = _current.load(std::memory_order_acquire)->Unavailable(*this, why); _transitionThread = 0; return result; } + void Reevaluate() { ASSERT(_transitionThread != Core::Thread::ThreadId()); Core::SafeSyncType guard(_transitionLock); _transitionThread = Core::Thread::ThreadId(); _current.load(std::memory_order_acquire)->Reevaluate(*this); _transitionThread = 0; } + void* QueryInterface(const uint32_t id, const bool asIUnknown) { return _current.load(std::memory_order_acquire)->QueryInterface(*this, id, asIUnknown); } + + // Internal triggers — no lock. Called by state class methods + // that are already executing under _transitionLock. + Core::hresult _Activate(const reason why) { ASSERT(IsTransitionThread()); return _current.load(std::memory_order_acquire)->Activate(*this, why); } + Core::hresult _Deactivate(const reason why) { ASSERT(IsTransitionThread()); return _current.load(std::memory_order_acquire)->Deactivate(*this, why); } + + // Called from within a state method to trigger cascading re-evaluation + // of dependent services. Temporarily releases _transitionLock so + // RecursiveNotification can call Reevaluate() on this service without + // deadlocking. Safe because SetState(DEACTIVATION) must be called + // before this — the transient state rejects concurrent operations. + // + // DEBT: this is controlled lock borrowing, not RAII-managed ownership. + // The ASSERT(IsTransitionThread()) guards against misuse in debug builds + // but compiles out in release. If _transitionLock is ever not held when + // this is called in release, Unlock() on an unheld mutex is UB. + // Mitigation: IsTransitionThread() is also asserted on all internal + // triggers (_Activate, _Deactivate) that are the only legal callers of + // state methods — making it structurally difficult to reach Evaluate() + // outside a transition without triggering an earlier assert. + void Evaluate() { + ASSERT(IsTransitionThread()); + _transitionThread = 0; + _transitionLock.Unlock(); + _parent._administrator.Evaluate(); + _transitionLock.Lock(); + _transitionThread = Core::Thread::ThreadId(); + } + + // Static state instances — shared across all plugins. + // Safe because state objects carry no per-plugin data. + static DeactivatedState _stateDeactivated; + static PreconditionState _statePrecondition; + static ActivationState _stateActivation; + static ActivatedState _stateActivated; + static DeactivationState _stateDeactivation; + static HibernatedState _stateHibernated; + static UnavailableState _stateUnavailable; + + // State classes access Service members via _parent (C++11 nested class access rules) + private: + Service& _parent; + Callback _callback; + mutable Core::CriticalSection _transitionLock; + // Debug: tracks which thread owns the current transition. + // ASSERT fires if a public trigger is called re-entrantly + // from within a callback or state method on the same thread. + // Atomic because the ASSERT reads it before _transitionLock + // is acquired — concurrent writes from other threads must be safe. + // Compiled out in release builds. + std::atomic _transitionThread; + // Atomic pointer — written by SetState() during transitions, + // read concurrently by QueryInterface() and trigger methods. + // Preserves State pattern dispatch without a mutex on reads. + std::atomic _current; + }; + class ControlData { public: ControlData() = delete; @@ -831,7 +1122,6 @@ namespace PluginHost { Service(const PluginHost::Config& server, const Plugin::Config& plugin, ServiceMap& administrator, const mode type, const Core::ProxyType& handler) : PluginHost::Service(plugin, server.WebPrefix(), server.PersistentPath(), server.DataPath(), server.VolatilePath()) , _pluginHandling() - , _queryInterfaceLock() , _handler(nullptr) , _extended(nullptr) , _webRequest(nullptr) @@ -854,6 +1144,38 @@ namespace PluginHost { , _composit(*this) , _jobs(administrator) , _type(type) + , _stateMachine(*this, [this](const IShell::state newState) { + const string callSign(PluginHost::Service::Configuration().Callsign.Value()); + const Core::EnumerateType textReason(_reason); + switch (newState) { + case IShell::ACTIVATED: + _administrator.Activated(callSign, this); + #ifdef THUNDER_RESTFULL_API + Notify(EMPTY_STRING, string(_T("{\"state\":\"activated\",\"reason\":\"")) + textReason.Data() + _T("\"}")); + #endif + Notify(_T("statechange"), string(_T("{\"state\":\"activated\",\"reason\":\"")) + textReason.Data() + _T("\"}")); + break; + case IShell::DEACTIVATED: + _administrator.Deinitialized(callSign, this); + #ifdef THUNDER_RESTFULL_API + Notify(EMPTY_STRING, string(_T("{\"state\":\"deactivated\",\"reason\":\"")) + textReason.Data() + _T("\"}")); + #endif + Notify(_T("statechange"), string(_T("{\"state\":\"deactivated\",\"reason\":\"")) + textReason.Data() + _T("\"}")); + break; + case IShell::PRECONDITION: + _administrator.Deinitialized(callSign, this); + break; + case IShell::UNAVAILABLE: + _administrator.Unavailable(callSign, this); + #ifdef THUNDER_RESTFULL_API + Notify(EMPTY_STRING, string(_T("{\"state\":\"unavailable\",\"reason\":\"")) + textReason.Data() + _T("\"}")); + #endif + Notify(_T("statechange"), string(_T("{\"state\":\"unavailable\",\"reason\":\"")) + textReason.Data() + _T("\"}")); + break; + default: + break; + } + }) { _jobs.Slots(_metadata.MaxRequests()); } @@ -1221,36 +1543,7 @@ namespace PluginHost { } inline void Evaluate() { - Lock(); - - uint32_t subsystems = _administrator.SubSystemInfo().Value(); - - IShell::state current(State()); - - // Active or not, update the condition state !!!! - if ((_precondition.Evaluate(subsystems) == true) && (current == IShell::PRECONDITION)) { - if (_precondition.IsMet() == true) { - - Unlock(); - - Activate(_reason); - - Lock(); - } - } - - if ((_termination.Evaluate(subsystems) == true) && (current == IShell::ACTIVATED)) { - if (_termination.IsMet() == false) { - - Unlock(); - - Deactivate(IShell::CONDITIONS); - - Lock(); - } - } - - Unlock(); + _stateMachine.Reevaluate(); } inline bool PostMortemAllowed(PluginHost::IShell::reason why) const { return (_administrator.Configuration().PostMortemAllowed(why)); @@ -1438,6 +1731,16 @@ namespace PluginHost { } private: + // Work methods called by StateMachine during transitions. + // Each does exactly one thing. No guard logic, no state changes, + // no notifications — those are all owned by StateMachine. + Core::hresult LoadPlugin(); // AcquireInterfaces + Core::hresult InitializePlugin(); // _handler->Initialize() + void DeinitializePlugin(); // _handler->Deinitialize() + void UnloadPlugin(); // ReleaseInterfaces + void Attach(); // WebServer + JSONRPC + external connector + StateControl + void Detach(); // reverse of Attach + uint32_t Wakeup(const uint32_t timeout); #ifdef HIBERNATE_SUPPORT_ENABLED @@ -1691,8 +1994,6 @@ namespace PluginHost { private: mutable Core::CriticalSection _pluginHandling; - mutable Core::CriticalSection _queryInterfaceLock; // a little shortcut to protect both the QueryInterface from Plugin deinitializing which could make the QueryInterface to the OOP part of a plugin crash (and to facilitate sending the Plugin Deactivated notifications while still allow QueryInterface to be called) - // the whole plugin state handling including locking needs a redesign... // The handlers that implement the actual logic behind the service IPlugin* _handler; @@ -1720,6 +2021,7 @@ namespace PluginHost { Core::SinkType _composit; Jobs _jobs; mode _type; + StateMachine _stateMachine; static Core::ProxyType _unavailableHandler; static Core::ProxyType _missingHandler; @@ -2766,55 +3068,89 @@ namespace PluginHost { } void Notify(const string& callsign, PluginHost::IShell* entry, Core::hresult (PluginHost::IPlugin::INotification::*notificatonmethod)(const string& callsign, IShell* plugin)) { + // Phase 1 — snapshot observer list under lock. + // Observers are AddRef'd so they stay alive across the unlock boundary. + // Foreign code never runs while _notificationLock is held. + std::vector snapshot; + _notificationLock.Lock(); + snapshot.reserve(_notifiers.size()); + for (auto& notifier : _notifiers) { + if (notifier.second.SendNotification(callsign, entry) == true) { + notifier.first->AddRef(); + snapshot.push_back(notifier.first); + } + } + _notificationLock.Unlock(); - auto it = _notifiers.begin(); - while (it != _notifiers.end()) { - if (it->second.SendNotification(callsign, entry) == true) { - Core::hresult result = (it->first->*notificatonmethod)(callsign, entry); - if (result == Core::ERROR_CANCEL) { - PluginHost::IPlugin::INotification* foundnotification = it->first; + // Phase 2 — fire callbacks outside lock. + // Observers may now safely call Unregister() or trigger transitions. + std::vector toRemove; + for (PluginHost::IPlugin::INotification* observer : snapshot) { + if ((observer->*notificatonmethod)(callsign, entry) == Core::ERROR_CANCEL) { + toRemove.push_back(observer); + } + observer->Release(); + } + + // Phase 3 — remove ERROR_CANCEL observers under lock. + // Preserves the ERROR_CANCEL API contract without holding + // _notificationLock during foreign code execution. + if (toRemove.empty() == false) { + _notificationLock.Lock(); + for (PluginHost::IPlugin::INotification* observer : toRemove) { + auto range = _notifiers.equal_range(observer); + for (auto it = range.first; it != range.second; ++it) { it = _notifiers.erase(it); - foundnotification->Release(); - foundnotification = nullptr; - } else { - ++it; } - } else { - ++it; + observer->Release(); } + _notificationLock.Unlock(); } - - _notificationLock.Unlock(); } void Notify(const string& callsign, PluginHost::IShell* entry, void (PluginHost::IPlugin::ILifeTime::*notificatonmethod)(const string& callsign, IShell* plugin)) const { - _notificationLock.Lock(); + // Phase 1 — snapshot under lock. + std::vector snapshot; + _notificationLock.Lock(); + snapshot.reserve(_notifiers.size()); for (const auto& notifier : _notifiers) { if (notifier.second.SendNotification(callsign, entry) == true) { PluginHost::IPlugin::ILifeTime* lifeTime = notifier.first->QueryInterface(); if (lifeTime != nullptr) { - (lifeTime->*notificatonmethod)(callsign, entry); - lifeTime->Release(); + snapshot.push_back(lifeTime); } } } - _notificationLock.Unlock(); + + // Phase 2 — fire outside lock. ILifeTime has no ERROR_CANCEL — no removal needed. + for (PluginHost::IPlugin::ILifeTime* lifeTime : snapshot) { + (lifeTime->*notificatonmethod)(callsign, entry); + lifeTime->Release(); + } } template void Visit(FUNCTION function) const { - _notificationLock.Lock(); + // Snapshot under lock — function may call back into ServiceMap. + std::vector snapshot; + _notificationLock.Lock(); + snapshot.reserve(_notifiers.size()); for (const auto& notifier : _notifiers) { - function(notifier.first); + notifier.first->AddRef(); + snapshot.push_back(notifier.first); } - _notificationLock.Unlock(); + + for (PluginHost::IPlugin::INotification* observer : snapshot) { + function(observer); + observer->Release(); + } } private: From 1eb1ac30e420f59f18f1a3fc15d06f5dbf085c4c Mon Sep 17 00:00:00 2001 From: Bram Oosterhuis Date: Wed, 13 May 2026 16:29:05 +0200 Subject: [PATCH 2/2] Use RAII to safely yield transition ownership in Evaluate() --- Source/Thunder/PluginServer.h | 44 +++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/Source/Thunder/PluginServer.h b/Source/Thunder/PluginServer.h index 2b6cef155..6bdb8b70f 100644 --- a/Source/Thunder/PluginServer.h +++ b/Source/Thunder/PluginServer.h @@ -923,27 +923,37 @@ namespace PluginHost { Core::hresult _Activate(const reason why) { ASSERT(IsTransitionThread()); return _current.load(std::memory_order_acquire)->Activate(*this, why); } Core::hresult _Deactivate(const reason why) { ASSERT(IsTransitionThread()); return _current.load(std::memory_order_acquire)->Deactivate(*this, why); } + struct TransitionYield { + TransitionYield() = delete; + TransitionYield(const TransitionYield&) = delete; + TransitionYield(TransitionYield&&) = delete; + TransitionYield& operator=(const TransitionYield&) = delete; + TransitionYield& operator=(TransitionYield&&) = delete; + + explicit TransitionYield(StateMachine& sm) + : _sm(sm) + { + _sm._transitionThread = 0; + _sm._transitionLock.Unlock(); + } + ~TransitionYield() + { + _sm._transitionLock.Lock(); + _sm._transitionThread = Core::Thread::ThreadId(); + } + StateMachine& _sm; + }; + // Called from within a state method to trigger cascading re-evaluation - // of dependent services. Temporarily releases _transitionLock so - // RecursiveNotification can call Reevaluate() on this service without - // deadlocking. Safe because SetState(DEACTIVATION) must be called - // before this — the transient state rejects concurrent operations. - // - // DEBT: this is controlled lock borrowing, not RAII-managed ownership. - // The ASSERT(IsTransitionThread()) guards against misuse in debug builds - // but compiles out in release. If _transitionLock is ever not held when - // this is called in release, Unlock() on an unheld mutex is UB. - // Mitigation: IsTransitionThread() is also asserted on all internal - // triggers (_Activate, _Deactivate) that are the only legal callers of - // state methods — making it structurally difficult to reach Evaluate() - // outside a transition without triggering an earlier assert. + // of dependent services. Temporarily yields _transitionLock via + // TransitionYield so RecursiveNotification can call Reevaluate() on + // this service without deadlocking. Safe because SetState(DEACTIVATION) + // must be called before this — the transient state rejects all + // concurrent operations during the yield window. void Evaluate() { ASSERT(IsTransitionThread()); - _transitionThread = 0; - _transitionLock.Unlock(); + TransitionYield yield(*this); _parent._administrator.Evaluate(); - _transitionLock.Lock(); - _transitionThread = Core::Thread::ThreadId(); } // Static state instances — shared across all plugins.