From 7188b88dc3b323e8dc7511edd6937dd7e16a0a58 Mon Sep 17 00:00:00 2001 From: "david.zagury" Date: Thu, 30 Apr 2026 19:53:14 +0300 Subject: [PATCH 1/2] featured: fix stale FEATURE entry after concurrent package uninstall When sonic-package-manager uninstalls a package it deletes the FEATURE entry from CONFIG_DB. Because featured unblocks spm (via STATE_DB) before finishing its own handler, the entry can be gone by the time featured tries to write back to it. Two code paths independently recreate the deleted entry, leaving a ghost record that survives the uninstall and appears in "show feature status": 1. sync_feature_scope(): called after a successful disable_feature(), it calls mod_entry({has_per_asic_scope, has_global_scope}) on the already- deleted key. Redis HSET recreates the hash with only those two fields (no 'state'), so the next notification arrives with state=None and featured logs "ERR: Unexpected state value 'None'". There is also a TOCTOU gap: spm can delete the entry between the mod_entry calls and the post-write read, producing the same partial entry. 2. resync_feature_state(): called when update_feature_state() fails (e.g. systemctl start fails because the service file was already removed by the concurrent uninstall). get_entry() returns {} for a deleted key, _feature_state_is_template(None) evaluates True, and mod_entry writes {state: disabled} back - recreating the entry with no other fields. The same function also completes partial entries left by (1): a non- empty entry with no 'state' key passes the deleted-entry guard and receives a state field, producing a 3-field ghost record. Fix all paths: - sync_feature_scope: after the two mod_entry writes, re-read the entry. If it exists but has no 'state' key, the write raced with deregister; delete the partial entry immediately. - resync_feature_state: return early if the entry is fully absent. If the entry exists but has no 'state' key, delete it rather than completing the ghost record. - update_feature_state: suppress the ERR log when state is None, since that is a normal consequence of the race rather than a programming error. Signed-off-by: david.zagury --- scripts/featured | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/scripts/featured b/scripts/featured index 24cfa77..1d443d2 100644 --- a/scripts/featured +++ b/scripts/featured @@ -266,8 +266,12 @@ class FeatureHandler(object): return False if not enable and not disable: - syslog.syslog(syslog.LOG_ERR, "Unexpected state value '{}' for feature {}" - .format(feature.state, feature.name)) + # state=None means the FEATURE entry was deleted by a concurrent uninstall + # while this notification was queued — not an error. Any other non-None + # value here is a genuinely unexpected state and should be flagged. + if feature.state is not None: + syslog.syslog(syslog.LOG_ERR, "Unexpected state value '{}' for feature {}" + .format(feature.state, feature.name)) return False if feature.delayed and not self.is_delayed_enabled: @@ -323,6 +327,12 @@ class FeatureHandler(object): return self._config_db.mod_entry(FEATURE_TBL, feature_config.name, {'has_per_asic_scope': str(feature_config.has_per_asic_scope)}) self._config_db.mod_entry(FEATURE_TBL, feature_config.name, {'has_global_scope': str(feature_config.has_global_scope)}) + # If the entry was deleted by a concurrent deregister, mod_entry recreates it + # without a 'state' field. Remove the partial entry to avoid a ghost record. + post_entry = self._config_db.get_entry(FEATURE_TBL, feature_config.name) + if post_entry and 'state' not in post_entry: + self._config_db.set_entry(FEATURE_TBL, feature_config.name, None) + return # sync has_per_asic_scope to CONFIG_DB in namespaces in multi-asic platform for ns, db in self.ns_cfg_db.items(): @@ -524,7 +534,17 @@ class FeatureHandler(object): def resync_feature_state(self, feature): current_entry = self._config_db.get_entry('FEATURE', feature.name) - current_feature_state = current_entry.get('state') if current_entry else None + + # Entry was deleted by a concurrent deregister — do not recreate it. + if not current_entry: + return + + # Partial entry left by sync_feature_scope racing with deregister — remove it. + if 'state' not in current_entry: + self._config_db.set_entry('FEATURE', feature.name, None) + return + + current_feature_state = current_entry.get('state') if feature.state == current_feature_state: return From dd9a46e393e328aabc90dae16a053619a060a9a1 Mon Sep 17 00:00:00 2001 From: "david.zagury" Date: Wed, 6 May 2026 10:33:12 +0300 Subject: [PATCH 2/2] featured: update and extend tests for deregister race guards test_feature_resync: the sub-case where get_entry returns None previously asserted that mod_entry would be called with the rendered state. That behaviour was the root of the bug -- resync_feature_state was recreating a FEATURE row that had been deleted by a concurrent deregister. Update the assertion to assert_not_called(), matching the new guard that returns early when the entry is absent. Add a further sub-case for a partial entry (row exists but has no 'state' key, the TOCTOU artifact left by sync_feature_scope racing with deregister): resync_feature_state must call set_entry(None) to remove the ghost record rather than completing it with mod_entry. test_sync_feature_scope_toctou: new test for the post-write TOCTOU guard in sync_feature_scope. Uses side_effect to return a live entry on the pre-write check and a partial entry (no 'state' key) on the post-write check, simulating the window where the FEATURE row is deleted and recreated by mod_entry during the race. sync_feature_scope must call set_entry(None) to clean it up. Signed-off-by: david.zagury --- tests/featured/featured_test.py | 44 +++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/tests/featured/featured_test.py b/tests/featured/featured_test.py index a563075..b2444f1 100644 --- a/tests/featured/featured_test.py +++ b/tests/featured/featured_test.py @@ -282,7 +282,8 @@ def test_feature_resync(self): } mock_db.get_entry.return_value = None feature_handler.sync_state_field(feature_table) - mock_db.mod_entry.assert_called_with('FEATURE', 'sflow', {'state': 'enabled'}) + # Missing FEATURE row: resync_feature_state skips mod_entry (avoid recreating a row removed concurrently). + mock_db.mod_entry.assert_not_called() mock_db.mod_entry.reset_mock() feature_handler = featured.FeatureHandler(mock_db, mock_feature_state_table, {}, False) @@ -321,7 +322,46 @@ def test_feature_resync(self): } feature_handler.sync_state_field(feature_table) mock_db.mod_entry.assert_called_with('FEATURE', 'sflow', {'state': 'enabled'}) - + mock_db.mod_entry.reset_mock() + + # Partial entry (no 'state' key): left behind by sync_feature_scope racing with + # deregister. resync_feature_state must delete it rather than completing it. + feature_handler = featured.FeatureHandler(mock_db, mock_feature_state_table, {}, False) + mock_db.get_entry.return_value = { + 'has_per_asic_scope': 'False', + 'has_global_scope': 'True', + } + mock_db.set_entry = mock.MagicMock() + feature_handler.sync_state_field(feature_table) + mock_db.set_entry.assert_called_with('FEATURE', 'sflow', None) + mock_db.mod_entry.assert_not_called() + + def test_sync_feature_scope_toctou(self): + """sync_feature_scope must delete a partial entry it accidentally creates when + the FEATURE row is deleted by a concurrent deregister between the mod_entry + calls and the post-write existence check (TOCTOU).""" + mock_db = mock.MagicMock() + mock_feature_state_table = mock.MagicMock() + + feature_handler = featured.FeatureHandler(mock_db, mock_feature_state_table, {}, False) + feature = featured.Feature('sflow', { + 'state': 'disabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + 'delayed': 'False', + }) + + # Post-write check sees a partial entry (no 'state' key): the FEATURE row was + # deleted by deregister and then recreated by mod_entry during the race + # window, leaving only the scope fields. + mock_db.get_entry.return_value = { + 'has_per_asic_scope': 'False', + 'has_global_scope': 'True', + } + + feature_handler.sync_feature_scope(feature) + mock_db.set_entry.assert_called_with(featured.FEATURE_TBL, 'sflow', None) + def test_port_init_done_twice(self): """There could be multiple "PortInitDone" event in case of swss restart(either due to crash or due to manual operation). swss