From 8c1737fff679eaf8242ce3d7dcb6b01ef10a4536 Mon Sep 17 00:00:00 2001 From: "david.zagury" Date: Thu, 30 Apr 2026 19:53:14 +0300 Subject: [PATCH 1/2] featured: fix stale FEATURE entry after concurrent package uninstall When sonic-package-manager uninstalls a package it deletes the FEATURE entry from CONFIG_DB. Because featured unblocks spm (via STATE_DB) before finishing its own handler, the entry can be gone by the time featured tries to write back to it. Two code paths independently recreate the deleted entry, leaving a ghost record that survives the uninstall and appears in "show feature status": 1. sync_feature_scope(): called after a successful disable_feature(), it calls mod_entry({has_per_asic_scope, has_global_scope}) on the already- deleted key. Redis HSET recreates the hash with only those two fields (no 'state'), so the next notification arrives with state=None and featured logs "ERR: Unexpected state value 'None'". There is also a TOCTOU gap: spm can delete the entry between the mod_entry calls and the post-write read, producing the same partial entry. 2. resync_feature_state(): called when update_feature_state() fails (e.g. systemctl start fails because the service file was already removed by the concurrent uninstall). get_entry() returns {} for a deleted key, _feature_state_is_template(None) evaluates True, and mod_entry writes {state: disabled} back - recreating the entry with no other fields. The same function also completes partial entries left by (1): a non- empty entry with no 'state' key passes the deleted-entry guard and receives a state field, producing a 3-field ghost record. Fix all paths: - sync_feature_scope: after the two mod_entry writes, re-read the entry. If it exists but has no 'state' key, the write raced with deregister; delete the partial entry immediately. - resync_feature_state: return early if the entry is fully absent. If the entry exists but has no 'state' key, delete it rather than completing the ghost record. - update_feature_state: suppress the ERR log when state is None, since that is a normal consequence of the race rather than a programming error. Signed-off-by: david.zagury --- scripts/featured | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/scripts/featured b/scripts/featured index 636c24c6..fb35d5a2 100644 --- a/scripts/featured +++ b/scripts/featured @@ -266,8 +266,12 @@ class FeatureHandler(object): return False if not enable and not disable: - syslog.syslog(syslog.LOG_ERR, "Unexpected state value '{}' for feature {}" - .format(feature.state, feature.name)) + # state=None means the FEATURE entry was deleted by a concurrent uninstall + # while this notification was queued — not an error. Any other non-None + # value here is a genuinely unexpected state and should be flagged. + if feature.state is not None: + syslog.syslog(syslog.LOG_ERR, "Unexpected state value '{}' for feature {}" + .format(feature.state, feature.name)) return False if feature.delayed and not self.is_delayed_enabled: @@ -349,6 +353,13 @@ class FeatureHandler(object): new_per_asic = str(feature_config.has_per_asic_scope) new_global = str(feature_config.has_global_scope) self._conditional_update_scope(self._config_db, feature_config.name, new_per_asic, new_global) + # If the entry was deleted by a concurrent deregister, _conditional_update_scope + # may have recreated it without a 'state' field. Remove the partial entry to + # avoid a ghost record. + post_entry = self._config_db.get_entry(FEATURE_TBL, feature_config.name) + if post_entry and 'state' not in post_entry: + self._config_db.set_entry(FEATURE_TBL, feature_config.name, None) + return # Sync both has_per_asic_scope and has_global_scope to CONFIG_DB in namespaces on multi-asic platforms for ns, db in self.ns_cfg_db.items(): @@ -549,7 +560,17 @@ class FeatureHandler(object): def resync_feature_state(self, feature): current_entry = self._config_db.get_entry('FEATURE', feature.name) - current_feature_state = current_entry.get('state') if current_entry else None + + # Entry was deleted by a concurrent deregister — do not recreate it. + if not current_entry: + return + + # Partial entry left by sync_feature_scope racing with deregister — remove it. + if 'state' not in current_entry: + self._config_db.set_entry('FEATURE', feature.name, None) + return + + current_feature_state = current_entry.get('state') if feature.state == current_feature_state: return From b10757c31cc8eb5bb1518fd564a7e741e41a8d94 Mon Sep 17 00:00:00 2001 From: "david.zagury" Date: Wed, 6 May 2026 10:33:12 +0300 Subject: [PATCH 2/2] featured: update and extend tests for deregister race guards test_feature_resync: the sub-case where get_entry returns None previously asserted that mod_entry would be called with the rendered state. That behaviour was the root of the bug -- resync_feature_state was recreating a FEATURE row that had been deleted by a concurrent deregister. Update the assertion to assert_not_called(), matching the new guard that returns early when the entry is absent. Add a further sub-case for a partial entry (row exists but has no 'state' key, the TOCTOU artifact left by sync_feature_scope racing with deregister): resync_feature_state must call set_entry(None) to remove the ghost record rather than completing it with mod_entry. test_sync_feature_scope_toctou: new test for the post-write TOCTOU guard in sync_feature_scope. Uses side_effect to return a live entry on the pre-write check and a partial entry (no 'state' key) on the post-write check, simulating the window where the FEATURE row is deleted and recreated by mod_entry during the race. sync_feature_scope must call set_entry(None) to clean it up. Signed-off-by: david.zagury --- scripts/featured | 1 + tests/featured/featured_test.py | 44 +++++++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/scripts/featured b/scripts/featured index fb35d5a2..9800a548 100644 --- a/scripts/featured +++ b/scripts/featured @@ -353,6 +353,7 @@ class FeatureHandler(object): new_per_asic = str(feature_config.has_per_asic_scope) new_global = str(feature_config.has_global_scope) self._conditional_update_scope(self._config_db, feature_config.name, new_per_asic, new_global) + # If the entry was deleted by a concurrent deregister, _conditional_update_scope # may have recreated it without a 'state' field. Remove the partial entry to # avoid a ghost record. diff --git a/tests/featured/featured_test.py b/tests/featured/featured_test.py index ccf5c69a..272f06f4 100644 --- a/tests/featured/featured_test.py +++ b/tests/featured/featured_test.py @@ -282,7 +282,8 @@ def test_feature_resync(self): } mock_db.get_entry.return_value = None feature_handler.sync_state_field(feature_table) - mock_db.mod_entry.assert_called_with('FEATURE', 'sflow', {'state': 'enabled'}) + # Missing FEATURE row: resync_feature_state skips mod_entry (avoid recreating a row removed concurrently). + mock_db.mod_entry.assert_not_called() mock_db.mod_entry.reset_mock() feature_handler = featured.FeatureHandler(mock_db, mock_feature_state_table, {}, False) @@ -321,7 +322,46 @@ def test_feature_resync(self): } feature_handler.sync_state_field(feature_table) mock_db.mod_entry.assert_called_with('FEATURE', 'sflow', {'state': 'enabled'}) - + mock_db.mod_entry.reset_mock() + + # Partial entry (no 'state' key): left behind by sync_feature_scope racing with + # deregister. resync_feature_state must delete it rather than completing it. + feature_handler = featured.FeatureHandler(mock_db, mock_feature_state_table, {}, False) + mock_db.get_entry.return_value = { + 'has_per_asic_scope': 'False', + 'has_global_scope': 'True', + } + mock_db.set_entry = mock.MagicMock() + feature_handler.sync_state_field(feature_table) + mock_db.set_entry.assert_called_with('FEATURE', 'sflow', None) + mock_db.mod_entry.assert_not_called() + + def test_sync_feature_scope_toctou(self): + """sync_feature_scope must delete a partial entry it accidentally creates when + the FEATURE row is deleted by a concurrent deregister between the mod_entry + calls and the post-write existence check (TOCTOU).""" + mock_db = mock.MagicMock() + mock_feature_state_table = mock.MagicMock() + + feature_handler = featured.FeatureHandler(mock_db, mock_feature_state_table, {}, False) + feature = featured.Feature('sflow', { + 'state': 'disabled', + 'has_global_scope': 'True', + 'has_per_asic_scope': 'False', + 'delayed': 'False', + }) + + # Post-write check sees a partial entry (no 'state' key): the FEATURE row was + # deleted by deregister and then recreated by mod_entry during the race + # window, leaving only the scope fields. + mock_db.get_entry.return_value = { + 'has_per_asic_scope': 'False', + 'has_global_scope': 'True', + } + + feature_handler.sync_feature_scope(feature) + mock_db.set_entry.assert_called_with(featured.FEATURE_TBL, 'sflow', None) + def test_port_init_done_twice(self): """There could be multiple "PortInitDone" event in case of swss restart(either due to crash or due to manual operation). swss