Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
### Breaking Changes

### Bugs Fixed

- Fix a bug with crashing on import if psutil gets accessDenied.
### Other Changes

## 1.0.0b48 (2026-02-05)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,8 @@
# _PROCESS.io_counters() is not available on Mac OS and some Linux distros.
_IO_AVAILABLE = hasattr(_PROCESS, "io_counters")
_IO_LAST_COUNT = 0
if _IO_AVAILABLE:
_io_counters_initial = _PROCESS.io_counters()
_IO_LAST_COUNT = _io_counters_initial.read_bytes + _io_counters_initial.write_bytes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just add a try catch around this initial call, and if it fails, set _IO_AVAILABLE to false. That would make this a ~3 line change.

Copy link
Member

@rads-1996 rads-1996 Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tbequinor I’ll open a separate PR with this fix and follow up once it’s included in a new release. We can close this PR for now, and I’ll create an issue to track the change. Link to the issue - #45283

_IO_LAST_TIME = datetime.now()
_IO_INITIALIZED = False
# Processor Time %
_LAST_CPU_TIMES = psutil.cpu_times()
# Request Rate
Expand Down Expand Up @@ -168,25 +166,55 @@ def _get_process_io(options: CallbackOptions) -> Iterable[Observation]:
:rtype: ~typing.Iterable[~opentelemetry.metrics.Observation]
"""
try:
if not _IO_AVAILABLE:
yield Observation(0, {})
return
# pylint: disable=global-statement
global _IO_LAST_COUNT
# pylint: disable=global-statement
global _IO_LAST_TIME
# RSS is non-swapped physical memory a process has used
io_counters = _PROCESS.io_counters()
rw_count = io_counters.read_bytes + io_counters.write_bytes
# pylint: disable=global-statement
global _IO_INITIALIZED
# pylint: disable=global-statement
global _IO_AVAILABLE

# Lazily initialize counters to avoid import-time failures in restricted environments.
if not _IO_INITIALIZED:
_IO_INITIALIZED = True
if _IO_AVAILABLE:
io_counters_func = getattr(_PROCESS, "io_counters", None)
if not callable(io_counters_func):
_IO_AVAILABLE = False
yield Observation(0, {})
return
io_counters_initial = io_counters_func()
_IO_LAST_COUNT = int(getattr(io_counters_initial, "read_bytes", 0)) + int(
getattr(io_counters_initial, "write_bytes", 0)
)
_IO_LAST_TIME = datetime.now()

if not _IO_AVAILABLE:
yield Observation(0, {})
return

io_counters_func = getattr(_PROCESS, "io_counters", None)
if not callable(io_counters_func):
_IO_AVAILABLE = False
yield Observation(0, {})
return

io_counters = io_counters_func()
rw_count = int(getattr(io_counters, "read_bytes", 0)) + int(getattr(io_counters, "write_bytes", 0))
Comment on lines +178 to +204
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When _IO_INITIALIZED is False, the callback initializes _IO_LAST_COUNT/_IO_LAST_TIME using an io_counters() call, but then continues in the same invocation to call io_counters() again and compute a rate over a near-zero elapsed time. This can create a large spike on the first observed value and does an extra syscall. Consider treating the first call as a baseline: after setting _IO_LAST_COUNT/_IO_LAST_TIME, yield Observation(0, {}) (or return) so rate calculation starts on the next collection interval.

Copilot uses AI. Check for mistakes.
rw_diff = rw_count - _IO_LAST_COUNT
_IO_LAST_COUNT = rw_count
current_time = datetime.now()
elapsed_time_s = (current_time - _IO_LAST_TIME).total_seconds()
_IO_LAST_TIME = current_time
if elapsed_time_s <= 0:
yield Observation(0, {})
return
io_rate = rw_diff / elapsed_time_s
yield Observation(io_rate, {})
except (psutil.NoSuchProcess, psutil.AccessDenied, Exception) as e: # pylint: disable=broad-except
_logger.exception("Error getting process I/O rate: %s", e)
_IO_AVAILABLE = False
_logger.debug("Disabling process I/O counter due to inaccessible io_counters: %s", e)
yield Observation(0, {})
Comment on lines 215 to 218
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The except (psutil.NoSuchProcess, psutil.AccessDenied, Exception) block effectively catches all exceptions (because Exception is included) and permanently sets _IO_AVAILABLE = False. This means a transient/unknown failure will disable the metric for the lifetime of the process and may hide real bugs. Consider only disabling on expected non-recoverable cases (e.g., AccessDenied, missing/non-callable io_counters, NotImplementedError), and for unexpected exceptions keep logging (ideally with stack trace) without flipping _IO_AVAILABLE to False.

Copilot uses AI. Check for mistakes.


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def setUp(self):
manager_module._IO_AVAILABLE = True
manager_module._IO_LAST_COUNT = 0
manager_module._IO_LAST_TIME = datetime.now()
manager_module._IO_INITIALIZED = False
manager_module._REQUESTS_COUNT = 0
manager_module._EXCEPTIONS_COUNT = 0
manager_module._LAST_REQUEST_RATE_TIME = datetime.now()
Expand Down Expand Up @@ -167,6 +168,7 @@ def test_get_process_io_success(self, mock_process, mock_datetime):
# Import and modify global variables
import azure.monitor.opentelemetry.exporter._performance_counters._manager as manager_module

manager_module._IO_INITIALIZED = True
manager_module._IO_LAST_COUNT = 3000 # Previous total
manager_module._IO_LAST_TIME = start_time

Expand All @@ -193,6 +195,7 @@ def test_get_process_io_unavailable(self, mock_process, mock_datetime):
import azure.monitor.opentelemetry.exporter._performance_counters._manager as manager_module

manager_module._IO_AVAILABLE = 0 # Previous total
manager_module._IO_INITIALIZED = True
manager_module._IO_LAST_COUNT = 0 # Previous total
manager_module._IO_LAST_TIME = start_time

Expand All @@ -201,6 +204,37 @@ def test_get_process_io_unavailable(self, mock_process, mock_datetime):
self.assertEqual(len(result), 1)
self.assertAlmostEqual(result[0].value, 0.0)

@mock.patch("azure.monitor.opentelemetry.exporter._performance_counters._manager._PROCESS")
def test_get_process_io_access_denied_disables_io(self, mock_process):
"""Test process I/O retrieval disables I/O counters when access is denied."""
mock_process.io_counters.side_effect = psutil.AccessDenied(1)

import azure.monitor.opentelemetry.exporter._performance_counters._manager as manager_module

manager_module._IO_AVAILABLE = True
manager_module._IO_INITIALIZED = False

result = list(_get_process_io(None))

self.assertEqual(len(result), 1)
self.assertEqual(result[0].value, 0)
self.assertFalse(manager_module._IO_AVAILABLE)
mock_process.io_counters.assert_called_once_with()

@mock.patch("azure.monitor.opentelemetry.exporter._performance_counters._manager._PROCESS")
def test_get_process_io_disabled_does_not_call_io_counters(self, mock_process):
"""Test process I/O retrieval does not call io_counters when unavailable."""
import azure.monitor.opentelemetry.exporter._performance_counters._manager as manager_module

manager_module._IO_AVAILABLE = False
manager_module._IO_INITIALIZED = True

result = list(_get_process_io(None))

self.assertEqual(len(result), 1)
self.assertEqual(result[0].value, 0)
mock_process.io_counters.assert_not_called()
Comment on lines +207 to +236
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new tests cover AccessDenied and the disabled path, but there’s no coverage for the new lazy-initialization behavior on the first _get_process_io() invocation (when _IO_INITIALIZED is False). Adding a test for that path would catch issues like first-sample spikes / double io_counters() calls, and would directly validate the regression described in the PR (no import-time crash / baseline initialization behavior).

Copilot uses AI. Check for mistakes.

@mock.patch("psutil.cpu_times")
def test_get_processor_time_success(self, mock_cpu_times):
"""Test successful processor time retrieval."""
Expand Down
Loading