Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
6371162
Set up Cgroup, CpuStats, and CpuMetricsCollector structs, and cgroup …
kathiehuang Feb 13, 2026
7a00846
Add cpu collector into loop with dogstatsd
kathiehuang Feb 13, 2026
6f8fb7b
Fix license
kathiehuang Feb 13, 2026
446b939
Move metrics_collector into its own crate
kathiehuang Feb 23, 2026
917db49
Submit cpu usage and limit metrics and fix units
kathiehuang Feb 23, 2026
e49db13
Test more precise time interval, add instance ID as a tag
kathiehuang Feb 25, 2026
80ee02d
Categorize metrics with azure.functions prefix as enhanced metrics
kathiehuang Feb 26, 2026
33366f4
Refactor to make CpuMetricsCollector, CpuStats, and metrics submissio…
kathiehuang Feb 26, 2026
c46819e
Testing different cpu collection methods
kathiehuang Mar 4, 2026
48bc444
Clean up and emit cpu usage and host-level cpu usage metrics
kathiehuang Mar 4, 2026
21feb5c
Clean up and emit cpu usage and host-level cpu usage metrics
kathiehuang Mar 4, 2026
c30203d
Add tags to metrics
kathiehuang Mar 5, 2026
2774101
Ensure tags match cloud integration metrics
kathiehuang Mar 6, 2026
66c9858
Separate Windows CPU metrics collection into separate PR
kathiehuang Mar 6, 2026
f22e570
Separate CPU host usage metrics collection into separate PR
kathiehuang Mar 6, 2026
48e3d43
Remove functionname tag
kathiehuang Mar 6, 2026
b5fa7bf
Send enhanced metrics even if custom metrics are turned off
kathiehuang Mar 6, 2026
4991d66
Pull out building metrics tags into function
kathiehuang Mar 7, 2026
454d20a
Add unit tests
kathiehuang Mar 7, 2026
28e5c3b
Clean up
kathiehuang Mar 7, 2026
86a46e6
Refactor
kathiehuang Mar 7, 2026
f272433
Remove last_collection_time
kathiehuang Mar 7, 2026
15b5bd9
Only send enhanced metrics for Azure Functions
kathiehuang Mar 7, 2026
c1eec7b
Add back last_collection_time
kathiehuang Mar 7, 2026
4aca646
Only enable enhanced metrics for Azure Functions
kathiehuang Mar 9, 2026
234ca72
Only create CPUMetricsCollector when metrics flusher is successfully …
kathiehuang Mar 9, 2026
afa07cd
Launch metrics flusher as independent task from collector
kathiehuang Mar 9, 2026
989266e
Create windows-enhanced-metrics feature for Windows-specific logic
kathiehuang Mar 10, 2026
d78bcc1
Add unit to collection interval variable
kathiehuang Mar 10, 2026
29740a7
Make last_usage_ns an Option and keep CPU total as u64 until f64 is n…
kathiehuang Mar 10, 2026
a9ae22d
Change collection interval to 1 for precision and remove unneeded logs
kathiehuang Mar 11, 2026
fe264b3
Add comment to clarify shared aggregator between dogstatsd and cpu co…
kathiehuang Mar 11, 2026
78fbcf4
Move tag building logic from datadog-serverless-compat to datadog-met…
kathiehuang Mar 11, 2026
20dc196
Remove unused dependencies from datadog-trace-agent
kathiehuang Mar 11, 2026
9603aad
Formatting
kathiehuang Mar 11, 2026
ba23441
Turn off DD_ENHANCED_METRICS in Windows for now to prevent metrics co…
kathiehuang Mar 30, 2026
002aa75
Handle malformed cpuset.cpus file
kathiehuang Mar 31, 2026
592385a
Skip collection when elapsed_secs is less than or equal to 0
kathiehuang Mar 31, 2026
a998457
Update comments to clarify that Windows is not supported yet
kathiehuang Mar 31, 2026
bb4f3b0
Add unit test for metric classification
kathiehuang Apr 1, 2026
76cd53e
Log when scheduler quota can't be parsed
kathiehuang Apr 1, 2026
723d121
nit: address clippy warning
kathiehuang Apr 1, 2026
250dfcd
Remove resource_id tag and add check for invalid CPU set range
kathiehuang Apr 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-datadog-serverless-compat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ jobs:
retention-days: 3
- if: ${{ inputs.runner == 'windows-2022' }}
shell: bash
run: cargo build --release -p datadog-serverless-compat --features windows-pipes
run: cargo build --release -p datadog-serverless-compat --features windows-pipes,windows-enhanced-metrics
- if: ${{ inputs.runner == 'windows-2022' }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cargo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
- shell: bash
run: |
if [[ "${{ inputs.runner }}" == "windows-2022" ]]; then
cargo nextest run --workspace --features datadog-serverless-compat/windows-pipes
cargo nextest run --workspace --features datadog-serverless-compat/windows-pipes,datadog-serverless-compat/windows-enhanced-metrics
else
cargo nextest run --workspace
fi
56 changes: 56 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions LICENSE-3rdparty.csv
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ headers,https://github.com/hyperium/headers,MIT,Sean McArthur <sean@seanmonstar.
headers-core,https://github.com/hyperium/headers,MIT,Sean McArthur <sean@seanmonstar.com>
heck,https://github.com/withoutboats/heck,MIT OR Apache-2.0,The heck Authors
heck,https://github.com/withoutboats/heck,MIT OR Apache-2.0,Without Boats <woboats@gmail.com>
hermit-abi,https://github.com/hermit-os/hermit-rs,MIT OR Apache-2.0,Stefan Lankes
hex,https://github.com/KokaKiwi/rust-hex,MIT OR Apache-2.0,KokaKiwi <kokakiwi@kokakiwi.net>
home,https://github.com/rust-lang/cargo,MIT OR Apache-2.0,Brian Anderson <andersrb@gmail.com>
http,https://github.com/hyperium/http,MIT OR Apache-2.0,"Alex Crichton <alex@alexcrichton.com>, Carl Lerche <me@carllerche.com>, Sean McArthur <sean@seanmonstar.com>"
Expand Down Expand Up @@ -147,6 +148,7 @@ nix,https://github.com/nix-rust/nix,MIT,The nix-rust Project Developers
nom,https://github.com/Geal/nom,MIT,contact@geoffroycouprie.com
nu-ansi-term,https://github.com/nushell/nu-ansi-term,MIT,"ogham@bsago.me, Ryan Scheel (Havvy) <ryan.havvy@gmail.com>, Josh Triplett <josh@joshtriplett.org>, The Nushell Project Developers"
num-traits,https://github.com/rust-num/num-traits,MIT OR Apache-2.0,The Rust Project Developers
num_cpus,https://github.com/seanmonstar/num_cpus,MIT OR Apache-2.0,Sean McArthur <sean@seanmonstar.com>
once_cell,https://github.com/matklad/once_cell,MIT OR Apache-2.0,Aleksey Kladov <aleksey.kladov@gmail.com>
openssl-probe,https://github.com/rustls/openssl-probe,MIT OR Apache-2.0,Alex Crichton <alex@alexcrichton.com>
opentelemetry,https://github.com/open-telemetry/opentelemetry-rust/tree/main/opentelemetry,Apache-2.0,The opentelemetry Authors
Expand Down
15 changes: 15 additions & 0 deletions crates/datadog-metrics-collector/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "datadog-metrics-collector"
version = "0.1.0"
edition.workspace = true
license.workspace = true
description = "Collector to read, compute, and submit enhanced metrics in Serverless environments"

[dependencies]
dogstatsd = { path = "../dogstatsd", default-features = true }
num_cpus = "1.16"
tracing = { version = "0.1", default-features = false }
libdd-common = { git = "https://github.com/DataDog/libdatadog", rev = "d52ee90209cb12a28bdda0114535c1a985a29d95", default-features = false }

[features]
windows-enhanced-metrics = []
170 changes: 170 additions & 0 deletions crates/datadog-metrics-collector/src/cpu.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/
// SPDX-License-Identifier: Apache-2.0

//! CPU metrics collector for Azure Functions
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we have azure in the path name for this, in the directory or the filename?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this crate is only used in Azure Functions environments right now, I'm thinking of leaving the name as-is and renaming it in the future if we expand enhanced metrics to Cloud Functions Gen 1

//!
//! This module provides OS-agnostic CPU stats collection, CPU usage
//! and limit computation, and metrics submission to Datadog.
//!
//! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores).

use dogstatsd::aggregator::AggregatorHandle;
use dogstatsd::metric::{Metric, MetricValue, SortedTags};
use libdd_common::azure_app_services;
use std::env;
use tracing::{debug, error};

const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.cpu.usage";
const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.cpu.limit";

/// Computed CPU total and limit metrics
pub struct CpuStats {
pub total: u64, // Cumulative CPU usage in nanoseconds
pub limit: Option<f64>, // CPU limit in nanocores
pub defaulted_limit: bool, // Whether CPU limit was defaulted to host CPU count
}

pub trait CpuStatsReader {
fn read(&self) -> Option<CpuStats>;
}

pub struct CpuMetricsCollector {
reader: Box<dyn CpuStatsReader>,
aggregator: AggregatorHandle,
tags: Option<SortedTags>,
last_usage_ns: Option<u64>,
last_collection_time: std::time::Instant,
}

impl CpuMetricsCollector {
/// Creates a new CpuMetricsCollector
///
/// # Arguments
///
/// * `aggregator` - The aggregator handle to submit metrics to
/// * `tags` - Optional tags to attach to all metrics
pub fn new(aggregator: AggregatorHandle, tags: Option<SortedTags>) -> Self {
#[cfg(feature = "windows-enhanced-metrics")]
let reader: Box<dyn CpuStatsReader> = Box::new(crate::windows::WindowsCpuStatsReader);
#[cfg(not(feature = "windows-enhanced-metrics"))]
let reader: Box<dyn CpuStatsReader> = Box::new(crate::linux::LinuxCpuStatsReader);
Self {
reader,
aggregator,
tags,
last_usage_ns: None,
last_collection_time: std::time::Instant::now(),
}
}

pub fn collect_and_submit(&mut self) {
if let Some(cpu_stats) = self.reader.read() {
let current_usage_ns = cpu_stats.total;
let now_instant = std::time::Instant::now();

// Skip first collection
let Some(last_usage_ns) = self.last_usage_ns else {
debug!("First CPU collection, skipping interval");
self.last_usage_ns = Some(current_usage_ns);
self.last_collection_time = now_instant;
return;
};

if current_usage_ns < last_usage_ns {
debug!("Current CPU usage is less than last usage, skipping interval");
self.last_usage_ns = Some(current_usage_ns);
self.last_collection_time = now_instant;
return;
}

let delta_ns = (current_usage_ns - last_usage_ns) as f64;
self.last_usage_ns = Some(current_usage_ns);
let elapsed_secs = now_instant
.duration_since(self.last_collection_time)
.as_secs_f64();
self.last_collection_time = now_instant;
if elapsed_secs <= 0.0 {
debug!("Elapsed time is less than or equal to 0, skipping interval");
return;
}

Comment on lines +82 to +90
Copy link

Copilot AI Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

elapsed_secs can be 0 or extremely small (e.g., if tokio::time::Interval catches up missed ticks and collect_and_submit runs back-to-back). Without a guard, the later delta_ns / elapsed_secs can yield inf/spiky usage rates. Consider skipping/clamping when elapsed_secs <= 0.0 (or below a small threshold).

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to check this in 1b9f9d2

// Divide nanoseconds delta by elapsed time to get usage rate in nanocores
let usage_rate_nc = delta_ns / elapsed_secs;

let now = std::time::UNIX_EPOCH
.elapsed()
.map(|d| d.as_secs())
.unwrap_or(0)
.try_into()
.unwrap_or(0);

let usage_metric = Metric::new(
CPU_USAGE_METRIC.into(),
MetricValue::distribution(usage_rate_nc),
self.tags.clone(),
Some(now),
);

if let Err(e) = self.aggregator.insert_batch(vec![usage_metric]) {
error!("Failed to insert CPU usage metric: {}", e);
Copy link
Copy Markdown
Contributor

@Lewis-E Lewis-E Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what situations would we see this error? Would we hit this repeatedly or can the aggregator recover from errors quickly? (Also applies to line 111)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

insert_batch calls tx.send, which is on an unbounded channel that has infinite capacity. An error will only happen if the receive half of the channel is closed or dropped, which means the aggregator service isn't working anymore and every subsequent call should also fail. This means that metrics would stop sending, with error logs on every attempted insert. It seems the only way to recover would be for the customer to stop and start their function app to restart the agent

Error logging but continuing is what the lambda extension does

If we're worried about log spam, I could change this to return early on the CPU usage metric insert failure - this would halve the error logs

Or maybe a better solution would be to have collect_and_submit return a Result, and main.rs could set cpu_collector=None on error?

Copy link
Copy Markdown
Contributor

@Lewis-E Lewis-E Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally like the return a Result option but am also curious why the lambda extension would send a repeating error log. Also, do you think this bit is unit testable?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Digging into it more, it seems like dogstatsd does the same thing

if let Err(e) = aggregator.insert_batch(all_valid_metrics) {
error!("Failed to send metrics to aggregator: {}", e);
}

I think I should be able to create a handle with a dead receiver to unit test this if we want to do this! I talked with Shreya and it doesn't seem like customers have been running into this - this pattern came from the existing enhanced metrics

}

if let Some(limit) = cpu_stats.limit {
if cpu_stats.defaulted_limit {
debug!("CPU limit defaulted to host CPU count");
}
let limit_metric = Metric::new(
CPU_LIMIT_METRIC.into(),
MetricValue::distribution(limit),
self.tags.clone(),
Some(now),
);
if let Err(e) = self.aggregator.insert_batch(vec![limit_metric]) {
error!("Failed to insert CPU limit metric: {}", e);
}
}
} else {
debug!(
"Skipping CPU metrics collection - could not find data to generate CPU usage and limit enhanced metrics"
);
}
}
}

pub fn build_cpu_metrics_tags() -> Option<SortedTags> {
let mut tag_parts = Vec::new();
// Azure tags from libdd_common
if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION {
let aas_tags = [
("resource_group", aas_metadata.get_resource_group()),
("subscription_id", aas_metadata.get_subscription_id()),
("name", aas_metadata.get_site_name()),
];
for (name, value) in aas_tags {
if value != "unknown" {
tag_parts.push(format!("{}:{}", name, value));
}
}
}

// Tags from env vars (not in libdd_common) - origin tag is added by DogStatsD
for (tag_name, env_var) in [
("region", "REGION_NAME"),
("plan_tier", "WEBSITE_SKU"),
("service", "DD_SERVICE"),
("env", "DD_ENV"),
("version", "DD_VERSION"),
("serverless_compat_version", "DD_SERVERLESS_COMPAT_VERSION"),
] {
if let Ok(val) = env::var(env_var)
&& !val.is_empty()
{
tag_parts.push(format!("{}:{}", tag_name, val));
}
}

if tag_parts.is_empty() {
return None;
}
SortedTags::parse(&tag_parts.join(",")).ok()
}
14 changes: 14 additions & 0 deletions crates/datadog-metrics-collector/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/
// SPDX-License-Identifier: Apache-2.0

#![cfg_attr(not(test), deny(clippy::panic))]
#![cfg_attr(not(test), deny(clippy::unwrap_used))]
#![cfg_attr(not(test), deny(clippy::expect_used))]
#![cfg_attr(not(test), deny(clippy::todo))]
#![cfg_attr(not(test), deny(clippy::unimplemented))]

pub mod cpu;
#[cfg(not(feature = "windows-enhanced-metrics"))]
pub(crate) mod linux;
#[cfg(feature = "windows-enhanced-metrics")]
pub(crate) mod windows;
Loading
Loading