-
Notifications
You must be signed in to change notification settings - Fork 1
[SVLS-8351] Add CPU Enhanced Metrics in Linux Azure Functions #77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6371162
7a00846
6f8fb7b
446b939
917db49
e49db13
80ee02d
33366f4
c46819e
48bc444
21feb5c
c30203d
2774101
66c9858
f22e570
48e3d43
b5fa7bf
4991d66
454d20a
28e5c3b
86a46e6
f272433
15b5bd9
c1eec7b
4aca646
234ca72
afa07cd
989266e
d78bcc1
29740a7
a9ae22d
fe264b3
78fbcf4
20dc196
9603aad
ba23441
002aa75
592385a
a998457
bb4f3b0
76cd53e
723d121
250dfcd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| [package] | ||
| name = "datadog-metrics-collector" | ||
| version = "0.1.0" | ||
| edition.workspace = true | ||
| license.workspace = true | ||
| description = "Collector to read, compute, and submit enhanced metrics in Serverless environments" | ||
|
|
||
| [dependencies] | ||
| dogstatsd = { path = "../dogstatsd", default-features = true } | ||
| num_cpus = "1.16" | ||
| tracing = { version = "0.1", default-features = false } | ||
| libdd-common = { git = "https://github.com/DataDog/libdatadog", rev = "d52ee90209cb12a28bdda0114535c1a985a29d95", default-features = false } | ||
|
|
||
| [features] | ||
| windows-enhanced-metrics = [] |
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,170 @@ | ||||||||
| // Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ | ||||||||
| // SPDX-License-Identifier: Apache-2.0 | ||||||||
|
|
||||||||
| //! CPU metrics collector for Azure Functions | ||||||||
| //! | ||||||||
| //! This module provides OS-agnostic CPU stats collection, CPU usage | ||||||||
| //! and limit computation, and metrics submission to Datadog. | ||||||||
| //! | ||||||||
| //! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores). | ||||||||
|
|
||||||||
| use dogstatsd::aggregator::AggregatorHandle; | ||||||||
| use dogstatsd::metric::{Metric, MetricValue, SortedTags}; | ||||||||
| use libdd_common::azure_app_services; | ||||||||
| use std::env; | ||||||||
| use tracing::{debug, error}; | ||||||||
|
|
||||||||
| const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.cpu.usage"; | ||||||||
| const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.cpu.limit"; | ||||||||
|
|
||||||||
| /// Computed CPU total and limit metrics | ||||||||
| pub struct CpuStats { | ||||||||
| pub total: u64, // Cumulative CPU usage in nanoseconds | ||||||||
| pub limit: Option<f64>, // CPU limit in nanocores | ||||||||
| pub defaulted_limit: bool, // Whether CPU limit was defaulted to host CPU count | ||||||||
| } | ||||||||
|
|
||||||||
| pub trait CpuStatsReader { | ||||||||
| fn read(&self) -> Option<CpuStats>; | ||||||||
| } | ||||||||
|
|
||||||||
| pub struct CpuMetricsCollector { | ||||||||
| reader: Box<dyn CpuStatsReader>, | ||||||||
| aggregator: AggregatorHandle, | ||||||||
| tags: Option<SortedTags>, | ||||||||
| last_usage_ns: Option<u64>, | ||||||||
| last_collection_time: std::time::Instant, | ||||||||
| } | ||||||||
|
|
||||||||
| impl CpuMetricsCollector { | ||||||||
| /// Creates a new CpuMetricsCollector | ||||||||
| /// | ||||||||
| /// # Arguments | ||||||||
| /// | ||||||||
| /// * `aggregator` - The aggregator handle to submit metrics to | ||||||||
| /// * `tags` - Optional tags to attach to all metrics | ||||||||
| pub fn new(aggregator: AggregatorHandle, tags: Option<SortedTags>) -> Self { | ||||||||
| #[cfg(feature = "windows-enhanced-metrics")] | ||||||||
| let reader: Box<dyn CpuStatsReader> = Box::new(crate::windows::WindowsCpuStatsReader); | ||||||||
| #[cfg(not(feature = "windows-enhanced-metrics"))] | ||||||||
| let reader: Box<dyn CpuStatsReader> = Box::new(crate::linux::LinuxCpuStatsReader); | ||||||||
| Self { | ||||||||
| reader, | ||||||||
| aggregator, | ||||||||
| tags, | ||||||||
| last_usage_ns: None, | ||||||||
| last_collection_time: std::time::Instant::now(), | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| pub fn collect_and_submit(&mut self) { | ||||||||
| if let Some(cpu_stats) = self.reader.read() { | ||||||||
| let current_usage_ns = cpu_stats.total; | ||||||||
| let now_instant = std::time::Instant::now(); | ||||||||
|
|
||||||||
| // Skip first collection | ||||||||
| let Some(last_usage_ns) = self.last_usage_ns else { | ||||||||
| debug!("First CPU collection, skipping interval"); | ||||||||
| self.last_usage_ns = Some(current_usage_ns); | ||||||||
| self.last_collection_time = now_instant; | ||||||||
| return; | ||||||||
| }; | ||||||||
|
|
||||||||
| if current_usage_ns < last_usage_ns { | ||||||||
| debug!("Current CPU usage is less than last usage, skipping interval"); | ||||||||
| self.last_usage_ns = Some(current_usage_ns); | ||||||||
| self.last_collection_time = now_instant; | ||||||||
| return; | ||||||||
| } | ||||||||
|
|
||||||||
| let delta_ns = (current_usage_ns - last_usage_ns) as f64; | ||||||||
| self.last_usage_ns = Some(current_usage_ns); | ||||||||
| let elapsed_secs = now_instant | ||||||||
| .duration_since(self.last_collection_time) | ||||||||
| .as_secs_f64(); | ||||||||
| self.last_collection_time = now_instant; | ||||||||
| if elapsed_secs <= 0.0 { | ||||||||
| debug!("Elapsed time is less than or equal to 0, skipping interval"); | ||||||||
| return; | ||||||||
| } | ||||||||
|
|
||||||||
|
Comment on lines
+82
to
+90
|
||||||||
| // Divide nanoseconds delta by elapsed time to get usage rate in nanocores | ||||||||
| let usage_rate_nc = delta_ns / elapsed_secs; | ||||||||
|
|
||||||||
| let now = std::time::UNIX_EPOCH | ||||||||
| .elapsed() | ||||||||
| .map(|d| d.as_secs()) | ||||||||
| .unwrap_or(0) | ||||||||
| .try_into() | ||||||||
| .unwrap_or(0); | ||||||||
|
|
||||||||
| let usage_metric = Metric::new( | ||||||||
| CPU_USAGE_METRIC.into(), | ||||||||
| MetricValue::distribution(usage_rate_nc), | ||||||||
| self.tags.clone(), | ||||||||
| Some(now), | ||||||||
| ); | ||||||||
|
|
||||||||
| if let Err(e) = self.aggregator.insert_batch(vec![usage_metric]) { | ||||||||
| error!("Failed to insert CPU usage metric: {}", e); | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In what situations would we see this error? Would we hit this repeatedly or can the aggregator recover from errors quickly? (Also applies to line 111)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Error logging but continuing is what the lambda extension does If we're worried about log spam, I could change this to return early on the CPU usage metric insert failure - this would halve the error logs Or maybe a better solution would be to have
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I personally like the return a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Digging into it more, it seems like dogstatsd does the same thing serverless-components/crates/dogstatsd/src/dogstatsd.rs Lines 567 to 569 in 05e5c26
I think I should be able to create a handle with a dead receiver to unit test this if we want to do this! I talked with Shreya and it doesn't seem like customers have been running into this - this pattern came from the existing enhanced metrics |
||||||||
| } | ||||||||
|
|
||||||||
| if let Some(limit) = cpu_stats.limit { | ||||||||
| if cpu_stats.defaulted_limit { | ||||||||
| debug!("CPU limit defaulted to host CPU count"); | ||||||||
| } | ||||||||
| let limit_metric = Metric::new( | ||||||||
| CPU_LIMIT_METRIC.into(), | ||||||||
| MetricValue::distribution(limit), | ||||||||
| self.tags.clone(), | ||||||||
| Some(now), | ||||||||
| ); | ||||||||
| if let Err(e) = self.aggregator.insert_batch(vec![limit_metric]) { | ||||||||
| error!("Failed to insert CPU limit metric: {}", e); | ||||||||
| } | ||||||||
| } | ||||||||
| } else { | ||||||||
| debug!( | ||||||||
| "Skipping CPU metrics collection - could not find data to generate CPU usage and limit enhanced metrics" | ||||||||
| ); | ||||||||
| } | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| pub fn build_cpu_metrics_tags() -> Option<SortedTags> { | ||||||||
| let mut tag_parts = Vec::new(); | ||||||||
| // Azure tags from libdd_common | ||||||||
| if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION { | ||||||||
| let aas_tags = [ | ||||||||
| ("resource_group", aas_metadata.get_resource_group()), | ||||||||
| ("subscription_id", aas_metadata.get_subscription_id()), | ||||||||
| ("name", aas_metadata.get_site_name()), | ||||||||
| ]; | ||||||||
| for (name, value) in aas_tags { | ||||||||
| if value != "unknown" { | ||||||||
| tag_parts.push(format!("{}:{}", name, value)); | ||||||||
| } | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| // Tags from env vars (not in libdd_common) - origin tag is added by DogStatsD | ||||||||
| for (tag_name, env_var) in [ | ||||||||
| ("region", "REGION_NAME"), | ||||||||
| ("plan_tier", "WEBSITE_SKU"), | ||||||||
| ("service", "DD_SERVICE"), | ||||||||
| ("env", "DD_ENV"), | ||||||||
| ("version", "DD_VERSION"), | ||||||||
| ("serverless_compat_version", "DD_SERVERLESS_COMPAT_VERSION"), | ||||||||
| ] { | ||||||||
| if let Ok(val) = env::var(env_var) | ||||||||
| && !val.is_empty() | ||||||||
| { | ||||||||
| tag_parts.push(format!("{}:{}", tag_name, val)); | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| if tag_parts.is_empty() { | ||||||||
| return None; | ||||||||
| } | ||||||||
| SortedTags::parse(&tag_parts.join(",")).ok() | ||||||||
| } | ||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| // Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| #![cfg_attr(not(test), deny(clippy::panic))] | ||
| #![cfg_attr(not(test), deny(clippy::unwrap_used))] | ||
| #![cfg_attr(not(test), deny(clippy::expect_used))] | ||
| #![cfg_attr(not(test), deny(clippy::todo))] | ||
| #![cfg_attr(not(test), deny(clippy::unimplemented))] | ||
|
|
||
| pub mod cpu; | ||
| #[cfg(not(feature = "windows-enhanced-metrics"))] | ||
| pub(crate) mod linux; | ||
| #[cfg(feature = "windows-enhanced-metrics")] | ||
| pub(crate) mod windows; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we have azure in the path name for this, in the directory or the filename?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since this crate is only used in Azure Functions environments right now, I'm thinking of leaving the name as-is and renaming it in the future if we expand enhanced metrics to Cloud Functions Gen 1