Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
e7ea99d
initial implementation
NachoEchevarria Mar 6, 2026
43c805c
update snapshots
NachoEchevarria Mar 6, 2026
45a3914
update snapshots
NachoEchevarria Mar 6, 2026
fa1e92c
fix duplicated
NachoEchevarria Mar 6, 2026
dfd5800
Avoid two calls to GetServiceName
NachoEchevarria Mar 9, 2026
891cd46
Add warning comments for enum sync
NachoEchevarria Mar 9, 2026
92b5de7
rename param
NachoEchevarria Mar 9, 2026
9495c55
Remove not needed
NachoEchevarria Mar 9, 2026
1603764
Remove not needed
NachoEchevarria Mar 10, 2026
83c03be
Refactor: Use ServiceNameMetadata to decrease double method calls.
NachoEchevarria Mar 10, 2026
fb7e24a
Fix unit tests
NachoEchevarria Mar 11, 2026
b487692
remove integration arrays
NachoEchevarria Mar 11, 2026
76423c9
Refactor schemas
NachoEchevarria Mar 11, 2026
2e0044c
Use opt.service_mapping even if default
NachoEchevarria Mar 11, 2026
d76c65e
Fix dbscopefactory
NachoEchevarria Mar 11, 2026
6ef7bb5
Final check serviceNameEqualsDefault
NachoEchevarria Mar 11, 2026
18105ba
SpanMessagePackFormatter opt case
NachoEchevarria Mar 11, 2026
42401b0
Add unit tests
NachoEchevarria Mar 11, 2026
564ff19
Minor refactor
NachoEchevarria Mar 12, 2026
19abb0a
remove hardcoded
NachoEchevarria Mar 12, 2026
3df9559
Initial implementation
NachoEchevarria Mar 13, 2026
571a6e8
Fix unit. Azure Aws add source.
NachoEchevarria Mar 13, 2026
cbf0b86
fix unit tests and integration tests
NachoEchevarria Mar 16, 2026
02ecf55
Update v1 rules
NachoEchevarria Mar 16, 2026
1721030
Update OTEL
NachoEchevarria Mar 17, 2026
5bc3666
Remove commas
NachoEchevarria Mar 17, 2026
337c5fe
serivoce source stats. Initial implementation.
NachoEchevarria Mar 18, 2026
c80fcaf
protect against empty strings
NachoEchevarria Mar 20, 2026
d6e12c2
Merge branch 'master' into nacho/ServiceSourceStats
NachoEchevarria Mar 20, 2026
7add326
Add Linux docker cgroup retry wrapper
datadog-official[bot] Mar 24, 2026
c2cdfd3
fixes
NachoEchevarria Mar 24, 2026
241ef8c
Match Windows approach
NachoEchevarria Mar 24, 2026
5b73840
cover more cases
NachoEchevarria Mar 25, 2026
ffdf087
Add ensure docker ready
NachoEchevarria Mar 26, 2026
a0ae1f7
Temporary change to test the stat system test
NachoEchevarria Mar 26, 2026
9fd8d2e
Merge branch 'master' into nacho/ServiceSourceStats
NachoEchevarria Mar 26, 2026
6415ecb
Fix compilation errors from merge.
NachoEchevarria Mar 26, 2026
a78fcd8
Improve logging
NachoEchevarria Mar 26, 2026
134aa23
Update .azure-pipelines/steps/ensure-docker-ready-linux.sh
NachoEchevarria Mar 27, 2026
974b4fe
Apply suggestion from @andrewlock
NachoEchevarria Mar 27, 2026
bf76fc4
Apply suggestion from @andrewlock
NachoEchevarria Mar 27, 2026
16f424f
Merge branch 'dd/ci/linux-docker-cgroup-retries' of https://github.co…
NachoEchevarria Mar 27, 2026
d98d1f7
undo
NachoEchevarria Mar 27, 2026
f4f367f
Merge branch 'master' into dd/ci/linux-docker-cgroup-retries
NachoEchevarria Apr 6, 2026
49bb57d
Add log line
NachoEchevarria Apr 6, 2026
9cc81c3
Merge branch 'dd/ci/linux-docker-cgroup-retries' of https://github.co…
NachoEchevarria Apr 6, 2026
a66e824
Use POSIX shell
NachoEchevarria Apr 6, 2026
47d9cc7
Merge branch 'master' into dd/ci/linux-docker-cgroup-retries
NachoEchevarria Apr 9, 2026
88d5788
Early return
NachoEchevarria Apr 9, 2026
4e1cdcd
Simplified try_restart_docker
NachoEchevarria Apr 9, 2026
ee16ba0
Make ensure-docker-ready-linux.sh executable
NachoEchevarria Apr 9, 2026
9388163
adress nits
NachoEchevarria Apr 9, 2026
26d3048
Early return
NachoEchevarria Apr 9, 2026
eea7deb
Avoid too verbose output
NachoEchevarria Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions .azure-pipelines/steps/ensure-docker-ready-linux.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/bin/sh
# Linux Docker readiness check — mirrors the Windows PowerShell logic in ensure-docker-ready.yml.
# Waits for the Docker daemon, attempts service restarts if needed, and fails fast
# to avoid wasting time on a broken agent.

set -u
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably exit on failing commands too? Or does that break things? Meh, maybe best to leave it 😅

Suggested change
set -u
set -eu

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we should avoid early exits in some commands that could possibly fail in the script


DOCKER_READY_TIMEOUT_SECONDS="${DOCKER_READY_TIMEOUT_SECONDS:-300}"
DOCKER_READY_CHECK_INTERVAL_SECONDS="${DOCKER_READY_CHECK_INTERVAL_SECONDS:-10}"
DOCKER_MAX_RESTARTS="${DOCKER_MAX_RESTARTS:-3}"

log()
{
echo "[ensure-docker-ready-linux] $*"
}

log_diagnostics()
{
log "--- Diagnostics ---"
local cgroup_version="unknown"
if [ -f "/sys/fs/cgroup/cgroup.controllers" ]; then
cgroup_version="v2"
elif [ -d "/sys/fs/cgroup" ]; then
cgroup_version="v1"
fi

log "cgroup version: ${cgroup_version}"
log "kernel: $(uname -a)"

if command -v systemctl >/dev/null 2>&1; then
log "systemd state:"
systemctl is-system-running || true
log "docker service status:"
systemctl status docker --no-pager || true
log "dbus service status:"
systemctl status dbus --no-pager || true
fi

if command -v journalctl >/dev/null 2>&1; then
log "docker journal logs (last 50 lines):"
journalctl -u docker --no-pager -n 50 || true
fi

log "docker version:"
docker version || true
log "docker info:"
docker info || true
}

try_restart_docker()
{
log "Attempting Docker service restart..."
local output
output=$(systemctl restart docker 2>&1)
if [ $? -eq 0 ]; then
log "systemctl restart docker completed"
return 0
else
log "systemctl restart docker failed: ${output}"
return 1
fi
}

wait_for_docker()
{
local elapsed=0
local restart_count=0

log "Waiting up to ${DOCKER_READY_TIMEOUT_SECONDS}s for Docker daemon (will attempt up to ${DOCKER_MAX_RESTARTS} service restarts)..."

# Quick check — if Docker is already healthy, nothing to do
if docker info >/dev/null 2>&1; then
log "Docker daemon is ready"
return 0
fi

# If we can't restart Docker, there's no point looping
if ! command -v systemctl >/dev/null 2>&1; then
log "Docker is not responding and systemctl is not available — cannot recover"
log_diagnostics
return 1
fi

# Log initial service state
local initial_status
initial_status=$(systemctl is-active docker 2>&1 || true)
log "Docker service initial state: ${initial_status}"

local consecutive_failures=0
local DOCKER_READY_FORCE_RESTART_AFTER=3

while [ "${elapsed}" -lt "${DOCKER_READY_TIMEOUT_SECONDS}" ]; do
if docker info >/dev/null 2>&1; then
log "Docker daemon is ready (waited ${elapsed}s, ${restart_count} restart(s) performed)"
return 0
fi

consecutive_failures=$((consecutive_failures + 1))

# Try restarting if the service is down, or if it reports active but is unresponsive
local svc_status
svc_status=$(systemctl is-active docker 2>&1 || true)
local should_restart=false
if [ "${svc_status}" != "active" ]; then
should_restart=true
elif [ "${consecutive_failures}" -ge "${DOCKER_READY_FORCE_RESTART_AFTER}" ]; then
log "Docker service reports active but has been unresponsive for ${consecutive_failures} checks"
should_restart=true
fi

if [ "${should_restart}" = true ] && [ "${restart_count}" -lt "${DOCKER_MAX_RESTARTS}" ]; then
restart_count=$((restart_count + 1))
log "Docker service is ${svc_status}. Attempting restart ${restart_count}/${DOCKER_MAX_RESTARTS}..."
try_restart_docker
sleep 2
consecutive_failures=0
elif [ "${should_restart}" = true ] && [ "${restart_count}" -ge "${DOCKER_MAX_RESTARTS}" ]; then
log "Docker service is ${svc_status} but max restarts (${DOCKER_MAX_RESTARTS}) exhausted — giving up"
log_diagnostics
return 1
fi

log "Docker not ready yet (${elapsed}s elapsed), retrying in ${DOCKER_READY_CHECK_INTERVAL_SECONDS}s..."
sleep "${DOCKER_READY_CHECK_INTERVAL_SECONDS}"
elapsed=$((elapsed + DOCKER_READY_CHECK_INTERVAL_SECONDS))
done

log "Docker daemon did not become ready within ${DOCKER_READY_TIMEOUT_SECONDS}s after ${restart_count} restart(s)"
echo "##vso[task.logissue type=error]Docker daemon did not become ready within ${DOCKER_READY_TIMEOUT_SECONDS}s after ${restart_count} restart(s)"
log_diagnostics
return 1
}

wait_for_docker
7 changes: 7 additions & 0 deletions .azure-pipelines/steps/ensure-docker-ready.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ parameters:
default: 3

steps:
- bash: ./.azure-pipelines/steps/ensure-docker-ready-linux.sh
displayName: Ensure Docker daemon is ready (Linux)
condition: ne(variables['Agent.OS'], 'Windows_NT')
env:
DOCKER_READY_TIMEOUT_SECONDS: ${{ parameters.timeoutSeconds }}
DOCKER_MAX_RESTARTS: ${{ parameters.maxRestarts }}

- powershell: |
# Prevent PowerShell from treating native command stderr as a terminating error
$ErrorActionPreference = 'Continue'
Expand Down
4 changes: 3 additions & 1 deletion .azure-pipelines/steps/run-in-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ parameters:
default: 0

steps:
- template: ensure-docker-ready.yml

- ${{ if eq(parameters.build, true) }}:
- script: |
if [ -z "${{ parameters.useNativeSdkVersion }}" ]; then
Expand Down Expand Up @@ -111,4 +113,4 @@ steps:
displayName: Run '${{ parameters.command }}' in Docker
retryCountOnTaskFailure: ${{ parameters.retryCountForRunCommand }}
env:
DD_LOGGER_DD_API_KEY: ${{ parameters.apiKey }}
DD_LOGGER_DD_API_KEY: ${{ parameters.apiKey }}
2 changes: 2 additions & 0 deletions .azure-pipelines/ultimate-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2143,6 +2143,8 @@ stages:
displayName: BuildWindowsIntegrationTests
retryCountOnTaskFailure: 3

- template: steps/ensure-docker-ready.yml
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch 👍


- powershell: |
mkdir -Force ./artifacts/build_data/snapshots
mkdir -Force ./artifacts/build_data/logs/LoaderOptimizationStartup
Expand Down
Loading