From bec5898b8e3cb737f99704fd3ff791552a6fcb9a Mon Sep 17 00:00:00 2001 From: Ying Zhang Date: Fri, 20 Mar 2026 09:56:10 +0800 Subject: [PATCH] HYPERFLEET-752 | ci: Improve E2E CI Test deployment logic --- deploy-scripts/deploy-clm.sh | 9 ++++ deploy-scripts/lib/adapter.sh | 93 ++++++++++++++++++++++++++-------- deploy-scripts/lib/api.sh | 28 +++++++++- deploy-scripts/lib/common.sh | 87 +++++++++++++++++++++++++++++++ deploy-scripts/lib/sentinel.sh | 28 +++++++++- 5 files changed, 221 insertions(+), 24 deletions(-) diff --git a/deploy-scripts/deploy-clm.sh b/deploy-scripts/deploy-clm.sh index 26ee72e..dc4853e 100755 --- a/deploy-scripts/deploy-clm.sh +++ b/deploy-scripts/deploy-clm.sh @@ -98,6 +98,9 @@ DELETE_K8S_RESOURCES="${DELETE_K8S_RESOURCES:-false}" DELETE_CLOUD_RESOURCES="${DELETE_CLOUD_RESOURCES:-false}" DELETE_ALL="${DELETE_ALL:-false}" +# Debug logging +DEBUG_LOG_DIR="${DEBUG_LOG_DIR:-${PROJECT_ROOT}/.debug-work}" + # ============================================================================ # Load Library Modules # ============================================================================ @@ -162,6 +165,8 @@ OPTIONAL FLAGS: # Execution Options --dry-run Print commands without executing --verbose Enable verbose logging + --debug-log-dir Directory to save debug logs on deployment failures + (default: ${WORK_DIR}/debug-logs) --help Show this help message ENVIRONMENT VARIABLES: @@ -322,6 +327,10 @@ parse_arguments() { VERBOSE=true shift ;; + --debug-log-dir) + DEBUG_LOG_DIR="$2" + shift 2 + ;; --help|-h) print_usage exit 0 diff --git a/deploy-scripts/lib/adapter.sh b/deploy-scripts/lib/adapter.sh index f152f60..1a12e68 100755 --- a/deploy-scripts/lib/adapter.sh +++ b/deploy-scripts/lib/adapter.sh @@ -107,8 +107,27 @@ install_adapter_instance() { log_info "Resource type: ${resource_type}" log_info "Adapter name: ${adapter_name}" - # Construct release name - local release_name="adapter-${resource_type}-${adapter_name}" + # Generate random suffix to prevent namespace conflicts + local random_suffix + random_suffix=$(head /dev/urandom | LC_ALL=C tr -dc 'a-z0-9' | head -c 8) + + # Construct release name with random suffix + # Kubernetes resource names have a 63-character limit + # Reserve ~15 characters for Helm's deployment/pod suffixes + local max_release_name_length=48 + local base_without_suffix="adapter-${resource_type}-${adapter_name}" + + # Calculate max base length (reserve space for "-" + suffix) + local max_base_length=$((max_release_name_length - ${#random_suffix} - 1)) + + # Truncate base if necessary, but always keep the suffix + if [[ ${#base_without_suffix} -gt ${max_base_length} ]]; then + base_without_suffix="${base_without_suffix:0:${max_base_length}}" + log_warning "Release name base truncated to ${max_base_length} chars to stay within Kubernetes limits" + fi + + local release_name="${base_without_suffix}-${random_suffix}" + log_info "Release name (with random suffix): ${release_name} (length: ${#release_name})" # Source adapter config directory (using ADAPTERS_FILE_DIR env var) local adapter_configs_dir="${ADAPTERS_FILE_DIR:-${TESTDATA_DIR}/adapter-configs}" @@ -165,7 +184,7 @@ install_adapter_instance() { fi - # Build helm command + # Build helm command with labels to track adapter metadata local helm_cmd=( helm upgrade --install "${release_name}" @@ -185,6 +204,7 @@ install_adapter_instance() { --set "broker.googlepubsub.subscriptionId=${subscription_id}" --set "broker.googlepubsub.topic=${topic}" --set "broker.googlepubsub.deadLetterTopic=${dead_letter_topic}" + --labels "adapter-resource-type=${resource_type},adapter-name=${adapter_name}" ) log_info "Executing Helm command:" @@ -200,12 +220,36 @@ install_adapter_instance() { log_success "Adapter ${adapter_name} for ${resource_type} is running and healthy" else log_error "Adapter ${adapter_name} for ${resource_type} deployment failed health check" - log_info "Checking pod logs for troubleshooting:" - kubectl logs -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${release_name}" --tail=50 2>/dev/null || true + + # Capture debug logs before cleanup + local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}" + capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}" + + # Cleanup failed deployment + log_warning "Cleaning up failed adapter deployment: ${release_name}" + if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then + log_info "Failed adapter deployment cleaned up successfully" + else + log_warning "Failed to cleanup adapter deployment, it may need manual cleanup" + fi return 1 fi else log_error "Failed to install adapter ${adapter_name} for ${resource_type}" + + # Check if release was created (partial deployment) and cleanup + if helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then + # Capture debug logs before cleanup + local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}" + capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}" + + log_warning "Cleaning up failed adapter deployment: ${release_name}" + if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then + log_info "Failed adapter deployment cleaned up successfully" + else + log_warning "Failed to cleanup adapter deployment, it may need manual cleanup" + fi + fi return 1 fi } @@ -259,29 +303,38 @@ uninstall_adapter_instance() { log_info "Resource type: ${resource_type}" log_info "Adapter name: ${adapter_name}" - # Construct release name - local release_name="adapter-${resource_type}-${adapter_name}" + # Find all releases by searching for Helm labels (avoids pattern matching issues with truncated names) + log_info "Searching for releases with labels: adapter-resource-type=${resource_type}, adapter-name=${adapter_name}" + local matching_releases + matching_releases=$(helm list -n "${NAMESPACE}" --selector "adapter-resource-type=${resource_type},adapter-name=${adapter_name}" -q 2>/dev/null) - # Check if release exists - if ! helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then - log_warning "Release '${release_name}' not found in namespace '${NAMESPACE}'" + if [[ -z "${matching_releases}" ]]; then + log_warning "No releases found with labels adapter-resource-type=${resource_type}, adapter-name=${adapter_name} in namespace '${NAMESPACE}'" return 0 fi - if [[ "${DRY_RUN}" == "true" ]]; then - log_info "[DRY-RUN] Would uninstall adapter (release: ${release_name})" - return 0 - fi + # Uninstall all matching releases + local uninstall_errors=0 + while IFS= read -r release_name; do + if [[ "${DRY_RUN}" == "true" ]]; then + log_info "[DRY-RUN] Would uninstall adapter (release: ${release_name})" + else + log_info "Uninstalling adapter ${adapter_name} for ${resource_type} (release: ${release_name})..." + log_info "Executing: helm uninstall ${release_name} -n ${NAMESPACE} --wait --timeout 5m" - log_info "Uninstalling adapter ${adapter_name} for ${resource_type}..." - log_info "Executing: helm uninstall ${release_name} -n ${NAMESPACE} --wait --timeout 5m" + if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m; then + log_success "Adapter ${adapter_name} for ${resource_type} (release: ${release_name}) uninstalled successfully" + else + log_error "Failed to uninstall adapter ${adapter_name} for ${resource_type} (release: ${release_name})" + ((uninstall_errors++)) + fi + fi + done <<< "${matching_releases}" - if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m; then - log_success "Adapter ${adapter_name} for ${resource_type} uninstalled successfully" - else - log_error "Failed to uninstall adapter ${adapter_name} for ${resource_type}" + if [[ ${uninstall_errors} -gt 0 ]]; then return 1 fi + return 0 } uninstall_adapters() { diff --git a/deploy-scripts/lib/api.sh b/deploy-scripts/lib/api.sh index f903719..418f8b7 100755 --- a/deploy-scripts/lib/api.sh +++ b/deploy-scripts/lib/api.sh @@ -83,12 +83,36 @@ install_api() { log_success "API is running and healthy" else log_error "API deployment failed health check" - log_info "Checking pod logs for troubleshooting:" - kubectl logs -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${release_name}" --tail=50 2>/dev/null || true + + # Capture debug logs before cleanup + local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}" + capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}" + + # Cleanup failed deployment + log_warning "Cleaning up failed API deployment: ${release_name}" + if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then + log_info "Failed API deployment cleaned up successfully" + else + log_warning "Failed to cleanup API deployment, it may need manual cleanup" + fi return 1 fi else log_error "Failed to install API" + + # Check if release was created (partial deployment) and cleanup + if helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then + # Capture debug logs before cleanup + local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}" + capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}" + + log_warning "Cleaning up failed API deployment: ${release_name}" + if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then + log_info "Failed API deployment cleaned up successfully" + else + log_warning "Failed to cleanup API deployment, it may need manual cleanup" + fi + fi return 1 fi } diff --git a/deploy-scripts/lib/common.sh b/deploy-scripts/lib/common.sh index ac15a72..7890618 100755 --- a/deploy-scripts/lib/common.sh +++ b/deploy-scripts/lib/common.sh @@ -184,6 +184,93 @@ verify_pod_health() { return 1 } +# ============================================================================ +# Debug Log Capture +# ============================================================================ + +capture_debug_logs() { + local namespace="$1" + local selector="$2" + local component_name="$3" + local output_dir="${4:-${WORK_DIR:-${PWD}}/debug-logs}" + local capture_failed=false + + log_section "Capturing Debug Logs for ${component_name}" + + # Create output directory + if ! mkdir -p "${output_dir}"; then + log_error "Failed to create debug log directory: ${output_dir}" + return 1 + fi + + local timestamp + timestamp=$(date +"%Y%m%d-%H%M%S") + local log_prefix="${output_dir}/${component_name}-${timestamp}-$$-${RANDOM}" + + log_info "Saving debug logs to: ${log_prefix}-*" + + # Capture pod logs + log_info "Capturing pod logs..." + kubectl logs -n "${namespace}" -l "${selector}" --all-containers=true --prefix=true > "${log_prefix}-pods.log" 2>&1 || { log_warning "Failed to capture current pod logs"; capture_failed=true; } + + # Capture previous pod logs (for crashed containers) + log_info "Capturing previous pod logs..." + kubectl logs -n "${namespace}" -l "${selector}" --all-containers=true --prefix=true --previous > "${log_prefix}-pods-previous.log" 2>&1 || true + + # Capture pod descriptions + log_info "Capturing pod descriptions..." + kubectl describe pods -n "${namespace}" -l "${selector}" > "${log_prefix}-pods-describe.txt" 2>&1 || { log_warning "Failed to capture pod descriptions"; capture_failed=true; } + + # Capture pod status + log_info "Capturing pod status..." + kubectl get pods -n "${namespace}" -l "${selector}" -o wide > "${log_prefix}-pods-status.txt" 2>&1 || { log_warning "Failed to capture pod status"; capture_failed=true; } + kubectl get pods -n "${namespace}" -l "${selector}" -o yaml > "${log_prefix}-pods-yaml.yaml" 2>&1 || { log_warning "Failed to capture pod YAML"; capture_failed=true; } + + # Capture events + log_info "Capturing namespace events..." + kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' > "${log_prefix}-events.txt" 2>&1 || { log_warning "Failed to capture namespace events"; capture_failed=true; } + + # Capture deployment/statefulset status if exists + log_info "Capturing deployment/statefulset status..." + kubectl get deployments,statefulsets -n "${namespace}" -l "${selector}" -o wide > "${log_prefix}-workloads-status.txt" 2>&1 || { log_warning "Failed to capture workload status"; capture_failed=true; } + kubectl get deployments,statefulsets -n "${namespace}" -l "${selector}" -o yaml > "${log_prefix}-workloads-yaml.yaml" 2>&1 || { log_warning "Failed to capture workload YAML"; capture_failed=true; } + + # Capture services and endpoints + log_info "Capturing services and endpoints..." + kubectl get svc,endpoints -n "${namespace}" -l "${selector}" -o wide > "${log_prefix}-network.txt" 2>&1 || { log_warning "Failed to capture services and endpoints"; capture_failed=true; } + + # Create a summary file + cat > "${log_prefix}-summary.txt" </dev/null || true + + # Capture debug logs before cleanup + local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}" + capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}" + + # Cleanup failed deployment + log_warning "Cleaning up failed ${component_name} deployment: ${release_name}" + if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then + log_info "Failed ${component_name} deployment cleaned up successfully" + else + log_warning "Failed to cleanup ${component_name} deployment, it may need manual cleanup" + fi return 1 fi else log_error "Failed to install ${component_name}" + + # Check if release was created (partial deployment) and cleanup + if helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then + # Capture debug logs before cleanup + local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}" + capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}" + + log_warning "Cleaning up failed ${component_name} deployment: ${release_name}" + if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then + log_info "Failed ${component_name} deployment cleaned up successfully" + else + log_warning "Failed to cleanup ${component_name} deployment, it may need manual cleanup" + fi + fi return 1 fi }