Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions deploy-scripts/deploy-clm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ DELETE_K8S_RESOURCES="${DELETE_K8S_RESOURCES:-false}"
DELETE_CLOUD_RESOURCES="${DELETE_CLOUD_RESOURCES:-false}"
DELETE_ALL="${DELETE_ALL:-false}"

# Debug logging
DEBUG_LOG_DIR="${DEBUG_LOG_DIR:-${PROJECT_ROOT}/.debug-work}"

# ============================================================================
# Load Library Modules
# ============================================================================
Expand Down Expand Up @@ -162,6 +165,8 @@ OPTIONAL FLAGS:
# Execution Options
--dry-run Print commands without executing
--verbose Enable verbose logging
--debug-log-dir <path> Directory to save debug logs on deployment failures
(default: ${WORK_DIR}/debug-logs)
--help Show this help message

ENVIRONMENT VARIABLES:
Expand Down Expand Up @@ -322,6 +327,10 @@ parse_arguments() {
VERBOSE=true
shift
;;
--debug-log-dir)
DEBUG_LOG_DIR="$2"
shift 2
;;
--help|-h)
print_usage
exit 0
Expand Down
93 changes: 73 additions & 20 deletions deploy-scripts/lib/adapter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,27 @@ install_adapter_instance() {
log_info "Resource type: ${resource_type}"
log_info "Adapter name: ${adapter_name}"

# Construct release name
local release_name="adapter-${resource_type}-${adapter_name}"
# Generate random suffix to prevent namespace conflicts
local random_suffix
random_suffix=$(head /dev/urandom | LC_ALL=C tr -dc 'a-z0-9' | head -c 8)

# Construct release name with random suffix
# Kubernetes resource names have a 63-character limit
# Reserve ~15 characters for Helm's deployment/pod suffixes
local max_release_name_length=48
local base_without_suffix="adapter-${resource_type}-${adapter_name}"

# Calculate max base length (reserve space for "-" + suffix)
local max_base_length=$((max_release_name_length - ${#random_suffix} - 1))

# Truncate base if necessary, but always keep the suffix
if [[ ${#base_without_suffix} -gt ${max_base_length} ]]; then
base_without_suffix="${base_without_suffix:0:${max_base_length}}"
log_warning "Release name base truncated to ${max_base_length} chars to stay within Kubernetes limits"
fi

local release_name="${base_without_suffix}-${random_suffix}"
log_info "Release name (with random suffix): ${release_name} (length: ${#release_name})"

# Source adapter config directory (using ADAPTERS_FILE_DIR env var)
local adapter_configs_dir="${ADAPTERS_FILE_DIR:-${TESTDATA_DIR}/adapter-configs}"
Expand Down Expand Up @@ -165,7 +184,7 @@ install_adapter_instance() {
fi


# Build helm command
# Build helm command with labels to track adapter metadata
local helm_cmd=(
helm upgrade --install
"${release_name}"
Expand All @@ -185,6 +204,7 @@ install_adapter_instance() {
--set "broker.googlepubsub.subscriptionId=${subscription_id}"
--set "broker.googlepubsub.topic=${topic}"
--set "broker.googlepubsub.deadLetterTopic=${dead_letter_topic}"
--labels "adapter-resource-type=${resource_type},adapter-name=${adapter_name}"
)

log_info "Executing Helm command:"
Expand All @@ -200,12 +220,36 @@ install_adapter_instance() {
log_success "Adapter ${adapter_name} for ${resource_type} is running and healthy"
else
log_error "Adapter ${adapter_name} for ${resource_type} deployment failed health check"
log_info "Checking pod logs for troubleshooting:"
kubectl logs -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${release_name}" --tail=50 2>/dev/null || true

# Capture debug logs before cleanup
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"

# Cleanup failed deployment
log_warning "Cleaning up failed adapter deployment: ${release_name}"
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
log_info "Failed adapter deployment cleaned up successfully"
else
log_warning "Failed to cleanup adapter deployment, it may need manual cleanup"
fi
return 1
fi
else
log_error "Failed to install adapter ${adapter_name} for ${resource_type}"

# Check if release was created (partial deployment) and cleanup
if helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then
# Capture debug logs before cleanup
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"

log_warning "Cleaning up failed adapter deployment: ${release_name}"
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
log_info "Failed adapter deployment cleaned up successfully"
else
log_warning "Failed to cleanup adapter deployment, it may need manual cleanup"
fi
fi
return 1
fi
}
Expand Down Expand Up @@ -259,29 +303,38 @@ uninstall_adapter_instance() {
log_info "Resource type: ${resource_type}"
log_info "Adapter name: ${adapter_name}"

# Construct release name
local release_name="adapter-${resource_type}-${adapter_name}"
# Find all releases by searching for Helm labels (avoids pattern matching issues with truncated names)
log_info "Searching for releases with labels: adapter-resource-type=${resource_type}, adapter-name=${adapter_name}"
local matching_releases
matching_releases=$(helm list -n "${NAMESPACE}" --selector "adapter-resource-type=${resource_type},adapter-name=${adapter_name}" -q 2>/dev/null)

# Check if release exists
if ! helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then
log_warning "Release '${release_name}' not found in namespace '${NAMESPACE}'"
if [[ -z "${matching_releases}" ]]; then
log_warning "No releases found with labels adapter-resource-type=${resource_type}, adapter-name=${adapter_name} in namespace '${NAMESPACE}'"
return 0
fi

if [[ "${DRY_RUN}" == "true" ]]; then
log_info "[DRY-RUN] Would uninstall adapter (release: ${release_name})"
return 0
fi
# Uninstall all matching releases
local uninstall_errors=0
while IFS= read -r release_name; do
if [[ "${DRY_RUN}" == "true" ]]; then
log_info "[DRY-RUN] Would uninstall adapter (release: ${release_name})"
else
log_info "Uninstalling adapter ${adapter_name} for ${resource_type} (release: ${release_name})..."
log_info "Executing: helm uninstall ${release_name} -n ${NAMESPACE} --wait --timeout 5m"

log_info "Uninstalling adapter ${adapter_name} for ${resource_type}..."
log_info "Executing: helm uninstall ${release_name} -n ${NAMESPACE} --wait --timeout 5m"
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m; then
log_success "Adapter ${adapter_name} for ${resource_type} (release: ${release_name}) uninstalled successfully"
else
log_error "Failed to uninstall adapter ${adapter_name} for ${resource_type} (release: ${release_name})"
((uninstall_errors++))
fi
fi
done <<< "${matching_releases}"

if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m; then
log_success "Adapter ${adapter_name} for ${resource_type} uninstalled successfully"
else
log_error "Failed to uninstall adapter ${adapter_name} for ${resource_type}"
if [[ ${uninstall_errors} -gt 0 ]]; then
return 1
fi
return 0
}

uninstall_adapters() {
Expand Down
28 changes: 26 additions & 2 deletions deploy-scripts/lib/api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,36 @@ install_api() {
log_success "API is running and healthy"
else
log_error "API deployment failed health check"
log_info "Checking pod logs for troubleshooting:"
kubectl logs -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${release_name}" --tail=50 2>/dev/null || true

# Capture debug logs before cleanup
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"

# Cleanup failed deployment
log_warning "Cleaning up failed API deployment: ${release_name}"
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
log_info "Failed API deployment cleaned up successfully"
else
log_warning "Failed to cleanup API deployment, it may need manual cleanup"
fi
return 1
fi
else
log_error "Failed to install API"

# Check if release was created (partial deployment) and cleanup
if helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then
# Capture debug logs before cleanup
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"

log_warning "Cleaning up failed API deployment: ${release_name}"
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
log_info "Failed API deployment cleaned up successfully"
else
log_warning "Failed to cleanup API deployment, it may need manual cleanup"
fi
fi
return 1
fi
}
Expand Down
87 changes: 87 additions & 0 deletions deploy-scripts/lib/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,93 @@ verify_pod_health() {
return 1
}

# ============================================================================
# Debug Log Capture
# ============================================================================

capture_debug_logs() {
local namespace="$1"
local selector="$2"
local component_name="$3"
local output_dir="${4:-${WORK_DIR:-${PWD}}/debug-logs}"
local capture_failed=false

log_section "Capturing Debug Logs for ${component_name}"

# Create output directory
if ! mkdir -p "${output_dir}"; then
log_error "Failed to create debug log directory: ${output_dir}"
return 1
fi

local timestamp
timestamp=$(date +"%Y%m%d-%H%M%S")
local log_prefix="${output_dir}/${component_name}-${timestamp}-$$-${RANDOM}"

log_info "Saving debug logs to: ${log_prefix}-*"

# Capture pod logs
log_info "Capturing pod logs..."
kubectl logs -n "${namespace}" -l "${selector}" --all-containers=true --prefix=true > "${log_prefix}-pods.log" 2>&1 || { log_warning "Failed to capture current pod logs"; capture_failed=true; }

# Capture previous pod logs (for crashed containers)
log_info "Capturing previous pod logs..."
kubectl logs -n "${namespace}" -l "${selector}" --all-containers=true --prefix=true --previous > "${log_prefix}-pods-previous.log" 2>&1 || true

# Capture pod descriptions
log_info "Capturing pod descriptions..."
kubectl describe pods -n "${namespace}" -l "${selector}" > "${log_prefix}-pods-describe.txt" 2>&1 || { log_warning "Failed to capture pod descriptions"; capture_failed=true; }

# Capture pod status
log_info "Capturing pod status..."
kubectl get pods -n "${namespace}" -l "${selector}" -o wide > "${log_prefix}-pods-status.txt" 2>&1 || { log_warning "Failed to capture pod status"; capture_failed=true; }
kubectl get pods -n "${namespace}" -l "${selector}" -o yaml > "${log_prefix}-pods-yaml.yaml" 2>&1 || { log_warning "Failed to capture pod YAML"; capture_failed=true; }

# Capture events
log_info "Capturing namespace events..."
kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' > "${log_prefix}-events.txt" 2>&1 || { log_warning "Failed to capture namespace events"; capture_failed=true; }

# Capture deployment/statefulset status if exists
log_info "Capturing deployment/statefulset status..."
kubectl get deployments,statefulsets -n "${namespace}" -l "${selector}" -o wide > "${log_prefix}-workloads-status.txt" 2>&1 || { log_warning "Failed to capture workload status"; capture_failed=true; }
kubectl get deployments,statefulsets -n "${namespace}" -l "${selector}" -o yaml > "${log_prefix}-workloads-yaml.yaml" 2>&1 || { log_warning "Failed to capture workload YAML"; capture_failed=true; }

# Capture services and endpoints
log_info "Capturing services and endpoints..."
kubectl get svc,endpoints -n "${namespace}" -l "${selector}" -o wide > "${log_prefix}-network.txt" 2>&1 || { log_warning "Failed to capture services and endpoints"; capture_failed=true; }

# Create a summary file
cat > "${log_prefix}-summary.txt" <<EOF
Debug Log Capture Summary
=========================
Component: ${component_name}
Namespace: ${namespace}
Selector: ${selector}
Timestamp: ${timestamp}

Files Generated:
- ${log_prefix}-pods.log (current pod logs)
- ${log_prefix}-pods-previous.log (previous pod logs for crashed containers)
- ${log_prefix}-pods-describe.txt (pod descriptions)
- ${log_prefix}-pods-status.txt (pod status)
- ${log_prefix}-pods-yaml.yaml (pod YAML manifests)
- ${log_prefix}-events.txt (namespace events)
- ${log_prefix}-workloads-status.txt (deployment/statefulset status)
- ${log_prefix}-workloads-yaml.yaml (deployment/statefulset YAML manifests)
- ${log_prefix}-network.txt (services and endpoints)
EOF

if [[ "${capture_failed}" == "true" ]]; then
log_warning "Debug logs captured with partial failures"
return 1
fi
log_success "Debug logs captured successfully"
log_info "Debug log location: ${output_dir}/"
log_info "Log prefix: ${component_name}-${timestamp}-*"

return 0
}

# ============================================================================
# Namespace Management
# ============================================================================
Expand Down
28 changes: 26 additions & 2 deletions deploy-scripts/lib/sentinel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,36 @@ install_sentinel_instance() {
log_success "${component_name} is running and healthy"
else
log_error "${component_name} deployment failed health check"
log_info "Checking pod logs for troubleshooting:"
kubectl logs -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${release_name}" --tail=50 2>/dev/null || true

# Capture debug logs before cleanup
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"

# Cleanup failed deployment
log_warning "Cleaning up failed ${component_name} deployment: ${release_name}"
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
log_info "Failed ${component_name} deployment cleaned up successfully"
else
log_warning "Failed to cleanup ${component_name} deployment, it may need manual cleanup"
fi
return 1
fi
else
log_error "Failed to install ${component_name}"

# Check if release was created (partial deployment) and cleanup
if helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then
# Capture debug logs before cleanup
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"

log_warning "Cleaning up failed ${component_name} deployment: ${release_name}"
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
log_info "Failed ${component_name} deployment cleaned up successfully"
else
log_warning "Failed to cleanup ${component_name} deployment, it may need manual cleanup"
fi
fi
return 1
fi
}
Expand Down