Skip to content

Commit 4558a57

Browse files
HYPERFLEET-752 | ci: Improve E2E CI Test deployment logic
1 parent f73bb04 commit 4558a57

5 files changed

Lines changed: 213 additions & 24 deletions

File tree

deploy-scripts/deploy-clm.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ DELETE_K8S_RESOURCES="${DELETE_K8S_RESOURCES:-false}"
9898
DELETE_CLOUD_RESOURCES="${DELETE_CLOUD_RESOURCES:-false}"
9999
DELETE_ALL="${DELETE_ALL:-false}"
100100

101+
# Debug logging
102+
DEBUG_LOG_DIR="${DEBUG_LOG_DIR:-${PROJECT_ROOT}/.debug-work}"
103+
101104
# ============================================================================
102105
# Load Library Modules
103106
# ============================================================================
@@ -162,6 +165,8 @@ OPTIONAL FLAGS:
162165
# Execution Options
163166
--dry-run Print commands without executing
164167
--verbose Enable verbose logging
168+
--debug-log-dir <path> Directory to save debug logs on deployment failures
169+
(default: ${WORK_DIR}/debug-logs)
165170
--help Show this help message
166171
167172
ENVIRONMENT VARIABLES:
@@ -322,6 +327,10 @@ parse_arguments() {
322327
VERBOSE=true
323328
shift
324329
;;
330+
--debug-log-dir)
331+
DEBUG_LOG_DIR="$2"
332+
shift 2
333+
;;
325334
--help|-h)
326335
print_usage
327336
exit 0

deploy-scripts/lib/adapter.sh

Lines changed: 73 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,27 @@ install_adapter_instance() {
107107
log_info "Resource type: ${resource_type}"
108108
log_info "Adapter name: ${adapter_name}"
109109

110-
# Construct release name
111-
local release_name="adapter-${resource_type}-${adapter_name}"
110+
# Generate random suffix to prevent namespace conflicts
111+
local random_suffix
112+
random_suffix=$(head /dev/urandom | LC_ALL=C tr -dc 'a-z0-9' | head -c 8)
113+
114+
# Construct release name with random suffix
115+
# Kubernetes resource names have a 63-character limit
116+
# Reserve ~15 characters for Helm's deployment/pod suffixes
117+
local max_release_name_length=48
118+
local base_without_suffix="adapter-${resource_type}-${adapter_name}"
119+
120+
# Calculate max base length (reserve space for "-" + suffix)
121+
local max_base_length=$((max_release_name_length - ${#random_suffix} - 1))
122+
123+
# Truncate base if necessary, but always keep the suffix
124+
if [[ ${#base_without_suffix} -gt ${max_base_length} ]]; then
125+
base_without_suffix="${base_without_suffix:0:${max_base_length}}"
126+
log_warning "Release name base truncated to ${max_base_length} chars to stay within Kubernetes limits"
127+
fi
128+
129+
local release_name="${base_without_suffix}-${random_suffix}"
130+
log_info "Release name (with random suffix): ${release_name} (length: ${#release_name})"
112131

113132
# Source adapter config directory (using ADAPTERS_FILE_DIR env var)
114133
local adapter_configs_dir="${ADAPTERS_FILE_DIR:-${TESTDATA_DIR}/adapter-configs}"
@@ -165,7 +184,7 @@ install_adapter_instance() {
165184
fi
166185

167186

168-
# Build helm command
187+
# Build helm command with labels to track adapter metadata
169188
local helm_cmd=(
170189
helm upgrade --install
171190
"${release_name}"
@@ -185,6 +204,7 @@ install_adapter_instance() {
185204
--set "broker.googlepubsub.subscriptionId=${subscription_id}"
186205
--set "broker.googlepubsub.topic=${topic}"
187206
--set "broker.googlepubsub.deadLetterTopic=${dead_letter_topic}"
207+
--labels "adapter-resource-type=${resource_type},adapter-name=${adapter_name}"
188208
)
189209

190210
log_info "Executing Helm command:"
@@ -200,12 +220,36 @@ install_adapter_instance() {
200220
log_success "Adapter ${adapter_name} for ${resource_type} is running and healthy"
201221
else
202222
log_error "Adapter ${adapter_name} for ${resource_type} deployment failed health check"
203-
log_info "Checking pod logs for troubleshooting:"
204-
kubectl logs -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${release_name}" --tail=50 2>/dev/null || true
223+
224+
# Capture debug logs before cleanup
225+
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
226+
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"
227+
228+
# Cleanup failed deployment
229+
log_warning "Cleaning up failed adapter deployment: ${release_name}"
230+
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
231+
log_info "Failed adapter deployment cleaned up successfully"
232+
else
233+
log_warning "Failed to cleanup adapter deployment, it may need manual cleanup"
234+
fi
205235
return 1
206236
fi
207237
else
208238
log_error "Failed to install adapter ${adapter_name} for ${resource_type}"
239+
240+
# Check if release was created (partial deployment) and cleanup
241+
if helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then
242+
# Capture debug logs before cleanup
243+
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
244+
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"
245+
246+
log_warning "Cleaning up failed adapter deployment: ${release_name}"
247+
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
248+
log_info "Failed adapter deployment cleaned up successfully"
249+
else
250+
log_warning "Failed to cleanup adapter deployment, it may need manual cleanup"
251+
fi
252+
fi
209253
return 1
210254
fi
211255
}
@@ -259,29 +303,38 @@ uninstall_adapter_instance() {
259303
log_info "Resource type: ${resource_type}"
260304
log_info "Adapter name: ${adapter_name}"
261305

262-
# Construct release name
263-
local release_name="adapter-${resource_type}-${adapter_name}"
306+
# Find all releases by searching for Helm labels (avoids pattern matching issues with truncated names)
307+
log_info "Searching for releases with labels: adapter-resource-type=${resource_type}, adapter-name=${adapter_name}"
308+
local matching_releases
309+
matching_releases=$(helm list -n "${NAMESPACE}" --selector "adapter-resource-type=${resource_type},adapter-name=${adapter_name}" -q 2>/dev/null)
264310

265-
# Check if release exists
266-
if ! helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then
267-
log_warning "Release '${release_name}' not found in namespace '${NAMESPACE}'"
311+
if [[ -z "${matching_releases}" ]]; then
312+
log_warning "No releases found with labels adapter-resource-type=${resource_type}, adapter-name=${adapter_name} in namespace '${NAMESPACE}'"
268313
return 0
269314
fi
270315

271-
if [[ "${DRY_RUN}" == "true" ]]; then
272-
log_info "[DRY-RUN] Would uninstall adapter (release: ${release_name})"
273-
return 0
274-
fi
316+
# Uninstall all matching releases
317+
local uninstall_errors=0
318+
while IFS= read -r release_name; do
319+
if [[ "${DRY_RUN}" == "true" ]]; then
320+
log_info "[DRY-RUN] Would uninstall adapter (release: ${release_name})"
321+
else
322+
log_info "Uninstalling adapter ${adapter_name} for ${resource_type} (release: ${release_name})..."
323+
log_info "Executing: helm uninstall ${release_name} -n ${NAMESPACE} --wait --timeout 5m"
275324

276-
log_info "Uninstalling adapter ${adapter_name} for ${resource_type}..."
277-
log_info "Executing: helm uninstall ${release_name} -n ${NAMESPACE} --wait --timeout 5m"
325+
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m; then
326+
log_success "Adapter ${adapter_name} for ${resource_type} (release: ${release_name}) uninstalled successfully"
327+
else
328+
log_error "Failed to uninstall adapter ${adapter_name} for ${resource_type} (release: ${release_name})"
329+
((uninstall_errors++))
330+
fi
331+
fi
332+
done <<< "${matching_releases}"
278333

279-
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m; then
280-
log_success "Adapter ${adapter_name} for ${resource_type} uninstalled successfully"
281-
else
282-
log_error "Failed to uninstall adapter ${adapter_name} for ${resource_type}"
334+
if [[ ${uninstall_errors} -gt 0 ]]; then
283335
return 1
284336
fi
337+
return 0
285338
}
286339

287340
uninstall_adapters() {

deploy-scripts/lib/api.sh

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,36 @@ install_api() {
8383
log_success "API is running and healthy"
8484
else
8585
log_error "API deployment failed health check"
86-
log_info "Checking pod logs for troubleshooting:"
87-
kubectl logs -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${release_name}" --tail=50 2>/dev/null || true
86+
87+
# Capture debug logs before cleanup
88+
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
89+
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"
90+
91+
# Cleanup failed deployment
92+
log_warning "Cleaning up failed API deployment: ${release_name}"
93+
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
94+
log_info "Failed API deployment cleaned up successfully"
95+
else
96+
log_warning "Failed to cleanup API deployment, it may need manual cleanup"
97+
fi
8898
return 1
8999
fi
90100
else
91101
log_error "Failed to install API"
102+
103+
# Check if release was created (partial deployment) and cleanup
104+
if helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then
105+
# Capture debug logs before cleanup
106+
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
107+
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"
108+
109+
log_warning "Cleaning up failed API deployment: ${release_name}"
110+
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
111+
log_info "Failed API deployment cleaned up successfully"
112+
else
113+
log_warning "Failed to cleanup API deployment, it may need manual cleanup"
114+
fi
115+
fi
92116
return 1
93117
fi
94118
}

deploy-scripts/lib/common.sh

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,85 @@ verify_pod_health() {
184184
return 1
185185
}
186186

187+
# ============================================================================
188+
# Debug Log Capture
189+
# ============================================================================
190+
191+
capture_debug_logs() {
192+
local namespace="$1"
193+
local selector="$2"
194+
local component_name="$3"
195+
local output_dir="${4:-${WORK_DIR}/debug-logs}"
196+
197+
log_section "Capturing Debug Logs for ${component_name}"
198+
199+
# Create output directory
200+
mkdir -p "${output_dir}"
201+
202+
local timestamp
203+
timestamp=$(date +"%Y%m%d-%H%M%S")
204+
local log_prefix="${output_dir}/${component_name}-${timestamp}"
205+
206+
log_info "Saving debug logs to: ${log_prefix}-*"
207+
208+
# Capture pod logs
209+
log_info "Capturing pod logs..."
210+
kubectl logs -n "${namespace}" -l "${selector}" --all-containers=true --prefix=true > "${log_prefix}-pods.log" 2>&1 || true
211+
212+
# Capture previous pod logs (for crashed containers)
213+
log_info "Capturing previous pod logs..."
214+
kubectl logs -n "${namespace}" -l "${selector}" --all-containers=true --prefix=true --previous > "${log_prefix}-pods-previous.log" 2>&1 || true
215+
216+
# Capture pod descriptions
217+
log_info "Capturing pod descriptions..."
218+
kubectl describe pods -n "${namespace}" -l "${selector}" > "${log_prefix}-pods-describe.txt" 2>&1 || true
219+
220+
# Capture pod status
221+
log_info "Capturing pod status..."
222+
kubectl get pods -n "${namespace}" -l "${selector}" -o wide > "${log_prefix}-pods-status.txt" 2>&1 || true
223+
kubectl get pods -n "${namespace}" -l "${selector}" -o yaml > "${log_prefix}-pods-yaml.yaml" 2>&1 || true
224+
225+
# Capture events
226+
log_info "Capturing namespace events..."
227+
kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' > "${log_prefix}-events.txt" 2>&1 || true
228+
229+
# Capture deployment/statefulset status if exists
230+
log_info "Capturing deployment/statefulset status..."
231+
kubectl get deployments,statefulsets -n "${namespace}" -l "${selector}" -o wide > "${log_prefix}-workloads-status.txt" 2>&1 || true
232+
kubectl get deployments,statefulsets -n "${namespace}" -l "${selector}" -o yaml > "${log_prefix}-workloads-yaml.yaml" 2>&1 || true
233+
234+
# Capture services and endpoints
235+
log_info "Capturing services and endpoints..."
236+
kubectl get svc,endpoints -n "${namespace}" -l "${selector}" -o wide > "${log_prefix}-network.txt" 2>&1 || true
237+
238+
# Create a summary file
239+
cat > "${log_prefix}-summary.txt" <<EOF
240+
Debug Log Capture Summary
241+
=========================
242+
Component: ${component_name}
243+
Namespace: ${namespace}
244+
Selector: ${selector}
245+
Timestamp: ${timestamp}
246+
247+
Files Generated:
248+
- ${log_prefix}-pods.log (current pod logs)
249+
- ${log_prefix}-pods-previous.log (previous pod logs for crashed containers)
250+
- ${log_prefix}-pods-describe.txt (pod descriptions)
251+
- ${log_prefix}-pods-status.txt (pod status)
252+
- ${log_prefix}-pods-yaml.yaml (pod YAML manifests)
253+
- ${log_prefix}-events.txt (namespace events)
254+
- ${log_prefix}-workloads-status.txt (deployment/statefulset status)
255+
- ${log_prefix}-workloads-yaml.yaml (deployment/statefulset YAML manifests)
256+
- ${log_prefix}-network.txt (services and endpoints)
257+
EOF
258+
259+
log_success "Debug logs captured successfully"
260+
log_info "Debug log location: ${output_dir}/"
261+
log_info "Log prefix: ${component_name}-${timestamp}-*"
262+
263+
return 0
264+
}
265+
187266
# ============================================================================
188267
# Namespace Management
189268
# ============================================================================

deploy-scripts/lib/sentinel.sh

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,36 @@ install_sentinel_instance() {
7878
log_success "${component_name} is running and healthy"
7979
else
8080
log_error "${component_name} deployment failed health check"
81-
log_info "Checking pod logs for troubleshooting:"
82-
kubectl logs -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${release_name}" --tail=50 2>/dev/null || true
81+
82+
# Capture debug logs before cleanup
83+
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
84+
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"
85+
86+
# Cleanup failed deployment
87+
log_warning "Cleaning up failed ${component_name} deployment: ${release_name}"
88+
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
89+
log_info "Failed ${component_name} deployment cleaned up successfully"
90+
else
91+
log_warning "Failed to cleanup ${component_name} deployment, it may need manual cleanup"
92+
fi
8393
return 1
8494
fi
8595
else
8696
log_error "Failed to install ${component_name}"
97+
98+
# Check if release was created (partial deployment) and cleanup
99+
if helm list -n "${NAMESPACE}" 2>/dev/null | grep -q "^${release_name}"; then
100+
# Capture debug logs before cleanup
101+
local debug_log_dir="${DEBUG_LOG_DIR:-${WORK_DIR}/debug-logs}"
102+
capture_debug_logs "${NAMESPACE}" "app.kubernetes.io/instance=${release_name}" "${release_name}" "${debug_log_dir}"
103+
104+
log_warning "Cleaning up failed ${component_name} deployment: ${release_name}"
105+
if helm uninstall "${release_name}" -n "${NAMESPACE}" --wait --timeout 5m 2>/dev/null; then
106+
log_info "Failed ${component_name} deployment cleaned up successfully"
107+
else
108+
log_warning "Failed to cleanup ${component_name} deployment, it may need manual cleanup"
109+
fi
110+
fi
87111
return 1
88112
fi
89113
}

0 commit comments

Comments
 (0)