diff --git a/Makefile b/Makefile index 2ea9bd27..bf558f2c 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,13 @@ deploy-local: echo "Error: The git submodules have not been initialized. Run 'git submodule update --init --recursive' first."; \ exit 1; \ fi + @if ! docker version >/dev/null 2>&1; then \ + echo "Error: Docker is not accessible. This could be due to:"; \ + echo " 1. Docker not installed - run: make setup-local"; \ + echo " 2. Docker group not active - run: newgrp docker && make deploy-local"; \ + echo " 3. Need to logout/login after setup - exit and re-enter your session"; \ + exit 1; \ + fi cd deployment && make up make crs-instance-id make wait-crs diff --git a/deployment/crs-architecture.sh b/deployment/crs-architecture.sh index afd51584..94e0246c 100755 --- a/deployment/crs-architecture.sh +++ b/deployment/crs-architecture.sh @@ -147,18 +147,24 @@ up() { # Set secrets echo -e "${BLU}Creating ghcr secret${NC}" - kubectl delete secret ghcr --namespace "$BUTTERCUP_NAMESPACE" || true + # Always recreate the ghcr secret to ensure it has the latest values + kubectl delete secret ghcr --namespace "$BUTTERCUP_NAMESPACE" >/dev/null 2>&1 || true kubectl create secret generic ghcr \ --namespace "$BUTTERCUP_NAMESPACE" \ --from-literal=pat="$GHCR_PAT" \ --from-literal=username="$GHCR_USERNAME" \ - --from-literal=scantron_github_pat="$SCANTRON_GITHUB_PAT" || echo -e "${GRN}ghcr secret already exists${NC}" + --from-literal=scantron_github_pat="$SCANTRON_GITHUB_PAT" echo -e "${BLU}Creating CRS_INSTANCE_ID${NC}" - CRS_INSTANCE_ID=$(echo $RANDOM | md5sum | head -c 20) - kubectl create configmap crs-instance-id \ - --namespace "$BUTTERCUP_NAMESPACE" \ - --from-literal=crs-instance-id="$CRS_INSTANCE_ID" || echo -e "${GRN}crs-instance-id configmap already exists${NC}" + # Check if configmap already exists + if kubectl get configmap crs-instance-id --namespace "$BUTTERCUP_NAMESPACE" >/dev/null 2>&1; then + echo -e "${GRN}crs-instance-id configmap already exists${NC}" + else + CRS_INSTANCE_ID=$(echo $RANDOM | md5sum | head -c 20) + kubectl create configmap crs-instance-id \ + --namespace "$BUTTERCUP_NAMESPACE" \ + --from-literal=crs-instance-id="$CRS_INSTANCE_ID" + fi CRS_INSTANCE_ID=$(kubectl get configmap crs-instance-id \ --namespace "$BUTTERCUP_NAMESPACE" \ @@ -166,29 +172,39 @@ up() { echo -e "${GRN}CRS_INSTANCE_ID is $CRS_INSTANCE_ID${NC}" if [[ -n "${DOCKER_USERNAME}" ]] && [[ -n "${DOCKER_PAT}" ]]; then - kubectl create secret docker-registry docker-auth \ - --namespace "$BUTTERCUP_NAMESPACE" \ - --docker-server=docker.io \ - --docker-username="$DOCKER_USERNAME" \ - --docker-password="$DOCKER_PAT" || echo -e "${GRN}docker-registry secret already exists${NC}" + # Check if docker-auth secret already exists + if kubectl get secret docker-auth --namespace "$BUTTERCUP_NAMESPACE" >/dev/null 2>&1; then + echo -e "${GRN}docker-registry secret already exists${NC}" + else + kubectl create secret docker-registry docker-auth \ + --namespace "$BUTTERCUP_NAMESPACE" \ + --docker-server=docker.io \ + --docker-username="$DOCKER_USERNAME" \ + --docker-password="$DOCKER_PAT" + fi else echo -e "${GRN}Docker credentials have not been configured. Skipping creating optional docker-registry secret${NC}" fi # Create TLS certificate for registry cache echo -e "${BLU}Creating TLS certificate for registry cache${NC}" - REGISTRY_CACHE_HOST="registry-cache.${BUTTERCUP_NAMESPACE}.svc.cluster.local" - openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ - -keyout /tmp/registry-cache.key \ - -out /tmp/registry-cache.crt \ - -subj "/CN=${REGISTRY_CACHE_HOST}" \ - -addext "subjectAltName=DNS:${REGISTRY_CACHE_HOST},DNS:registry-cache,DNS:registry-cache.${BUTTERCUP_NAMESPACE},DNS:ghcr.io" - - kubectl create secret tls registry-cache-tls \ - --namespace "$BUTTERCUP_NAMESPACE" \ - --key=/tmp/registry-cache.key \ - --cert=/tmp/registry-cache.crt || echo -e "${GRN}registry-cache-tls secret already exists${NC}" - rm -f /tmp/registry-cache.key /tmp/registry-cache.crt + # Check if secret already exists + if kubectl get secret registry-cache-tls --namespace "$BUTTERCUP_NAMESPACE" >/dev/null 2>&1; then + echo -e "${GRN}registry-cache-tls secret already exists${NC}" + else + REGISTRY_CACHE_HOST="registry-cache.${BUTTERCUP_NAMESPACE}.svc.cluster.local" + openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ + -keyout /tmp/registry-cache.key \ + -out /tmp/registry-cache.crt \ + -subj "/CN=${REGISTRY_CACHE_HOST}" \ + -addext "subjectAltName=DNS:${REGISTRY_CACHE_HOST},DNS:registry-cache,DNS:registry-cache.${BUTTERCUP_NAMESPACE},DNS:ghcr.io" + + kubectl create secret tls registry-cache-tls \ + --namespace "$BUTTERCUP_NAMESPACE" \ + --key=/tmp/registry-cache.key \ + --cert=/tmp/registry-cache.crt + rm -f /tmp/registry-cache.key /tmp/registry-cache.crt + fi #deploy kubernetes resources in AKS cluster if [ "$TAILSCALE_ENABLED" = "true" ]; then diff --git a/deployment/env.template b/deployment/env.template index 1209ab89..fc7b72a3 100644 --- a/deployment/env.template +++ b/deployment/env.template @@ -64,6 +64,7 @@ export AZURE_API_BASE="" export AZURE_API_KEY="" export OPENAI_API_KEY="" export ANTHROPIC_API_KEY="" +# Note: At least one API key (OpenAI or Anthropic) is required # LangFuse settings, instructing LLM-applications to log their LLM traces # export LANGFUSE_ENABLED=true diff --git a/fix-litellm-secret.sh b/fix-litellm-secret.sh new file mode 100755 index 00000000..b040f4f8 --- /dev/null +++ b/fix-litellm-secret.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Workaround script for litellm-api-user secret issue +# This creates the missing secret that patcher and seed-gen need + +echo "Waiting for litellm secrets to be created..." +while ! kubectl get secret buttercup-litellm-api-secrets -n crs &>/dev/null; do + sleep 2 +done + +echo "Creating litellm-api-user secret..." +LITELLM_KEY=$(kubectl get secret buttercup-litellm-api-secrets -n crs -o jsonpath='{.data.BUTTERCUP_LITELLM_KEY}' | base64 -d) +kubectl create secret generic litellm-api-user -n crs --from-literal=API_KEY="$LITELLM_KEY" + +echo "Secret created successfully!" +kubectl get secret litellm-api-user -n crs \ No newline at end of file diff --git a/litellm-secret-bug-issue.md b/litellm-secret-bug-issue.md new file mode 100644 index 00000000..6a9b73aa --- /dev/null +++ b/litellm-secret-bug-issue.md @@ -0,0 +1,165 @@ +# Helm Deployment Fails: litellm-api-user Secret Not Created, Blocking Patcher and Seed-gen Pods + +## Summary +The Buttercup Helm deployment has a critical bug where the `patcher` and `seed-gen` pods fail to start because they depend on a secret (`litellm-api-user`) that is not reliably created. This causes the deployment to timeout and fail, even though all other components start successfully. + +## Environment +- **Platform**: Ubuntu 22.04 ARM64 (via Multipass on macOS) +- **Kubernetes**: Minikube v1.36.0 +- **Deployment Method**: `make deploy-local` +- **Affected Components**: patcher, seed-gen + +## Problem Description + +### 1. Root Cause +The `litellm-user-keys-setup` post-install job is responsible for creating the `litellm-api-user` secret, but it frequently fails due to: +- Race condition with LiteLLM service startup +- Insufficient retry/timeout configuration +- No fallback mechanism when the job fails + +### 2. Symptoms +```bash +$ kubectl get pods -n crs +NAME READY STATUS RESTARTS AGE +buttercup-patcher-7d6646f85d-cv7ht 0/1 Init:0/3 0 24m +buttercup-seed-gen-57985f475f-fsz6v 0/1 Init:0/3 0 24m +``` + +Both pods are stuck in `Init:0/3` state with the error: +``` +Warning FailedMount MountVolume.SetUp failed for volume "api-key-secret" : secret "litellm-api-user" not found +``` + +### 3. Investigation Results + +The job that should create the secret fails: +```bash +$ kubectl get jobs -n crs +NAME STATUS COMPLETIONS DURATION AGE +litellm-user-keys-setup Failed 0/1 33m 33m +``` + +The job reaches its backoff limit trying to connect to LiteLLM and create a user API key. + +## Current Workaround + +Users can manually create the missing secret: +```bash +kubectl create secret generic litellm-api-user -n crs \ + --from-literal=API_KEY=$(kubectl get secret buttercup-litellm-api-secrets -n crs \ + -o jsonpath='{.data.BUTTERCUP_LITELLM_KEY}' | base64 -d) +``` + +Then restart the affected pods: +```bash +kubectl delete pod -l app=patcher -n crs +kubectl delete pod -l app=seed-gen -n crs +``` + +## Proposed Solutions + +### Option 1: Improve Job Robustness (Recommended) +Modify `deployment/k8s/templates/litellm-user-keys-job.yaml`: + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: litellm-user-keys-setup + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "10" # Ensure it runs after LiteLLM +spec: + backoffLimit: 10 # Increase from default 6 + activeDeadlineSeconds: 1800 # 30 minutes instead of default + template: + spec: + serviceAccountName: litellm-user-keys-setup + restartPolicy: OnFailure + initContainers: + - name: wait-for-litellm + image: curlimages/curl:8.6.0 + command: + - sh + - -c + - | + until curl -f http://{{ .Release.Name }}-litellm:4000/health/readiness; do + echo "Waiting for LiteLLM to be ready..." + sleep 5 + done + containers: + # ... rest of the job spec +``` + +### Option 2: Create Fallback Secret +Add a new template `deployment/k8s/templates/litellm-api-user-fallback.yaml`: + +```yaml +# Only create if the job didn't create it +{{- if not (lookup "v1" "Secret" .Release.Namespace "litellm-api-user") }} +apiVersion: v1 +kind: Secret +metadata: + name: litellm-api-user + annotations: + "helm.sh/hook": post-install + "helm.sh/hook-weight": "20" # Run after the job +type: Opaque +data: + # Use the master key as fallback + API_KEY: {{ (lookup "v1" "Secret" .Release.Namespace (printf "%s-litellm-api-secrets" .Release.Name)).data.BUTTERCUP_LITELLM_KEY }} +{{- end }} +``` + +### Option 3: Use Existing Secret +Modify `patcher` and `seed-gen` deployments to use the existing `buttercup-litellm-api-secrets` directly: + +In `deployment/k8s/charts/patcher/templates/deployment.yaml` and `deployment/k8s/charts/seed-gen/templates/deployment.yaml`: + +```yaml +# Change from: +{{- include "buttercup.env.llm" (merge (dict "secretName" "litellm-api-user" "secretKey" "API_KEY") .) | nindent 8 }} + +# To: +{{- include "buttercup.env.llm" (merge (dict "secretName" (printf "%s-litellm-api-secrets" .Release.Name) "secretKey" "BUTTERCUP_LITELLM_KEY") .) | nindent 8 }} +``` + +### Option 4: Make Secret Creation Synchronous +Instead of using a post-install hook, make the secret creation part of the regular deployment: + +1. Create a Kubernetes Job (not a Hook) that runs before patcher/seed-gen +2. Add init containers to patcher/seed-gen that wait for the secret to exist +3. Use a ConfigMap to track job completion status + +## Impact +- **High severity**: Prevents successful deployment without manual intervention +- **Affects**: All new deployments, especially in resource-constrained environments +- **Frequency**: Appears to be more common in nested virtualization or ARM64 environments + +## Additional Context + +The issue is exacerbated by: +1. No clear error message to users about what failed +2. Helm reports timeout without indicating which specific component failed +3. The post-install hook failure is silent unless users know to check jobs + +## Recommended Fix Priority +**Option 1** (Improve Job Robustness) is recommended as it: +- Maintains the intended architecture +- Provides better reliability without changing the security model +- Is backward compatible +- Gives clear failure indication if it still fails + +## Testing Requirements +After implementing the fix, test: +1. Fresh deployment on resource-constrained systems +2. Deployment with slow LiteLLM startup +3. Upgrade scenarios from existing deployments +4. Rollback scenarios + +## Related Files +- `deployment/k8s/templates/litellm-user-keys-job.yaml` +- `deployment/k8s/templates/litellm-user-keys-setup-script.yaml` +- `deployment/k8s/charts/patcher/templates/deployment.yaml` +- `deployment/k8s/charts/seed-gen/templates/deployment.yaml` +- `deployment/k8s/templates/_helpers.tpl` \ No newline at end of file diff --git a/scripts/common.sh b/scripts/common.sh index 2825aecb..d76e0bcb 100755 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -59,6 +59,18 @@ portable_sed() { fi } +# Function to verify Docker access +verify_docker_access() { + if ! docker version >/dev/null 2>&1; then + print_error "Docker is not accessible. This could be due to:" + echo " 1. Docker not installed - run: make setup-local" + echo " 2. Docker group not active - run: newgrp docker" + echo " 3. Not logged in after group change - logout and login again" + return 1 + fi + return 0 +} + # Function to install Docker install_docker() { print_status "Installing Docker..." @@ -76,13 +88,39 @@ install_docker() { print_status "Installing Docker buildx plugin..." sudo apt install -y docker-buildx-plugin print_success "Docker buildx plugin installed" + + # Check if Docker daemon is accessible + if ! docker version >/dev/null 2>&1; then + if groups | grep -q docker; then + # User is in docker group but session not refreshed + print_warning "Docker group membership detected but not active in current session" + print_error "Please run one of the following to continue:" + echo " Option 1: Exit and re-login to your session" + echo " Option 2: Run: newgrp docker && cd $(pwd) && make setup-local" + echo " Option 3: Run: sudo -E make setup-local (uses sudo for Docker)" + exit 1 + else + # User not in docker group yet + print_error "Docker permissions not configured properly" + print_error "Please logout and login again, then re-run: make setup-local" + exit 1 + fi + fi } # Function to install kubectl install_kubectl() { print_status "Installing kubectl..." if ! command_exists kubectl; then - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + # Detect architecture + ARCH=$(uname -m) + case $ARCH in + x86_64) KUBECTL_ARCH="amd64";; + aarch64|arm64) KUBECTL_ARCH="arm64";; + *) print_error "Unsupported architecture: $ARCH"; exit 1;; + esac + + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${KUBECTL_ARCH}/kubectl" sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl rm kubectl print_success "kubectl installed successfully" @@ -109,9 +147,17 @@ install_helm() { install_minikube() { print_status "Installing Minikube..." if ! command_exists minikube; then - curl -LO https://github.com/kubernetes/minikube/releases/latest/download/minikube-linux-amd64 - sudo install minikube-linux-amd64 /usr/local/bin/minikube - rm minikube-linux-amd64 + # Detect architecture + ARCH=$(uname -m) + case $ARCH in + x86_64) MINIKUBE_ARCH="amd64";; + aarch64|arm64) MINIKUBE_ARCH="arm64";; + *) print_error "Unsupported architecture: $ARCH"; exit 1;; + esac + + curl -LO https://github.com/kubernetes/minikube/releases/latest/download/minikube-linux-${MINIKUBE_ARCH} + sudo install minikube-linux-${MINIKUBE_ARCH} /usr/local/bin/minikube + rm minikube-linux-${MINIKUBE_ARCH} print_success "Minikube installed successfully" else print_success "Minikube is already installed" @@ -476,34 +522,145 @@ configure_llm_budget_wrapper() { # Function to configure required API keys for local development configure_local_api_keys() { - print_status "Configuring required API keys for local development..." + print_status "Configuring AI API keys for local development..." # Source the env file to check current values if [ -f "deployment/env" ]; then source deployment/env fi - # OpenAI API Key (Optional) + # Check if we already have at least one LLM API key configured + local openai_configured=false + local anthropic_configured=false + + if [ -n "$OPENAI_API_KEY" ] && [ "$OPENAI_API_KEY" != "" ]; then + openai_configured=true + fi + + if [ -n "$ANTHROPIC_API_KEY" ] && [ "$ANTHROPIC_API_KEY" != "" ]; then + anthropic_configured=true + fi + + # If we already have at least one key configured, skip to optional services + if [ "$openai_configured" = true ] || [ "$anthropic_configured" = true ]; then + print_success "At least one LLM API key is already configured" + configure_optional_services + return 0 + fi + + # Main loop for LLM API key configuration + while true; do + print_linebreak + print_error "REQUIRED: At least ONE AI API key is needed for Buttercup to function." + print_status "Buttercup uses AI models for vulnerability discovery and patching." + echo + print_status "Choose your AI provider configuration:" + echo " 1. OpenAI only (GPT-4 for patch generation)" + echo " 2. Anthropic only (Claude for seed generation)" + echo " 3. Both providers (recommended for best results)" + echo + + read -p "Enter your choice (1-3): " llm_choice + + case $llm_choice in + 1) + configure_openai_key + if [ $? -eq 0 ]; then + print_success "OpenAI configured successfully" + break + fi + ;; + 2) + configure_anthropic_key + if [ $? -eq 0 ]; then + print_success "Anthropic configured successfully" + break + fi + ;; + 3) + local openai_result=1 + local anthropic_result=1 + + configure_openai_key + openai_result=$? + + configure_anthropic_key + anthropic_result=$? + + if [ $openai_result -eq 0 ] || [ $anthropic_result -eq 0 ]; then + print_success "At least one AI provider configured successfully" + break + else + print_warning "No API keys were configured. Please try again." + fi + ;; + *) + print_error "Invalid choice. Please enter 1, 2, or 3." + print_status "Note: At least one API key is required." + ;; + esac + done + + # Configure optional services + configure_optional_services + + print_success "API key configuration completed" +} + +# Helper function to configure OpenAI API key +configure_openai_key() { print_linebreak - print_status "OpenAI API Key (Optional): Powers AI-driven vulnerability analysis and patch generation." - print_status "The patcher component performs best with OpenAI models (GPT-4o/GPT-4o-mini)." - print_status "Generate your API key at: https://platform.openai.com/settings/organization/api-keys" + print_status "Configuring OpenAI API Key" + print_status "Used for: AI-driven vulnerability analysis and patch generation" + print_status "Best models: GPT-4o/GPT-4o-mini" + print_status "Get your key at: https://platform.openai.com/settings/organization/api-keys" + configure_service "OPENAI_API_KEY" "OpenAI API key" "$OPENAI_API_KEY" "" false - # Anthropic API Key (Optional) + # Verify the key was actually set + if [ -f "deployment/env" ]; then + source deployment/env + fi + + if [ -n "$OPENAI_API_KEY" ] && [ "$OPENAI_API_KEY" != "" ]; then + return 0 + else + print_warning "OpenAI API key was not configured" + return 1 + fi +} + +# Helper function to configure Anthropic API key +configure_anthropic_key() { print_linebreak - print_status "Anthropic API Key (Optional): Powers AI-driven fuzzing seed generation." - print_status "The seed generation component performs best with Anthropic models (Claude 3.5/4 Sonnet)." - print_status "Generate your API key at: https://console.anthropic.com/settings/keys" + print_status "Configuring Anthropic API Key" + print_status "Used for: AI-driven fuzzing seed generation" + print_status "Best models: Claude 3.5/Claude 4 Sonnet" + print_status "Get your key at: https://console.anthropic.com/settings/keys" + configure_service "ANTHROPIC_API_KEY" "Anthropic API key" "$ANTHROPIC_API_KEY" "" false - # Anthropic API Key (Optional) + # Verify the key was actually set + if [ -f "deployment/env" ]; then + source deployment/env + fi + + if [ -n "$ANTHROPIC_API_KEY" ] && [ "$ANTHROPIC_API_KEY" != "" ]; then + return 0 + else + print_warning "Anthropic API key was not configured" + return 1 + fi +} + +# Helper function to configure optional services +configure_optional_services() { + # Gemini API Key (Optional) print_linebreak print_status "Google Gemini API Key (Optional): Fallback model." print_status "Use this model as a fallback if other models are not configured or not available." print_status "Generate your API key at: https://aistudio.google.com/apikey" configure_service "GEMINI_API_KEY" "Gemini API key" "$GEMINI_API_KEY" "" false - # GitHub Personal Access Token (Optional) print_linebreak print_status "GitHub Personal Access Token (Optional): Access to private GitHub resources."