diff --git a/.github/workflows/deploy-ai.yml b/.github/workflows/deploy-ai.yml index e4d0755d..6a988ed4 100644 --- a/.github/workflows/deploy-ai.yml +++ b/.github/workflows/deploy-ai.yml @@ -11,6 +11,10 @@ on: types: - closed +permissions: + id-token: write + contents: read + env: AWS_REGION: ap-northeast-2 SERVICE_NAME: machine @@ -28,8 +32,7 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_GITHUB_ACTIONS_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} - name: Login to ECR @@ -73,8 +76,7 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_GITHUB_ACTIONS_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} - name: Deploy AI service to EC2 via SSM @@ -82,7 +84,8 @@ jobs: ECR_REGISTRY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ env.AWS_REGION }}.amazonaws.com IMAGE_TAG_INPUT: ${{ github.event.inputs.image_tag }} PROJECT_NAME: oplust - SSM_MACHINE_ENV_PARAM: /oplust/machine/env + ENV_NAME: dev + SSM_MACHINE_ENV_PARAM: /oplust/dev/machine/env SSM_MACHINE_NODE_EXPORTER_TARGET_PARAM: /oplust/monitoring/targets/machine-node-exporter run: | set -euo pipefail @@ -96,12 +99,12 @@ jobs: MONITORING_PRIVATE_IP=$(aws ec2 describe-instances \ --region "$AWS_REGION" \ - --filters "Name=tag:Name,Values=${PROJECT_NAME}-monitoring-ec2" "Name=instance-state-name,Values=running" \ + --filters "Name=tag:Name,Values=monitoring" "Name=tag:Project,Values=${PROJECT_NAME}" "Name=tag:Env,Values=${ENV_NAME}" "Name=instance-state-name,Values=running" \ --query "Reservations[0].Instances[0].PrivateIpAddress" \ --output text) if [ -z "$MONITORING_PRIVATE_IP" ] || [ "$MONITORING_PRIVATE_IP" = "None" ]; then - echo "No running monitoring instance found for tag: ${PROJECT_NAME}-monitoring-ec2" >&2 + echo "No running monitoring instance found for Name=monitoring, Project=${PROJECT_NAME}, Env=${ENV_NAME}" >&2 exit 1 fi @@ -116,6 +119,26 @@ jobs: exit 1 fi + wait_for_ssm_online() { + local iid="$1" + for _ in $(seq 1 30); do + ping_status=$(aws ssm describe-instance-information \ + --region "$AWS_REGION" \ + --filters "Key=InstanceIds,Values=${iid}" \ + --query 'InstanceInformationList[0].PingStatus' \ + --output text 2>/dev/null || true) + if [ "$ping_status" = "Online" ]; then + return 0 + fi + sleep 5 + done + return 1 + } + + if ! wait_for_ssm_online "$INSTANCE_ID"; then + echo "Instance is running but not SSM Online: $INSTANCE_ID" >&2 + exit 1 + fi MACHINE_PRIVATE_IP=$(aws ec2 describe-instances \ --region "$AWS_REGION" \ --instance-ids "$INSTANCE_ID" \ @@ -124,7 +147,7 @@ jobs: MONITORING_SG_ID=$(aws ec2 describe-instances \ --region "$AWS_REGION" \ - --filters "Name=tag:Name,Values=${PROJECT_NAME}-monitoring-ec2" "Name=instance-state-name,Values=running" \ + --filters "Name=tag:Name,Values=monitoring" "Name=tag:Project,Values=${PROJECT_NAME}" "Name=tag:Env,Values=${ENV_NAME}" "Name=instance-state-name,Values=running" \ --query "Reservations[0].Instances[0].SecurityGroups[0].GroupId" \ --output text) diff --git a/.github/workflows/deploy-ec2-docker.yml b/.github/workflows/deploy-ec2-docker.yml index 160c12ae..0c14fc86 100644 --- a/.github/workflows/deploy-ec2-docker.yml +++ b/.github/workflows/deploy-ec2-docker.yml @@ -1,4 +1,4 @@ -name: Deploy Docker Apps To EC2 +name: Deploy Docker Apps To EC2 on: workflow_dispatch: @@ -11,6 +11,10 @@ on: types: - closed +permissions: + id-token: write + contents: read + env: AWS_REGION: ap-northeast-2 @@ -33,13 +37,32 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Validate OIDC inputs + env: + ROLE_ARN: ${{ secrets.AWS_GITHUB_ACTIONS_ROLE_ARN }} + run: | + set -euo pipefail + if [ -z "${ROLE_ARN:-}" ]; then + echo "AWS_GITHUB_ACTIONS_ROLE_ARN is empty or not injected." >&2 + exit 1 + fi + if ! echo "$ROLE_ARN" | grep -Eq '^arn:aws:iam::[0-9]{12}:role/.+'; then + echo "AWS_GITHUB_ACTIONS_ROLE_ARN format is invalid: expected role ARN." >&2 + exit 1 + fi + echo "GitHub ref: ${GITHUB_REF}" + echo "GitHub ref_name: ${GITHUB_REF_NAME}" + echo "Role ARN format check passed." + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_GITHUB_ACTIONS_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} + - name: Verify assumed identity + run: aws sts get-caller-identity + - name: Login to ECR uses: aws-actions/amazon-ecr-login@v2 @@ -55,6 +78,7 @@ jobs: run: | IMAGE_TAG="${IMAGE_TAG_INPUT:-${GITHUB_SHA}}" IMAGE_URI="${ECR_REGISTRY}/${{ matrix.ecr_repo }}:${IMAGE_TAG}" + IMAGE_URI_LATEST="${ECR_REGISTRY}/${{ matrix.ecr_repo }}:latest" docker build \ @@ -72,44 +96,92 @@ jobs: needs: build-and-push steps: + - name: Validate OIDC inputs + env: + ROLE_ARN: ${{ secrets.AWS_GITHUB_ACTIONS_ROLE_ARN }} + run: | + set -euo pipefail + if [ -z "${ROLE_ARN:-}" ]; then + echo "AWS_GITHUB_ACTIONS_ROLE_ARN is empty or not injected." >&2 + exit 1 + fi + if ! echo "$ROLE_ARN" | grep -Eq '^arn:aws:iam::[0-9]{12}:role/.+'; then + echo "AWS_GITHUB_ACTIONS_ROLE_ARN format is invalid: expected role ARN." >&2 + exit 1 + fi + echo "GitHub ref: ${GITHUB_REF}" + echo "GitHub ref_name: ${GITHUB_REF_NAME}" + echo "Role ARN format check passed." + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_GITHUB_ACTIONS_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} + - name: Verify assumed identity + run: aws sts get-caller-identity + - name: Deploy to EC2 instances via SSM env: ECR_REGISTRY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ env.AWS_REGION }}.amazonaws.com IMAGE_TAG_INPUT: ${{ github.event.inputs.image_tag }} PROJECT_NAME: oplust DB_NAME: oplust - SSM_RDS_ENDPOINT_PARAM: /oplust/common/rds-endpoint - SSM_DB_USERNAME_PARAM: /oplust/common/db-username - SSM_DB_PASSWORD_PARAM: /oplust/common/db-password + RDS_INSTANCE_IDENTIFIER: oplust-dev-db + MONITORING_INSTANCE_TAG: monitoring NODE_EXPORTER_TARGET_SSM_PARAM: /oplust/monitoring/targets/node-exporter - ECS_CLUSTER_NAME: fluffy-flamingo-5ag1uq - ECS_SERVICE_NAME: oplust-transcoder-service + SSM_ECS_CLUSTER_NAME_PARAM: /oplust/dev/lambda/worker/ecs_cluster_name + SSM_ECS_SERVICE_NAME_PARAM: /oplust/dev/lambda/worker/ecs_service_name TRANSCODER_REPOSITORY: oplust-transcoder TRANSCODER_CONTAINER_NAME: oplust-transcoder - SCALER_LAMBDA_NAME: oplust-transcoder-scaler + ENV_NAME: dev run: | set -euo pipefail IMAGE_TAG="${IMAGE_TAG_INPUT:-${GITHUB_SHA}}" + ECS_CLUSTER_NAME=$(aws ssm get-parameter --region "$AWS_REGION" --name "$SSM_ECS_CLUSTER_NAME_PARAM" --with-decryption --query 'Parameter.Value' --output text) + ECS_SERVICE_NAME=$(aws ssm get-parameter --region "$AWS_REGION" --name "$SSM_ECS_SERVICE_NAME_PARAM" --with-decryption --query 'Parameter.Value' --output text) + SCALER_LAMBDA_NAME="${PROJECT_NAME}-${ENV_NAME}-worker" + + if [ -z "$ECS_CLUSTER_NAME" ] || [ "$ECS_CLUSTER_NAME" = "None" ] || [ -z "$ECS_SERVICE_NAME" ] || [ "$ECS_SERVICE_NAME" = "None" ]; then + echo "Failed to resolve ECS cluster/service from SSM parameters" >&2 + exit 1 + fi + + DB_HOST=$(aws rds describe-db-instances --region "$AWS_REGION" --db-instance-identifier "$RDS_INSTANCE_IDENTIFIER" --query 'DBInstances[0].Endpoint.Address' --output text) + if [ -z "$DB_HOST" ] || [ "$DB_HOST" = "None" ]; then + echo "Failed to resolve DB host from RDS" >&2 + exit 1 + fi + MONITORING_PRIVATE_IP=$(aws ec2 describe-instances \ --region "$AWS_REGION" \ - --filters "Name=tag:Name,Values=${PROJECT_NAME}-monitoring-ec2" "Name=instance-state-name,Values=running" \ + --filters "Name=tag:Name,Values=${MONITORING_INSTANCE_TAG}" "Name=instance-state-name,Values=running" \ --query "Reservations[0].Instances[0].PrivateIpAddress" \ --output text) if [ -z "$MONITORING_PRIVATE_IP" ] || [ "$MONITORING_PRIVATE_IP" = "None" ]; then - echo "No running monitoring instance found for tag: ${PROJECT_NAME}-monitoring-ec2" >&2 + echo "No running monitoring instance found for tag: ${MONITORING_INSTANCE_TAG}" >&2 exit 1 fi + wait_for_ssm_online() { + local iid="$1" + for _ in $(seq 1 30); do + ping_status=$(aws ssm describe-instance-information \ + --region "$AWS_REGION" \ + --filters "Key=InstanceIds,Values=${iid}" \ + --query 'InstanceInformationList[0].PingStatus' \ + --output text 2>/dev/null || true) + if [ "$ping_status" = "Online" ]; then + return 0 + fi + sleep 5 + done + return 1 + } deploy_service() { local target_tag="$1" local image_uri="$2" @@ -130,6 +202,26 @@ jobs: echo "No running instance found for tag: ${target_tag}" >&2 exit 1 fi + local instance_state + instance_state=$(aws ec2 describe-instances \ + --region "$AWS_REGION" \ + --instance-ids "$instance_id" \ + --query "Reservations[0].Instances[0].State.Name" \ + --output text) + + if [ "$instance_state" != "running" ]; then + echo "Instance is not running for tag ${target_tag}: ${instance_id} (${instance_state})" >&2 + exit 1 + fi + + if ! wait_for_ssm_online "$instance_id"; then + echo "Instance is not SSM online for tag ${target_tag}: ${instance_id}" >&2 + aws ssm describe-instance-information \ + --region "$AWS_REGION" \ + --filters "Key=InstanceIds,Values=${instance_id}" \ + --output table || true + exit 1 + fi local private_ip private_ip=$(aws ec2 describe-instances \ @@ -187,11 +279,14 @@ jobs: commands=( "set -e" "sudo mkdir -p /etc/oplust" - "DB_HOST=\$(aws ssm get-parameter --region $AWS_REGION --name '$SSM_RDS_ENDPOINT_PARAM' --with-decryption --query 'Parameter.Value' --output text)" - "DB_USER=\$(aws ssm get-parameter --region $AWS_REGION --name '$SSM_DB_USERNAME_PARAM' --with-decryption --query 'Parameter.Value' --output text)" - "DB_PASS=\$(aws ssm get-parameter --region $AWS_REGION --name '$SSM_DB_PASSWORD_PARAM' --with-decryption --query 'Parameter.Value' --output text)" "SERVICE_ENV=\$(aws ssm get-parameter --region $AWS_REGION --name '${service_env_param}' --with-decryption --query 'Parameter.Value' --output text)" - "echo \"SPRING_DATASOURCE_URL=jdbc:mysql://\$DB_HOST:3306/${DB_NAME}\" | sudo tee ${env_file} >/dev/null" + "if ! command -v jq >/dev/null 2>&1; then if command -v dnf >/dev/null 2>&1; then sudo dnf -y install jq; elif command -v yum >/dev/null 2>&1; then sudo yum -y install jq; elif command -v apt-get >/dev/null 2>&1; then sudo apt-get update -y && sudo apt-get install -y jq; else echo 'jq not found and no supported package manager to install it' >&2; exit 1; fi; fi" + "DB_SECRET_ARN=\$(aws rds describe-db-instances --region $AWS_REGION --db-instance-identifier '$RDS_INSTANCE_IDENTIFIER' --query 'DBInstances[0].MasterUserSecret.SecretArn' --output text)" + "DB_SECRET_JSON=\$(aws secretsmanager get-secret-value --region $AWS_REGION --secret-id \"\$DB_SECRET_ARN\" --query 'SecretString' --output text)" + "DB_USER=\$(echo \"\$DB_SECRET_JSON\" | jq -r '.username')" + "DB_PASS=\$(echo \"\$DB_SECRET_JSON\" | jq -r '.password')" + "if [ -z \"\$DB_USER\" ] || [ \"\$DB_USER\" = \"null\" ] || [ -z \"\$DB_PASS\" ] || [ \"\$DB_PASS\" = \"null\" ]; then echo 'Failed to resolve DB credentials from Secrets Manager' >&2; exit 1; fi" + "echo \"SPRING_DATASOURCE_URL=jdbc:mysql://${DB_HOST}:3306/${DB_NAME}\" | sudo tee ${env_file} >/dev/null" "echo \"SPRING_DATASOURCE_USERNAME=\$DB_USER\" | sudo tee -a ${env_file} >/dev/null" "echo \"SPRING_DATASOURCE_PASSWORD=\$DB_PASS\" | sudo tee -a ${env_file} >/dev/null" "printf '%s\n' \"\$SERVICE_ENV\" | sudo tee -a ${env_file} >/dev/null" @@ -469,6 +564,11 @@ jobs: rm -f "$td_file" "$new_td_file" } - deploy_service "${PROJECT_NAME}-user-ec2" "${ECR_REGISTRY}/oplust-api-user:${IMAGE_TAG}" "oplust-api-user" "/etc/oplust/api-user.env" "8080" "/oplust/api-user/env" "user-api" - deploy_service "${PROJECT_NAME}-admin-ec2" "${ECR_REGISTRY}/oplust-api-admin:${IMAGE_TAG}" "oplust-api-admin" "/etc/oplust/api-admin.env" "8081" "/oplust/api-admin/env" "admin-api" + deploy_service "user" "${ECR_REGISTRY}/oplust-api-user:${IMAGE_TAG}" "oplust-api-user" "/etc/oplust/api-user.env" "8080" "/oplust/${ENV_NAME}/api-user/env" "user-api" + deploy_service "admin" "${ECR_REGISTRY}/oplust-api-admin:${IMAGE_TAG}" "oplust-api-admin" "/etc/oplust/api-admin.env" "8081" "/oplust/${ENV_NAME}/api-admin/env" "admin-api" deploy_transcoder_ecs + + + + + diff --git a/.github/workflows/deploy-monitoring.yml b/.github/workflows/deploy-monitoring.yml index 20d00c72..9103d021 100644 --- a/.github/workflows/deploy-monitoring.yml +++ b/.github/workflows/deploy-monitoring.yml @@ -6,43 +6,28 @@ on: monitoring_instance_tag: description: "EC2 Name tag for monitoring server" required: true - default: "oplust-monitoring-ec2" + default: "monitoring" type: string - user_api_target_ssm_param: - description: "SSM parameter name for user-api target (host:port)" + project_name: + description: "Project name" required: true - default: "/oplust/monitoring/targets/user-api" + default: "oplust" type: string - admin_api_target_ssm_param: - description: "SSM parameter name for admin-api target (host:port)" + environment: + description: "Environment name" required: true - default: "/oplust/monitoring/targets/admin-api" + default: "dev" type: string - transcoder_target_ssm_param: - description: "SSM parameter name for transcoder target (host:port)" + grafana_admin_secret_id: + description: "Secrets Manager secret id/arn for Grafana admin credentials" required: true - default: "/oplust/monitoring/targets/transcoder" - type: string - node_exporter_target_ssm_param: - description: "SSM parameter name for node-exporter target (host:port)" - required: true - default: "/oplust/monitoring/targets/node-exporter" - type: string - machine_node_exporter_target_ssm_param: - description: "SSM parameter name for machine node-exporter target (host:port)" - required: true - default: "/oplust/monitoring/targets/machine-node-exporter" - type: string - grafana_password_ssm_param: - description: "SSM SecureString parameter name for Grafana admin password" - required: true - default: "/oplust/monitoring/grafana-admin-password" - type: string - grafana_admin_password: - description: "Optional override password (leave blank to use SSM)" - required: false + default: "oplust/dev/monitoring/grafana-admin-credentials" type: string +permissions: + id-token: write + contents: read + env: AWS_REGION: ap-northeast-2 MONITORING_ROOT: /opt/oplust-monitoring @@ -58,82 +43,66 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_GITHUB_ACTIONS_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} - - name: Resolve scrape targets from SSM + - name: Resolve targets from Terraform naming convention env: - USER_API_TARGET_SSM_PARAM: ${{ github.event.inputs.user_api_target_ssm_param }} - ADMIN_API_TARGET_SSM_PARAM: ${{ github.event.inputs.admin_api_target_ssm_param }} - TRANSCODER_TARGET_SSM_PARAM: ${{ github.event.inputs.transcoder_target_ssm_param }} - NODE_EXPORTER_TARGET_SSM_PARAM: ${{ github.event.inputs.node_exporter_target_ssm_param }} - MACHINE_NODE_EXPORTER_TARGET_SSM_PARAM: ${{ github.event.inputs.machine_node_exporter_target_ssm_param }} + PROJECT_NAME: ${{ github.event.inputs.project_name }} + ENV_NAME: ${{ github.event.inputs.environment }} + INSTANCE_TAG: ${{ github.event.inputs.monitoring_instance_tag }} run: | set -euo pipefail - USER_API_TARGET=$(aws ssm get-parameter \ - --region "$AWS_REGION" \ - --name "$USER_API_TARGET_SSM_PARAM" \ - --with-decryption \ - --query 'Parameter.Value' \ - --output text) + get_private_ip_by_name() { + local name_tag="$1" + aws ec2 describe-instances \ + --region "$AWS_REGION" \ + --filters "Name=tag:Name,Values=${name_tag}" "Name=instance-state-name,Values=running" \ + --query "Reservations[0].Instances[0].PrivateIpAddress" \ + --output text + } - ADMIN_API_TARGET=$(aws ssm get-parameter \ - --region "$AWS_REGION" \ - --name "$ADMIN_API_TARGET_SSM_PARAM" \ - --with-decryption \ - --query 'Parameter.Value' \ - --output text) + USER_API_IP=$(get_private_ip_by_name "user") + ADMIN_API_IP=$(get_private_ip_by_name "admin") + MONITORING_IP=$(get_private_ip_by_name "${INSTANCE_TAG}") - TRANSCODER_TARGET=$(aws ssm get-parameter \ - --region "$AWS_REGION" \ - --name "$TRANSCODER_TARGET_SSM_PARAM" \ - --with-decryption \ - --query 'Parameter.Value' \ - --output text) + if [ -z "$USER_API_IP" ] || [ "$USER_API_IP" = "None" ] || [ -z "$ADMIN_API_IP" ] || [ "$ADMIN_API_IP" = "None" ] || [ -z "$MONITORING_IP" ] || [ "$MONITORING_IP" = "None" ]; then + echo "Failed to resolve required EC2 private IPs (user/admin/monitoring)." >&2 + exit 1 + fi - NODE_EXPORTER_TARGET=$(aws ssm get-parameter \ - --region "$AWS_REGION" \ - --name "$NODE_EXPORTER_TARGET_SSM_PARAM" \ - --with-decryption \ - --query 'Parameter.Value' \ - --output text) + USER_API_TARGET="${USER_API_IP}:8080" + ADMIN_API_TARGET="${ADMIN_API_IP}:8081" + MACHINE_NODE_EXPORTER_TARGET="${MONITORING_IP}:9100" - MACHINE_NODE_EXPORTER_TARGET=$(aws ssm get-parameter \ - --region "$AWS_REGION" \ - --name "$MACHINE_NODE_EXPORTER_TARGET_SSM_PARAM" \ - --with-decryption \ - --query 'Parameter.Value' \ - --output text) + NODE_EXPORTER_TARGETS_YAML="\"${USER_API_IP}:9100\",\"${ADMIN_API_IP}:9100\",\"${MONITORING_IP}:9100\"" + + ECS_CLUSTER_PARAM="/${PROJECT_NAME}/${ENV_NAME}/lambda/worker/ecs_cluster_name" + ECS_SERVICE_PARAM="/${PROJECT_NAME}/${ENV_NAME}/lambda/worker/ecs_service_name" - if [ -z "$USER_API_TARGET" ] || [ "$USER_API_TARGET" = "None" ] || \ - [ -z "$ADMIN_API_TARGET" ] || [ "$ADMIN_API_TARGET" = "None" ] || \ - [ -z "$TRANSCODER_TARGET" ] || [ "$TRANSCODER_TARGET" = "None" ] || \ - [ -z "$NODE_EXPORTER_TARGET" ] || [ "$NODE_EXPORTER_TARGET" = "None" ] || \ - [ -z "$MACHINE_NODE_EXPORTER_TARGET" ] || [ "$MACHINE_NODE_EXPORTER_TARGET" = "None" ]; then - echo "One or more scrape targets are empty. Check SSM parameter values." >&2 + ECS_CLUSTER_NAME=$(aws ssm get-parameter --region "$AWS_REGION" --name "$ECS_CLUSTER_PARAM" --with-decryption --query 'Parameter.Value' --output text) + ECS_SERVICE_NAME=$(aws ssm get-parameter --region "$AWS_REGION" --name "$ECS_SERVICE_PARAM" --with-decryption --query 'Parameter.Value' --output text) + + if [ -z "$ECS_CLUSTER_NAME" ] || [ "$ECS_CLUSTER_NAME" = "None" ] || [ -z "$ECS_SERVICE_NAME" ] || [ "$ECS_SERVICE_NAME" = "None" ]; then + echo "Failed to resolve ECS cluster/service from SSM." >&2 exit 1 fi - normalize_targets() { - # Accept comma/newline/space separated values and render a YAML inline list payload: - # "host1:port","host2:port" - printf '%s' "$1" \ - | tr ',\r\n\t' ' ' \ - | xargs -n1 \ - | sed '/^$/d' \ - | sort -u \ - | awk '{printf "\"%s\",", $0}' \ - | sed 's/,$//' - } + TASK_ARNS=$(aws ecs list-tasks --region "$AWS_REGION" --cluster "$ECS_CLUSTER_NAME" --service-name "$ECS_SERVICE_NAME" --desired-status RUNNING --query 'taskArns' --output text | tr '\t' ' ' || true) + TRANSCODER_TARGETS_YAML="" - TRANSCODER_TARGETS_YAML=$(normalize_targets "$TRANSCODER_TARGET") - NODE_EXPORTER_TARGETS_YAML=$(normalize_targets "$NODE_EXPORTER_TARGET") + if [ -n "${TASK_ARNS:-}" ]; then + ENI_IDS=$(aws ecs describe-tasks --region "$AWS_REGION" --cluster "$ECS_CLUSTER_NAME" --tasks $TASK_ARNS \ + --query 'tasks[].attachments[].details[?name==`networkInterfaceId`].value' --output text | tr '\t' ' ' || true) - if [ -z "$TRANSCODER_TARGETS_YAML" ] || [ -z "$NODE_EXPORTER_TARGETS_YAML" ]; then - echo "Rendered target list is empty. Check SSM parameter values." >&2 - exit 1 + if [ -n "${ENI_IDS:-}" ]; then + TRANSCODER_IPS=$(aws ec2 describe-network-interfaces --region "$AWS_REGION" --network-interface-ids $ENI_IDS \ + --query 'NetworkInterfaces[].PrivateIpAddress' --output text || true) + if [ -n "${TRANSCODER_IPS:-}" ]; then + TRANSCODER_TARGETS_YAML=$(printf '%s\n' $TRANSCODER_IPS | sed '/^$/d' | sort -u | awk '{printf "\"%s:8082\",", $0}' | sed 's/,$//') + fi + fi fi echo "USER_API_TARGET=$USER_API_TARGET" >> "$GITHUB_ENV" @@ -154,26 +123,22 @@ jobs: -e "s|__MACHINE_NODE_EXPORTER_TARGET__|${MACHINE_NODE_EXPORTER_TARGET}|g" \ apps/monitoring/prometheus/prometheus.prod.yml.tpl > apps/monitoring/prometheus/prometheus.prod.yml - - name: Resolve Grafana admin password + - name: Resolve Grafana admin password from Secrets Manager env: - GRAFANA_ADMIN_PASSWORD_INPUT: ${{ github.event.inputs.grafana_admin_password }} - GRAFANA_PASSWORD_SSM_PARAM: ${{ github.event.inputs.grafana_password_ssm_param }} + GRAFANA_ADMIN_SECRET_ID: ${{ github.event.inputs.grafana_admin_secret_id }} run: | set -euo pipefail - if [ -n "$GRAFANA_ADMIN_PASSWORD_INPUT" ]; then - GRAFANA_PASSWORD="$GRAFANA_ADMIN_PASSWORD_INPUT" - else - GRAFANA_PASSWORD=$(aws ssm get-parameter \ - --region "$AWS_REGION" \ - --name "$GRAFANA_PASSWORD_SSM_PARAM" \ - --with-decryption \ - --query 'Parameter.Value' \ - --output text) - fi + SECRET_JSON=$(aws secretsmanager get-secret-value \ + --region "$AWS_REGION" \ + --secret-id "$GRAFANA_ADMIN_SECRET_ID" \ + --query 'SecretString' \ + --output text) + + GRAFANA_PASSWORD=$(echo "$SECRET_JSON" | jq -r '.password') - if [ -z "$GRAFANA_PASSWORD" ] || [ "$GRAFANA_PASSWORD" = "None" ]; then - echo "Grafana admin password is empty. Check input or SSM parameter." >&2 + if [ -z "$GRAFANA_PASSWORD" ] || [ "$GRAFANA_PASSWORD" = "null" ]; then + echo "Grafana admin password is empty in secret: $GRAFANA_ADMIN_SECRET_ID" >&2 exit 1 fi @@ -196,6 +161,26 @@ jobs: exit 1 fi + wait_for_ssm_online() { + local iid="$1" + for _ in $(seq 1 30); do + ping_status=$(aws ssm describe-instance-information \ + --region "$AWS_REGION" \ + --filters "Key=InstanceIds,Values=${iid}" \ + --query 'InstanceInformationList[0].PingStatus' \ + --output text 2>/dev/null || true) + if [ "$ping_status" = "Online" ]; then + return 0 + fi + sleep 5 + done + return 1 + } + + if ! wait_for_ssm_online "$INSTANCE_ID"; then + echo "Instance is running but not SSM Online: $INSTANCE_ID" >&2 + exit 1 + fi COMPOSE_B64=$(base64 -w 0 apps/monitoring/docker-compose.yml) COMPOSE_PROD_B64=$(base64 -w 0 apps/monitoring/docker-compose.prod.yml) PROM_PROD_B64=$(base64 -w 0 apps/monitoring/prometheus/prometheus.prod.yml) @@ -265,5 +250,3 @@ jobs: echo "Monitoring deployment timed out" >&2 exit 1 - - diff --git a/.github/workflows/deploy-rabbitmq.yml b/.github/workflows/deploy-rabbitmq.yml index f7bf7c47..552db05d 100644 --- a/.github/workflows/deploy-rabbitmq.yml +++ b/.github/workflows/deploy-rabbitmq.yml @@ -6,34 +6,30 @@ on: rabbitmq_instance_tag: description: "EC2 Name tag for RabbitMQ server" required: true - default: "oplust-rabbitmq-ec2" + default: "rabbitmq" type: string - rabbitmq_image_tag: - description: "RabbitMQ image tag" + rabbitmq_admin_secret_id: + description: "Secrets Manager secret id/arn for RabbitMQ admin credentials" required: true - default: "3.13-management" + default: "oplust/dev/rabbitmq/admin-credentials" type: string - rabbitmq_user_ssm_param: - description: "SSM SecureString parameter for RabbitMQ default user" + rabbitmq_app_secret_id: + description: "Secrets Manager secret id/arn for RabbitMQ app credentials" required: true - default: "/oplust/rabbitmq/default-user" + default: "oplust/dev/rabbitmq/credentials" type: string - rabbitmq_password_ssm_param: - description: "SSM SecureString parameter for RabbitMQ default password" + rabbitmq_vhost: + description: "RabbitMQ vhost" required: true - default: "/oplust/rabbitmq/default-password" - type: string - rabbitmq_vhost_ssm_param: - description: "SSM parameter for RabbitMQ default vhost" - required: true - default: "/oplust/rabbitmq/default-vhost" + default: "/" type: string +permissions: + id-token: write + contents: read + env: AWS_REGION: ap-northeast-2 - RABBITMQ_ROOT: /opt/oplust-rabbitmq - RABBITMQ_CONTAINER_NAME: oplust-rabbitmq - RABBITMQ_DATA_VOLUME: oplust-rabbitmq-data jobs: deploy-rabbitmq: @@ -43,20 +39,23 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: ${{ secrets.AWS_GITHUB_ACTIONS_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} - - name: Deploy RabbitMQ via SSM + - name: Sync RabbitMQ users from Secrets Manager via SSM env: INSTANCE_TAG: ${{ github.event.inputs.rabbitmq_instance_tag }} - RABBITMQ_IMAGE_TAG: ${{ github.event.inputs.rabbitmq_image_tag }} - RABBITMQ_USER_SSM_PARAM: ${{ github.event.inputs.rabbitmq_user_ssm_param }} - RABBITMQ_PASSWORD_SSM_PARAM: ${{ github.event.inputs.rabbitmq_password_ssm_param }} - RABBITMQ_VHOST_SSM_PARAM: ${{ github.event.inputs.rabbitmq_vhost_ssm_param }} + RABBITMQ_ADMIN_SECRET_ID: ${{ github.event.inputs.rabbitmq_admin_secret_id }} + RABBITMQ_APP_SECRET_ID: ${{ github.event.inputs.rabbitmq_app_secret_id }} + RABBITMQ_VHOST: ${{ github.event.inputs.rabbitmq_vhost }} run: | set -euo pipefail + if [[ "$RABBITMQ_VHOST" == *"'"* ]] || [[ "$RABBITMQ_VHOST" == *$'\n'* ]] || [[ "$RABBITMQ_VHOST" == *$'\r'* ]]; then + echo "Invalid rabbitmq_vhost input. Single quote and newlines are not allowed." >&2 + exit 1 + fi + INSTANCE_ID=$(aws ec2 describe-instances \ --region "$AWS_REGION" \ --filters "Name=tag:Name,Values=${INSTANCE_TAG}" "Name=instance-state-name,Values=running" \ @@ -68,27 +67,54 @@ jobs: exit 1 fi + wait_for_ssm_online() { + local iid="$1" + for _ in $(seq 1 30); do + ping_status=$(aws ssm describe-instance-information \ + --region "$AWS_REGION" \ + --filters "Key=InstanceIds,Values=${iid}" \ + --query 'InstanceInformationList[0].PingStatus' \ + --output text 2>/dev/null || true) + if [ "$ping_status" = "Online" ]; then + return 0 + fi + sleep 5 + done + return 1 + } + + if ! wait_for_ssm_online "$INSTANCE_ID"; then + echo "Instance is running but not SSM Online: $INSTANCE_ID" >&2 + exit 1 + fi PARAMS=$(jq -nc \ --arg c1 "set -euo pipefail" \ - --arg c2 "sudo mkdir -p ${RABBITMQ_ROOT}" \ - --arg c3 "RABBITMQ_DEFAULT_USER=\$(aws ssm get-parameter --region $AWS_REGION --name '${RABBITMQ_USER_SSM_PARAM}' --with-decryption --query 'Parameter.Value' --output text)" \ - --arg c4 "RABBITMQ_DEFAULT_PASS=\$(aws ssm get-parameter --region $AWS_REGION --name '${RABBITMQ_PASSWORD_SSM_PARAM}' --with-decryption --query 'Parameter.Value' --output text)" \ - --arg c5 "RABBITMQ_DEFAULT_VHOST=\$(aws ssm get-parameter --region $AWS_REGION --name '${RABBITMQ_VHOST_SSM_PARAM}' --with-decryption --query 'Parameter.Value' --output text)" \ - --arg c6 "if [ -z \\\"\$RABBITMQ_DEFAULT_USER\\\" ] || [ \\\"\$RABBITMQ_DEFAULT_USER\\\" = \\\"None\\\" ] || [ -z \\\"\$RABBITMQ_DEFAULT_PASS\\\" ] || [ \\\"\$RABBITMQ_DEFAULT_PASS\\\" = \\\"None\\\" ] || [ -z \\\"\$RABBITMQ_DEFAULT_VHOST\\\" ] || [ \\\"\$RABBITMQ_DEFAULT_VHOST\\\" = \\\"None\\\" ]; then echo 'RabbitMQ env values are empty from SSM' >&2; exit 1; fi" \ - --arg c7 "printf '%s\n' \\\"RABBITMQ_DEFAULT_USER=\$RABBITMQ_DEFAULT_USER\\\" \\\"RABBITMQ_DEFAULT_PASS=\$RABBITMQ_DEFAULT_PASS\\\" \\\"RABBITMQ_DEFAULT_VHOST=\$RABBITMQ_DEFAULT_VHOST\\\" | sudo tee ${RABBITMQ_ROOT}/.env >/dev/null" \ - --arg c8 "sudo chmod 600 ${RABBITMQ_ROOT}/.env" \ - --arg c9 "sudo docker pull rabbitmq:${RABBITMQ_IMAGE_TAG}" \ - --arg c10 "sudo docker rm -f ${RABBITMQ_CONTAINER_NAME} || true" \ - --arg c11 "sudo docker volume create ${RABBITMQ_DATA_VOLUME} >/dev/null" \ - --arg c12 "sudo docker run -d --name ${RABBITMQ_CONTAINER_NAME} --restart unless-stopped -p 5672:5672 -p 15672:15672 --env-file ${RABBITMQ_ROOT}/.env -v ${RABBITMQ_DATA_VOLUME}:/var/lib/rabbitmq rabbitmq:${RABBITMQ_IMAGE_TAG}" \ - --arg c13 "for i in \$(seq 1 30); do if sudo docker exec ${RABBITMQ_CONTAINER_NAME} rabbitmq-diagnostics -q ping >/dev/null 2>&1; then echo 'RabbitMQ is healthy'; exit 0; fi; sleep 2; done; echo 'RabbitMQ health check failed' >&2; exit 1" \ - '{commands:[$c1,$c2,$c3,$c4,$c5,$c6,$c7,$c8,$c9,$c10,$c11,$c12,$c13]}') + --arg c2 "command -v jq >/dev/null 2>&1 || sudo dnf -y install jq" \ + --arg c3 "command -v aws >/dev/null 2>&1 || sudo dnf -y install awscli" \ + --arg c4 "sudo systemctl enable --now rabbitmq-server" \ + --arg c5 "sudo rabbitmq-plugins enable rabbitmq_management" \ + --arg c6 "ADMIN_JSON=\$(aws secretsmanager get-secret-value --region $AWS_REGION --secret-id '${RABBITMQ_ADMIN_SECRET_ID}' --query SecretString --output text)" \ + --arg c7 "APP_JSON=\$(aws secretsmanager get-secret-value --region $AWS_REGION --secret-id '${RABBITMQ_APP_SECRET_ID}' --query SecretString --output text)" \ + --arg c8 "ADMIN_USER=\$(echo \"\$ADMIN_JSON\" | jq -r '.username')" \ + --arg c9 "ADMIN_PASS=\$(echo \"\$ADMIN_JSON\" | jq -r '.password')" \ + --arg c10 "APP_USER=\$(echo \"\$APP_JSON\" | jq -r '.username')" \ + --arg c11 "APP_PASS=\$(echo \"\$APP_JSON\" | jq -r '.password')" \ + --arg c12 "if [ -z \"\$ADMIN_USER\" ] || [ \"\$ADMIN_USER\" = \"null\" ] || [ -z \"\$ADMIN_PASS\" ] || [ \"\$ADMIN_PASS\" = \"null\" ] || [ -z \"\$APP_USER\" ] || [ \"\$APP_USER\" = \"null\" ] || [ -z \"\$APP_PASS\" ] || [ \"\$APP_PASS\" = \"null\" ]; then echo 'Secret payload invalid (username/password missing)' >&2; exit 1; fi" \ + --arg c13 "if sudo rabbitmqctl list_users | awk '{print \$1}' | grep -qx \"\$ADMIN_USER\"; then sudo rabbitmqctl change_password \"\$ADMIN_USER\" \"\$ADMIN_PASS\"; else sudo rabbitmqctl add_user \"\$ADMIN_USER\" \"\$ADMIN_PASS\"; fi" \ + --arg c14 "sudo rabbitmqctl set_user_tags \"\$ADMIN_USER\" administrator" \ + --arg c15 "sudo rabbitmqctl set_permissions -p '${RABBITMQ_VHOST}' \"\$ADMIN_USER\" '.*' '.*' '.*'" \ + --arg c16 "if sudo rabbitmqctl list_users | awk '{print \$1}' | grep -qx \"\$APP_USER\"; then sudo rabbitmqctl change_password \"\$APP_USER\" \"\$APP_PASS\"; else sudo rabbitmqctl add_user \"\$APP_USER\" \"\$APP_PASS\"; fi" \ + --arg c17 "sudo rabbitmqctl set_user_tags \"\$APP_USER\" management" \ + --arg c18 "sudo rabbitmqctl set_permissions -p '${RABBITMQ_VHOST}' \"\$APP_USER\" '.*' '.*' '.*'" \ + --arg c19 "if sudo rabbitmqctl list_users | awk '{print \$1}' | grep -qx guest; then sudo rabbitmqctl delete_user guest || true; fi" \ + --arg c20 "sudo rabbitmq-diagnostics -q ping" \ + '{commands:[$c1,$c2,$c3,$c4,$c5,$c6,$c7,$c8,$c9,$c10,$c11,$c12,$c13,$c14,$c15,$c16,$c17,$c18,$c19,$c20]}') CMD_ID=$(aws ssm send-command \ --region "$AWS_REGION" \ --instance-ids "$INSTANCE_ID" \ --document-name "AWS-RunShellScript" \ - --comment "Deploy RabbitMQ" \ + --comment "Sync RabbitMQ users from Secrets Manager" \ --parameters "$PARAMS" \ --query 'Command.CommandId' \ --output text) @@ -105,11 +131,11 @@ jobs: case "$STATUS" in Success) - echo "RabbitMQ deployment success" + echo "RabbitMQ sync success" exit 0 ;; Failed|Cancelled|TimedOut) - echo "RabbitMQ deployment failed: $STATUS" >&2 + echo "RabbitMQ sync failed: $STATUS" >&2 aws ssm get-command-invocation \ --region "$AWS_REGION" \ --command-id "$CMD_ID" \ @@ -127,5 +153,5 @@ jobs: esac done - echo "RabbitMQ deployment timed out" >&2 + echo "RabbitMQ sync timed out" >&2 exit 1