diff --git a/.gitignore b/.gitignore index 9c1a9f73b8..a4f4cf5f7d 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,10 @@ test/integration/output.json test/integration/sifchainrelayerdb/* *.log -dist \ No newline at end of file +dist +# Terraform +.terraform/ +*.tfstate +*.tfstate.backup +.terraform.lock.hcl + diff --git a/docs/architecture/gcp-reference-architecture.md b/docs/architecture/gcp-reference-architecture.md new file mode 100644 index 0000000000..b427c42783 --- /dev/null +++ b/docs/architecture/gcp-reference-architecture.md @@ -0,0 +1,202 @@ +# Sifnode GCP Reference Architecture + +**Document:** GCP Architecture for Sifnode Validator Nodes +**Role:** Kael Support Documentation +**Date:** 2026-04-21 +**Status:** Draft (pre-implementation) + +--- + +## Overview + +This document defines the Google Cloud Platform architecture for deploying Sifnode validator nodes with high availability, security, and observability. + +## Architecture Components + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ GCP Project │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Cloud Load Balancer (Layer 7) │ │ +│ │ - SSL termination │ │ +│ │ - Health checks │ │ +│ └──────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────▼──────────────────────────────────┐ │ +│ │ GKE Cluster (Regional) │ │ +│ │ ┌─────────────────────────────────────────────────┐ │ │ +│ │ │ Node Pool: validator-pool │ │ │ +│ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ +│ │ │ │ Node 1 │ │ Node 2 │ │ Node 3 │ │ │ │ +│ │ │ │Sifnode │ │Sifnode │ │Sifnode │ │ │ │ +│ │ │ │Pod │ │Pod │ │Pod │ │ │ │ +│ │ │ └─────────┘ └─────────┘ └─────────┘ │ │ │ +│ │ └─────────────────────────────────────────────────┘ │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────▼──────────────────────────────────┐ │ +│ │ Cloud SQL (PostgreSQL) │ │ +│ │ - Chain data persistence │ │ +│ │ - Automated backups │ │ +│ │ - Private IP only │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ Cloud Monitoring & Logging │ │ +│ │ - Metrics collection │ │ +│ │ - Alerting policies │ │ +│ │ - Log aggregation │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Module Specifications + +### 1. GKE Module (modules/gke) + +**Purpose:** Container orchestration for Sifnode validator nodes + +**Configuration:** +- **Cluster Type:** Regional (multi-zone for HA) +- **Node Pool:** validator-pool + - Machine: n2-standard-4 (4 vCPU, 16GB RAM) + - Disk: 100GB SSD persistent + - Preemptible: false (validators need stability) + - Autoscaling: 3-5 nodes + - Taints: dedicated=validator:NoSchedule +- **Networking:** Private cluster, VPC-native +- **Security:** Workload Identity, Shielded GKE nodes + +**Terraform Variables:** +```hcl +project_id = string +region = string +cluster_name = string +node_count_min = number (default: 3) +node_count_max = number (default: 5) +machine_type = string (default: "n2-standard-4") +``` + +--- + +### 2. Cloud SQL Module (modules/cloud-sql) + +**Purpose:** Chain data persistence for Sifnode + +**Configuration:** +- **Database:** PostgreSQL 14 +- **Tier:** db-custom-2-4096 (2 vCPU, 4GB RAM) +- **Storage:** 100GB SSD, auto-expand +- **HA:** Regional availability +- **Backup:** Daily automated backups, 7-day retention +- **Access:** Private IP only (no public IP) + +**Terraform Variables:** +```hcl +project_id = string +region = string +instance_name = string +database_version = string (default: "POSTGRES_14") +tier = string (default: "db-custom-2-4096") +``` + +--- + +### 3. Load Balancer Module (modules/load-balancer) + +**Purpose:** RPC endpoint distribution and SSL termination + +**Configuration:** +- **Type:** External HTTPS Load Balancer +- **Backend:** GKE service backend +- **SSL:** Managed SSL certificate +- **Health Checks:** /status endpoint +- **CDN:** Disabled (real-time blockchain data) + +**Terraform Variables:** +```hcl +project_id = string +name = string +domain = string # Optional, for SSL +backend_service = string +``` + +--- + +### 4. Monitoring Module (modules/monitoring) + +**Purpose:** Observability for validator infrastructure + +**Configuration:** +- **Metrics:** Node CPU, memory, disk, Sifnode sync status +- **Alerts:** + - Node down > 5 minutes + - Disk usage > 80% + - Sync lag > 10 blocks +- **Dashboards:** Validator health overview + +**Terraform Variables:** +```hcl +project_id = string +notification_email = string +alert_channels = list(string) +``` + +--- + +## Security Best Practices + +1. **Network Security** + - Private GKE cluster (no public control plane) + - VPC peering for Cloud SQL access + - Firewall rules: allow only necessary ports (26656, 26657, 1317) + +2. **IAM** + - Workload Identity for pod-to-GCP service + - Least privilege service accounts + - No default service account usage + +3. **Secrets Management** + - Kubernetes Secrets for node keys + - Cloud KMS for encryption at rest + - No hardcoded credentials + +4. **Data Protection** + - Cloud SQL encrypted with customer-managed keys + - Automated backups with point-in-time recovery + - VPC Service Controls for data exfiltration prevention + +--- + +## Cost Estimation (Monthly) + +| Resource | Configuration | Cost | +|----------|--------------|------| +| GKE Cluster | 3x n2-standard-4 | ~$290 | +| Cloud SQL | db-custom-2-4096 | ~$145 | +| Load Balancer | External HTTPS | ~$18 | +| Storage | 100GB SSD x 3 | ~$40 | +| Monitoring | Cloud Monitoring | ~$20 | +| **Total** | | **~$513/month** | + +--- + +## Implementation Checklist + +- [ ] VPC network created with custom subnets +- [ ] GKE cluster provisioned (regional) +- [ ] Cloud SQL instance with private IP +- [ ] Load balancer configured with health checks +- [ ] Monitoring dashboards and alerts created +- [ ] Sifnode Docker image built and pushed to GCR +- [ ] Kubernetes manifests created (StatefulSet, Service, ConfigMap) +- [ ] End-to-end deployment tested +- [ ] Documentation updated +- [ ] PR submitted to Sifnode repo + +--- + +**Prepared by:** Kael (Support Role) +**Review:** William (Lead Implementation) diff --git a/docs/runbooks/deployment-runbook.md b/docs/runbooks/deployment-runbook.md new file mode 100644 index 0000000000..f67e943dda --- /dev/null +++ b/docs/runbooks/deployment-runbook.md @@ -0,0 +1,222 @@ +# Sifnode GCP Deployment Runbook + +**Document:** Step-by-step deployment procedures +**Role:** Kael Support - Operations Documentation +**Date:** 2026-04-21 +**Version:** 1.0 (Draft) + +--- + +## Prerequisites + +- [ ] GCP project created with billing enabled +- [ ] gcloud CLI authenticated: `gcloud auth login` +- [ ] Terraform >= 1.5 installed +- [ ] kubectl installed +- [ ] Docker installed (for image building) +- [ ] Access to Sifnode source code + +--- + +## Phase 1: Environment Setup (15 minutes) + +### 1.1 Configure GCP Project +```bash +export PROJECT_ID="sifnode-gcp-support" +export REGION="us-central1" +export ZONE="us-central1-a" + +gcloud config set project $PROJECT_ID +gcloud config set compute/region $REGION +gcloud config set compute/zone $ZONE +``` + +### 1.2 Enable Required APIs +```bash +gcloud services enable container.googleapis.com +gcloud services enable sqladmin.googleapis.com +gcloud services enable compute.googleapis.com +gcloud services enable monitoring.googleapis.com +gcloud services enable logging.googleapis.com +``` + +### 1.3 Create Service Account +```bash +gcloud iam service-accounts create terraform-sa \ + --display-name="Terraform Service Account" + +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:terraform-sa@$PROJECT_ID.iam.gserviceaccount.com" \ + --role="roles/editor" +``` + +--- + +## Phase 2: Terraform Deployment (30 minutes) + +### 2.1 Initialize Terraform +```bash +cd /home/wls/bounty-workspaces/sifnode-gcp/terraform/environments/dev + +terraform init +terraform workspace new dev || terraform workspace select dev +``` + +### 2.2 Configure Variables +Create `terraform.tfvars`: +```hcl +project_id = "sifnode-gcp-support" +region = "us-central1" + +# GKE +gke_cluster_name = "sifnode-dev" +gke_node_count = 3 + +# Cloud SQL +sql_instance_name = "sifnode-db-dev" +sql_database_name = "sifnodedb" + +# Notification +notification_email = "ops@example.com" +``` + +### 2.3 Plan and Apply +```bash +terraform plan -out=tfplan +terraform apply tfplan +``` + +### 2.4 Configure kubectl +```bash +gcloud container clusters get-credentials sifnode-dev \ + --region=us-central1 +``` + +--- + +## Phase 3: Database Setup (10 minutes) + +### 3.1 Create Database User +```bash +gcloud sql users create sifnode \ + --instance=sifnode-db-dev \ + --password="$(openssl rand -base64 32)" +``` + +### 3.2 Create Database +```bash +gcloud sql databases create sifnodedb \ + --instance=sifnode-db-dev +``` + +### 3.3 Store Secret in Kubernetes +```bash +kubectl create secret generic db-credentials \ + --from-literal=username=sifnode \ + --from-literal=password="YOUR_PASSWORD" +``` + +--- + +## Phase 4: Sifnode Deployment (20 minutes) + +### 4.1 Build Docker Image +```bash +cd /home/wls/bounty-workspaces/sifnode-gcp + +docker build -t gcr.io/$PROJECT_ID/sifnode:latest . +docker push gcr.io/$PROJECT_ID/sifnode:latest +``` + +### 4.2 Deploy to GKE +```bash +kubectl apply -f k8s/namespace.yaml +kubectl apply -f k8s/configmap.yaml +kubectl apply -f k8s/statefulset.yaml +kubectl apply -f k8s/service.yaml +kubectl apply -f k8s/ingress.yaml +``` + +### 4.3 Verify Deployment +```bash +kubectl get pods -n sifnode -w +kubectl logs -f sifnode-0 -n sifnode +``` + +--- + +## Phase 5: Verification (10 minutes) + +### 5.1 Check Node Status +```bash +kubectl exec -it sifnode-0 -n sifnode -- sifnoded status +``` + +### 5.2 Test RPC Endpoint +```bash +export LB_IP=$(kubectl get svc sifnode-lb -n sifnode -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +curl http://$LB_IP:26657/status | jq +``` + +### 5.3 Verify Metrics +```bash +gcloud monitoring dashboards list +``` + +--- + +## Phase 6: Production Deployment + +Repeat Phases 2-5 with: +- Workspace: `prod` +- Region: `us-central1` (or preferred region) +- Node count: 5 (for HA) +- Cloud SQL HA: Enabled + +--- + +## Rollback Procedures + +### Scenario: Deployment Failure +```bash +# Destroy resources +terraform destroy + +# Verify cleanup +gcloud compute instances list +gcloud container clusters list +gcloud sql instances list +``` + +### Scenario: Application Issue +```bash +# Rollback to previous version +kubectl rollout undo statefulset/sifnode -n sifnode + +# Check rollout status +kubectl rollout status statefulset/sifnode -n sifnode +``` + +--- + +## Post-Deployment Tasks + +- [ ] Document actual resource names +- [ ] Save terraform.tfvars to secure location +- [ ] Configure backup verification +- [ ] Test alert notifications +- [ ] Update monitoring dashboards +- [ ] Schedule security review + +--- + +## Support Contacts + +- **Technical Lead:** William +- **Support/Documentation:** Kael +- **Infrastructure:** Sifnode GCP Team + +--- + +**Last Updated:** 2026-04-21 +**Next Review:** Before production deployment diff --git a/docs/testing/validation-checklist.md b/docs/testing/validation-checklist.md new file mode 100644 index 0000000000..ffb1f2c17c --- /dev/null +++ b/docs/testing/validation-checklist.md @@ -0,0 +1,192 @@ +# Sifnode GCP Testing & Validation Checklist + +**Document:** Testing Framework for GCP Implementation +**Role:** Kael Support - Validation & QA +**Date:** 2026-04-21 + +--- + +## Pre-Deployment Validation + +### Terraform Validation +- [ ] `terraform fmt` passes (no formatting issues) +- [ ] `terraform validate` passes (syntax valid) +- [ ] `terraform plan` shows expected resources +- [ ] No hardcoded credentials in `.tf` files +- [ ] All variables have descriptions +- [ ] All outputs defined and documented + +### Security Validation +- [ ] GKE cluster uses private nodes +- [ ] Cloud SQL has no public IP +- [ ] Service accounts use Workload Identity +- [ ] Firewall rules restrict to required ports only +- [ ] Secrets stored in Kubernetes Secrets, not plain text +- [ ] SSL certificate configured for load balancer + +### Cost Validation +- [ ] Resource limits set (prevent runaway costs) +- [ ] Billing alerts configured +- [ ] Preemptible nodes NOT used for validators (require stability) + +--- + +## Deployment Testing + +### GKE Module Tests +1. **Cluster Creation** + ```bash + gcloud container clusters list --filter="name=sifnode-cluster" + ``` + - [ ] Cluster exists in correct region + - [ ] Node pool has 3+ nodes + - [ ] Nodes show "Ready" status + +2. **Node Configuration** + ```bash + kubectl get nodes -o wide + ``` + - [ ] Machine type matches spec (n2-standard-4) + - [ ] Disk size >= 100GB + - [ ] Shielded nodes enabled + +3. **Networking** + ```bash + kubectl get svc + ``` + - [ ] Services accessible within VPC + - [ ] External IPs assigned correctly + +### Cloud SQL Module Tests +1. **Instance Creation** + ```bash + gcloud sql instances describe sifnode-db + ``` + - [ ] Instance status is RUNNABLE + - [ ] Private IP configured + - [ ] Public IP disabled + - [ ] Automated backups enabled + +2. **Database Connectivity** + ```bash + # From GKE pod + pg_isready -h -p 5432 + ``` + - [ ] Connection succeeds + - [ ] Authentication works + - [ ] Database created + +### Load Balancer Tests +1. **Health Checks** + ```bash + curl https:///status + ``` + - [ ] Returns HTTP 200 + - [ ] Response includes sync status + +2. **SSL Certificate** + ```bash + openssl s_client -connect :443 -servername + ``` + - [ ] Certificate valid + - [ ] Chain complete + - [ ] Not expired + +3. **Backend Service** + - [ ] All backends healthy + - [ ] Traffic distributed across nodes + +--- + +## Sifnode Application Tests + +### Pod Status +```bash +kubectl get pods -n sifnode +``` +- [ ] Pods in "Running" state +- [ ] No CrashLoopBackOff +- [ ] Restarts < 3 + +### Node Synchronization +```bash +kubectl exec -it sifnode-0 -- sifnoded status +``` +- [ ] Latest_block_height increasing +- [ ] Catching_up: false +- [ ] Sync complete within 30 minutes of deployment + +### API Endpoints +```bash +# RPC endpoint +curl http://:26657/status + +# REST API +curl http://:1317/cosmos/base/tendermint/v1beta1/blocks/latest +``` +- [ ] RPC responds with valid JSON +- [ ] REST API returns block data +- [ ] No timeout errors + +--- + +## Monitoring Validation + +### Metrics Collection +- [ ] Node CPU metrics visible in Cloud Monitoring +- [ ] Node memory metrics visible +- [ ] Disk usage metrics visible +- [ ] Sifnode-specific metrics (if custom metrics configured) + +### Alert Tests +- [ ] Test alert: Simulate node down → verify notification received +- [ ] Test alert: High CPU usage → verify threshold triggers +- [ ] Test alert: Disk full → verify early warning + +### Dashboards +- [ ] Dashboard exists: "Sifnode Validator Overview" +- [ ] All charts populate with data +- [ ] Time range selector works + +--- + +## Failure Scenario Tests + +### Node Failure +- [ ] Kill one pod: `kubectl delete pod sifnode-0` +- [ ] Verify pod auto-recreates +- [ ] Verify chain sync resumes +- [ ] Verify no data loss + +### Database Connection Loss +- [ ] Simulate Cloud SQL restart +- [ ] Verify Sifnode reconnects automatically +- [ ] Verify no corruption + +### Network Partition +- [ ] Block egress from one node +- [ ] Verify load balancer routes traffic away +- [ ] Verify recovery when partition heals + +--- + +## Performance Benchmarks + +| Metric | Target | Actual | Pass | +|--------|--------|--------|------| +| Block sync speed | >100 blocks/sec | | | +| API response time | <200ms p95 | | | +| Node startup time | <5 minutes | | | +| Failover time | <30 seconds | | | + +--- + +## Sign-Off + +**Tested by:** Kael +**Date:** ___________ +**Result:** ☐ PASS / ☐ FAIL + +**Notes:** +_____________________________________________ +_____________________________________________ diff --git a/terraform/.terraform.lock.hcl b/terraform/.terraform.lock.hcl new file mode 100644 index 0000000000..e690978063 --- /dev/null +++ b/terraform/.terraform.lock.hcl @@ -0,0 +1,22 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "7.28.0" + constraints = ">= 5.0.0" + hashes = [ + "h1:M3DrxwI8FiHJpvq3yVX2QWZeqv5dyLt3nQ1YBm/TNXA=", + "zh:078c16b9c5e9067e72070367846976b58f906d8efab6fc4fc1325661717dc9cc", + "zh:08b839014b428233a3a83d15045e7559b07fc035c7f73cc1ee2694c50c4dea54", + "zh:0c76ea69f75633bdfc67a0cd6ea510332c0cb0f2d4968b8a070e546fb47e444e", + "zh:3a308492ad4c153583f7b8ecc3c80bf0bbc15a32c62b5b3794efb27db01ff26b", + "zh:6754f51373994470f78937856982b0a39648ac302713d07205d320a13ad41d82", + "zh:79d387214f55df16c795f11988a0285a4bfa846c447faa85008b953b77081eb1", + "zh:8de432482d77d1a1077b2dc3db764b8ba6d1b07a4b991a07c960855adc0b031b", + "zh:900daa2435de1928a9868aa4c17d8b7b109ab363c97f7fe274466193af1412b0", + "zh:96c25183a7f13b3de9a5631aa2a13ed1a4285b8393df90c2380c2fe74f350ab5", + "zh:971121626be01245acd9a4520a63e1405e4f528d3c83f39a28f8caaeac235b45", + "zh:e90d5e7d7bf47c8cf5bbf2e5d0bf855ed10350ad3584795a6911f85fdb5c0c3c", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/terraform/docs/architecture/gcp-reference-architecture.md b/terraform/docs/architecture/gcp-reference-architecture.md new file mode 100644 index 0000000000..57d2a4c18a --- /dev/null +++ b/terraform/docs/architecture/gcp-reference-architecture.md @@ -0,0 +1,206 @@ +# Sifnode GCP Reference Architecture + +**Document Version:** 1.0 +**Created:** 2026-04-21 +**Author:** Kael (Support) +**Status:** Ready for Implementation + +--- + +## Overview + +This document defines the Google Cloud Platform (GCP) infrastructure architecture for deploying Sifnode validator nodes using Terraform. + +--- + +## Architecture Diagram + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ GCP Project │ +│ ┌───────────────────────────────────────────────────────────┐ │ +│ │ VPC Network │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ GKE │◄───│ Cloud SQL │ │ Cloud │ │ │ +│ │ │ Cluster │ │ PostgreSQL │ │ Load Balancer│ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ • Validator │ │ • Chain data│ │ • External │ │ │ +│ │ │ nodes │ │ • State │ │ • HTTPS │ │ │ +│ │ │ • Sentry │ │ • Indices │ │ • SSL term │ │ │ +│ │ │ nodes │ │ │ │ │ │ │ +│ │ └──────┬──────┘ └─────────────┘ └──────┬──────┘ │ │ +│ │ │ │ │ │ +│ │ ┌──────▼───────────────────────────────────────▼──────┐ │ │ +│ │ │ Cloud Monitoring │ │ │ +│ │ │ • Metrics • Alerts • Dashboards • Logging │ │ │ +│ │ └─────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Module Specifications + +### 1. GKE Module (modules/gke/) + +**Purpose:** Kubernetes cluster for Sifnode validator and sentry nodes + +**Configuration:** +- **Cluster Type:** Regional (multi-zone for HA) +- **Region:** us-central1 (configurable) +- **Node Pool:** + - Machine type: n2-standard-4 (4 vCPU, 16 GB RAM) + - Min nodes: 3 + - Max nodes: 5 (autoscaling enabled) + - Disk: 100GB SSD per node + - Preemptible: false (validators need stability) + +**Node Types:** +| Role | Count | Resources | Purpose | +|------|-------|-----------|---------| +| Validator | 2 | n2-standard-4 | Core validation, signing | +| Sentry | 2 | n2-standard-2 | Public RPC, peer proxy | + +**Security:** +- Private cluster (master authorized networks) +- Workload Identity enabled +- Network policies for pod isolation + +--- + +### 2. Cloud SQL Module (modules/cloud-sql/) + +**Purpose:** PostgreSQL database for chain data and application state + +**Configuration:** +- **Database Version:** PostgreSQL 14 +- **Tier:** db-custom-4-16384 (4 vCPU, 16GB RAM) +- **Storage:** 500GB SSD, auto-expand +- **High Availability:** Zone-redundant (us-central1-a, us-central1-b) +- **Backup:** Automated daily, 30-day retention + +**Network:** +- Private IP only (VPC peering) +- No public IP allocation +- Cloud SQL Auth Proxy for K8s access + +**Databases:** +| Database | Purpose | +|----------|---------| +| sifnode_chain | Chain state, blocks | +| sifnode_indices | Query indices | +| sifnode_app | Application data | + +--- + +### 3. Load Balancer Module (modules/load-balancer/) + +**Purpose:** External HTTPS load balancer for RPC endpoints + +**Configuration:** +- **Type:** External HTTP(S) Load Balancer +- **SSL:** Google-managed certificate (Let's Encrypt) +- **Backend:** GKE service (sifnode-rpc) +- **Health Checks:** HTTP /health endpoint + +**Ports:** +| Port | Service | Description | +|------|---------|-------------| +| 443 | RPC | JSON-RPC over HTTPS | +| 26657 | Tendermint RPC | Node status, queries | +| 1317 | REST API | LCD endpoints | + +**Rate Limiting:** +- Cloud Armor: 1000 req/min per IP +- DDoS protection enabled + +--- + +### 4. Monitoring Module (modules/monitoring/) + +**Purpose:** Observability stack for infrastructure and application + +**Components:** +- **Cloud Monitoring:** Metrics, dashboards, alerts +- **Cloud Logging:** Log aggregation, export +- **Cloud Trace:** Distributed tracing +- **Uptime Checks:** External health monitoring + +**Key Metrics:** +| Metric | Alert Threshold | Severity | +|--------|-----------------|----------| +| Node CPU | > 80% for 5 min | Warning | +| Node Memory | > 90% for 5 min | Critical | +| Block height stuck | No change for 2 min | Critical | +| Peer count | < 5 peers | Warning | +| Disk usage | > 85% | Warning | + +**Dashboards:** +- Sifnode Overview (blocks, peers, sync status) +- Infrastructure Health (CPU, memory, disk) +- RPC Performance (latency, error rates) + +--- + +## Security Best Practices + +### Network Security +- [ ] VPC with custom subnets (no default network) +- [ ] Private GKE cluster (master auth networks) +- [ ] Cloud SQL private IP only +- [ ] Firewall rules: least privilege +- [ ] Cloud Armor for DDoS protection + +### IAM & Access +- [ ] Service accounts per module +- [ ] Workload Identity for GKE +- [ ] No service account keys (use IAM roles) +- [ ] Cloud IAM audit logging enabled + +### Data Protection +- [ ] Cloud SQL encrypted at rest +- [ ] Cloud SQL automated backups +- [ ] GKE node disk encryption +- [ ] Secrets in Secret Manager + +--- + +## Cost Estimation + +| Resource | Monthly Cost | +|----------|--------------| +| GKE (4 nodes, n2-standard-4) | ~$350 | +| Cloud SQL (db-custom-4-16384, HA) | ~$120 | +| Load Balancer | ~$18 | +| Cloud Monitoring | ~$25 | +| **Total** | **~$513/month** | + +*Estimate for production deployment. Dev environments can use smaller instances.* + +--- + +## Implementation Checklist + +### Pre-Deployment +- [ ] GCP project created with billing +- [ ] Required APIs enabled (GKE, Cloud SQL, LB, Monitoring) +- [ ] Service accounts created with IAM roles +- [ ] Terraform state backend configured (GCS bucket) + +### Deployment Order +1. [ ] Network module (VPC, subnets, firewall) +2. [ ] Cloud SQL module (database provisioning) +3. [ ] GKE module (cluster, node pools) +4. [ ] Load Balancer module (IP, SSL, backend) +5. [ ] Monitoring module (alerts, dashboards) + +### Post-Deployment +- [ ] Verify all services healthy +- [ ] Test RPC endpoints +- [ ] Validate monitoring dashboards +- [ ] Document any deviations + +--- + +**Next:** See `../testing/validation-checklist.md` for test procedures. diff --git a/terraform/docs/runbooks/deployment-runbook.md b/terraform/docs/runbooks/deployment-runbook.md new file mode 100644 index 0000000000..a1f3c58bda --- /dev/null +++ b/terraform/docs/runbooks/deployment-runbook.md @@ -0,0 +1,379 @@ +# Sifnode GCP Deployment Runbook + +**Document Version:** 1.0 +**Created:** 2026-04-21 +**Author:** Kael (Support) +**Status:** Ready for Execution + +--- + +## Overview + +This runbook provides step-by-step procedures for deploying Sifnode validator infrastructure on Google Cloud Platform using Terraform. + +**Estimated Time:** 4-6 hours +**Prerequisites:** GCP project with billing, Terraform >= 1.0 + +--- + +## Phase 1: Environment Setup (30 min) + +### 1.1 GCP Project Configuration + +```bash +# Set project +export PROJECT_ID="sifnode-gcp-mainnet" +gcloud config set project $PROJECT_ID + +# Enable required APIs +gcloud services enable container.googleapis.com +gcloud services enable sqladmin.googleapis.com +gcloud services enable compute.googleapis.com +gcloud services enable monitoring.googleapis.com +gcloud services enable logging.googleapis.com +``` + +### 1.2 Service Account Setup + +```bash +# Create Terraform service account +gcloud iam service-accounts create terraform-deploy \ + --display-name="Terraform Deployer" + +# Grant IAM roles +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:terraform-deploy@$PROJECT_ID.iam.gserviceaccount.com" \ + --role="roles/editor" + +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:terraform-deploy@$PROJECT_ID.iam.gserviceaccount.com" \ + --role="roles/container.admin" + +# Create and download key +gcloud iam service-accounts keys create terraform-key.json \ + --iam-account=terraform-deploy@$PROJECT_ID.iam.gserviceaccount.com +``` + +### 1.3 Terraform Backend + +```bash +# Create GCS bucket for state +gsutil mb -p $PROJECT_ID gs://$PROJECT_ID-tfstate + +gsutil versioning set on gs://$PROJECT_ID-tfstate +``` + +--- + +## Phase 2: Terraform Deployment (2-3 hours) + +### 2.1 Initialize Terraform + +```bash +cd /home/wls/bounty-workspaces/sifnode-gcp/terraform + +# Initialize with backend +cat > backend.tf << 'EOF' +terraform { + backend "gcs" { + bucket = "sifnode-gcp-mainnet-tfstate" + prefix = "terraform/state" + } +} +EOF + +terraform init +``` + +### 2.2 Configure Variables + +```bash +# Create terraform.tfvars +cat > terraform.tfvars << 'EOF' +project_id = "sifnode-gcp-mainnet" +region = "us-central1" + +# GKE Configuration +cluster_name = "sifnode-mainnet" +node_pool_name = "validator-pool" +node_count = 3 +node_machine_type = "n2-standard-4" + +# Cloud SQL Configuration +sql_instance_name = "sifnode-db" +database_tier = "db-custom-4-16384" +database_version = "POSTGRES_14" + +# Network +vpc_name = "sifnode-vpc" +subnet_name = "sifnode-subnet" +subnet_cidr = "10.0.0.0/24" + +# Monitoring +enable_monitoring = true +alert_email = "alerts@example.com" +EOF +``` + +### 2.3 Deploy Network Module + +```bash +terraform plan -target=module.network +terraform apply -target=module.network -auto-approve + +# Verify +gcloud compute networks list +gcloud compute networks subnets list +``` + +### 2.4 Deploy Cloud SQL Module + +```bash +terraform plan -target=module.cloud-sql +terraform apply -target=module.cloud-sql -auto-approve + +# Verify +gcloud sql instances list +``` + +### 2.5 Deploy GKE Module + +```bash +terraform plan -target=module.gke +terraform apply -target=module.gke -auto-approve + +# Verify +gcloud container clusters list +kubectl get nodes +``` + +### 2.6 Deploy Load Balancer Module + +```bash +terraform plan -target=module.load-balancer +terraform apply -target=module.load-balancer -auto-approve + +# Verify +gcloud compute forwarding-rules list +``` + +### 2.7 Deploy Monitoring Module + +```bash +terraform plan -target=module.monitoring +terraform apply -target=module.monitoring -auto-approve + +# Verify +gcloud monitoring dashboards list +``` + +--- + +## Phase 3: Database Setup (30 min) + +### 3.1 Create Databases + +```bash +# Get private IP +SQL_IP=$(gcloud sql instances describe sifnode-db --format='value(ipAddresses[0].ipAddress)') + +# Connect and create databases +gcloud sql connect sifnode-db --user=postgres --quiet << 'EOF' +CREATE DATABASE sifnode_chain; +CREATE DATABASE sifnode_indices; +CREATE DATABASE sifnode_app; +CREATE USER sifnode WITH PASSWORD '[SECURE_PASSWORD]'; +GRANT ALL PRIVILEGES ON DATABASE sifnode_chain TO sifnode; +GRANT ALL PRIVILEGES ON DATABASE sifnode_indices TO sifnode; +\q +EOF +``` + +### 3.2 Configure Connection + +```bash +# Create Cloud SQL Proxy secret +kubectl create secret generic cloudsql-credentials \ + --from-literal=username=sifnode \ + --from-literal=password='[SECURE_PASSWORD]' +``` + +--- + +## Phase 4: Sifnode Deployment (1-2 hours) + +### 4.1 Build Docker Image + +```bash +cd /home/wls/bounty-workspaces/sifnode-gcp + +# Build +docker build -t gcr.io/$PROJECT_ID/sifnode:latest -f Dockerfile . + +# Push +docker push gcr.io/$PROJECT_ID/sifnode:latest +``` + +### 4.2 Deploy to GKE + +```bash +# Create namespace +kubectl create namespace sifnode + +# Deploy validator nodes +kubectl apply -f k8s/validator-deployment.yaml +kubectl apply -f k8s/validator-service.yaml + +# Deploy sentry nodes +kubectl apply -f k8s/sentry-deployment.yaml +kubectl apply -f k8s/sentry-service.yaml + +# Verify +kubectl get pods -n sifnode +kubectl get services -n sifnode +``` + +### 4.3 Verify Sifnode Status + +```bash +# Check pod logs +kubectl logs -f deployment/sifnode-validator -n sifnode + +# Check sync status +kubectl exec -it deployment/sifnode-validator -n sifnode -- \ + sifnoded status | jq '.SyncInfo.catching_up' + +# Expected: false (when synced) +``` + +--- + +## Phase 5: Verification (30 min) + +### 5.1 Infrastructure Tests + +```bash +# Run validation checklist tests +./scripts/validate-infrastructure.sh + +# Expected: All tests pass +``` + +### 5.2 Endpoint Tests + +```bash +# Get load balancer IP +LB_IP=$(gcloud compute forwarding-rules describe sifnode-lb \ + --global --format='value(IPAddress)') + +# Test RPC +curl -s http://$LB_IP:26657/status | jq '.result.sync_info' + +# Test REST API +curl -s http://$LB_IP:1317/node_info | jq '.node_info' + +# Expected: Valid JSON responses +``` + +### 5.3 Monitoring Verification + +```bash +# Check dashboards +gcloud monitoring dashboards list + +# Check alerts +gcloud alpha monitoring policies list + +# Verify uptime checks +gcloud monitoring uptime list +``` + +--- + +## Rollback Procedures + +### Scenario 1: Terraform Destroy (Full Teardown) + +```bash +# Destroy all resources +terraform destroy -auto-approve + +# Verify +gcloud compute instances list # Should be empty +gcloud container clusters list # Should be empty +gcloud sql instances list # Should be empty +``` + +### Scenario 2: Partial Rollback (Single Module) + +```bash +# Example: Rollback GKE only +terraform destroy -target=module.gke -auto-approve + +# Re-deploy +terraform apply -target=module.gke -auto-approve +``` + +### Scenario 3: Application Rollback + +```bash +# Rollback to previous version +kubectl rollout undo deployment/sifnode-validator -n sifnode + +# Verify rollback +kubectl rollout status deployment/sifnode-validator -n sifnode +``` + +--- + +## Post-Deployment Tasks + +### Immediate (Today) +- [ ] Verify all endpoints responding +- [ ] Confirm monitoring dashboards populated +- [ ] Test alert notifications +- [ ] Document any deviations from plan + +### Short-term (This Week) +- [ ] Configure backup verification +- [ ] Document operational procedures +- [ ] Train team on incident response +- [ ] Review and optimize costs + +### Ongoing +- [ ] Monitor sync status daily +- [ ] Review alerts weekly +- [ ] Update documentation monthly +- [ ] Security patches as needed + +--- + +## Troubleshooting + +### Common Issues + +**Issue:** Terraform apply fails with API not enabled +**Fix:** Run API enablement commands in Phase 1.1 + +**Issue:** GKE nodes not joining cluster +**Fix:** Check firewall rules allow master-to-node communication + +**Issue:** Cloud SQL connection refused +**Fix:** Verify private IP peering and service account permissions + +**Issue:** Sifnode not syncing **Fix:** Check peer connections (`sifnoded net_info`), verify genesis file + +--- + +## Contact Information + +| Role | Responsibility | Contact | +|------|---------------|---------| +| William | Lead Implementation | bridge handoff | +| Kael | Support/Documentation | bridge handoff | + +**Emergency:** Escalate to Lisa via bridge if deployment blocked > 30 min + +--- + +**Reference:** See `../architecture/gcp-reference-architecture.md` for architecture details. diff --git a/terraform/docs/testing/validation-checklist.md b/terraform/docs/testing/validation-checklist.md new file mode 100644 index 0000000000..1307674261 --- /dev/null +++ b/terraform/docs/testing/validation-checklist.md @@ -0,0 +1,194 @@ +# Sifnode GCP Validation Checklist + +**Document Version:** 1.0 +**Created:** 2026-04-21 +**Author:** Kael (Support) + +--- + +## Pre-Deployment Validation + +### Terraform Validation +| Check | Command | Expected Result | +|-------|---------|-----------------| +| Syntax check | `terraform validate` | ✓ Success | +| Format check | `terraform fmt -check` | ✓ All files formatted | +| Plan review | `terraform plan` | Review output, no errors | +| State lock | Check GCS bucket | No stale locks | + +### Security Validation +| Check | Method | Requirement | +|-------|--------|-------------| +| No hardcoded secrets | `grep -r "password\|secret\|key" *.tf` | Use vars/Secret Manager only | +| Private cluster | Verify GKE config | `enable_private_nodes = true` | +| SQL private IP | Check Cloud SQL config | `ipv4_enabled = false` | +| Firewall rules | Review rules | Least privilege, specific CIDRs | + +--- + +## Module-by-Module Testing + +### 1. GKE Module Tests + +| Test | Command/Method | Pass Criteria | +|------|----------------|---------------| +| Cluster reachable | `gcloud container clusters get-credentials` | No errors | +| Nodes ready | `kubectl get nodes` | All nodes Ready | +| Workload Identity | `kubectl auth can-i list pods` | Success | +| Network policies | Deploy test pod | Pod isolation works | +| Autoscaling | Check HPA | Scales to demand | + +**Sifnode-Specific Tests:** +| Test | Method | Pass Criteria | +|------|--------|---------------| +| Validator pod running | `kubectl get pods -n sifnode` | Status: Running | +| Sentry pod running | `kubectl get pods -n sifnode` | Status: Running | +| ConfigMaps mounted | `kubectl describe pod` | Config volume mounted | +| Secrets accessible | Pod logs | No secret access errors | + +--- + +### 2. Cloud SQL Module Tests + +| Test | Command | Pass Criteria | +|------|---------|---------------| +| Instance created | `gcloud sql instances list` | Status: RUNNABLE | +| Private IP working | `gcloud compute networks peerings list` | Peering active | +| Database accessible | `psql -h [PRIVATE_IP] -U postgres` | Connection succeeds | +| Backup configured | `gcloud sql backups list` | Recent backups shown | +| HA configured | `gcloud sql instances describe` | `availabilityType: REGIONAL` | + +**Sifnode Database Tests:** +| Test | Method | Pass Criteria | +|------|--------|---------------| +| Chain DB exists | `\l` in psql | sifnode_chain listed | +| Tables created | `\dt` in sifnode_chain | Expected tables present | +| Indices created | Query performance | Queries use indices | +| Connection from K8s | Test pod connection | Success via Cloud SQL Proxy | + +--- + +### 3. Load Balancer Module Tests + +| Test | Command | Pass Criteria | +|------|---------|---------------| +| IP reserved | `gcloud compute addresses list` | Status: RESERVED | +| SSL certificate | `gcloud compute ssl-certificates list` | Status: ACTIVE | +| Backend service | `gcloud compute backend-services list` | Health check passing | +| URL map | `gcloud compute url-maps list` | Routes configured | + +**Endpoint Tests:** +| Test | URL | Expected Response | +|------|-----|-------------------| +| Health check | `https://[DOMAIN]/health` | HTTP 200 | +| RPC endpoint | `https://[DOMAIN]:443` | JSON-RPC response | +| Tendermint RPC | `https://[DOMAIN]:26657/status` | Node status JSON | +| REST API | `https://[DOMAIN]:1317/node_info` | Node info JSON | + +--- + +### 4. Monitoring Module Tests + +| Test | Method | Pass Criteria | +|------|--------|---------------| +| Dashboards created | Cloud Console → Monitoring → Dashboards | Sifnode dashboards present | +| Alert policies | Cloud Console → Alerting | Policies active | +| Uptime checks | Cloud Console → Uptime | Checks passing | +| Log sinks | `gcloud logging sinks list` | Sinks configured | + +**Metric Verification:** +| Metric | Query | Expected | +|--------|-------|----------| +| CPU usage | `compute.googleapis.com/instance/cpu/utilization` | Data flowing | +| Disk I/O | `compute.googleapis.com/instance/disk/write_bytes_count` | Data flowing | +| Custom metrics | Sifnode exporter | Block height metric | + +--- + +## Sifnode Application Tests + +### Sync Status +| Test | Command | Pass Criteria | +|------|---------|---------------| +| Node catching up | `curl localhost:26657/status` | `catching_up: false` | +| Block height | Compare with public explorer | Within 5 blocks | +| Sync rate | Monitor over 10 min | > 100 blocks/min | + +### API Tests +| Test | Command | Pass Criteria | +|------|---------|---------------| +| RPC available | `curl localhost:26657/net_info` | Returns peer info | +| REST API available | `curl localhost:1317/bank/balances/[ADDR]` | Returns balance | +| gRPC available | `grpcurl -plaintext localhost:9090 list` | Services listed | + +### Health Checks +| Test | Method | Pass Criteria | +|------|--------|---------------| +| Process running | `pgrep sifnoded` | Process found | +| Port listening | `netstat -tlnp | grep 26657` | Port open | +| Memory usage | `kubectl top pod` | < 8GB per pod | +| Disk usage | `df -h` | < 85% used | + +--- + +## Failure Scenario Tests + +### Node Failure Simulation +| Scenario | Action | Expected Recovery | +|----------|--------|-------------------| +| Validator pod restart | `kubectl delete pod` | Auto-restart, re-sync | +| Node drain | `kubectl drain` | Pods migrate to other nodes | +| Zone outage | Simulate zone failure | HA keeps service up | + +### Database Failure +| Scenario | Action | Expected Recovery | +|----------|--------|-------------------| +| Primary failover | Trigger failover | Automatic, < 60s downtime | +| Connection loss | Block SQL port | App reconnects automatically | +| Backup restore | Restore from backup | Data integrity maintained | + +### Network Failure +| Scenario | Action | Expected Recovery | +|----------|--------|-------------------| +| LB backend unhealthy | Stop pods | LB routes to healthy | +| SSL cert expiry | Check cert rotation | Auto-renewal | +| DDoS attack | Cloud Armor test | Rate limiting triggers | + +--- + +## Performance Benchmarks + +### Load Testing +| Test | Tool | Target | +|------|------|--------| +| RPC throughput | `wrk` | 1000 req/s | +| Tendermint queries | Custom script | 500 req/s | +| Sync speed | Monitor | 100+ blocks/sec | + +### Latency Targets +| Metric | Target | Critical | +|--------|--------|----------| +| RPC response | < 100ms | > 500ms | +| Block propagation | < 5s | > 30s | +| DB query | < 50ms | > 200ms | + +--- + +## Sign-Off Checklist + +| Validator | Role | Date | Signature | +|-----------|------|------|-----------| +| | William (Lead) | | | +| | Kael (Support) | | | + +**Final Approval:** +- [ ] All module tests passing +- [ ] All application tests passing +- [ ] Failure scenarios validated +- [ ] Performance benchmarks met +- [ ] Documentation updated +- [ ] Ready for production + +--- + +**Next:** See `../runbooks/deployment-runbook.md` for deployment procedures. diff --git a/terraform/environments/dev/.terraform.lock.hcl b/terraform/environments/dev/.terraform.lock.hcl new file mode 100644 index 0000000000..e690978063 --- /dev/null +++ b/terraform/environments/dev/.terraform.lock.hcl @@ -0,0 +1,22 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "7.28.0" + constraints = ">= 5.0.0" + hashes = [ + "h1:M3DrxwI8FiHJpvq3yVX2QWZeqv5dyLt3nQ1YBm/TNXA=", + "zh:078c16b9c5e9067e72070367846976b58f906d8efab6fc4fc1325661717dc9cc", + "zh:08b839014b428233a3a83d15045e7559b07fc035c7f73cc1ee2694c50c4dea54", + "zh:0c76ea69f75633bdfc67a0cd6ea510332c0cb0f2d4968b8a070e546fb47e444e", + "zh:3a308492ad4c153583f7b8ecc3c80bf0bbc15a32c62b5b3794efb27db01ff26b", + "zh:6754f51373994470f78937856982b0a39648ac302713d07205d320a13ad41d82", + "zh:79d387214f55df16c795f11988a0285a4bfa846c447faa85008b953b77081eb1", + "zh:8de432482d77d1a1077b2dc3db764b8ba6d1b07a4b991a07c960855adc0b031b", + "zh:900daa2435de1928a9868aa4c17d8b7b109ab363c97f7fe274466193af1412b0", + "zh:96c25183a7f13b3de9a5631aa2a13ed1a4285b8393df90c2380c2fe74f350ab5", + "zh:971121626be01245acd9a4520a63e1405e4f528d3c83f39a28f8caaeac235b45", + "zh:e90d5e7d7bf47c8cf5bbf2e5d0bf855ed10350ad3584795a6911f85fdb5c0c3c", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/terraform/environments/dev/main.tf b/terraform/environments/dev/main.tf new file mode 100644 index 0000000000..80a3f65bc4 --- /dev/null +++ b/terraform/environments/dev/main.tf @@ -0,0 +1,23 @@ +# Sifnode GCP - Dev Environment +# Wires root module with dev-specific values + +module "sifnode_dev" { + source = "../../" + + project_id = var.project_id + region = "us-central1" + vpc_name = "sifnode-dev-vpc" + subnet_cidr = "10.0.0.0/20" + gke_cluster_name = "sifnode-dev" + gke_node_count_min = 3 + gke_node_count_max = 5 + gke_machine_type = "n2-standard-4" + sql_instance_name = "sifnode-db-dev" + sql_database_version = "POSTGRES_14" + sql_tier = "db-custom-2-4096" + sql_database_name = "sifnodedb" + sql_deletion_protection = false + lb_domain = "" + notification_email = var.notification_email + alert_channels = [] +} \ No newline at end of file diff --git a/terraform/environments/dev/terraform.tfvars b/terraform/environments/dev/terraform.tfvars new file mode 100644 index 0000000000..5007553fc1 --- /dev/null +++ b/terraform/environments/dev/terraform.tfvars @@ -0,0 +1,3 @@ +# Dev environment variables +project_id = "sifnode-gcp-support" +notification_email = "ops@example.com" \ No newline at end of file diff --git a/terraform/environments/dev/variables.tf b/terraform/environments/dev/variables.tf new file mode 100644 index 0000000000..9090abbd9f --- /dev/null +++ b/terraform/environments/dev/variables.tf @@ -0,0 +1,10 @@ +# Dev environment input variables +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "notification_email" { + description = "Email for alert notifications" + type = string +} \ No newline at end of file diff --git a/terraform/environments/prod/.terraform.lock.hcl b/terraform/environments/prod/.terraform.lock.hcl new file mode 100644 index 0000000000..e690978063 --- /dev/null +++ b/terraform/environments/prod/.terraform.lock.hcl @@ -0,0 +1,22 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "7.28.0" + constraints = ">= 5.0.0" + hashes = [ + "h1:M3DrxwI8FiHJpvq3yVX2QWZeqv5dyLt3nQ1YBm/TNXA=", + "zh:078c16b9c5e9067e72070367846976b58f906d8efab6fc4fc1325661717dc9cc", + "zh:08b839014b428233a3a83d15045e7559b07fc035c7f73cc1ee2694c50c4dea54", + "zh:0c76ea69f75633bdfc67a0cd6ea510332c0cb0f2d4968b8a070e546fb47e444e", + "zh:3a308492ad4c153583f7b8ecc3c80bf0bbc15a32c62b5b3794efb27db01ff26b", + "zh:6754f51373994470f78937856982b0a39648ac302713d07205d320a13ad41d82", + "zh:79d387214f55df16c795f11988a0285a4bfa846c447faa85008b953b77081eb1", + "zh:8de432482d77d1a1077b2dc3db764b8ba6d1b07a4b991a07c960855adc0b031b", + "zh:900daa2435de1928a9868aa4c17d8b7b109ab363c97f7fe274466193af1412b0", + "zh:96c25183a7f13b3de9a5631aa2a13ed1a4285b8393df90c2380c2fe74f350ab5", + "zh:971121626be01245acd9a4520a63e1405e4f528d3c83f39a28f8caaeac235b45", + "zh:e90d5e7d7bf47c8cf5bbf2e5d0bf855ed10350ad3584795a6911f85fdb5c0c3c", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/terraform/environments/prod/main.tf b/terraform/environments/prod/main.tf new file mode 100644 index 0000000000..03a2af8dff --- /dev/null +++ b/terraform/environments/prod/main.tf @@ -0,0 +1,23 @@ +# Sifnode GCP - Prod Environment +# Wires root module with prod-specific values (HA, more nodes) + +module "sifnode_prod" { + source = "../../" + + project_id = var.project_id + region = "us-central1" + vpc_name = "sifnode-prod-vpc" + subnet_cidr = "10.1.0.0/20" + gke_cluster_name = "sifnode-prod" + gke_node_count_min = 5 + gke_node_count_max = 7 + gke_machine_type = "n2-standard-4" + sql_instance_name = "sifnode-db-prod" + sql_database_version = "POSTGRES_14" + sql_tier = "db-custom-4-8192" + sql_database_name = "sifnodedb" + sql_deletion_protection = true + lb_domain = var.lb_domain + notification_email = var.notification_email + alert_channels = var.alert_channels +} \ No newline at end of file diff --git a/terraform/environments/prod/terraform.tfvars b/terraform/environments/prod/terraform.tfvars new file mode 100644 index 0000000000..af60e69517 --- /dev/null +++ b/terraform/environments/prod/terraform.tfvars @@ -0,0 +1,4 @@ +# Prod environment variables +project_id = "sifnode-gcp-support" +notification_email = "ops@example.com" +lb_domain = "" \ No newline at end of file diff --git a/terraform/environments/prod/variables.tf b/terraform/environments/prod/variables.tf new file mode 100644 index 0000000000..27e86fa486 --- /dev/null +++ b/terraform/environments/prod/variables.tf @@ -0,0 +1,22 @@ +# Prod environment input variables +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "notification_email" { + description = "Email for alert notifications" + type = string +} + +variable "lb_domain" { + description = "Domain for SSL certificate" + type = string + default = "" +} + +variable "alert_channels" { + description = "Additional alert notification channels" + type = list(string) + default = [] +} \ No newline at end of file diff --git a/terraform/main.tf b/terraform/main.tf new file mode 100644 index 0000000000..5fede958e5 --- /dev/null +++ b/terraform/main.tf @@ -0,0 +1,70 @@ +# ----------------------------------------------------------------------------- +# Sifnode GCP - Root Module +# Wires all sub-modules together +# ----------------------------------------------------------------------------- + +# VPC Network +module "vpc" { + source = "./modules/vpc" + + project_id = var.project_id + region = var.region + vpc_name = var.vpc_name + subnet_cidr = var.subnet_cidr +} + +# GKE Cluster +module "gke" { + source = "./modules/gke" + + project_id = var.project_id + region = var.region + cluster_name = var.gke_cluster_name + node_count_min = var.gke_node_count_min + node_count_max = var.gke_node_count_max + machine_type = var.gke_machine_type + network_vpc_id = module.vpc.vpc_id + subnetwork_id = module.vpc.subnet_id + + depends_on = [module.vpc] +} + +# Cloud SQL +module "cloud_sql" { + source = "./modules/cloud-sql" + + project_id = var.project_id + region = var.region + instance_name = var.sql_instance_name + database_version = var.sql_database_version + tier = var.sql_tier + database_name = var.sql_database_name + vpc_id = module.vpc.vpc_id + deletion_protection = var.sql_deletion_protection + + depends_on = [module.vpc] +} + +# Load Balancer +module "load_balancer" { + source = "./modules/load-balancer" + + project_id = var.project_id + name = "sifnode-lb" + region = var.region + domain = var.lb_domain + + depends_on = [module.gke] +} + +# Monitoring +module "monitoring" { + source = "./modules/monitoring" + + project_id = var.project_id + notification_email = var.notification_email + alert_channels = var.alert_channels + cluster_name = var.gke_cluster_name + + depends_on = [module.gke] +} \ No newline at end of file diff --git a/terraform/modules/cloud-sql/main.tf b/terraform/modules/cloud-sql/main.tf new file mode 100644 index 0000000000..373f57c1a3 --- /dev/null +++ b/terraform/modules/cloud-sql/main.tf @@ -0,0 +1,72 @@ +# Private services access: allocate IP range for Cloud SQL private IP +resource "google_compute_global_address" "private_ip_range" { + count = var.vpc_id != "" ? 1 : 0 + name = "${var.instance_name}-private-ip-range" + project = var.project_id + purpose = "VPC_PEERING" + address_type = "INTERNAL" + prefix_length = 16 + network = var.vpc_id +} + +# Establish private service networking connection +resource "google_service_networking_connection" "private_vpc_connection" { + count = var.vpc_id != "" ? 1 : 0 + network = var.vpc_id + service = "servicenetworking.googleapis.com" + reserved_peering_ranges = [google_compute_global_address.private_ip_range[0].name] +} + +# Cloud SQL PostgreSQL instance +resource "google_sql_database_instance" "instance" { + name = var.instance_name + project = var.project_id + region = var.region + database_version = var.database_version + + settings { + tier = var.tier + + disk_size = var.storage_size_gb + disk_type = "PD_SSD" + disk_autoresize = true + + availability_type = "REGIONAL" + + backup_configuration { + enabled = true + start_time = "02:00" + point_in_time_recovery_enabled = true + + backup_retention_settings { + retained_backups = 7 + } + } + + ip_configuration { + ipv4_enabled = var.vpc_id != "" ? false : true + private_network = var.vpc_id != "" ? var.vpc_id : null + } + } + + deletion_protection = var.deletion_protection + + depends_on = [ + google_service_networking_connection.private_vpc_connection + ] +} + +# Default database +resource "google_sql_database" "database" { + name = var.database_name + project = var.project_id + instance = google_sql_database_instance.instance.name +} + +# Default user +resource "google_sql_user" "user" { + name = "sifnode" + project = var.project_id + instance = google_sql_database_instance.instance.name + password = "" +} \ No newline at end of file diff --git a/terraform/modules/cloud-sql/outputs.tf b/terraform/modules/cloud-sql/outputs.tf new file mode 100644 index 0000000000..27427ccaf6 --- /dev/null +++ b/terraform/modules/cloud-sql/outputs.tf @@ -0,0 +1,24 @@ +output "instance_name" { + description = "Name of the Cloud SQL instance" + value = google_sql_database_instance.instance.name +} + +output "instance_connection_name" { + description = "Connection name of the Cloud SQL instance (project:region:instance)" + value = google_sql_database_instance.instance.connection_name +} + +output "private_ip" { + description = "Private IP address of the Cloud SQL instance" + value = google_sql_database_instance.instance.private_ip_address +} + +output "database_name" { + description = "Name of the created database" + value = google_sql_database.database.name +} + +output "server_ca_cert" { + description = "Server CA certificate of the Cloud SQL instance" + value = google_sql_database_instance.instance.server_ca_cert +} \ No newline at end of file diff --git a/terraform/modules/cloud-sql/variables.tf b/terraform/modules/cloud-sql/variables.tf new file mode 100644 index 0000000000..15583425fa --- /dev/null +++ b/terraform/modules/cloud-sql/variables.tf @@ -0,0 +1,50 @@ +variable "project_id" { + description = "GCP project ID where Cloud SQL will be created" + type = string +} + +variable "region" { + description = "GCP region for the Cloud SQL instance" + type = string +} + +variable "instance_name" { + description = "Name of the Cloud SQL instance" + type = string +} + +variable "database_version" { + description = "PostgreSQL database version" + type = string + default = "POSTGRES_14" +} + +variable "tier" { + description = "Machine tier for the Cloud SQL instance" + type = string + default = "db-custom-2-4096" +} + +variable "storage_size_gb" { + description = "Storage size in GB for the Cloud SQL instance" + type = number + default = 100 +} + +variable "database_name" { + description = "Name of the default database to create" + type = string + default = "sifnodedb" +} + +variable "vpc_id" { + description = "VPC network ID for private services access" + type = string + default = "" +} + +variable "deletion_protection" { + description = "Whether to enable deletion protection on the Cloud SQL instance" + type = bool + default = true +} \ No newline at end of file diff --git a/terraform/modules/cloud-sql/versions.tf b/terraform/modules/cloud-sql/versions.tf new file mode 100644 index 0000000000..3a1bb14de7 --- /dev/null +++ b/terraform/modules/cloud-sql/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.3.0" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + } +} \ No newline at end of file diff --git a/terraform/modules/gke/main.tf b/terraform/modules/gke/main.tf new file mode 100644 index 0000000000..f4bdb223e0 --- /dev/null +++ b/terraform/modules/gke/main.tf @@ -0,0 +1,138 @@ +# ----------------------------------------------------------------------------- +# GKE Module - Main Resources +# Sifnode Validator Cluster (Regional, Private, VPC-native) +# ----------------------------------------------------------------------------- + +# Service account for the GKE validator node pool +resource "google_service_account" "validator_sa" { + account_id = "${var.cluster_name}-validator" + display_name = "Service account for Sifnode validator GKE node pool" + project = var.project_id +} + +# ----------------------------------------------------------------------------- +# GKE Regional Private Cluster +# ----------------------------------------------------------------------------- +resource "google_container_cluster" "validator" { + name = var.cluster_name + project = var.project_id + location = var.region + + # Regional cluster (multi-zone HA) - do not set node locations explicitly + # to let GCP pick all zones in the region automatically. + + # VPC-native (IP aliasing enabled) for private cluster support + network = var.network_vpc_id + subnetwork = var.subnetwork_id + + # Private cluster configuration + private_cluster_config { + enable_private_nodes = true + enable_private_endpoint = false + master_ipv4_cidr_block = var.master_ipv4_cidr_block + } + + # IP allocation policy (VPC-native / alias IPs) + ip_allocation_policy { + # Let GKE auto-create secondary ranges for Pods and Services + } + + # Workload Identity configuration + workload_identity_config { + workload_pool = "${var.project_id}.svc.id.goog" + } + + # Network Policy enforcement + network_policy { + enabled = true + provider = "CALICO" + } + + # Shielded GKE nodes + enable_shielded_nodes = true + + # Disable default node pool - we define our own below + initial_node_count = 1 + + # Deletion protection for production safety + deletion_protection = false + + lifecycle { + prevent_destroy = false + ignore_changes = [initial_node_count] + } +} + +# ----------------------------------------------------------------------------- +# Validator Node Pool +# ----------------------------------------------------------------------------- +resource "google_container_node_pool" "validator_pool" { + name = "validator-pool" + project = var.project_id + location = var.region + cluster = google_container_cluster.validator.name + + # Initial node count (starts at min for autoscaling) + initial_node_count = var.node_count_min + + # Autoscaling configuration + autoscaling { + min_node_count = var.node_count_min + max_node_count = var.node_count_max + } + + # Node configuration + node_config { + machine_type = var.machine_type + disk_size_gb = 100 + disk_type = "pd-ssd" + + # Not preemptible - validator nodes must be stable + preemptible = false + + # Service account for node pool + service_account = google_service_account.validator_sa.email + + # OAuth scopes for GCP API access + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring", + ] + + # Taint: dedicate nodes to validator workloads only + taint { + key = "dedicated" + value = "validator" + effect = "NO_SCHEDULE" + } + + # Labels for identification + labels = { + purpose = "validator" + } + + # Shielded instance config + shielded_instance_config { + enable_secure_boot = true + enable_integrity_monitoring = true + } + + # Workload Identity metadata + workload_metadata_config { + mode = "GKE_METADATA" + } + } + + # Management - auto-repair and auto-upgrade for resilience + management { + auto_repair = true + auto_upgrade = true + } + + lifecycle { + create_before_destroy = true + } + + depends_on = [google_container_cluster.validator] +} \ No newline at end of file diff --git a/terraform/modules/gke/outputs.tf b/terraform/modules/gke/outputs.tf new file mode 100644 index 0000000000..e46d7fde84 --- /dev/null +++ b/terraform/modules/gke/outputs.tf @@ -0,0 +1,35 @@ +# ----------------------------------------------------------------------------- +# GKE Module - Outputs +# Sifnode Validator Cluster +# ----------------------------------------------------------------------------- + +output "cluster_name" { + description = "The name of the GKE cluster." + value = google_container_cluster.validator.name +} + +output "cluster_id" { + description = "The unique identifier for the GKE cluster (project/location/name)." + value = google_container_cluster.validator.id +} + +output "endpoint" { + description = "The public endpoint for the GKE cluster API server." + value = google_container_cluster.validator.endpoint +} + +output "ca_certificate" { + description = "The base64-encoded public certificate for the cluster's CA." + value = google_container_cluster.validator.master_auth[0].cluster_ca_certificate + sensitive = true +} + +output "node_pool_name" { + description = "The name of the validator node pool." + value = google_container_node_pool.validator_pool.name +} + +output "service_account" { + description = "The email of the service account created for the validator node pool." + value = google_service_account.validator_sa.email +} \ No newline at end of file diff --git a/terraform/modules/gke/variables.tf b/terraform/modules/gke/variables.tf new file mode 100644 index 0000000000..2f92ccab26 --- /dev/null +++ b/terraform/modules/gke/variables.tf @@ -0,0 +1,53 @@ +# ----------------------------------------------------------------------------- +# GKE Module - Input Variables +# Sifnode Validator Cluster +# ----------------------------------------------------------------------------- + +variable "project_id" { + description = "The GCP project ID where the GKE cluster will be created." + type = string +} + +variable "region" { + description = "The GCP region for the regional GKE cluster. Controls which zones are used for multi-zone HA." + type = string +} + +variable "cluster_name" { + description = "The name of the GKE cluster. Must be unique within the project." + type = string +} + +variable "node_count_min" { + description = "Minimum number of nodes in the validator node pool (autoscaling lower bound)." + type = number + default = 3 +} + +variable "node_count_max" { + description = "Maximum number of nodes in the validator node pool (autoscaling upper bound)." + type = number + default = 5 +} + +variable "machine_type" { + description = "The machine type for validator node pool instances. Default n2-standard-4 provides 4 vCPU and 16 GB RAM." + type = string + default = "n2-standard-4" +} + +variable "network_vpc_id" { + description = "The self-link or name of the VPC network into which the GKE cluster will be deployed." + type = string +} + +variable "subnetwork_id" { + description = "The self-link or name of the subnetwork within the VPC for the GKE cluster nodes." + type = string +} + +variable "master_ipv4_cidr_block" { + description = "The IPv4 CIDR block assigned to the GKE master internal endpoint. Must be /28 and within the VPC's secondary range." + type = string + default = "172.16.0.0/28" +} \ No newline at end of file diff --git a/terraform/modules/gke/versions.tf b/terraform/modules/gke/versions.tf new file mode 100644 index 0000000000..3a1bb14de7 --- /dev/null +++ b/terraform/modules/gke/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.3.0" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + } +} \ No newline at end of file diff --git a/terraform/modules/load-balancer/main.tf b/terraform/modules/load-balancer/main.tf new file mode 100644 index 0000000000..9c1e142554 --- /dev/null +++ b/terraform/modules/load-balancer/main.tf @@ -0,0 +1,119 @@ +# --------------------------------------------------------------------------- +# Health Check – probes /status on port 1317 (default) via HTTP +# --------------------------------------------------------------------------- +resource "google_compute_health_check" "this" { + name = "${var.name}-hc" + project = var.project_id + description = "Health check for ${var.name} load balancer" + + http_health_check { + port = var.health_check_port + request_path = var.health_check_path + } + + check_interval_sec = 10 + timeout_sec = 5 + healthy_threshold = 2 + unhealthy_threshold = 3 +} + +# --------------------------------------------------------------------------- +# Backend Service – no CDN (real-time blockchain data), 30s timeout +# --------------------------------------------------------------------------- +resource "google_compute_backend_service" "this" { + name = "${var.name}-backend" + project = var.project_id + + protocol = "HTTP" + port_name = "http" + timeout_sec = 30 + + health_checks = [google_compute_health_check.this.id] + + # CDN is disabled by default; explicitly set to false + enable_cdn = false + + # If the caller passed an explicit backend_service reference we do not + # manage backends here – they are expected to be NEGs attached externally. + # When a dedicated backend_service var is NOT supplied we leave backends + # empty so they can be referenced / attached later. + dynamic "backend" { + for_each = var.backend_service != "" ? [var.backend_service] : [] + content { + group = backend.value + } + } +} + +# --------------------------------------------------------------------------- +# URL Map – routes all traffic to the backend service +# --------------------------------------------------------------------------- +resource "google_compute_url_map" "this" { + name = "${var.name}-url-map" + project = var.project_id + default_service = google_compute_backend_service.this.id + + description = "URL map for ${var.name}" +} + +# --------------------------------------------------------------------------- +# Managed SSL Certificate (conditional – only when domain is provided) +# --------------------------------------------------------------------------- +resource "google_compute_managed_ssl_certificate" "this" { + count = var.domain != "" ? 1 : 0 + name = "${var.name}-ssl-cert" + project = var.project_id + + managed { + domains = [var.domain] + } +} + +# --------------------------------------------------------------------------- +# Target HTTPS Proxy (when domain/SSL cert exists) +# --------------------------------------------------------------------------- +resource "google_compute_target_https_proxy" "this" { + count = var.domain != "" ? 1 : 0 + name = "${var.name}-https-proxy" + project = var.project_id + url_map = google_compute_url_map.this.id + ssl_certificates = [google_compute_managed_ssl_certificate.this[0].id] +} + +# --------------------------------------------------------------------------- +# Target HTTP Proxy (when NO domain – fallback) +# --------------------------------------------------------------------------- +resource "google_compute_target_http_proxy" "this" { + count = var.domain == "" ? 1 : 0 + name = "${var.name}-http-proxy" + project = var.project_id + url_map = google_compute_url_map.this.id +} + +# --------------------------------------------------------------------------- +# Global Forwarding Rule – HTTPS (when domain provided) +# --------------------------------------------------------------------------- +resource "google_compute_global_forwarding_rule" "https" { + count = var.domain != "" ? 1 : 0 + name = "${var.name}-https-fwd-rule" + project = var.project_id + target = google_compute_target_https_proxy.this[0].id + port_range = "443" + ip_protocol = "TCP" + + load_balancing_scheme = "EXTERNAL_MANAGED" +} + +# --------------------------------------------------------------------------- +# Global Forwarding Rule – HTTP (when no domain) +# --------------------------------------------------------------------------- +resource "google_compute_global_forwarding_rule" "http" { + count = var.domain == "" ? 1 : 0 + name = "${var.name}-http-fwd-rule" + project = var.project_id + target = google_compute_target_http_proxy.this[0].id + port_range = "80" + ip_protocol = "TCP" + + load_balancing_scheme = "EXTERNAL_MANAGED" +} \ No newline at end of file diff --git a/terraform/modules/load-balancer/outputs.tf b/terraform/modules/load-balancer/outputs.tf new file mode 100644 index 0000000000..b1cb008057 --- /dev/null +++ b/terraform/modules/load-balancer/outputs.tf @@ -0,0 +1,19 @@ +output "load_balancer_ip" { + description = "External IP address of the global forwarding rule" + value = var.domain != "" ? google_compute_global_forwarding_rule.https[0].ip_address : google_compute_global_forwarding_rule.http[0].ip_address +} + +output "forwarding_rule_name" { + description = "Name of the global forwarding rule" + value = var.domain != "" ? google_compute_global_forwarding_rule.https[0].name : google_compute_global_forwarding_rule.http[0].name +} + +output "ssl_certificate_name" { + description = "Name of the managed SSL certificate (empty if no domain provided)" + value = var.domain != "" ? google_compute_managed_ssl_certificate.this[0].name : "" +} + +output "url_map_name" { + description = "Name of the URL map" + value = google_compute_url_map.this.name +} \ No newline at end of file diff --git a/terraform/modules/load-balancer/variables.tf b/terraform/modules/load-balancer/variables.tf new file mode 100644 index 0000000000..83072e16a9 --- /dev/null +++ b/terraform/modules/load-balancer/variables.tf @@ -0,0 +1,38 @@ +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "name" { + description = "Name prefix for load balancer resources" + type = string +} + +variable "region" { + description = "GCP region for the load balancer resources" + type = string +} + +variable "domain" { + description = "Domain for managed SSL certificate. If empty, an HTTP proxy is created instead of HTTPS." + type = string + default = "" +} + +variable "backend_service" { + description = "Self-link of the backend service or NEG to attach to the load balancer" + type = string + default = "" +} + +variable "health_check_path" { + description = "HTTP path for health check probes" + type = string + default = "/status" +} + +variable "health_check_port" { + description = "Port for health check probes" + type = number + default = 1317 +} \ No newline at end of file diff --git a/terraform/modules/load-balancer/versions.tf b/terraform/modules/load-balancer/versions.tf new file mode 100644 index 0000000000..710ec2ecf0 --- /dev/null +++ b/terraform/modules/load-balancer/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + } +} \ No newline at end of file diff --git a/terraform/modules/monitoring/dashboard.json.tpl b/terraform/modules/monitoring/dashboard.json.tpl new file mode 100644 index 0000000000..ecfc29ab5a --- /dev/null +++ b/terraform/modules/monitoring/dashboard.json.tpl @@ -0,0 +1,119 @@ +{ + "displayName": "Sifnode Validator Overview", + "mosaicLayout": { + "columns": 12, + "tiles": [ + { + "width": 6, + "height": 4, + "widget": { + "title": "CPU Utilization", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\"%{if cluster_filter != ""} AND ${cluster_filter}%{endif}", + "plotType": "LINE" + } + } + ], + "timeshift": "0s", + "yAxis": { + "label": "CPU" + } + } + } + }, + { + "width": 6, + "height": 4, + "widget": { + "title": "Memory Utilization", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\"%{if cluster_filter != ""} AND ${cluster_filter}%{endif}", + "plotType": "LINE" + } + } + ], + "timeshift": "0s", + "yAxis": { + "label": "Memory (bytes)" + } + } + } + }, + { + "width": 6, + "height": 4, + "widget": { + "title": "Disk Read / Write", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"agent.googleapis.com/disk/read_bytes_count\" resource.type=\"gce_instance\"%{if cluster_filter != ""} AND ${cluster_filter}%{endif}", + "plotType": "LINE" + } + }, + { + "timeSeriesFilter": { + "filter": "metric.type=\"agent.googleapis.com/disk/write_bytes_count\" resource.type=\"gce_instance\"%{if cluster_filter != ""} AND ${cluster_filter}%{endif}", + "plotType": "LINE" + } + } + ], + "timeshift": "0s", + "yAxis": { + "label": "Bytes/sec" + } + } + } + }, + { + "width": 6, + "height": 4, + "widget": { + "title": "Received Bytes (Network)", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"kubernetes.io/pod/network/received_bytes_count\" resource.type=\"k8s_pod\"%{if cluster_filter != ""} AND ${cluster_filter}%{endif}", + "plotType": "LINE" + } + } + ], + "timeshift": "0s", + "yAxis": { + "label": "Bytes/sec" + } + } + } + }, + { + "width": 12, + "height": 4, + "widget": { + "title": "Uptime Check Status", + "xyChart": { + "dataSets": [ + { + "timeSeriesFilter": { + "filter": "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" resource.type=\"uptime_url\"", + "plotType": "LINE" + } + } + ], + "timeshift": "0s", + "yAxis": { + "label": "Status" + } + } + } + } + ] + } +} \ No newline at end of file diff --git a/terraform/modules/monitoring/main.tf b/terraform/modules/monitoring/main.tf new file mode 100644 index 0000000000..91733e0579 --- /dev/null +++ b/terraform/modules/monitoring/main.tf @@ -0,0 +1,193 @@ +locals { + cluster_filter = var.cluster_name != "" ? "resource.labels.cluster_name=\"${var.cluster_name}\"" : "" +} + +# ----------------------------------------------------------------------------- +# Notification Channel +# ----------------------------------------------------------------------------- + +resource "google_monitoring_notification_channel" "email" { + display_name = "Sifnode Alert Email" + type = "email" + + labels = { + email_address = var.notification_email + } +} + +# ----------------------------------------------------------------------------- +# Alert Policy: Node Down (uptime check failure > 5 min) +# ----------------------------------------------------------------------------- + +resource "google_monitoring_alert_policy" "node_down" { + display_name = "Sifnode Node Down" + combiner = "OR" + + conditions { + display_name = "Uptime check failure for > 5 minutes" + + condition_threshold { + filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" resource.type=\"uptime_url\"" + duration = "300s" + comparison = "COMPARISON_LT" + threshold_value = 1 + + aggregations { + alignment_period = "300s" + per_series_aligner = "ALIGN_NEXT_OLDER" + } + } + } + + notification_channels = concat( + [google_monitoring_notification_channel.email.id], + var.alert_channels, + ) + + severity = "CRITICAL" +} + +# ----------------------------------------------------------------------------- +# Alert Policy: Disk Usage > 80% +# ----------------------------------------------------------------------------- + +resource "google_monitoring_alert_policy" "disk_usage" { + display_name = "Sifnode Disk Usage > 80%" + combiner = "OR" + + conditions { + display_name = "Disk usage above 80 percent" + + condition_threshold { + filter = "metric.type=\"agent.googleapis.com/disk/percent_used\" resource.type=\"gce_instance\"${local.cluster_filter != "" ? " AND ${local.cluster_filter}" : ""}" + duration = "300s" + comparison = "COMPARISON_GT" + + threshold_value = 80 + + aggregations { + alignment_period = "300s" + per_series_aligner = "ALIGN_MEAN" + } + } + } + + notification_channels = concat( + [google_monitoring_notification_channel.email.id], + var.alert_channels, + ) + + severity = "WARNING" +} + +# ----------------------------------------------------------------------------- +# Alert Policy: High CPU > 90% for 5 minutes +# ----------------------------------------------------------------------------- + +resource "google_monitoring_alert_policy" "high_cpu" { + display_name = "Sifnode High CPU > 90%" + combiner = "OR" + + conditions { + display_name = "CPU utilization above 90 percent for 5 minutes" + + condition_threshold { + filter = "metric.type=\"kubernetes.io/container/cpu/core_usage_time\" resource.type=\"k8s_container\"${local.cluster_filter != "" ? " AND ${local.cluster_filter}" : ""}" + duration = "300s" + comparison = "COMPARISON_GT" + + threshold_value = 0.9 + + aggregations { + alignment_period = "300s" + per_series_aligner = "ALIGN_RATE" + } + } + } + + notification_channels = concat( + [google_monitoring_notification_channel.email.id], + var.alert_channels, + ) + + severity = "WARNING" +} + +# ----------------------------------------------------------------------------- +# Alert Policy: Memory Usage > 85% +# ----------------------------------------------------------------------------- + +resource "google_monitoring_alert_policy" "memory_usage" { + display_name = "Sifnode Memory Usage > 85%" + combiner = "OR" + + conditions { + display_name = "Memory usage above 85 percent" + + condition_threshold { + filter = "metric.type=\"kubernetes.io/container/memory/used_bytes\" resource.type=\"k8s_container\"${local.cluster_filter != "" ? " AND ${local.cluster_filter}" : ""}" + duration = "300s" + comparison = "COMPARISON_GT" + + threshold_value = 0.85 + + aggregations { + alignment_period = "300s" + per_series_aligner = "ALIGN_MEAN" + } + } + } + + notification_channels = concat( + [google_monitoring_notification_channel.email.id], + var.alert_channels, + ) + + severity = "WARNING" +} + +# ----------------------------------------------------------------------------- +# Alert Policy: Sync Lag > 10 Blocks (placeholder — custom metric) +# ----------------------------------------------------------------------------- + +resource "google_monitoring_alert_policy" "sync_lag" { + display_name = "Sifnode Sync Lag > 10 Blocks" + combiner = "OR" + + conditions { + display_name = "Sync lag above 10 blocks" + + condition_threshold { + filter = "metric.type=\"custom.googleapis.com/sifnode/sync_lag\" resource.type=\"generic_node\"" + duration = "300s" + comparison = "COMPARISON_GT" + + threshold_value = 10 + + aggregations { + alignment_period = "300s" + per_series_aligner = "ALIGN_MEAN" + } + } + } + + notification_channels = concat( + [google_monitoring_notification_channel.email.id], + var.alert_channels, + ) + + severity = "WARNING" +} + +# ----------------------------------------------------------------------------- +# Dashboard: Sifnode Validator Overview +# ----------------------------------------------------------------------------- + +resource "google_monitoring_dashboard" "validator_overview" { + dashboard_json = templatefile( + "${path.module}/dashboard.json.tpl", + { + cluster_filter = local.cluster_filter + } + ) +} \ No newline at end of file diff --git a/terraform/modules/monitoring/outputs.tf b/terraform/modules/monitoring/outputs.tf new file mode 100644 index 0000000000..9c496c4b2b --- /dev/null +++ b/terraform/modules/monitoring/outputs.tf @@ -0,0 +1,25 @@ +output "notification_channel_id" { + description = "ID of the created email notification channel" + value = google_monitoring_notification_channel.email.id +} + +output "notification_channel_name" { + description = "Full resource name of the email notification channel" + value = google_monitoring_notification_channel.email.name +} + +output "alert_policy_ids" { + description = "Map of alert policy IDs keyed by name" + value = { + node_down = google_monitoring_alert_policy.node_down.id + disk_usage = google_monitoring_alert_policy.disk_usage.id + high_cpu = google_monitoring_alert_policy.high_cpu.id + memory_usage = google_monitoring_alert_policy.memory_usage.id + sync_lag = google_monitoring_alert_policy.sync_lag.id + } +} + +output "dashboard_id" { + description = "ID of the Sifnode Validator Overview dashboard" + value = google_monitoring_dashboard.validator_overview.id +} \ No newline at end of file diff --git a/terraform/modules/monitoring/variables.tf b/terraform/modules/monitoring/variables.tf new file mode 100644 index 0000000000..67c4f540a5 --- /dev/null +++ b/terraform/modules/monitoring/variables.tf @@ -0,0 +1,21 @@ +variable "project_id" { + description = "GCP project ID where monitoring resources will be created" + type = string +} + +variable "notification_email" { + description = "Email address for alert notifications" + type = string +} + +variable "alert_channels" { + description = "Additional notification channel IDs to attach to alert policies" + type = list(string) + default = [] +} + +variable "cluster_name" { + description = "GKE cluster name used for metric filtering (optional)" + type = string + default = "" +} \ No newline at end of file diff --git a/terraform/modules/monitoring/versions.tf b/terraform/modules/monitoring/versions.tf new file mode 100644 index 0000000000..710ec2ecf0 --- /dev/null +++ b/terraform/modules/monitoring/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + } +} \ No newline at end of file diff --git a/terraform/modules/vpc/main.tf b/terraform/modules/vpc/main.tf new file mode 100644 index 0000000000..f592ca149e --- /dev/null +++ b/terraform/modules/vpc/main.tf @@ -0,0 +1,126 @@ +# ----------------------------------------------------------------------------- +# VPC Module - Main Resources +# Sifnode GCP Network +# ----------------------------------------------------------------------------- + +resource "google_compute_network" "vpc" { + name = var.vpc_name + project = var.project_id + auto_create_subnetworks = false + routing_mode = "REGIONAL" +} + +resource "google_compute_subnetwork" "subnet" { + name = "${var.vpc_name}-subnet" + project = var.project_id + region = var.region + network = google_compute_network.vpc.id + ip_cidr_range = var.subnet_cidr + + secondary_ip_range { + range_name = "pods" + ip_cidr_range = var.pods_cidr + } + + secondary_ip_range { + range_name = "services" + ip_cidr_range = var.services_cidr + } + + private_ip_google_access = true +} + +# Firewall rules for Sifnode +resource "google_compute_firewall" "allow_internal" { + name = "${var.vpc_name}-allow-internal" + project = var.project_id + network = google_compute_network.vpc.name + + allow { + protocol = "tcp" + ports = ["0-65535"] + } + + allow { + protocol = "udp" + ports = ["0-65535"] + } + + allow { + protocol = "icmp" + } + + source_ranges = [var.subnet_cidr] + description = "Allow internal communication within the VPC" +} + +resource "google_compute_firewall" "allow_sifnode_ports" { + name = "${var.vpc_name}-allow-sifnode" + project = var.project_id + network = google_compute_network.vpc.name + + # P2P port (Tendermint) + allow { + protocol = "tcp" + ports = ["26656"] + } + + # RPC port + allow { + protocol = "tcp" + ports = ["26657"] + } + + # REST API port + allow { + protocol = "tcp" + ports = ["1317"] + } + + # Health check port + allow { + protocol = "tcp" + ports = ["1317"] + } + + source_ranges = ["0.0.0.0/0"] + target_tags = ["sifnode-validator"] + description = "Allow Sifnode P2P, RPC, and REST API ports" +} + +resource "google_compute_firewall" "allow_health_checks" { + name = "${var.vpc_name}-allow-health-checks" + project = var.project_id + network = google_compute_network.vpc.name + + allow { + protocol = "tcp" + ports = ["1317", "26657"] + } + + source_ranges = ["35.191.0.0/16", "130.211.0.0/22"] + target_tags = ["sifnode-validator"] + description = "Allow GCP health check probes" +} + +# Cloud NAT for egress (private nodes need outbound access) +resource "google_compute_router" "router" { + name = "${var.vpc_name}-router" + project = var.project_id + region = var.region + network = google_compute_network.vpc.id +} + +resource "google_compute_router_nat" "nat" { + name = "${var.vpc_name}-nat" + project = var.project_id + region = var.region + router = google_compute_router.router.name + nat_ip_allocate_option = "AUTO_ONLY" + source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES" + + log_config { + enable = true + filter = "ERRORS_ONLY" + } +} \ No newline at end of file diff --git a/terraform/modules/vpc/outputs.tf b/terraform/modules/vpc/outputs.tf new file mode 100644 index 0000000000..ebfdeb5c1a --- /dev/null +++ b/terraform/modules/vpc/outputs.tf @@ -0,0 +1,39 @@ +# ----------------------------------------------------------------------------- +# VPC Module - Outputs +# Sifnode GCP Network +# ----------------------------------------------------------------------------- + +output "vpc_id" { + description = "The self-link ID of the VPC network" + value = google_compute_network.vpc.id +} + +output "vpc_name" { + description = "The name of the VPC network" + value = google_compute_network.vpc.name +} + +output "subnet_id" { + description = "The self-link ID of the subnetwork" + value = google_compute_subnetwork.subnet.id +} + +output "subnet_name" { + description = "The name of the subnetwork" + value = google_compute_subnetwork.subnet.name +} + +output "pods_range_name" { + description = "The secondary range name for GKE pods" + value = "pods" +} + +output "services_range_name" { + description = "The secondary range name for GKE services" + value = "services" +} + +output "router_name" { + description = "The name of the Cloud Router" + value = google_compute_router.router.name +} \ No newline at end of file diff --git a/terraform/modules/vpc/variables.tf b/terraform/modules/vpc/variables.tf new file mode 100644 index 0000000000..81e472f414 --- /dev/null +++ b/terraform/modules/vpc/variables.tf @@ -0,0 +1,38 @@ +# ----------------------------------------------------------------------------- +# VPC Module - Input Variables +# Sifnode GCP Network +# ----------------------------------------------------------------------------- + +variable "project_id" { + description = "The GCP project ID where the VPC will be created." + type = string +} + +variable "region" { + description = "The GCP region for subnetwork creation." + type = string +} + +variable "vpc_name" { + description = "The name of the VPC network." + type = string + default = "sifnode-vpc" +} + +variable "subnet_cidr" { + description = "CIDR range for the primary subnetwork." + type = string + default = "10.0.0.0/20" +} + +variable "pods_cidr" { + description = "CIDR range for GKE pods (secondary IP range)." + type = string + default = "10.4.0.0/14" +} + +variable "services_cidr" { + description = "CIDR range for GKE services (secondary IP range)." + type = string + default = "10.8.0.0/16" +} \ No newline at end of file diff --git a/terraform/modules/vpc/versions.tf b/terraform/modules/vpc/versions.tf new file mode 100644 index 0000000000..2a07aef3b3 --- /dev/null +++ b/terraform/modules/vpc/versions.tf @@ -0,0 +1,9 @@ +terraform { + required_version = ">= 1.3.0" + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + } +} \ No newline at end of file diff --git a/terraform/outputs.tf b/terraform/outputs.tf new file mode 100644 index 0000000000..bb881d7e31 --- /dev/null +++ b/terraform/outputs.tf @@ -0,0 +1,83 @@ +# ----------------------------------------------------------------------------- +# Sifnode GCP - Root Outputs +# ----------------------------------------------------------------------------- + +# VPC +output "vpc_id" { + description = "VPC network self-link" + value = module.vpc.vpc_id +} + +output "subnet_name" { + description = "Subnetwork name" + value = module.vpc.subnet_name +} + +# GKE +output "gke_cluster_name" { + description = "GKE cluster name" + value = module.gke.cluster_name +} + +output "gke_cluster_id" { + description = "GKE cluster ID" + value = module.gke.cluster_id +} + +output "gke_endpoint" { + description = "GKE API server endpoint" + value = module.gke.endpoint +} + +output "gke_service_account" { + description = "GKE node pool service account email" + value = module.gke.service_account +} + +# Cloud SQL +output "cloud_sql_instance_name" { + description = "Cloud SQL instance name" + value = module.cloud_sql.instance_name +} + +output "cloud_sql_connection_name" { + description = "Cloud SQL connection name (project:region:instance)" + value = module.cloud_sql.instance_connection_name +} + +output "cloud_sql_private_ip" { + description = "Cloud SQL private IP address" + value = module.cloud_sql.private_ip +} + +output "cloud_sql_database_name" { + description = "Database name" + value = module.cloud_sql.database_name +} + +# Load Balancer +output "load_balancer_ip" { + description = "External IP of the load balancer" + value = module.load_balancer.load_balancer_ip +} + +output "ssl_certificate_name" { + description = "SSL certificate name (empty if no domain)" + value = module.load_balancer.ssl_certificate_name +} + +# Monitoring +output "notification_channel_id" { + description = "Monitoring notification channel ID" + value = module.monitoring.notification_channel_id +} + +output "dashboard_id" { + description = "Monitoring dashboard ID" + value = module.monitoring.dashboard_id +} + +output "alert_policy_ids" { + description = "Map of alert policy IDs" + value = module.monitoring.alert_policy_ids +} \ No newline at end of file diff --git a/terraform/schemerr.txt b/terraform/schemerr.txt new file mode 100644 index 0000000000..db95ac1bc2 --- /dev/null +++ b/terraform/schemerr.txt @@ -0,0 +1,22 @@ +╷ +│ Error: Backend initialization required, please run "terraform init" +│  +│ Reason: Initial configuration of the requested backend "gcs" +│  +│ The "backend" is the interface that Terraform uses to store state, +│ perform operations, etc. If this message is showing up, it means that the +│ Terraform configuration you're using is using a custom configuration for +│ the Terraform backend. +│  +│ Changes to backend configurations require reinitialization. This allows +│ Terraform to set up the new configuration, copy existing state, etc. Please +│ run +│ "terraform init" with either the "-reconfigure" or "-migrate-state" flags +│ to +│ use the current configuration. +│  +│ If the change reason above is incorrect, please verify your configuration +│ hasn't changed and try again. At this point, no changes to your existing +│ configuration or state have been made. +╵ + diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000000..6ac5f0f9f6 --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,100 @@ +# ----------------------------------------------------------------------------- +# Sifnode GCP - Root Variables +# ----------------------------------------------------------------------------- + +variable "project_id" { + description = "GCP project ID for all resources" + type = string +} + +variable "region" { + description = "GCP region for resource deployment" + type = string + default = "us-central1" +} + +# VPC +variable "vpc_name" { + description = "Name of the VPC network" + type = string + default = "sifnode-vpc" +} + +variable "subnet_cidr" { + description = "CIDR range for the primary subnetwork" + type = string + default = "10.0.0.0/20" +} + +# GKE +variable "gke_cluster_name" { + description = "Name of the GKE cluster" + type = string +} + +variable "gke_node_count_min" { + description = "Minimum number of GKE nodes (autoscaling lower bound)" + type = number + default = 3 +} + +variable "gke_node_count_max" { + description = "Maximum number of GKE nodes (autoscaling upper bound)" + type = number + default = 5 +} + +variable "gke_machine_type" { + description = "Machine type for GKE nodes" + type = string + default = "n2-standard-4" +} + +# Cloud SQL +variable "sql_instance_name" { + description = "Name of the Cloud SQL instance" + type = string +} + +variable "sql_database_version" { + description = "PostgreSQL version for Cloud SQL" + type = string + default = "POSTGRES_14" +} + +variable "sql_tier" { + description = "Machine tier for Cloud SQL" + type = string + default = "db-custom-2-4096" +} + +variable "sql_database_name" { + description = "Name of the application database" + type = string + default = "sifnodedb" +} + +variable "sql_deletion_protection" { + description = "Enable deletion protection on Cloud SQL" + type = bool + default = true +} + +# Load Balancer +variable "lb_domain" { + description = "Domain for managed SSL certificate (empty = HTTP only)" + type = string + default = "" +} + +# Monitoring +variable "notification_email" { + description = "Email address for alert notifications" + type = string +} + +variable "alert_channels" { + description = "Additional alert notification channel IDs" + type = list(string) + default = [] +} \ No newline at end of file diff --git a/terraform/versions.tf b/terraform/versions.tf new file mode 100644 index 0000000000..29f8216b38 --- /dev/null +++ b/terraform/versions.tf @@ -0,0 +1,19 @@ +terraform { + required_version = ">= 1.3.0" + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + } + + backend "gcs" { + bucket = "sifnode-gcp-terraform-state" + prefix = "terraform/state" + } +} + +provider "google" { + project = var.project_id + region = var.region +} \ No newline at end of file