diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..8d10708 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,11 @@ +{ + "permissions": { + "allow": [ + "Bash(git add:*)", + "Bash(git commit:*)", + "Bash(git push:*)", + "Bash(curl:*)" + ], + "deny": [] + } +} \ No newline at end of file diff --git a/.github/workflows/database-dump.yml b/.github/workflows/database-dump.yml index 11d5d72..0105a72 100644 --- a/.github/workflows/database-dump.yml +++ b/.github/workflows/database-dump.yml @@ -14,10 +14,40 @@ jobs: actions: write steps: - - name: Set up PostgreSQL client + - name: Cache PostgreSQL client + id: cache-postgresql + uses: actions/cache@v3 + with: + path: | + /usr/lib/postgresql/17 + /usr/share/postgresql/17 + /usr/bin/pg_dump + /usr/bin/pg_restore + /usr/bin/psql + key: ${{ runner.os }}-postgresql-client-17-${{ hashFiles('.github/workflows/database-dump.yml') }} + + - name: Set up PostgreSQL 17 client + if: steps.cache-postgresql.outputs.cache-hit != 'true' run: | + # Add PostgreSQL official APT repository using the new method sudo apt-get update - sudo apt-get install -y postgresql-client + sudo apt-get install -y wget ca-certificates gnupg + + # Create keyrings directory if it doesn't exist + sudo mkdir -p /usr/share/keyrings + + # Download and add the signing key with proper conversion + wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | \ + gpg --dearmor | \ + sudo tee /usr/share/keyrings/postgresql-archive-keyring.gpg > /dev/null + + # Add the repository with signed-by option + echo "deb [signed-by=/usr/share/keyrings/postgresql-archive-keyring.gpg] http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | \ + sudo tee /etc/apt/sources.list.d/pgdg.list + + # Update and install PostgreSQL 17 client + sudo apt-get update + sudo apt-get install -y postgresql-client-17 - name: Create dump directory run: mkdir -p database_dumps @@ -33,75 +63,26 @@ jobs: export PGUSER=$(echo $DATABASE_URL | sed -E 's/postgres:\/\/([^:]+):.*/\1/') export PGPASSWORD=$(echo $DATABASE_URL | sed -E 's/postgres:\/\/[^:]+:([^@]+)@.*/\1/') - # Create dump excluding users table - DUMP_FILE="database_dumps/db_dump_$(date +%Y%m%d_%H%M%S).sql" + # Create dump excluding users table in custom archive format + DUMP_FILE="database_dumps/db_dump_$(date +%Y%m%d_%H%M%S).dump" - # Dump schema and data, excluding the users table + # Dump schema and data in custom format, excluding the users table pg_dump --no-owner --no-privileges \ + --format=custom \ --exclude-table=users \ --exclude-table=schema_migrations \ --exclude-table=ar_internal_metadata \ -f "$DUMP_FILE" - # Compress the dump - gzip "$DUMP_FILE" - echo "DUMP_FILE=${DUMP_FILE}.gz" >> $GITHUB_ENV - echo "Dump created: ${DUMP_FILE}.gz" + echo "DUMP_FILE=${DUMP_FILE}" >> $GITHUB_ENV + echo "Dump created: ${DUMP_FILE}" - - name: Upload dump as artifact using GitHub API - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # Get the dump file name - DUMP_FILE_NAME=$(basename "$DUMP_FILE") - - # Create a unique artifact name with timestamp - ARTIFACT_NAME="database-dump-$(date +%Y%m%d-%H%M%S)" - - # Get workflow run ID - RUN_ID="${{ github.run_id }}" - - # Create artifact upload - echo "Creating artifact upload..." - UPLOAD_RESPONSE=$(curl -L \ - -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - "https://api.github.com/repos/${{ github.repository }}/actions/runs/${RUN_ID}/artifacts" \ - -d "{\"name\":\"${ARTIFACT_NAME}\", \"retention_days\": 30}") - - # Extract upload URL and other details - UPLOAD_URL=$(echo "$UPLOAD_RESPONSE" | jq -r '.upload_url') - ARTIFACT_ID=$(echo "$UPLOAD_RESPONSE" | jq -r '.id') - - if [ "$UPLOAD_URL" = "null" ] || [ -z "$UPLOAD_URL" ]; then - echo "Failed to create artifact upload" - echo "Response: $UPLOAD_RESPONSE" - exit 1 - fi - - # Upload the file - echo "Uploading dump file..." - curl -L \ - -X PUT \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - -H "Content-Type: application/gzip" \ - --data-binary "@$DUMP_FILE" \ - "$UPLOAD_URL" - - # Finalize the artifact - echo "Finalizing artifact..." - curl -L \ - -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - "https://api.github.com/repos/${{ github.repository }}/actions/artifacts/${ARTIFACT_ID}/finalize" - - echo "Database dump uploaded as artifact: ${ARTIFACT_NAME}" + - name: Upload dump as artifact + uses: actions/upload-artifact@v4 + with: + name: database-dump-${{ github.run_number }}-${{ github.run_attempt }} + path: ${{ env.DUMP_FILE }} + retention-days: 30 - name: Clean up old artifacts env: diff --git a/README.md b/README.md index e123dae..36306c8 100644 --- a/README.md +++ b/README.md @@ -38,12 +38,12 @@ These are extracted using an LLM from the Entry's raw data. Each entry might have multiple activities. `Evidence`: - Evidence links an Activity to a Promise. They are linked using an LLM. + Evidence links an Activity to a Promise. They are linked using an LLM. ### 🛠 Setup -Ensure you have Ruby and PostgresQL installed +Ensure you have Ruby, PostgreSQL and the Github CLI installed ```bash # Install dependencies @@ -55,9 +55,41 @@ sudo service postgresql start # Setup database rails db:create -rails db:migrate -rails db:seed +rake db:fetch_and_restore # Run the server rails s ``` + +### 🚀 Developer Onboarding + +For new developers joining the project, we provide a streamlined onboarding process using production database dumps: + +#### Quick Start with Production Data + +1. **Prerequisites**: + - Install the GitHub CLI: https://cli.github.com/ + - Authenticate with: `gh auth login` + +2. **Restore from Latest Database Dump**: + ```bash + # List available database dumps + rake db:list_dumps + + # Fetch and restore the latest production database dump + # This will download the most recent weekly backup and restore it locally + rake db:fetch_and_restore + ``` + +3. **What's Included**: The database dump includes all production data except: + - User accounts (for privacy/security) + - Schema migrations metadata + - Internal Rails metadata + +4. **Post-Restore**: After restoring, the rake task automatically runs any pending migrations + +#### Database Dumps Schedule + +- Production database is automatically dumped weekly (every Monday at 2 AM UTC) +- Dumps are stored as GitHub Actions artifacts for 30 days +- Dumps use PostgreSQL's custom archive format for efficient storage and restore diff --git a/lib/tasks/database_restore.rake b/lib/tasks/database_restore.rake new file mode 100644 index 0000000..c2e6349 --- /dev/null +++ b/lib/tasks/database_restore.rake @@ -0,0 +1,216 @@ +namespace :db do + desc "Fetch latest database dump from GitHub artifacts and restore it" + task fetch_and_restore: :environment do + require "net/http" + require "json" + require "fileutils" + require "open3" + + # GitHub API configuration + repo = ENV["GITHUB_REPOSITORY"] || "BuildCanada/OutcomeTrackerAPI" + + puts "Fetching latest database dump artifact..." + + # Get list of artifacts + uri = URI("https://api.github.com/repos/#{repo}/actions/artifacts") + uri.query = URI.encode_www_form(per_page: 100) + + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + + request = Net::HTTP::Get.new(uri) + request["Accept"] = "application/vnd.github+json" + # No authorization needed for public repositories + request["X-GitHub-Api-Version"] = "2022-11-28" + + response = http.request(request) + + if response.code != "200" + puts "Error fetching artifacts: #{response.code} #{response.body}" + exit 1 + end + + artifacts = JSON.parse(response.body)["artifacts"] + + # Find the latest database dump artifact + dump_artifacts = artifacts.select { |a| a["name"].start_with?("database-dump-") } + + if dump_artifacts.empty? + puts "No database dump artifacts found" + exit 1 + end + + latest_artifact = dump_artifacts.max_by { |a| DateTime.parse(a["created_at"]) } + + puts "Found artifact: #{latest_artifact['name']} (created: #{latest_artifact['created_at']})" + + # Check if gh CLI is installed + unless system("which gh > /dev/null 2>&1") + puts "Error: GitHub CLI (gh) is not installed." + puts "Please install it from: https://cli.github.com/" + exit 1 + end + + # Create temp directory + temp_dir = Rails.root.join("tmp", "database_restore") + FileUtils.mkdir_p(temp_dir) + + # Download the artifact using gh CLI + puts "Downloading artifact using GitHub CLI..." + download_cmd = "gh api repos/#{repo}/actions/artifacts/#{latest_artifact['id']}/zip > #{temp_dir}/artifact.zip" + + success = system(download_cmd) + + unless success + puts "Error downloading artifact" + puts "Make sure you're authenticated with: gh auth login" + exit 1 + end + + zip_file = temp_dir.join("artifact.zip") + puts "Downloaded artifact to #{zip_file}" + + # Extract the zip file + system("unzip -o #{zip_file} -d #{temp_dir}") or raise "Failed to extract artifact" + + # Find the dump file + dump_file = Dir.glob(temp_dir.join("*.dump")).first + + if dump_file.nil? + puts "No dump file found in artifact" + exit 1 + end + + puts "Found dump file: #{dump_file}" + + # Restore the database + Rake::Task["db:restore"].invoke(dump_file) + + # Cleanup + FileUtils.rm_rf(temp_dir) + + puts "Database restore complete!" + end + + desc "Restore database from a dump file" + task :restore, [ :dump_file ] => :environment do |t, args| + dump_file = args[:dump_file] + + if dump_file.nil? || !File.exist?(dump_file) + puts "Error: Dump file not found: #{dump_file}" + exit 1 + end + + # Confirm before proceeding + unless ENV["SKIP_CONFIRMATION"] == "true" + puts "\nWARNING: This will restore the database from #{dump_file}" + puts "This will DROP and recreate all tables except 'users', 'schema_migrations', and 'ar_internal_metadata'" + puts "Are you sure? Type 'yes' to continue:" + + confirmation = STDIN.gets.chomp + unless confirmation.downcase == "yes" + puts "Aborted" + exit 0 + end + end + + # Get database configuration + db_config = Rails.configuration.database_configuration[Rails.env] + + # Build pg_restore command + pg_restore_cmd = [ "pg_restore" ] + + # Connection parameters + pg_restore_cmd << "--host=#{db_config['host']}" if db_config["host"] + pg_restore_cmd << "--port=#{db_config['port']}" if db_config["port"] + pg_restore_cmd << "--username=#{db_config['username']}" if db_config["username"] + pg_restore_cmd << "--dbname=#{db_config['database']}" + + # Restore options + pg_restore_cmd << "--clean" # Clean (drop) database objects before recreating + pg_restore_cmd << "--if-exists" # Use IF EXISTS when dropping objects + pg_restore_cmd << "--no-owner" # Don't set ownership + pg_restore_cmd << "--no-privileges" # Don't restore access privileges + pg_restore_cmd << "--verbose" # Verbose output + + # The dump already excludes users, schema_migrations, and ar_internal_metadata + # so we don't need to exclude them again + + pg_restore_cmd << dump_file + + # Set PGPASSWORD environment variable if password is provided + env = {} + env["PGPASSWORD"] = db_config["password"] if db_config["password"] + + puts "Restoring database from #{dump_file}..." + + # Execute pg_restore + stdout, stderr, status = Open3.capture3(env, *pg_restore_cmd.map(&:to_s)) + + if status.success? + puts "Database restored successfully!" + else + puts "Error restoring database:" + puts stderr + exit 1 + end + + # Run any pending migrations that might have been added since the dump + puts "Running pending migrations..." + Rake::Task["db:migrate"].invoke + end + + desc "List available database dump artifacts" + task list_dumps: :environment do + require "net/http" + require "json" + + # GitHub API configuration + repo = ENV["GITHUB_REPOSITORY"] || "BuildCanada/OutcomeTrackerAPI" + + # Get list of artifacts + uri = URI("https://api.github.com/repos/#{repo}/actions/artifacts") + uri.query = URI.encode_www_form(per_page: 100) + + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + + request = Net::HTTP::Get.new(uri) + request["Accept"] = "application/vnd.github+json" + # No authorization needed for public repositories + request["X-GitHub-Api-Version"] = "2022-11-28" + + response = http.request(request) + + if response.code != "200" + puts "Error fetching artifacts: #{response.code} #{response.body}" + exit 1 + end + + artifacts = JSON.parse(response.body)["artifacts"] + + # Find database dump artifacts + dump_artifacts = artifacts.select { |a| a["name"].start_with?("database-dump-") } + + if dump_artifacts.empty? + puts "No database dump artifacts found" + exit 0 + end + + puts "\nAvailable database dumps:" + puts "-" * 80 + + dump_artifacts.sort_by { |a| DateTime.parse(a["created_at"]) }.reverse.each do |artifact| + created_at = DateTime.parse(artifact["created_at"]) + size_mb = artifact["size_in_bytes"].to_f / (1024 * 1024) + expires_at = DateTime.parse(artifact["expires_at"]) + + puts "Name: #{artifact['name']}" + puts "Created: #{created_at.strftime('%Y-%m-%d %H:%M:%S UTC')}" + puts "Size: #{size_mb.round(2)} MB" + puts "Expires: #{expires_at.strftime('%Y-%m-%d %H:%M:%S UTC')}" + puts "ID: #{artifact['id']}" + puts "-" * 80 + end + end +end