-
Notifications
You must be signed in to change notification settings - Fork 0
116 lines (103 loc) · 4 KB
/
Copy pathdeploy-dataflow.yaml
File metadata and controls
116 lines (103 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
name: Deploy Dataflow Streaming
on:
workflow_dispatch:
inputs:
environment:
description: "Target environment"
type: choice
options:
- staging
- prod
default: staging
region:
description: "GCP region for Dataflow job"
type: choice
options:
- us-central1
- us-east1
- us-west1
default: us-central1
worker_machine_type:
description: "Worker VM machine type"
type: choice
options:
- e2-standard-2
- n1-standard-2
- e2-standard-4
- n1-standard-4
default: e2-standard-2
env:
PROJECT_ID: "deeplearning-sahil"
REGION: ${{ inputs.region }}
PUBSUB_TOPIC: "projects/deeplearning-sahil/topics/iris-inference-data"
TEMP_LOCATION: "gs://sb-vertex/temp"
STAGING_LOCATION: "gs://sb-vertex/staging"
SERVICE_ACCOUNT_EMAIL: "kfp-mlops@deeplearning-sahil.iam.gserviceaccount.com"
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set environment-specific variables
run: |
if [ "${{ inputs.environment }}" = "prod" ]; then
echo "CLOUD_RUN_SERVICE=iris-classifier-xgboost-service" >> $GITHUB_ENV
echo "OUTPUT_TABLE=${{ env.PROJECT_ID }}:ml_dataset.iris_predictions_streaming" >> $GITHUB_ENV
echo "JOB_PREFIX=iris-streaming-inference" >> $GITHUB_ENV
else
echo "CLOUD_RUN_SERVICE=iris-classifier-xgboost-service-staging" >> $GITHUB_ENV
echo "OUTPUT_TABLE=${{ env.PROJECT_ID }}:ml_dataset.iris_predictions_streaming_staging" >> $GITHUB_ENV
echo "JOB_PREFIX=iris-streaming-inference-staging" >> $GITHUB_ENV
fi
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install dependencies
run: pip install -e .
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}
- name: Set up gcloud CLI
uses: google-github-actions/setup-gcloud@v1
with:
project_id: ${{ env.PROJECT_ID }}
- name: Discover Cloud Run service URL
id: service_url
run: |
URL=$(gcloud run services describe ${{ env.CLOUD_RUN_SERVICE }} \
--region=us-central1 \
--project=${{ env.PROJECT_ID }} \
--format='value(status.url)')
echo "url=$URL" >> $GITHUB_OUTPUT
echo "Discovered service URL for ${{ inputs.environment }}: $URL"
- name: Deploy Dataflow streaming job
run: |
JOB_NAME="${{ env.JOB_PREFIX }}-$(date +%Y%m%d-%H%M%S)"
echo "Submitting ${{ inputs.environment }} job: $JOB_NAME"
python src/dataflow/iris_streaming_pipeline.py \
--input_topic ${{ env.PUBSUB_TOPIC }} \
--output_table ${{ env.OUTPUT_TABLE }} \
--project_id ${{ env.PROJECT_ID }} \
--region ${{ env.REGION }} \
--service_url ${{ steps.service_url.outputs.url }} \
--batch_size 50 \
--max_batch_duration_secs 1.0 \
--runner DataflowRunner \
--job_name $JOB_NAME \
--temp_location ${{ env.TEMP_LOCATION }} \
--staging_location ${{ env.STAGING_LOCATION }} \
--service_account_email ${{ env.SERVICE_ACCOUNT_EMAIL }} \
--use_public_ips \
--worker_machine_type ${{ inputs.worker_machine_type }} \
--max_num_workers 3 \
--autoscaling_algorithm THROUGHPUT_BASED \
--streaming \
--enable_streaming_engine \
--experiments use_runner_v2 \
--no_wait
echo "Job submitted: $JOB_NAME"
echo "Environment: ${{ inputs.environment }}"
echo "Monitor: https://console.cloud.google.com/dataflow/jobs/${{ env.REGION }}?project=${{ env.PROJECT_ID }}"