QCX-TERRA/deploy.py at main · QueueLab/QCX-TERRA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# deploy.py
import os
from azure.ai.ml import MLClient
from azure.ai.ml.entities import (
    Model,
    Environment,
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    # CodeConfiguration, # Removed as MLflow model deployment doesn't need it
)
from azure.identity import DefaultAzureCredential


# ------------------------------------------------------------------
# USER SETTINGS – fill these
# ------------------------------------------------------------------
SUBSCRIPTION_ID = os.getenv("SUBSCRIPTION_ID")
RESOURCE_GROUP   = "<your-resource-group>"
WORKSPACE_NAME   = "<your-aml-workspace>"
COMPUTE_NAME     = "<your-compute-cluster>"   # for conversion job
ENDPOINT_NAME    = "hf-onnx-endpoint"
DEPLOYMENT_NAME  = "hf-onnx-deploy"
INSTANCE_TYPE    = "Standard_DS3_v2"
# ------------------------------------------------------------------

credential = DefaultAzureCredential()
ml_client = MLClient(credential, SUBSCRIPTION_ID, RESOURCE_GROUP, WORKSPACE_NAME)

# --------------------------------------------------------------
# 1. Run conversion job (only needed once)
# --------------------------------------------------------------
from azure.ai.ml import command

conv_job = command(
    code="./",
    command="python convert_to_onnx.py",
    environment="hf-onnx-env@latest",
    compute=COMPUTE_NAME,
    display_name="HF → ONNX conversion",
    experiment_name="onnx-conversion"
)

print("Submitting conversion job...")
submitted = ml_client.jobs.create_or_update(conv_job)
ml_client.jobs.stream(submitted.name)
print("Conversion job finished. ONNX files are in the job output.")

# --------------------------------------------------------------
# 2. Register the ONNX model (point to job output)
# --------------------------------------------------------------
# The conversion job outputs to `azureml://jobs/<job_name>/outputs/artifacts/paths/onnx_model`
# Grab the latest job run name from the UI or via:
#   ml_client.jobs.list(parent_job_name="onnx-conversion")[-1].name
# The model is now registered by convert_to_onnx.py as an MLflow model.
# We just need to get the latest version of the registered model.
MODEL_NAME = "terramind-onnx-model" # Must match the name in convert_to_onnx.py

# Get the latest version of the MLflow model
registered_model = ml_client.models.get(name=MODEL_NAME, version="latest")
print(f"Using MLflow Model: {registered_model.id}")

# --------------------------------------------------------------
# 3. Define environment (MLflow models use a curated environment)
# --------------------------------------------------------------
# MLflow models are deployed using a curated environment, so we don't need to define a custom one.
# We will use the built-in MLflow ONNX environment.
# env = Environment(
#     name="hf-onnx-env",
#     conda_file="conda.yaml",
#     image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04"
# )

# --------------------------------------------------------------
# 4. Create endpoint
# --------------------------------------------------------------
endpoint = ManagedOnlineEndpoint(
    name=ENDPOINT_NAME,
    description="ONNX inference API for HF model",
    auth_mode="key"
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

# --------------------------------------------------------------
# 5. Deploy
# --------------------------------------------------------------
deployment = ManagedOnlineDeployment(
    name=DEPLOYMENT_NAME,
    endpoint_name=ENDPOINT_NAME,
    model=registered_model,
    # MLflow models use a curated environment and do not require code_configuration
    instance_type=INSTANCE_TYPE,
    instance_count=1
)

print("Deploying...")
ml_client.online_deployments.begin_create_or_update(deployment).result()
ml_client.online_endpoints.begin_start(endpoint_name=ENDPOINT_NAME).result()
print(f"Endpoint ready: {ENDPOINT_NAME}")