-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaction.yml
More file actions
230 lines (223 loc) · 9.04 KB
/
Copy pathaction.yml
File metadata and controls
230 lines (223 loc) · 9.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
name: datalayer-evals
description: Run Datalayer eval reports from CI using the datalayer-core Python API.
author: Datalayer
branding:
icon: bar-chart-2
color: blue
inputs:
mode:
description: "Action mode: run-report, execute-runs, or prepare-spec."
required: false
default: "run-report"
evalset-id:
description: Primary evalset ID to report.
required: false
default: ""
evalset-spec-file:
description: Path to a primary evalset spec JSON file. If set, the action creates the evalset before reporting.
required: false
default: ""
secondary-evalset-id:
description: Optional secondary evalset ID to report and compare against the primary report.
required: false
default: ""
secondary-evalset-spec-file:
description: Optional path to a secondary evalset spec JSON file.
required: false
default: ""
api-key:
description: Datalayer API key.
required: true
default: ""
ai-agents-url:
description: Optional AI Agents base URL override.
required: false
default: ""
billable-account-uid:
description: Optional billable account UID context used for eval operations.
required: false
default: ""
run-environment:
description: Optional run environment label passed through to the action runtime.
required: false
default: "sdk"
request-timeout-seconds:
description: >-
Per-request timeout (seconds) for a single agent chat call during
execute-runs. A hung agent call is aborted after this window, the case is
marked failed, execution continues, and cloud runtimes are always torn
down. Defaults to 180 (3 minutes per call).
required: false
default: "180"
prepared-spec-output-dir:
description: Output directory used by mode=prepare-spec.
required: false
default: artifacts/specs
prepared-spec-output-file:
description: Optional filename override used by mode=prepare-spec.
required: false
default: ""
run-limit:
description: Runs fetched per experiment.
required: false
default: "50"
output-markdown:
description: Output markdown report file path.
required: false
default: evals-report.md
secondary-output-markdown:
description: Optional secondary markdown report output path.
required: false
default: ""
comparison-summary-output:
description: Optional comparison summary markdown output path.
required: false
default: ""
export-csv:
description: Export report.csv from datalayer evals report --export.
required: false
default: "true"
iam-url:
description: Optional IAM base URL override used by datalayer agents create.
required: false
default: ""
runtimes-url:
description: Optional Runtimes base URL override used by datalayer agents create.
required: false
default: ""
agentspec-ids:
description: Optional comma-separated agent spec ids used by execute-runs mode (or execute-runs=true).
required: false
default: ""
execute-runs:
description: >-
When true, execute real eval runs before reporting. Requires evalset-spec-file
and agentspec-ids. The action creates the evalset, launches one cloud runtime
per agentspec, runs every case run-limit times, grades outputs, persists runs,
and tears the runtimes down.
required: false
default: "false"
agent-environment-name:
description: Environment name used by execute-runs mode for cloud runtime execution.
required: false
default: "ai-agents-env"
upload-report-artifacts:
description: Upload generated markdown/csv/log artifacts in a final step.
required: false
default: "true"
report-artifact-name:
description: Name used by actions/upload-artifact for report artifacts.
required: false
default: datalayer-evals-reports
outputs:
prepared-spec-path:
description: Path to prepared lane-specific evalset spec generated by mode=prepare-spec.
value: ${{ steps.run.outputs.prepared_spec_path }}
spec-path:
description: Alias for prepared-spec-path.
value: ${{ steps.run.outputs.spec_path }}
report-file:
description: Path to generated markdown report.
value: ${{ steps.run.outputs.report_file }}
evalset-id:
description: Evalset id used for the primary report (created when executing runs or from a spec).
value: ${{ steps.run.outputs.evalset_id }}
executed-evalset-id:
description: Evalset id created when execute-runs is enabled (empty otherwise).
value: ${{ steps.run.outputs.executed_evalset_id }}
csv-file:
description: Path to generated CSV report (empty when export-csv=false).
value: ${{ steps.run.outputs.csv_file }}
log-file:
description: Path to captured CLI output log.
value: ${{ steps.run.outputs.log_file }}
timestamped-report-file:
description: Timestamped markdown path generated by --export.
value: ${{ steps.run.outputs.timestamped_report_file }}
timestamped-csv-file:
description: Timestamped CSV path generated by --export.
value: ${{ steps.run.outputs.timestamped_csv_file }}
secondary-report-file:
description: Secondary markdown report file path (empty when secondary input is omitted).
value: ${{ steps.run.outputs.secondary_report_file }}
secondary-csv-file:
description: Secondary CSV report file path.
value: ${{ steps.run.outputs.secondary_csv_file }}
secondary-log-file:
description: Secondary CLI output log file path.
value: ${{ steps.run.outputs.secondary_log_file }}
secondary-timestamped-report-file:
description: Secondary timestamped markdown path generated by --export.
value: ${{ steps.run.outputs.secondary_timestamped_report_file }}
secondary-timestamped-csv-file:
description: Secondary timestamped CSV path generated by --export.
value: ${{ steps.run.outputs.secondary_timestamped_csv_file }}
comparison-summary-file:
description: Comparison summary markdown file path.
value: ${{ steps.run.outputs.comparison_summary_file }}
failed-run-count:
description: Total number of failed runs across primary and secondary reports.
value: ${{ steps.run.outputs.failed_run_count }}
primary-failed-run-count:
description: Number of failed runs in the primary report.
value: ${{ steps.run.outputs.primary_failed_run_count }}
secondary-failed-run-count:
description: Number of failed runs in the secondary report (0 when no secondary report).
value: ${{ steps.run.outputs.secondary_failed_run_count }}
runs:
using: composite
steps:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install datalayer-core
shell: bash
run: |
python -m pip install --upgrade pip
python -m pip install "datalayer-core>=1.1.39"
- name: Run Datalayer Evals Report
id: run
shell: bash
env:
INPUT_MODE: ${{ inputs.mode }}
INPUT_EVALSET_ID: ${{ inputs.evalset-id }}
INPUT_EVALSET_SPEC_FILE: ${{ inputs.evalset-spec-file }}
INPUT_SECONDARY_EVALSET_ID: ${{ inputs.secondary-evalset-id }}
INPUT_SECONDARY_EVALSET_SPEC_FILE: ${{ inputs.secondary-evalset-spec-file }}
INPUT_API_KEY: ${{ inputs.api-key }}
INPUT_AI_AGENTS_URL: ${{ inputs.ai-agents-url }}
INPUT_BILLABLE_ACCOUNT_UID: ${{ inputs.billable-account-uid }}
INPUT_RUN_ENVIRONMENT: ${{ inputs.run-environment }}
INPUT_REQUEST_TIMEOUT_SECONDS: ${{ inputs.request-timeout-seconds }}
INPUT_PREPARED_SPEC_OUTPUT_DIR: ${{ inputs.prepared-spec-output-dir }}
INPUT_PREPARED_SPEC_OUTPUT_FILE: ${{ inputs.prepared-spec-output-file }}
INPUT_RUN_LIMIT: ${{ inputs.run-limit }}
INPUT_OUTPUT_MARKDOWN: ${{ inputs.output-markdown }}
INPUT_SECONDARY_OUTPUT_MARKDOWN: ${{ inputs.secondary-output-markdown }}
INPUT_COMPARISON_SUMMARY_OUTPUT: ${{ inputs.comparison-summary-output }}
INPUT_EXPORT_CSV: ${{ inputs.export-csv }}
INPUT_IAM_URL: ${{ inputs.iam-url }}
INPUT_RUNTIMES_URL: ${{ inputs.runtimes-url }}
INPUT_AGENT_SPEC_IDS: ${{ inputs.agentspec-ids }}
INPUT_EXECUTE_RUNS: ${{ inputs.execute-runs }}
INPUT_AGENT_ENVIRONMENT_NAME: ${{ inputs.agent-environment-name }}
run: python ${{ github.action_path }}/src/datalayer_evals_action.py
- name: Upload Datalayer report artifacts
if: ${{ inputs.upload-report-artifacts == 'true' }}
uses: actions/upload-artifact@v4
with:
name: ${{ inputs.report-artifact-name }}
if-no-files-found: warn
path: |
${{ steps.run.outputs.report_file }}
${{ steps.run.outputs.csv_file }}
${{ steps.run.outputs.log_file }}
${{ steps.run.outputs.timestamped_report_file }}
${{ steps.run.outputs.timestamped_csv_file }}
${{ steps.run.outputs.secondary_report_file }}
${{ steps.run.outputs.secondary_csv_file }}
${{ steps.run.outputs.secondary_log_file }}
${{ steps.run.outputs.secondary_timestamped_report_file }}
${{ steps.run.outputs.secondary_timestamped_csv_file }}
${{ steps.run.outputs.comparison_summary_file }}