Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
171 changes: 1 addition & 170 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
name: test
permissions:
actions: read
contents: write
pull-requests: write # Allow writing comments on PRs
issues: write # Allow writing comments on issues
statuses: write # Allow writing statuses on PRs
discussions: write
contents: read

# Cancel in-progress runs when a new commit is pushed to the same branch/PR
concurrency:
Expand Down Expand Up @@ -86,15 +81,6 @@ jobs:
IN_DOCKER: 'True'
ANONYMIZED_TELEMETRY: 'false'
BROWSER_USE_LOGGING_LEVEL: 'DEBUG'
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
AZURE_OPENAI_KEY: ${{ secrets.AZURE_OPENAI_KEY }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
strategy:
matrix:
test_filename: ${{ fromJson(needs.find_tests.outputs.TEST_FILENAMES || '["FAILED_TO_DISCOVER_TESTS"]') }}
Expand Down Expand Up @@ -180,158 +166,3 @@ jobs:
max_attempts: 1
retry_on: error
command: pytest "tests/ci/${{ matrix.test_filename }}.py"

evaluate-tasks:
needs: setup-chromium
runs-on: ubuntu-latest
timeout-minutes: 8 # Allow more time for agent eval
env:
IN_DOCKER: 'true'
BROWSER_USE_CLOUD_SYNC: 'false'
ANONYMIZED_TELEMETRY: 'false'
BROWSER_USE_LOGGING_LEVEL: 'DEBUG'
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }}
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v6
with:
enable-cache: true
activate-environment: true

- name: Cache uv packages and venv
uses: actions/cache@v4
with:
path: |
~/.cache/uv
.venv
key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }}
restore-keys: |
${{ runner.os }}-uv-venv-

- run: uv sync --dev --all-extras

- name: Get week number for cache key
id: week
run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT

- name: Cache chromium binaries
id: cache-chromium
uses: actions/cache@v4
with:
path: |
~/.cache/ms-playwright
key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }}
restore-keys: |
${{ runner.os }}-${{ runner.arch }}-chromium-

- name: Install Chromium browser if not cached
if: steps.cache-chromium.outputs.cache-hit != 'true'
run: uvx playwright install chromium --with-deps --no-shell

- name: Cache browser-use extensions
uses: actions/cache@v4
with:
path: |
~/.config/browseruse/extensions
key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }}
restore-keys: |
${{ runner.os }}-browseruse-extensions-

- name: Run agent tasks evaluation and capture score
id: eval
uses: nick-fields/retry@v3
with:
timeout_minutes: 4
max_attempts: 1
retry_on: error
command: |
python tests/ci/evaluate_tasks.py > result.txt
cat result.txt
echo "PASSED=$(grep '^PASSED=' result.txt | cut -d= -f2)" >> $GITHUB_ENV
echo "TOTAL=$(grep '^TOTAL=' result.txt | cut -d= -f2)" >> $GITHUB_ENV
echo "DETAILED_RESULTS=$(grep '^DETAILED_RESULTS=' result.txt | cut -d= -f2-)" >> $GITHUB_ENV

- name: Print agent evaluation summary
run: |
echo "Agent tasks passed: $PASSED / $TOTAL"

- name: Write agent evaluation summary to workflow overview
run: |
if [ "$PASSED" = "$TOTAL" ]; then
COLOR="green"
else
COLOR="yellow"
fi
echo "<h2>Agent Tasks Score: <span style='color:$COLOR;'>$PASSED/$TOTAL</span></h2>" >> $GITHUB_STEP_SUMMARY

- name: Comment PR with agent evaluation results
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
continue-on-error: true
with:
script: |
const passed = parseInt(process.env.PASSED);
const total = parseInt(process.env.TOTAL);
const detailedResults = JSON.parse(process.env.DETAILED_RESULTS);
const score = `${passed}/${total}`;
const percentage = Math.round((passed / total) * 100);

// Fail the workflow if 0% pass rate
if (percentage === 0) {
core.setFailed(`Evaluation failed: 0% pass rate (${passed}/${total})`);
}

// Create detailed table
let tableRows = '';
detailedResults.forEach(result => {
const emoji = result.success ? '✅' : '❌';
const status = result.success ? 'Pass' : 'Fail';
tableRows += `| ${result.task} | ${emoji} ${status} | ${result.reason} |\n`;
});

const comment = `## Agent Task Evaluation Results: ${score} (${percentage}%)

<details>
<summary>View detailed results</summary>

| Task | Result | Reason |
|------|--------|--------|
${tableRows}

Check the [evaluate-tasks job](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed task execution logs.
</details>`;

// Find existing comment to update or create new one
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});

const botComment = comments.find(comment =>
comment.user.type === 'Bot' &&
comment.body.includes('Agent Task Evaluation Results')
);

if (botComment) {
// Update existing comment
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: botComment.id,
body: comment
});
} else {
// Create new comment
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: comment
});
}
Loading