diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml.disabled similarity index 100% rename from .github/workflows/claude.yml rename to .github/workflows/claude.yml.disabled diff --git a/.github/workflows/cloud_evals.yml b/.github/workflows/cloud_evals.yml.disabled similarity index 100% rename from .github/workflows/cloud_evals.yml rename to .github/workflows/cloud_evals.yml.disabled diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml.disabled similarity index 100% rename from .github/workflows/docker.yml rename to .github/workflows/docker.yml.disabled diff --git a/.github/workflows/eval-on-pr.yml b/.github/workflows/eval-on-pr.yml.disabled similarity index 100% rename from .github/workflows/eval-on-pr.yml rename to .github/workflows/eval-on-pr.yml.disabled diff --git a/.github/workflows/package.yaml b/.github/workflows/package.yaml.disabled similarity index 100% rename from .github/workflows/package.yaml rename to .github/workflows/package.yaml.disabled diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml.disabled similarity index 100% rename from .github/workflows/publish.yml rename to .github/workflows/publish.yml.disabled diff --git a/.github/workflows/stale-bot.yml b/.github/workflows/stale-bot.yml.disabled similarity index 100% rename from .github/workflows/stale-bot.yml rename to .github/workflows/stale-bot.yml.disabled diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index fb4187235b..53c79186a0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,11 +1,6 @@ name: test permissions: - actions: read - contents: write - pull-requests: write # Allow writing comments on PRs - issues: write # Allow writing comments on issues - statuses: write # Allow writing statuses on PRs - discussions: write + contents: read # Cancel in-progress runs when a new commit is pushed to the same branch/PR concurrency: @@ -86,15 +81,6 @@ jobs: IN_DOCKER: 'True' ANONYMIZED_TELEMETRY: 'false' BROWSER_USE_LOGGING_LEVEL: 'DEBUG' - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} - AZURE_OPENAI_KEY: ${{ secrets.AZURE_OPENAI_KEY }} - AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} - BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} - OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} strategy: matrix: test_filename: ${{ fromJson(needs.find_tests.outputs.TEST_FILENAMES || '["FAILED_TO_DISCOVER_TESTS"]') }} @@ -180,158 +166,3 @@ jobs: max_attempts: 1 retry_on: error command: pytest "tests/ci/${{ matrix.test_filename }}.py" - - evaluate-tasks: - needs: setup-chromium - runs-on: ubuntu-latest - timeout-minutes: 8 # Allow more time for agent eval - env: - IN_DOCKER: 'true' - BROWSER_USE_CLOUD_SYNC: 'false' - ANONYMIZED_TELEMETRY: 'false' - BROWSER_USE_LOGGING_LEVEL: 'DEBUG' - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} - BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} - steps: - - uses: actions/checkout@v4 - - uses: astral-sh/setup-uv@v6 - with: - enable-cache: true - activate-environment: true - - - name: Cache uv packages and venv - uses: actions/cache@v4 - with: - path: | - ~/.cache/uv - .venv - key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-uv-venv- - - - run: uv sync --dev --all-extras - - - name: Get week number for cache key - id: week - run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT - - - name: Cache chromium binaries - id: cache-chromium - uses: actions/cache@v4 - with: - path: | - ~/.cache/ms-playwright - key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} - restore-keys: | - ${{ runner.os }}-${{ runner.arch }}-chromium- - - - name: Install Chromium browser if not cached - if: steps.cache-chromium.outputs.cache-hit != 'true' - run: uvx playwright install chromium --with-deps --no-shell - - - name: Cache browser-use extensions - uses: actions/cache@v4 - with: - path: | - ~/.config/browseruse/extensions - key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} - restore-keys: | - ${{ runner.os }}-browseruse-extensions- - - - name: Run agent tasks evaluation and capture score - id: eval - uses: nick-fields/retry@v3 - with: - timeout_minutes: 4 - max_attempts: 1 - retry_on: error - command: | - python tests/ci/evaluate_tasks.py > result.txt - cat result.txt - echo "PASSED=$(grep '^PASSED=' result.txt | cut -d= -f2)" >> $GITHUB_ENV - echo "TOTAL=$(grep '^TOTAL=' result.txt | cut -d= -f2)" >> $GITHUB_ENV - echo "DETAILED_RESULTS=$(grep '^DETAILED_RESULTS=' result.txt | cut -d= -f2-)" >> $GITHUB_ENV - - - name: Print agent evaluation summary - run: | - echo "Agent tasks passed: $PASSED / $TOTAL" - - - name: Write agent evaluation summary to workflow overview - run: | - if [ "$PASSED" = "$TOTAL" ]; then - COLOR="green" - else - COLOR="yellow" - fi - echo "

Agent Tasks Score: $PASSED/$TOTAL

" >> $GITHUB_STEP_SUMMARY - - - name: Comment PR with agent evaluation results - if: github.event_name == 'pull_request' - uses: actions/github-script@v7 - continue-on-error: true - with: - script: | - const passed = parseInt(process.env.PASSED); - const total = parseInt(process.env.TOTAL); - const detailedResults = JSON.parse(process.env.DETAILED_RESULTS); - const score = `${passed}/${total}`; - const percentage = Math.round((passed / total) * 100); - - // Fail the workflow if 0% pass rate - if (percentage === 0) { - core.setFailed(`Evaluation failed: 0% pass rate (${passed}/${total})`); - } - - // Create detailed table - let tableRows = ''; - detailedResults.forEach(result => { - const emoji = result.success ? '✅' : '❌'; - const status = result.success ? 'Pass' : 'Fail'; - tableRows += `| ${result.task} | ${emoji} ${status} | ${result.reason} |\n`; - }); - - const comment = `## Agent Task Evaluation Results: ${score} (${percentage}%) - -
- View detailed results - - | Task | Result | Reason | - |------|--------|--------| - ${tableRows} - - Check the [evaluate-tasks job](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed task execution logs. -
`; - - // Find existing comment to update or create new one - const { data: comments } = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - }); - - const botComment = comments.find(comment => - comment.user.type === 'Bot' && - comment.body.includes('Agent Task Evaluation Results') - ); - - if (botComment) { - // Update existing comment - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: botComment.id, - body: comment - }); - } else { - // Create new comment - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: comment - }); - }