diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml.disabled similarity index 100% rename from .github/workflows/claude.yml rename to .github/workflows/claude.yml.disabled diff --git a/.github/workflows/cloud_evals.yml b/.github/workflows/cloud_evals.yml.disabled similarity index 100% rename from .github/workflows/cloud_evals.yml rename to .github/workflows/cloud_evals.yml.disabled diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml.disabled similarity index 100% rename from .github/workflows/docker.yml rename to .github/workflows/docker.yml.disabled diff --git a/.github/workflows/eval-on-pr.yml b/.github/workflows/eval-on-pr.yml.disabled similarity index 100% rename from .github/workflows/eval-on-pr.yml rename to .github/workflows/eval-on-pr.yml.disabled diff --git a/.github/workflows/package.yaml b/.github/workflows/package.yaml.disabled similarity index 100% rename from .github/workflows/package.yaml rename to .github/workflows/package.yaml.disabled diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml.disabled similarity index 100% rename from .github/workflows/publish.yml rename to .github/workflows/publish.yml.disabled diff --git a/.github/workflows/stale-bot.yml b/.github/workflows/stale-bot.yml.disabled similarity index 100% rename from .github/workflows/stale-bot.yml rename to .github/workflows/stale-bot.yml.disabled diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index fb4187235b..53c79186a0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,11 +1,6 @@ name: test permissions: - actions: read - contents: write - pull-requests: write # Allow writing comments on PRs - issues: write # Allow writing comments on issues - statuses: write # Allow writing statuses on PRs - discussions: write + contents: read # Cancel in-progress runs when a new commit is pushed to the same branch/PR concurrency: @@ -86,15 +81,6 @@ jobs: IN_DOCKER: 'True' ANONYMIZED_TELEMETRY: 'false' BROWSER_USE_LOGGING_LEVEL: 'DEBUG' - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} - AZURE_OPENAI_KEY: ${{ secrets.AZURE_OPENAI_KEY }} - AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} - BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} - OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} strategy: matrix: test_filename: ${{ fromJson(needs.find_tests.outputs.TEST_FILENAMES || '["FAILED_TO_DISCOVER_TESTS"]') }} @@ -180,158 +166,3 @@ jobs: max_attempts: 1 retry_on: error command: pytest "tests/ci/${{ matrix.test_filename }}.py" - - evaluate-tasks: - needs: setup-chromium - runs-on: ubuntu-latest - timeout-minutes: 8 # Allow more time for agent eval - env: - IN_DOCKER: 'true' - BROWSER_USE_CLOUD_SYNC: 'false' - ANONYMIZED_TELEMETRY: 'false' - BROWSER_USE_LOGGING_LEVEL: 'DEBUG' - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} - BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} - steps: - - uses: actions/checkout@v4 - - uses: astral-sh/setup-uv@v6 - with: - enable-cache: true - activate-environment: true - - - name: Cache uv packages and venv - uses: actions/cache@v4 - with: - path: | - ~/.cache/uv - .venv - key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-uv-venv- - - - run: uv sync --dev --all-extras - - - name: Get week number for cache key - id: week - run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT - - - name: Cache chromium binaries - id: cache-chromium - uses: actions/cache@v4 - with: - path: | - ~/.cache/ms-playwright - key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} - restore-keys: | - ${{ runner.os }}-${{ runner.arch }}-chromium- - - - name: Install Chromium browser if not cached - if: steps.cache-chromium.outputs.cache-hit != 'true' - run: uvx playwright install chromium --with-deps --no-shell - - - name: Cache browser-use extensions - uses: actions/cache@v4 - with: - path: | - ~/.config/browseruse/extensions - key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} - restore-keys: | - ${{ runner.os }}-browseruse-extensions- - - - name: Run agent tasks evaluation and capture score - id: eval - uses: nick-fields/retry@v3 - with: - timeout_minutes: 4 - max_attempts: 1 - retry_on: error - command: | - python tests/ci/evaluate_tasks.py > result.txt - cat result.txt - echo "PASSED=$(grep '^PASSED=' result.txt | cut -d= -f2)" >> $GITHUB_ENV - echo "TOTAL=$(grep '^TOTAL=' result.txt | cut -d= -f2)" >> $GITHUB_ENV - echo "DETAILED_RESULTS=$(grep '^DETAILED_RESULTS=' result.txt | cut -d= -f2-)" >> $GITHUB_ENV - - - name: Print agent evaluation summary - run: | - echo "Agent tasks passed: $PASSED / $TOTAL" - - - name: Write agent evaluation summary to workflow overview - run: | - if [ "$PASSED" = "$TOTAL" ]; then - COLOR="green" - else - COLOR="yellow" - fi - echo "