Test CUA Supporting Models #39
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test CUA Supporting Models | |
| # This workflow tests all supported CUA models with API keys | |
| # Run manually using workflow_dispatch with test_models=true | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| test_models: | |
| description: "Test all supported models (requires API keys)" | |
| required: false | |
| default: true | |
| type: boolean | |
| schedule: | |
| # Runs at 3 PM UTC (8 AM PDT) daily | |
| - cron: "0 15 * * *" | |
| jobs: | |
| # Test all CUA models - runs on PRs, schedules, or when manually triggered | |
| test-all-models: | |
| if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false') }} | |
| runs-on: ubuntu-latest | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| model: | |
| # Claude Sonnet/Haiku | |
| - anthropic/claude-sonnet-4-5-20250929 | |
| - anthropic/claude-haiku-4-5-20251001 | |
| - anthropic/claude-opus-4-1-20250805 | |
| # OpenAI CU Preview | |
| - openai/computer-use-preview | |
| # GLM-V | |
| - openrouter/z-ai/glm-4.5v | |
| # - huggingface-local/zai-org/GLM-4.5V # Requires local model setup | |
| # Gemini CU Preview | |
| - gemini-2.5-computer-use-preview-10-2025 | |
| # InternVL | |
| # - huggingface-local/OpenGVLab/InternVL3_5-1B | |
| # - huggingface-local/OpenGVLab/InternVL3_5-2B | |
| # - huggingface-local/OpenGVLab/InternVL3_5-4B | |
| # - huggingface-local/OpenGVLab/InternVL3_5-8B | |
| # UI-TARS (supports full computer-use, can run standalone) | |
| # - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B | |
| # Note: OpenCUA, GTA, and Holo are grounding-only models | |
| # They only support predict_click(), not agent.run() | |
| # See composed agents section below for testing them | |
| # Moondream (typically used in composed agents) | |
| # Format: moondream3+{any-llm-with-tools} | |
| # - moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools | |
| # - moondream3+openai/gpt-4o # GPT-4o has VLM + Tools | |
| # OmniParser (typically used in composed agents) | |
| # Format: omniparser+{any-vlm-with-tools} | |
| - omniparser+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools | |
| # - omniparser+openai/gpt-4o # GPT-4o has VLM + Tools | |
| # Other grounding models + VLM with tools | |
| # Format: {grounding-model}+{any-vlm-with-tools} | |
| # These grounding-only models (OpenCUA, GTA, Holo) must be used in composed form | |
| # since they only support predict_click(), not full agent.run() | |
| # - huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929 | |
| # - huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929 | |
| # - huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up uv and Python | |
| uses: astral-sh/setup-uv@v4 | |
| with: | |
| python-version: "3.12" | |
| - name: Cache system packages | |
| uses: actions/cache@v4 | |
| with: | |
| path: /var/cache/apt | |
| key: ${{ runner.os }}-apt-${{ hashFiles('**/Dockerfile') }} | |
| restore-keys: | | |
| ${{ runner.os }}-apt- | |
| - name: Install system dependencies | |
| timeout-minutes: 20 | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y libgl1-mesa-dri libglib2.0-0 | |
| - name: Cache Python dependencies (uv) | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/uv | |
| .venv | |
| key: ${{ runner.os }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock', 'libs/python/**/pyproject.toml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-uv- | |
| - name: Install CUA dependencies (uv) | |
| run: | | |
| # Remove existing venv if it exists (from cache restore) to avoid interactive prompt | |
| rm -rf .venv | |
| uv venv --python 3.12 | |
| uv pip install -e libs/python/agent -e libs/python/computer | |
| uv pip install -e libs/python/core | |
| uv pip install "cua-agent[uitars-hf,internvl-hf,opencua-hf,moondream3,omni]" | |
| uv pip install pytest | |
| - name: Cache HuggingFace models | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/huggingface | |
| key: ${{ runner.os }}-hf-models-v1 | |
| restore-keys: | | |
| ${{ runner.os }}-hf-models- | |
| # Large cache - models can be several GB each and are reused across runs | |
| - name: Record test start time | |
| run: echo "TEST_START_TIME=$(date +%s)" >> $GITHUB_ENV | |
| env: | |
| # Ensure HuggingFace uses consistent cache location | |
| HF_HOME: ~/.cache/huggingface | |
| - name: Test model with agent loop | |
| id: test_model | |
| timeout-minutes: 20 | |
| continue-on-error: true | |
| run: | | |
| cd tests/agent_loop_testing | |
| uv run python agent_test.py --model "${{ matrix.model }}" | |
| env: | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} | |
| OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| - name: Calculate test duration and prepare message | |
| if: always() | |
| run: | | |
| TEST_END_TIME=$(date +%s) | |
| # Handle case where TEST_START_TIME might not be set | |
| if [ -z "$TEST_START_TIME" ]; then | |
| TEST_START_TIME=$TEST_END_TIME | |
| fi | |
| TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME)) | |
| # Convert seconds to minutes and seconds | |
| MINUTES=$((TEST_DURATION / 60)) | |
| SECONDS=$((TEST_DURATION % 60)) | |
| # Format duration | |
| if [ $MINUTES -gt 0 ]; then | |
| DURATION_STR="${MINUTES}m ${SECONDS}s" | |
| else | |
| DURATION_STR="${SECONDS}s" | |
| fi | |
| # Determine status icon based on test step outcome | |
| if [ "${{ steps.test_model.outcome }}" == "success" ]; then | |
| STATUS_ICON="✅" | |
| STATUS_TEXT="PASSED" | |
| SLACK_COLOR="#36a64f" | |
| else | |
| STATUS_ICON="❌" | |
| STATUS_TEXT="FAILED" | |
| SLACK_COLOR="#dc3545" | |
| fi | |
| # Prepare Slack message | |
| echo "TESTS_CONTENT<<EOF" >> $GITHUB_ENV | |
| echo "*CUA Model Test Results*" >> $GITHUB_ENV | |
| echo "" >> $GITHUB_ENV | |
| echo "*Model:* ${{ matrix.model }}" >> $GITHUB_ENV | |
| echo "*Status:* ${STATUS_ICON} ${STATUS_TEXT}" >> $GITHUB_ENV | |
| echo "*Duration:* ${DURATION_STR}" >> $GITHUB_ENV | |
| echo "*Run:* ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" >> $GITHUB_ENV | |
| echo "EOF" >> $GITHUB_ENV | |
| # Set color based on outcome | |
| echo "SLACK_COLOR=${SLACK_COLOR}" >> $GITHUB_ENV | |
| # Save result to JSON file for summary | |
| mkdir -p test_summary | |
| MODEL_NAME="${{ matrix.model }}" | |
| # Sanitize model name for filename | |
| SAFE_MODEL_NAME=$(echo "$MODEL_NAME" | sed 's/[^a-zA-Z0-9]/_/g') | |
| # Determine pass status | |
| if [ "${{ steps.test_model.outcome }}" == "success" ]; then | |
| PASSED_VAL="true" | |
| else | |
| PASSED_VAL="false" | |
| fi | |
| # Create JSON file using printf to avoid YAML parsing issues | |
| printf '{\n "model": "%s",\n "status": "%s",\n "status_icon": "%s",\n "duration": "%s",\n "duration_seconds": %d,\n "passed": %s\n}' \ | |
| "${MODEL_NAME}" "${STATUS_TEXT}" "${STATUS_ICON}" "${DURATION_STR}" "${TEST_DURATION}" "${PASSED_VAL}" \ | |
| > "test_summary/${SAFE_MODEL_NAME}.json" | |
| # Expose safe model name for subsequent steps (artifact naming) | |
| echo "SAFE_MODEL_NAME=${SAFE_MODEL_NAME}" >> $GITHUB_ENV | |
| - name: Upload test results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: test-results-${{ matrix.model }} | |
| path: | | |
| tests/agent_loop_testing/test_images/ | |
| *.log | |
| if-no-files-found: ignore | |
| retention-days: 7 | |
| - name: Upload test summary data | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| # Unique, slash-free artifact name per matrix entry | |
| name: test-summary-${{ env.SAFE_MODEL_NAME }} | |
| path: test_summary/ | |
| if-no-files-found: ignore | |
| retention-days: 1 | |
| - name: Set default Slack color | |
| if: always() && env.SLACK_COLOR == '' | |
| run: echo "SLACK_COLOR=#36a64f" >> $GITHUB_ENV | |
| # Individual model notifications disabled - only summary is sent | |
| # - name: Notify Slack with test results | |
| # if: always() | |
| # uses: rtCamp/action-slack-notify@v2 | |
| # env: | |
| # SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} | |
| # SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }} | |
| # SLACK_TITLE: CUA Model Test Update | |
| # SLACK_COLOR: ${{ env.SLACK_COLOR }} | |
| # SLACK_MESSAGE: | | |
| # ${{ env.TESTS_CONTENT }} | |
| # Summary job that aggregates all model test results | |
| test-summary: | |
| if: ${{ always() && (github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false')) }} | |
| needs: test-all-models | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Install jq | |
| run: sudo apt-get update && sudo apt-get install -y jq | |
| - name: Download all test summary artifacts | |
| continue-on-error: true | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: test-summary-* | |
| merge-multiple: true | |
| path: all_summaries | |
| - name: Generate and send summary | |
| if: always() | |
| shell: bash | |
| run: | | |
| # Create directory if it doesn't exist | |
| mkdir -p all_summaries | |
| # Get list of models being tested in this run from the matrix | |
| # This helps filter out artifacts from previous runs when testing locally | |
| EXPECTED_MODELS="${{ join(matrix.model, ' ') }}" | |
| # Aggregate all results | |
| PASSED_COUNT=0 | |
| FAILED_COUNT=0 | |
| TOTAL_DURATION=0 | |
| SUMMARY_MESSAGE="*🚀 Model Summaries*\n\n" | |
| # Process each JSON file (find all JSON files recursively) | |
| # Save to temp file first to avoid subshell issues | |
| find all_summaries -name "*.json" -type f 2>/dev/null > /tmp/json_files.txt || true | |
| # Use associative array to deduplicate by model name | |
| declare -A processed_models | |
| while IFS= read -r json_file; do | |
| if [ -f "$json_file" ]; then | |
| MODEL=$(jq -r '.model' "$json_file") | |
| # Skip if we've already processed this model | |
| if [ "${processed_models[$MODEL]}" = "1" ]; then | |
| echo "Skipping duplicate model: $MODEL" | |
| continue | |
| fi | |
| # Filter: Only include models that are in the current matrix | |
| # This prevents including artifacts from previous workflow runs | |
| if [ -n "$EXPECTED_MODELS" ]; then | |
| if ! echo "$EXPECTED_MODELS" | grep -q "$MODEL"; then | |
| echo "Skipping model from previous run: $MODEL" | |
| continue | |
| fi | |
| fi | |
| # Mark as processed | |
| processed_models[$MODEL]="1" | |
| STATUS_ICON=$(jq -r '.status_icon' "$json_file") | |
| STATUS=$(jq -r '.status' "$json_file") | |
| DURATION=$(jq -r '.duration' "$json_file") | |
| DURATION_SEC=$(jq -r '.duration_seconds' "$json_file") | |
| PASSED=$(jq -r '.passed' "$json_file") | |
| # Add to summary as clean line format | |
| SUMMARY_MESSAGE="${SUMMARY_MESSAGE}${STATUS_ICON} ${STATUS} - \`${MODEL}\` - ${DURATION}\n" | |
| if [ "$PASSED" = "true" ]; then | |
| PASSED_COUNT=$((PASSED_COUNT + 1)) | |
| else | |
| FAILED_COUNT=$((FAILED_COUNT + 1)) | |
| fi | |
| TOTAL_DURATION=$((TOTAL_DURATION + DURATION_SEC)) | |
| fi | |
| done < /tmp/json_files.txt | |
| # Check if we found any results | |
| TOTAL_COUNT=$((PASSED_COUNT + FAILED_COUNT)) | |
| if [ $TOTAL_COUNT -eq 0 ]; then | |
| SUMMARY_MESSAGE="${SUMMARY_MESSAGE}⚠️ No test results found (workflow may have been canceled)\n" | |
| SLACK_COLOR="#ffa500" | |
| else | |
| # Add summary stats | |
| SUMMARY_MESSAGE="${SUMMARY_MESSAGE}\n*Results:* ${PASSED_COUNT} passed, ${FAILED_COUNT} failed out of ${TOTAL_COUNT} models\n" | |
| # Calculate total duration | |
| TOTAL_MIN=$((TOTAL_DURATION / 60)) | |
| TOTAL_SEC=$((TOTAL_DURATION % 60)) | |
| if [ $TOTAL_MIN -gt 0 ]; then | |
| TOTAL_DURATION_STR="${TOTAL_MIN}m ${TOTAL_SEC}s" | |
| else | |
| TOTAL_DURATION_STR="${TOTAL_SEC}s" | |
| fi | |
| SUMMARY_MESSAGE="${SUMMARY_MESSAGE}*Total Duration:* ${TOTAL_DURATION_STR}\n" | |
| # Determine color based on results | |
| if [ $FAILED_COUNT -eq 0 ]; then | |
| SLACK_COLOR="#36a64f" | |
| elif [ $PASSED_COUNT -eq 0 ]; then | |
| SLACK_COLOR="#dc3545" | |
| else | |
| SLACK_COLOR="#ffa500" | |
| fi | |
| fi | |
| SUMMARY_MESSAGE="${SUMMARY_MESSAGE}*Run:* ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| # Export for use in next step | |
| echo "SUMMARY_MESSAGE<<EOF" >> $GITHUB_ENV | |
| echo -e "${SUMMARY_MESSAGE}" >> $GITHUB_ENV | |
| echo "EOF" >> $GITHUB_ENV | |
| echo "SLACK_COLOR=${SLACK_COLOR}" >> $GITHUB_ENV | |
| - name: Send summary to Slack | |
| if: always() | |
| uses: rtCamp/action-slack-notify@v2 | |
| env: | |
| SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} | |
| SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }} | |
| SLACK_TITLE: CUA Models Test Summary | |
| SLACK_COLOR: ${{ env.SLACK_COLOR }} | |
| SLACK_MESSAGE: | | |
| ${{ env.SUMMARY_MESSAGE }} |