# Run tasks uv run scripts/run.py --agent browser-use --benchmark BrowseComp --mode first_n --count 3 # Evaluate results uv run scripts/eval.py --agent browser-use --benchmark BrowseComp