Skip to main content
This page shows how to run various benchmarks using Agent-TARS.

Installation

npm install -g @agent-tars/cli@0.3.0

Basic Usage

uv run scripts/run.py \
  --agent Agent-TARS \
  --benchmark <benchmark_name> \
  --mode <run_mode> \
  [options]

Run LexBench-Browser

# Run first 5 tasks
uv run scripts/run.py \
  --agent Agent-TARS \
  --benchmark LexBench-Browser \
  --mode first_n --count 5

# Use no-login subset
uv run scripts/run.py \
  --agent Agent-TARS \
  --benchmark LexBench-Browser \
  --split no_login \
  --mode first_n --count 5

Run Online-Mind2Web

# Run first 5 tasks
uv run scripts/run.py \
  --agent Agent-TARS \
  --benchmark Online-Mind2Web \
  --mode first_n --count 5

# Use Hard30 subset
uv run scripts/run.py \
  --agent Agent-TARS \
  --benchmark Online-Mind2Web \
  --version 20251214 \
  --split Hard30 \
  --mode all

Run BrowseComp

uv run scripts/run.py \
  --agent Agent-TARS \
  --benchmark BrowseComp \
  --mode first_n --count 5

Common Parameters

ParameterDescriptionDefault
--modeRun mode (all, first_n, sample_n, specific)all
--countNumber of tasks1
--splitData subsetAll
--timeoutTimeout in secondsFrom config
--skip-completedSkip completed tasksFalse
--debugDebug modeFalse