Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
384 changes: 384 additions & 0 deletions .github/workflows/benchmark-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,384 @@
name: Benchmark GPU (PR)

# Rent an RTX 5090 on Vast.ai (hourly) and run the drift-free A/B/B/A (ABBA) paired
# prover benchmark — the same method as the CPU `/bench-abba` (scripts/bench_abba.sh) —
# but with the CUDA prover path enabled (BENCH_FEATURES=jemalloc-stats,prover/cuda).
# It builds the cli at the PR head and at main, runs N interleaved pairs on the GPU,
# posts the paired-t + Wilcoxon verdict back to the PR, then ALWAYS destroys the box.
#
# Triggered by a "/bench-gpu [N]" comment on a PR (N = pair count, default 10) or via
# workflow_dispatch. Orchestration runs on a GitHub-hosted runner; all GPU work happens
# on the rented Vast box (provisioned by the template onstart).
#
# Requires repo secrets:
# VAST_API_KEY — https://cloud.vast.ai/manage-keys/
# VAST_TEMPLATE_HASH — hash of the "NVIDIA CUDA Lambda VM 64GB" template

on:
workflow_dispatch:
inputs:
pairs:
description: "Number of A/B/B/A pairs"
default: "1" # TEMP(testing): fast runs; restore to "10" before merge
issue_comment:
types: [created]
# TEMP(testing): lets the workflow run from this branch before it's on the default
# branch (push uses the branch's own definition; issue_comment/workflow_dispatch do
# not). REMOVE this push trigger before merging.
push:
branches: [gpu_benchmarks]

permissions:
contents: read
pull-requests: write
issues: write

concurrency:
group: benchmark-gpu-${{ github.event.issue.number || github.run_id }}
cancel-in-progress: true

env:
# Vast offer search: RTX 5090, >=16 cores, >=96GB RAM, >=64GB disk, verified +
# rentable, Blackwell-capable driver, <= cap.
GPU_NAME: RTX_5090
PRICE_CAP: "3"
VAST_IMAGE_DISK: "64"
# cli features for the ABBA build — the GPU (cuda) prover path plus jemalloc heap stats.
BENCH_FEATURES: "jemalloc-stats,prover/cuda"

jobs:
benchmark-gpu:
runs-on: ubuntu-latest
# Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author.
# TEMP(testing): `github.event_name == 'push'` lets branch pushes run it pre-merge.
# REMOVE the push clause before merging.
if: >-
github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
startsWith(github.event.comment.body, '/bench-gpu') &&
contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
# ABBA on the GPU: dual cuda build (~15-30 min) + 2*pairs proves (~77s each).
timeout-minutes: 180
steps:
- name: Resolve PR ref + pair count
id: config
env:
GH_TOKEN: ${{ github.token }}
EVENT_NAME: ${{ github.event_name }}
COMMENT_BODY: ${{ github.event.comment.body }}
PR_NUM: ${{ github.event.issue.number }}
DISPATCH_PAIRS: ${{ github.event.inputs.pairs }}
DISPATCH_REF: ${{ github.ref_name }}
run: |
if [ "$EVENT_NAME" = "issue_comment" ]; then
# Pin the head SHA (works for fork PRs; avoids a force-push race mid-run).
HEAD_SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH=""
# "/bench-gpu 20" -> 20 pairs; otherwise default.
N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p')
PAIRS=${N:-1} # TEMP(testing): default 1; restore to 10 before merge
else
# workflow_dispatch / push: compare this branch vs main.
OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
PAIRS=${DISPATCH_PAIRS:-1} # TEMP(testing): default 1; restore to 10 before merge
fi
# TEMP(testing): clamp floor lowered to 1 for fast runs; restore to [2,40] before merge.
if [ "$PAIRS" -lt 1 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
echo "::warning::pair count out of range [1,40], defaulting to 1"
PAIRS=1
fi
{
echo "pr_num=$OUT_PR_NUM"
echo "head_sha=$OUT_HEAD_SHA"
echo "branch=$OUT_BRANCH"
echo "pairs=$PAIRS"
} >> "$GITHUB_OUTPUT"
echo "Using $PAIRS A/B/B/A pairs"

- name: Acknowledge (react + occupancy notice)
if: github.event_name == 'issue_comment'
uses: actions/github-script@v7
env:
PAIRS: ${{ steps.config.outputs.pairs }}
with:
script: |
await github.rest.reactions.createForIssueComment({
owner: context.repo.owner, repo: context.repo.repo,
comment_id: context.payload.comment.id, content: 'eyes'
});
await github.rest.issues.createComment({
owner: context.repo.owner, repo: context.repo.repo,
issue_number: context.issue.number,
body: `⏳ **GPU ABBA started** — renting an RTX 5090 on Vast.ai and running ${process.env.PAIRS} interleaved pairs (PR vs main) on the CUDA prover path. This takes ~1 hr; results will be posted here.`
});

- name: Install Vast CLI
env:
VAST_API_KEY: ${{ secrets.VAST_API_KEY }}
run: |
pip install --quiet --upgrade vastai
vastai set api-key "$VAST_API_KEY"

- name: Generate ephemeral SSH key
id: sshkey
run: |
mkdir -p "$HOME/.ssh"
KEY="$HOME/.ssh/vast_bench"
ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-bench-${GITHUB_RUN_ID}" >/dev/null
echo "key_path=$KEY" >> "$GITHUB_OUTPUT"

- name: Pick a Vast offer
id: offer
env:
# Retry the same query to ride out transient scarcity (datacenter RTX 5090s
# are a small, fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL.
OFFER_ATTEMPTS: "10"
OFFER_INTERVAL: "30"
# Require driver >= this major so cudarc (default cuda-version-from-build-system)
# matches the runtime driver. Older drivers (e.g. 575) lack newer symbols like
# cuCtxGetDevice_v2 and the GPU path falls back to CPU. Filtered client-side in jq
# because vast can't numerically compare the driver_version string server-side.
MIN_DRIVER: "580"
run: |
# cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different
# units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing.
QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
# Keep only offers whose driver major >= MIN_DRIVER, then cheapest first.
SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total)"
OFFER_ID=""
for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json)
OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json)
if [ -n "$OFFER_ID" ]; then
echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)"
break
fi
echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..."
sleep "$OFFER_INTERVAL"
done
if [ -z "$OFFER_ID" ]; then
echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
exit 1
fi
echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT"

- name: Create instance
id: instance
env:
VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }}
OFFER_ID: ${{ steps.offer.outputs.id }}
run: |
vastai create instance "$OFFER_ID" \
--template_hash "$VAST_TEMPLATE_HASH" \
--disk "$VAST_IMAGE_DISK" \
--ssh --direct --raw > create.json
cat create.json
IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json)
if [ -z "$IID" ]; then
echo "::error::Failed to create Vast instance"
exit 1
fi
# Persist immediately so teardown runs even if later steps fail.
echo "$IID" > "$RUNNER_TEMP/vast_instance_id"
echo "id=$IID" >> "$GITHUB_OUTPUT"
echo "Created instance $IID"

- name: Attach SSH key to instance
env:
IID: ${{ steps.instance.outputs.id }}
KEY: ${{ steps.sshkey.outputs.key_path }}
run: |
# Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys).
# It's removed when the instance is destroyed, so no account-level key to clean up.
# Retry: the instance may not accept the attach immediately after create.
PUB="$(cat "$KEY.pub")"
for attempt in $(seq 1 12); do
if vastai attach ssh "$IID" "$PUB"; then
echo "Attached ssh key (attempt $attempt)"; exit 0
fi
echo "attach failed (attempt $attempt/12); retrying in 10s..."
sleep 10
done
echo "::error::Failed to attach ssh key to instance $IID"
exit 1

- name: Wait for SSH
id: ssh
env:
IID: ${{ steps.instance.outputs.id }}
run: |
echo "Waiting for instance $IID to reach 'running' with SSH endpoint..."
HOST=""; PORT=""
for _ in $(seq 1 60); do # ~10 min
vastai show instance "$IID" --raw > inst.json || true
STATUS=$(jq -r '.actual_status // empty' inst.json)
# We create with --direct, so SSH straight to the public IP + the host port
# mapped to container port 22. The .ssh_host/.ssh_port proxy fields are
# unreliable (observed off-by-one vs the real proxy port), so use the direct
# mapping — same endpoint `vastai ssh-url` reports.
HOST=$(jq -r '.public_ipaddr // empty' inst.json)
PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json)
echo " status=$STATUS ssh=$HOST:$PORT"
if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then
break
fi
sleep 10
done
if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then
echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)"
exit 1
fi
echo "host=$HOST" >> "$GITHUB_OUTPUT"
echo "port=$PORT" >> "$GITHUB_OUTPUT"

# Wait for sshd to accept our key.
for _ in $(seq 1 30); do
if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
-i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then
echo "sshd reachable"; exit 0
fi
sleep 10
done
echo "::error::sshd did not accept connections in time"
exit 1

- name: Wait for onstart provisioning
env:
HOST: ${{ steps.ssh.outputs.host }}
PORT: ${{ steps.ssh.outputs.port }}
KEY: ${{ steps.sshkey.outputs.key_path }}
run: |
SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..."
# The bootstrap's final stdout line is "=== done ===". Vast captures onstart
# output to /var/log/onstart.log; fall back to checking the artifacts it leaves.
for _ in $(seq 1 120); do # ~20 min
if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then
echo "onstart reported done"; exit 0
fi
# shellcheck disable=SC2016 # $HOME/$(...) must expand on the remote box, not the runner
if $SSH 'test -x "$HOME/.cargo/bin/cargo" \
&& test -f /opt/lambda-vm-sysroot/include/stdlib.h \
&& test -d /workspace/lambda_vm/.git \
&& "$HOME/.cargo/bin/rustup" toolchain list 2>/dev/null | grep -q nightly-2026-02-01'; then
echo "provisioning artifacts present"; exit 0
fi
sleep 10
done
echo "::error::onstart provisioning did not complete in time"
exit 1

- name: Run GPU ABBA benchmark
id: bench
env:
HOST: ${{ steps.ssh.outputs.host }}
PORT: ${{ steps.ssh.outputs.port }}
KEY: ${{ steps.sshkey.outputs.key_path }}
PR_NUM: ${{ steps.config.outputs.pr_num }}
HEAD_SHA: ${{ steps.config.outputs.head_sha }}
BRANCH: ${{ steps.config.outputs.branch }}
PAIRS: ${{ steps.config.outputs.pairs }}
run: |
SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"

# Resolve the PR side (REF_A) and the fetch needed to make it resolvable on the box.
if [ -n "$PR_NUM" ]; then
FETCH="git fetch --force origin refs/pull/$PR_NUM/head"
REF_A="$HEAD_SHA"
else
FETCH="git fetch --force origin $BRANCH"
REF_A="origin/$BRANCH"
fi

# The template clones the repo at the DEFAULT branch (main), so check out the PR
# ref first — otherwise we'd run main's bench_abba.sh (no BENCH_FEATURES => CPU
# build). bench_abba.sh then builds the cli at REF_A and origin/main (isolated
# worktree), runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI +
# Wilcoxon verdict. BENCH_FEATURES routes the build through the CUDA prover path.
# REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
# binaries (PTX is compiled for the detected arch); never trust a cached binary.
REMOTE="set -e; cd /workspace/lambda_vm; \
command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
git fetch --force origin main; $FETCH; \
git checkout -f $REF_A; \
REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
scripts/bench_abba.sh $REF_A origin/main $PAIRS"

$SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt"
# Extract the result section for the PR comment (same marker bench-abba.yml uses).
sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt"

# Surface the result in the Actions run summary too (push/workflow_dispatch
# runs have no PR to comment on).
{
echo "## GPU ABBA — ethrex 20 transfers (vs main)"
echo '```'
cat "$RUNNER_TEMP/abba_result.txt"
echo '```'
} >> "$GITHUB_STEP_SUMMARY"

- name: Comment ABBA result on PR
if: always() && github.event_name == 'issue_comment'
uses: actions/github-script@v7
env:
HEAD_SHA: ${{ steps.config.outputs.head_sha }}
PAIRS: ${{ steps.config.outputs.pairs }}
OUTCOME: ${{ steps.bench.outcome }}
GPU_NAME: ${{ env.GPU_NAME }}
OFFER_PRICE: ${{ steps.offer.outputs.price }}
with:
script: |
const fs = require('fs');
const tmp = process.env.RUNNER_TEMP;
const read = (p) => { try { return fs.readFileSync(p, 'utf8').trim(); } catch { return ''; } };
const head = (process.env.HEAD_SHA || '').slice(0, 10);
const pairs = process.env.PAIRS;
const gpu = (process.env.GPU_NAME || '').replace('_', ' ');
const price = process.env.OFFER_PRICE;

let body = `## GPU Benchmark (ABBA) — \`${head}\` vs \`main\` (${pairs} pairs)\n\n`;
body += `<sub>${gpu} · Vast.ai datacenter${price ? ` @ \$${price}/hr` : ''} · \`prover/cuda\` · drift-free A/B/B/A</sub>\n\n`;
if (process.env.OUTCOME === 'success') {
const res = read(`${tmp}/abba_result.txt`) || read(`${tmp}/abba_out.txt`);
body += '```\n' + res + '\n```\n';
body += '\n<sub>+ = PR faster. Trust the verdict when paired-t and Wilcoxon agree.</sub>\n';
} else {
const tail = read(`${tmp}/abba_out.txt`).split('\n').slice(-30).join('\n');
body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n';
}

const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner, repo: context.repo.repo,
issue_number: context.issue.number,
});
const marker = 'GPU Benchmark (ABBA)';
const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker));
if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner, repo: context.repo.repo,
comment_id: existing.id, body,
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner, repo: context.repo.repo,
issue_number: context.issue.number, body,
});
}

# --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
- name: Destroy instance
if: always()
run: |
if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
IID=$(cat "$RUNNER_TEMP/vast_instance_id")
echo "Destroying instance $IID"
# --yes: skip the interactive [y/N] confirm (CI has no tty).
vastai destroy instance "$IID" --yes || echo "::warning::destroy instance $IID failed — check the Vast console"
else
echo "No instance id recorded; nothing to destroy."
fi
1 change: 1 addition & 0 deletions .github/workflows/benchmark-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ jobs:
github.event.issue.pull_request &&
startsWith(github.event.comment.body, '/bench') &&
!startsWith(github.event.comment.body, '/bench-abba') &&
!startsWith(github.event.comment.body, '/bench-gpu') &&
contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
steps:
- name: React to comment
Expand Down
Loading
Loading