From 158278c25aae372d6b7e7c835c108206e599b311 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 16:07:44 -0300
Subject: [PATCH 1/9] ci: add gpu benchmarks
---
.github/workflows/benchmark-gpu.yml | 337 ++++++++++++++++++++++++++++
.github/workflows/benchmark-pr.yml | 1 +
infra/gpu_bench.sh | 95 ++++++++
3 files changed, 433 insertions(+)
create mode 100644 .github/workflows/benchmark-gpu.yml
create mode 100755 infra/gpu_bench.sh
diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
new file mode 100644
index 000000000..abdea98a5
--- /dev/null
+++ b/.github/workflows/benchmark-gpu.yml
@@ -0,0 +1,337 @@
+name: Benchmark GPU (PR)
+
+# Rent an RTX 5090 on Vast.ai (hourly), run the SAME headline ethrex prover
+# benchmark as benchmark-pr.yml but with the CUDA path enabled, post the absolute
+# GPU numbers back to the PR, then always destroy the instance.
+#
+# Triggered by a "/bench-gpu [N]" comment on a PR (N = prove iterations, default 3),
+# or via workflow_dispatch. The orchestration runs on a GitHub-hosted runner; all the
+# GPU work happens on the rented Vast box (provisioned by the template onstart).
+#
+# Requires repo secrets:
+# VAST_API_KEY — https://cloud.vast.ai/manage-keys/
+# VAST_TEMPLATE_HASH — hash of the "NVIDIA CUDA Lambda VM 64GB" template
+
+on:
+ workflow_dispatch:
+ inputs:
+ runs:
+ description: "Number of prove iterations"
+ default: "3"
+ issue_comment:
+ types: [created]
+
+permissions:
+ contents: read
+ pull-requests: write
+
+concurrency:
+ group: benchmark-gpu-${{ github.event.issue.number || github.run_id }}
+ cancel-in-progress: true
+
+env:
+ # Vast offer search: datacenter RTX 5090, >=16 cores, >=32GB RAM, >=64GB disk,
+ # verified + rentable, Blackwell-capable driver, under the price cap ($/hr).
+ GPU_NAME: RTX_5090
+ PRICE_CAP: "3"
+ VAST_IMAGE_DISK: "64"
+
+jobs:
+ benchmark-gpu:
+ runs-on: ubuntu-latest
+ # Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author.
+ if: >-
+ github.event_name == 'workflow_dispatch' ||
+ (github.event_name == 'issue_comment' &&
+ github.event.issue.pull_request &&
+ startsWith(github.event.comment.body, '/bench-gpu') &&
+ contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
+ steps:
+ - name: React to comment
+ if: github.event_name == 'issue_comment'
+ uses: actions/github-script@v7
+ with:
+ script: |
+ await github.rest.reactions.createForIssueComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ comment_id: context.payload.comment.id,
+ content: 'eyes'
+ });
+
+ - name: Resolve PR ref + run count
+ id: config
+ env:
+ GH_TOKEN: ${{ github.token }}
+ EVENT_NAME: ${{ github.event_name }}
+ COMMENT_BODY: ${{ github.event.comment.body }}
+ PR_NUM: ${{ github.event.issue.number }}
+ DISPATCH_RUNS: ${{ github.event.inputs.runs }}
+ DISPATCH_REF: ${{ github.ref_name }}
+ run: |
+ if [ "$EVENT_NAME" = "issue_comment" ]; then
+ SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+ echo "pr_num=$PR_NUM" >> "$GITHUB_OUTPUT"
+ echo "sha=$SHA" >> "$GITHUB_OUTPUT"
+ # "/bench-gpu 5" -> 5 iterations; otherwise default.
+ N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p')
+ RUNS=${N:-3}
+ else
+ echo "pr_num=" >> "$GITHUB_OUTPUT"
+ echo "sha=$DISPATCH_REF" >> "$GITHUB_OUTPUT"
+ RUNS=${DISPATCH_RUNS:-3}
+ fi
+ # Clamp to [1,10].
+ if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then
+ echo "::warning::run count out of range, defaulting to 3"
+ RUNS=3
+ fi
+ echo "runs=$RUNS" >> "$GITHUB_OUTPUT"
+ echo "Using $RUNS prove iteration(s)"
+
+ - name: Install Vast CLI
+ env:
+ VAST_API_KEY: ${{ secrets.VAST_API_KEY }}
+ run: |
+ pip install --quiet --upgrade vastai
+ vastai set api-key "$VAST_API_KEY"
+
+ - name: Register ephemeral SSH key
+ id: sshkey
+ run: |
+ mkdir -p "$HOME/.ssh"
+ KEY="$HOME/.ssh/vast_bench"
+ COMMENT="gh-actions-bench-${GITHUB_RUN_ID}"
+ ssh-keygen -t ed25519 -N "" -f "$KEY" -C "$COMMENT" >/dev/null
+ vastai create ssh-key "$(cat "$KEY.pub")"
+ echo "key_comment=$COMMENT" >> "$GITHUB_OUTPUT"
+ echo "key_path=$KEY" >> "$GITHUB_OUTPUT"
+
+ - name: Pick a Vast offer
+ id: offer
+ run: |
+ # NB: cpu_ram is in MB (32 GB = 32 * 1024 = 32768); disk_space/cuda_max_good as named.
+ QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32768 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+ echo "Query: $QUERY"
+ vastai search offers "$QUERY" --raw -o dph_total > offers.json
+ OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json)
+ OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json)
+ if [ -z "$OFFER_ID" ]; then
+ echo "::error::No datacenter RTX 5090 offer matched (>=16 cores, >=32GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+ exit 1
+ fi
+ echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr"
+ echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
+ echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT"
+
+ - name: Create instance
+ id: instance
+ env:
+ VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }}
+ OFFER_ID: ${{ steps.offer.outputs.id }}
+ run: |
+ vastai create instance "$OFFER_ID" \
+ --template_hash "$VAST_TEMPLATE_HASH" \
+ --disk "$VAST_IMAGE_DISK" \
+ --ssh --direct --raw > create.json
+ cat create.json
+ IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json)
+ if [ -z "$IID" ]; then
+ echo "::error::Failed to create Vast instance"
+ exit 1
+ fi
+ # Persist immediately so teardown runs even if later steps fail.
+ echo "$IID" > "$RUNNER_TEMP/vast_instance_id"
+ echo "id=$IID" >> "$GITHUB_OUTPUT"
+ echo "Created instance $IID"
+
+ - name: Wait for SSH
+ id: ssh
+ env:
+ IID: ${{ steps.instance.outputs.id }}
+ run: |
+ echo "Waiting for instance $IID to reach 'running' with SSH endpoint..."
+ HOST=""; PORT=""
+ for _ in $(seq 1 60); do # ~10 min
+ vastai show instance "$IID" --raw > inst.json || true
+ STATUS=$(jq -r '.actual_status // empty' inst.json)
+ # We create with --direct, so SSH straight to the public IP + the host port
+ # mapped to container port 22. The .ssh_host/.ssh_port proxy fields are
+ # unreliable (observed off-by-one vs the real proxy port), so use the direct
+ # mapping — same endpoint `vastai ssh-url` reports.
+ HOST=$(jq -r '.public_ipaddr // empty' inst.json)
+ PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json)
+ echo " status=$STATUS ssh=$HOST:$PORT"
+ if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then
+ break
+ fi
+ sleep 10
+ done
+ if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then
+ echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)"
+ exit 1
+ fi
+ echo "host=$HOST" >> "$GITHUB_OUTPUT"
+ echo "port=$PORT" >> "$GITHUB_OUTPUT"
+
+ # Wait for sshd to accept our key.
+ for _ in $(seq 1 30); do
+ if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
+ -i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then
+ echo "sshd reachable"; exit 0
+ fi
+ sleep 10
+ done
+ echo "::error::sshd did not accept connections in time"
+ exit 1
+
+ - name: Wait for onstart provisioning
+ env:
+ HOST: ${{ steps.ssh.outputs.host }}
+ PORT: ${{ steps.ssh.outputs.port }}
+ KEY: ${{ steps.sshkey.outputs.key_path }}
+ run: |
+ SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+ echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..."
+ # The bootstrap's final stdout line is "=== done ===". Vast captures onstart
+ # output to /var/log/onstart.log; fall back to checking the artifacts it leaves.
+ for _ in $(seq 1 120); do # ~20 min
+ if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then
+ echo "onstart reported done"; exit 0
+ fi
+ # shellcheck disable=SC2016 # $HOME/$(...) must expand on the remote box, not the runner
+ if $SSH 'test -x "$HOME/.cargo/bin/cargo" \
+ && test -f /opt/lambda-vm-sysroot/include/stdlib.h \
+ && test -d /workspace/lambda_vm/.git \
+ && "$HOME/.cargo/bin/rustup" toolchain list 2>/dev/null | grep -q nightly-2026-02-01'; then
+ echo "provisioning artifacts present"; exit 0
+ fi
+ sleep 10
+ done
+ echo "::error::onstart provisioning did not complete in time"
+ exit 1
+
+ - name: Check out PR source on the box
+ env:
+ HOST: ${{ steps.ssh.outputs.host }}
+ PORT: ${{ steps.ssh.outputs.port }}
+ KEY: ${{ steps.sshkey.outputs.key_path }}
+ PR_NUM: ${{ steps.config.outputs.pr_num }}
+ SHA: ${{ steps.config.outputs.sha }}
+ run: |
+ SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+ if [ -n "$PR_NUM" ]; then
+ # Fetch the PR head via the base repo's pull ref (works for fork PRs too).
+ $SSH "cd /workspace/lambda_vm && git fetch --force origin 'refs/pull/${PR_NUM}/head' && git checkout --force '$SHA'"
+ else
+ # workflow_dispatch: check out the requested branch.
+ $SSH "cd /workspace/lambda_vm && git fetch --force origin '$SHA' && git checkout --force FETCH_HEAD"
+ fi
+ $SSH "cd /workspace/lambda_vm && git --no-pager log -1 --oneline"
+
+ - name: Run GPU benchmark
+ id: bench
+ env:
+ HOST: ${{ steps.ssh.outputs.host }}
+ PORT: ${{ steps.ssh.outputs.port }}
+ KEY: ${{ steps.sshkey.outputs.key_path }}
+ RUNS: ${{ steps.config.outputs.runs }}
+ run: |
+ SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+ # bash -l so ~/.bashrc (cargo env) is sourced; the script also sources it itself.
+ $SSH "bash -lc 'cd /workspace/lambda_vm && bash infra/gpu_bench.sh $RUNS'" | tee bench.log
+
+ # Parse per-run metrics (same format as the CPU bench: "Proving time:" / "Peak heap:").
+ mapfile -t TIMES < <(grep -o 'Proving time: [0-9.]*' bench.log | awk '{print $3}')
+ mapfile -t HEAPS < <(grep -o 'Peak heap: [0-9]*' bench.log | awk '{print $3}')
+ if [ "${#TIMES[@]}" -eq 0 ] || [ "${#HEAPS[@]}" -eq 0 ]; then
+ echo "::error::Failed to parse any GPU metrics from the bench output"
+ exit 1
+ fi
+ MED_POS=$(( (${#TIMES[@]} + 1) / 2 ))
+ TIME_MED=$(printf '%s\n' "${TIMES[@]}" | sort -n | awk "NR==$MED_POS")
+ HEAP_MED=$(printf '%s\n' "${HEAPS[@]}" | sort -n | awk "NR==$MED_POS")
+ ALL_TIMES=$(printf '%s\n' "${TIMES[@]}" | paste -sd '/' -)
+ ALL_HEAPS=$(printf '%s\n' "${HEAPS[@]}" | paste -sd '/' -)
+ {
+ echo "time_s=$TIME_MED"
+ echo "peak_mb=$HEAP_MED"
+ echo "all_times=$ALL_TIMES"
+ echo "all_heaps=$ALL_HEAPS"
+ } >> "$GITHUB_OUTPUT"
+
+ - name: Comment on PR
+ if: github.event_name == 'issue_comment'
+ uses: actions/github-script@v7
+ env:
+ TIME_S: ${{ steps.bench.outputs.time_s }}
+ PEAK_MB: ${{ steps.bench.outputs.peak_mb }}
+ ALL_TIMES: ${{ steps.bench.outputs.all_times }}
+ ALL_HEAPS: ${{ steps.bench.outputs.all_heaps }}
+ RUNS: ${{ steps.config.outputs.runs }}
+ GPU_NAME: ${{ env.GPU_NAME }}
+ OFFER_PRICE: ${{ steps.offer.outputs.price }}
+ COMMIT_SHA: ${{ steps.config.outputs.sha }}
+ with:
+ script: |
+ const time = process.env.TIME_S;
+ const peak = process.env.PEAK_MB;
+ const runs = parseInt(process.env.RUNS || '1');
+ const allTimes = (process.env.ALL_TIMES || '').split('/').map(t => `${t}s`).join(' / ');
+ const allHeaps = (process.env.ALL_HEAPS || '').split('/').map(h => `${h} MB`).join(' / ');
+ const nLabel = runs > 1 ? ` (median of ${runs})` : '';
+ const sha = (process.env.COMMIT_SHA || '').substring(0, 8);
+
+ let body = `## Benchmark (GPU) — ethrex 20 transfers${nLabel}\n\n`;
+ body += `GPU: ${process.env.GPU_NAME.replace('_', ' ')} · Vast.ai datacenter @ \$${process.env.OFFER_PRICE}/hr · \`prover/cuda\`\n\n`;
+ body += `| Metric | GPU |\n`;
+ body += `|--------|-----|\n`;
+ body += `| **Prove time** | ${time}s |\n`;
+ body += `| **Peak heap** | ${peak} MB |\n`;
+ if (runs > 1) {
+ body += `\nRuns — time: ${allTimes} · heap: ${allHeaps}\n`;
+ }
+ body += `\nCommit: ${sha} · Runner: Vast.ai RTX 5090\n`;
+
+ const { data: comments } = await github.rest.issues.listComments({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ });
+ const marker = 'Benchmark (GPU) — ethrex';
+ const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker));
+ if (existing) {
+ await github.rest.issues.updateComment({
+ owner: context.repo.owner, repo: context.repo.repo,
+ comment_id: existing.id, body,
+ });
+ } else {
+ await github.rest.issues.createComment({
+ owner: context.repo.owner, repo: context.repo.repo,
+ issue_number: context.issue.number, body,
+ });
+ }
+
+ # --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
+ - name: Destroy instance
+ if: always()
+ run: |
+ if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
+ IID=$(cat "$RUNNER_TEMP/vast_instance_id")
+ echo "Destroying instance $IID"
+ vastai destroy instance "$IID" || echo "::warning::destroy instance $IID failed — check the Vast console"
+ else
+ echo "No instance id recorded; nothing to destroy."
+ fi
+
+ - name: Remove ephemeral SSH key
+ if: always()
+ env:
+ KEY_COMMENT: ${{ steps.sshkey.outputs.key_comment }}
+ run: |
+ [ -z "$KEY_COMMENT" ] && exit 0
+ vastai show ssh-keys --raw > keys.json 2>/dev/null || exit 0
+ for kid in $(jq -r --arg c "$KEY_COMMENT" '.[] | select(.public_key | contains($c)) | .id' keys.json); do
+ echo "Deleting ssh-key $kid"
+ vastai delete ssh-key "$kid" || true
+ done
diff --git a/.github/workflows/benchmark-pr.yml b/.github/workflows/benchmark-pr.yml
index ca66bf9a7..2eaebc213 100644
--- a/.github/workflows/benchmark-pr.yml
+++ b/.github/workflows/benchmark-pr.yml
@@ -55,6 +55,7 @@ jobs:
(github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
startsWith(github.event.comment.body, '/bench') &&
+ !startsWith(github.event.comment.body, '/bench-gpu') &&
contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
steps:
- name: React to comment
diff --git a/infra/gpu_bench.sh b/infra/gpu_bench.sh
new file mode 100755
index 000000000..1557e1e02
--- /dev/null
+++ b/infra/gpu_bench.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Run the headline ethrex prover benchmark on a GPU box, with the CUDA path enabled.
+#
+# Usage: infra/gpu_bench.sh [runs]
+# runs number of prove iterations (default 3)
+#
+# Assumes the box was provisioned by the Vast template onstart
+# (yetanotherco/scripts/bootstrap-onstart.sh): Rust 1.94.0 + nightly-2026-02-01,
+# LLVM/clang, and the rv64 sysroot at /opt/lambda-vm-sysroot are already in place;
+# CUDA/nvcc come from the base image. This script does NOT provision — it only
+# builds with `prover/cuda`, generates the bench fixture, and runs the prove loop.
+#
+# It proves the SAME workload as the CPU benchmark (.github/workflows/benchmark-pr.yml):
+# the ethrex guest ELF against a generated 20-transfer (distinct sender->recipient)
+# block. Each run prints the CLI's "Proving time:" / "Peak heap:" lines, which the
+# orchestrating workflow parses.
+
+set -euo pipefail
+
+RUNS="${1:-3}"
+
+# Headline program (keep in sync with benchmark-pr.yml ELF/INPUT).
+ELF="executor/program_artifacts/rust/ethrex.elf"
+INPUT="executor/tests/ethrex_bench_20.bin"
+TRANSFERS=20
+
+log() { printf '\n=== %s ===\n' "$*"; }
+
+# --- 0. Locate cargo + sysroot (provisioned by the template onstart) ---------
+if [ -f "$HOME/.cargo/env" ]; then
+ # shellcheck disable=SC1091
+ . "$HOME/.cargo/env"
+fi
+export PATH="$HOME/.cargo/bin:$PATH"
+export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}"
+
+# --- 1. Sanity-check the GPU toolchain ---------------------------------------
+log "GPU + toolchain check"
+if ! command -v nvidia-smi >/dev/null 2>&1; then
+ echo "::error::nvidia-smi not found — no GPU driver on this box" >&2
+ exit 1
+fi
+nvidia-smi --query-gpu=name,compute_cap,driver_version --format=csv,noheader || true
+
+# nvcc may live under /usr/local/cuda/bin without being on PATH.
+if ! command -v nvcc >/dev/null 2>&1; then
+ for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do
+ if [ -x "$d/nvcc" ]; then
+ export PATH="$d:$PATH"
+ export CUDA_HOME="${CUDA_HOME:-$(dirname "$d")}"
+ break
+ fi
+ done
+fi
+if ! command -v nvcc >/dev/null 2>&1; then
+ echo "::error::nvcc not found — CUDA toolkit missing (math-cuda needs it to compile kernels)" >&2
+ exit 1
+fi
+nvcc --version | tail -n 2
+
+if ! command -v cargo >/dev/null 2>&1; then
+ echo "::error::cargo not found — template onstart provisioning incomplete" >&2
+ exit 1
+fi
+if [ ! -f "$SYSROOT_DIR/include/stdlib.h" ]; then
+ echo "::error::rv64 sysroot missing at $SYSROOT_DIR — onstart provisioning incomplete" >&2
+ exit 1
+fi
+
+# --- 2. Build the ethrex guest ELF (same target as the CPU bench) ------------
+log "building ethrex guest ELF"
+make "$ELF"
+
+# --- 3. Generate the 20-transfer fixture -------------------------------------
+log "generating $INPUT ($TRANSFERS distinct transfers)"
+( cd tooling/ethrex-fixtures && cargo build --release )
+GEN=tooling/ethrex-fixtures/target/release/ethrex-fixtures
+"$GEN" "$TRANSFERS" "$INPUT" distinct
+
+# --- 4. Build the CLI with the GPU (cuda) path -------------------------------
+# jemalloc-stats gives the deterministic "Peak heap:" line; prover/cuda routes
+# the LDE (and friends) through crypto/math-cuda. math-cuda/build.rs auto-detects
+# the RTX 5090 arch (compute_120) via nvidia-smi, so no arch pin is needed.
+log "building CLI with --features jemalloc-stats,prover/cuda"
+cargo build --release -p cli --features jemalloc-stats,prover/cuda
+
+# --- 5. Prove loop -----------------------------------------------------------
+log "proving $ELF x$RUNS (GPU)"
+for i in $(seq 1 "$RUNS"); do
+ echo "--- Run $i/$RUNS ---"
+ ./target/release/cli prove "$ELF" --private-input "$INPUT" -o /tmp/proof.bin --time
+ rm -f /tmp/proof.bin
+done
+
+log "done"
From a9d848a5ea22261340e33e3c5d66660b8e4671a4 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 16:39:04 -0300
Subject: [PATCH 2/9] add retries
---
.github/workflows/benchmark-gpu.yml | 29 ++++++++++++++++++++++-------
1 file changed, 22 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index abdea98a5..e1750ddde 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -109,18 +109,33 @@ jobs:
- name: Pick a Vast offer
id: offer
+ env:
+ # Retry the same query to ride out transient scarcity (datacenter RTX 5090s
+ # are a small, fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL.
+ OFFER_ATTEMPTS: "10"
+ OFFER_INTERVAL: "30"
run: |
- # NB: cpu_ram is in MB (32 GB = 32 * 1024 = 32768); disk_space/cuda_max_good as named.
- QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32768 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+ # cpu_ram is the machine's TOTAL RAM in MB; a 1-GPU rental usually gets most of it.
+ # The ethrex_bench_20 GPU prove peaks at ~78 GB heap, so require >=96 GB
+ # (96 * 1024 = 98304) for headroom — a 32 GB box would OOM.
+ QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=98304 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
echo "Query: $QUERY"
- vastai search offers "$QUERY" --raw -o dph_total > offers.json
- OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json)
- OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json)
+ OFFER_ID=""
+ for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
+ vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
+ OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json)
+ OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json)
+ if [ -n "$OFFER_ID" ]; then
+ echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)"
+ break
+ fi
+ echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..."
+ sleep "$OFFER_INTERVAL"
+ done
if [ -z "$OFFER_ID" ]; then
- echo "::error::No datacenter RTX 5090 offer matched (>=16 cores, >=32GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+ echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
exit 1
fi
- echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr"
echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT"
From 5ef0fe2b6608ff317b9a4142f0c04134a229f3e1 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 17:00:54 -0300
Subject: [PATCH 3/9] ci: use ABBA method to run the benchmark
---
.github/workflows/benchmark-gpu.yml | 211 ++++++++++++++--------------
infra/gpu_bench.sh | 95 -------------
scripts/bench_abba.sh | 9 +-
3 files changed, 116 insertions(+), 199 deletions(-)
delete mode 100755 infra/gpu_bench.sh
diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index e1750ddde..51a7ed63b 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -1,12 +1,14 @@
name: Benchmark GPU (PR)
-# Rent an RTX 5090 on Vast.ai (hourly), run the SAME headline ethrex prover
-# benchmark as benchmark-pr.yml but with the CUDA path enabled, post the absolute
-# GPU numbers back to the PR, then always destroy the instance.
+# Rent an RTX 5090 on Vast.ai (hourly) and run the drift-free A/B/B/A (ABBA) paired
+# prover benchmark — the same method as the CPU `/bench-abba` (scripts/bench_abba.sh) —
+# but with the CUDA prover path enabled (BENCH_FEATURES=jemalloc-stats,prover/cuda).
+# It builds the cli at the PR head and at main, runs N interleaved pairs on the GPU,
+# posts the paired-t + Wilcoxon verdict back to the PR, then ALWAYS destroys the box.
#
-# Triggered by a "/bench-gpu [N]" comment on a PR (N = prove iterations, default 3),
-# or via workflow_dispatch. The orchestration runs on a GitHub-hosted runner; all the
-# GPU work happens on the rented Vast box (provisioned by the template onstart).
+# Triggered by a "/bench-gpu [N]" comment on a PR (N = pair count, default 10) or via
+# workflow_dispatch. Orchestration runs on a GitHub-hosted runner; all GPU work happens
+# on the rented Vast box (provisioned by the template onstart).
#
# Requires repo secrets:
# VAST_API_KEY — https://cloud.vast.ai/manage-keys/
@@ -15,79 +17,102 @@ name: Benchmark GPU (PR)
on:
workflow_dispatch:
inputs:
- runs:
- description: "Number of prove iterations"
- default: "3"
+ pairs:
+ description: "Number of A/B/B/A pairs"
+ default: "10"
issue_comment:
types: [created]
+ # TEMP(testing): lets the workflow run from this branch before it's on the default
+ # branch (push uses the branch's own definition; issue_comment/workflow_dispatch do
+ # not). REMOVE this push trigger before merging.
+ push:
+ branches: [gpu_benchmarks]
permissions:
contents: read
pull-requests: write
+ issues: write
concurrency:
group: benchmark-gpu-${{ github.event.issue.number || github.run_id }}
cancel-in-progress: true
env:
- # Vast offer search: datacenter RTX 5090, >=16 cores, >=32GB RAM, >=64GB disk,
- # verified + rentable, Blackwell-capable driver, under the price cap ($/hr).
+ # Vast offer search: datacenter RTX 5090, >=16 cores, >=96GB RAM (the ABBA prove
+ # peaks ~78 GB), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap.
GPU_NAME: RTX_5090
PRICE_CAP: "3"
VAST_IMAGE_DISK: "64"
+ # cli features for the ABBA build — the GPU (cuda) prover path plus jemalloc heap stats.
+ BENCH_FEATURES: "jemalloc-stats,prover/cuda"
jobs:
benchmark-gpu:
runs-on: ubuntu-latest
# Skip unless: workflow_dispatch, or a "/bench-gpu" comment from a privileged author.
+ # TEMP(testing): `github.event_name == 'push'` lets branch pushes run it pre-merge.
+ # REMOVE the push clause before merging.
if: >-
+ github.event_name == 'push' ||
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
startsWith(github.event.comment.body, '/bench-gpu') &&
contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
+ # ABBA on the GPU: dual cuda build (~15-30 min) + 2*pairs proves (~77s each).
+ timeout-minutes: 180
steps:
- - name: React to comment
- if: github.event_name == 'issue_comment'
- uses: actions/github-script@v7
- with:
- script: |
- await github.rest.reactions.createForIssueComment({
- owner: context.repo.owner,
- repo: context.repo.repo,
- comment_id: context.payload.comment.id,
- content: 'eyes'
- });
-
- - name: Resolve PR ref + run count
+ - name: Resolve PR ref + pair count
id: config
env:
GH_TOKEN: ${{ github.token }}
EVENT_NAME: ${{ github.event_name }}
COMMENT_BODY: ${{ github.event.comment.body }}
PR_NUM: ${{ github.event.issue.number }}
- DISPATCH_RUNS: ${{ github.event.inputs.runs }}
+ DISPATCH_PAIRS: ${{ github.event.inputs.pairs }}
DISPATCH_REF: ${{ github.ref_name }}
run: |
if [ "$EVENT_NAME" = "issue_comment" ]; then
- SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
- echo "pr_num=$PR_NUM" >> "$GITHUB_OUTPUT"
- echo "sha=$SHA" >> "$GITHUB_OUTPUT"
- # "/bench-gpu 5" -> 5 iterations; otherwise default.
+ # Pin the head SHA (works for fork PRs; avoids a force-push race mid-run).
+ HEAD_SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+ OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH=""
+ # "/bench-gpu 20" -> 20 pairs; otherwise default.
N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p')
- RUNS=${N:-3}
+ PAIRS=${N:-10}
else
- echo "pr_num=" >> "$GITHUB_OUTPUT"
- echo "sha=$DISPATCH_REF" >> "$GITHUB_OUTPUT"
- RUNS=${DISPATCH_RUNS:-3}
+ # workflow_dispatch / push: compare this branch vs main.
+ OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
+ PAIRS=${DISPATCH_PAIRS:-10}
fi
- # Clamp to [1,10].
- if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then
- echo "::warning::run count out of range, defaulting to 3"
- RUNS=3
+ # Clamp to [2,40] (even is ideal so AB/BA orders balance).
+ if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
+ echo "::warning::pair count out of range [2,40], defaulting to 10"
+ PAIRS=10
fi
- echo "runs=$RUNS" >> "$GITHUB_OUTPUT"
- echo "Using $RUNS prove iteration(s)"
+ {
+ echo "pr_num=$OUT_PR_NUM"
+ echo "head_sha=$OUT_HEAD_SHA"
+ echo "branch=$OUT_BRANCH"
+ echo "pairs=$PAIRS"
+ } >> "$GITHUB_OUTPUT"
+ echo "Using $PAIRS A/B/B/A pairs"
+
+ - name: Acknowledge (react + occupancy notice)
+ if: github.event_name == 'issue_comment'
+ uses: actions/github-script@v7
+ env:
+ PAIRS: ${{ steps.config.outputs.pairs }}
+ with:
+ script: |
+ await github.rest.reactions.createForIssueComment({
+ owner: context.repo.owner, repo: context.repo.repo,
+ comment_id: context.payload.comment.id, content: 'eyes'
+ });
+ await github.rest.issues.createComment({
+ owner: context.repo.owner, repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: `⏳ **GPU ABBA started** — renting an RTX 5090 on Vast.ai and running ${process.env.PAIRS} interleaved pairs (PR vs main) on the CUDA prover path. This takes ~1 hr; results will be posted here.`
+ });
- name: Install Vast CLI
env:
@@ -226,94 +251,76 @@ jobs:
echo "::error::onstart provisioning did not complete in time"
exit 1
- - name: Check out PR source on the box
+ - name: Run GPU ABBA benchmark
+ id: bench
env:
HOST: ${{ steps.ssh.outputs.host }}
PORT: ${{ steps.ssh.outputs.port }}
KEY: ${{ steps.sshkey.outputs.key_path }}
PR_NUM: ${{ steps.config.outputs.pr_num }}
- SHA: ${{ steps.config.outputs.sha }}
+ HEAD_SHA: ${{ steps.config.outputs.head_sha }}
+ BRANCH: ${{ steps.config.outputs.branch }}
+ PAIRS: ${{ steps.config.outputs.pairs }}
run: |
SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+
+ # Resolve the PR side (REF_A) and the fetch needed to make it resolvable on the box.
if [ -n "$PR_NUM" ]; then
- # Fetch the PR head via the base repo's pull ref (works for fork PRs too).
- $SSH "cd /workspace/lambda_vm && git fetch --force origin 'refs/pull/${PR_NUM}/head' && git checkout --force '$SHA'"
+ FETCH="git fetch --force origin refs/pull/$PR_NUM/head"
+ REF_A="$HEAD_SHA"
else
- # workflow_dispatch: check out the requested branch.
- $SSH "cd /workspace/lambda_vm && git fetch --force origin '$SHA' && git checkout --force FETCH_HEAD"
+ FETCH="git fetch --force origin $BRANCH"
+ REF_A="origin/$BRANCH"
fi
- $SSH "cd /workspace/lambda_vm && git --no-pager log -1 --oneline"
- - name: Run GPU benchmark
- id: bench
- env:
- HOST: ${{ steps.ssh.outputs.host }}
- PORT: ${{ steps.ssh.outputs.port }}
- KEY: ${{ steps.sshkey.outputs.key_path }}
- RUNS: ${{ steps.config.outputs.runs }}
- run: |
- SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
- # bash -l so ~/.bashrc (cargo env) is sourced; the script also sources it itself.
- $SSH "bash -lc 'cd /workspace/lambda_vm && bash infra/gpu_bench.sh $RUNS'" | tee bench.log
+ # bench_abba.sh builds the cli at REF_A and origin/main (isolated worktree),
+ # runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + Wilcoxon
+ # verdict. BENCH_FEATURES routes the build through the CUDA prover path.
+ REMOTE="set -e; cd /workspace/lambda_vm; \
+ command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
+ git fetch --force origin main; $FETCH; \
+ SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
+ scripts/bench_abba.sh $REF_A origin/main $PAIRS"
- # Parse per-run metrics (same format as the CPU bench: "Proving time:" / "Peak heap:").
- mapfile -t TIMES < <(grep -o 'Proving time: [0-9.]*' bench.log | awk '{print $3}')
- mapfile -t HEAPS < <(grep -o 'Peak heap: [0-9]*' bench.log | awk '{print $3}')
- if [ "${#TIMES[@]}" -eq 0 ] || [ "${#HEAPS[@]}" -eq 0 ]; then
- echo "::error::Failed to parse any GPU metrics from the bench output"
- exit 1
- fi
- MED_POS=$(( (${#TIMES[@]} + 1) / 2 ))
- TIME_MED=$(printf '%s\n' "${TIMES[@]}" | sort -n | awk "NR==$MED_POS")
- HEAP_MED=$(printf '%s\n' "${HEAPS[@]}" | sort -n | awk "NR==$MED_POS")
- ALL_TIMES=$(printf '%s\n' "${TIMES[@]}" | paste -sd '/' -)
- ALL_HEAPS=$(printf '%s\n' "${HEAPS[@]}" | paste -sd '/' -)
- {
- echo "time_s=$TIME_MED"
- echo "peak_mb=$HEAP_MED"
- echo "all_times=$ALL_TIMES"
- echo "all_heaps=$ALL_HEAPS"
- } >> "$GITHUB_OUTPUT"
+ $SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt"
+ # Extract the result section for the PR comment (same marker bench-abba.yml uses).
+ sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt"
- - name: Comment on PR
- if: github.event_name == 'issue_comment'
+ - name: Comment ABBA result on PR
+ if: always() && github.event_name == 'issue_comment'
uses: actions/github-script@v7
env:
- TIME_S: ${{ steps.bench.outputs.time_s }}
- PEAK_MB: ${{ steps.bench.outputs.peak_mb }}
- ALL_TIMES: ${{ steps.bench.outputs.all_times }}
- ALL_HEAPS: ${{ steps.bench.outputs.all_heaps }}
- RUNS: ${{ steps.config.outputs.runs }}
+ HEAD_SHA: ${{ steps.config.outputs.head_sha }}
+ PAIRS: ${{ steps.config.outputs.pairs }}
+ OUTCOME: ${{ steps.bench.outcome }}
GPU_NAME: ${{ env.GPU_NAME }}
OFFER_PRICE: ${{ steps.offer.outputs.price }}
- COMMIT_SHA: ${{ steps.config.outputs.sha }}
with:
script: |
- const time = process.env.TIME_S;
- const peak = process.env.PEAK_MB;
- const runs = parseInt(process.env.RUNS || '1');
- const allTimes = (process.env.ALL_TIMES || '').split('/').map(t => `${t}s`).join(' / ');
- const allHeaps = (process.env.ALL_HEAPS || '').split('/').map(h => `${h} MB`).join(' / ');
- const nLabel = runs > 1 ? ` (median of ${runs})` : '';
- const sha = (process.env.COMMIT_SHA || '').substring(0, 8);
+ const fs = require('fs');
+ const tmp = process.env.RUNNER_TEMP;
+ const read = (p) => { try { return fs.readFileSync(p, 'utf8').trim(); } catch { return ''; } };
+ const head = (process.env.HEAD_SHA || '').slice(0, 10);
+ const pairs = process.env.PAIRS;
+ const gpu = (process.env.GPU_NAME || '').replace('_', ' ');
+ const price = process.env.OFFER_PRICE;
- let body = `## Benchmark (GPU) — ethrex 20 transfers${nLabel}\n\n`;
- body += `GPU: ${process.env.GPU_NAME.replace('_', ' ')} · Vast.ai datacenter @ \$${process.env.OFFER_PRICE}/hr · \`prover/cuda\`\n\n`;
- body += `| Metric | GPU |\n`;
- body += `|--------|-----|\n`;
- body += `| **Prove time** | ${time}s |\n`;
- body += `| **Peak heap** | ${peak} MB |\n`;
- if (runs > 1) {
- body += `\nRuns — time: ${allTimes} · heap: ${allHeaps}\n`;
+ let body = `## GPU Benchmark (ABBA) — \`${head}\` vs \`main\` (${pairs} pairs)\n\n`;
+ body += `${gpu} · Vast.ai datacenter${price ? ` @ \$${price}/hr` : ''} · \`prover/cuda\` · drift-free A/B/B/A\n\n`;
+ if (process.env.OUTCOME === 'success') {
+ const res = read(`${tmp}/abba_result.txt`) || read(`${tmp}/abba_out.txt`);
+ body += '```\n' + res + '\n```\n';
+ body += '\n+ = PR faster. Trust the verdict when paired-t and Wilcoxon agree.\n';
+ } else {
+ const tail = read(`${tmp}/abba_out.txt`).split('\n').slice(-30).join('\n');
+ body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n';
}
- body += `\nCommit: ${sha} · Runner: Vast.ai RTX 5090\n`;
const { data: comments } = await github.rest.issues.listComments({
- owner: context.repo.owner,
- repo: context.repo.repo,
+ owner: context.repo.owner, repo: context.repo.repo,
issue_number: context.issue.number,
});
- const marker = 'Benchmark (GPU) — ethrex';
+ const marker = 'GPU Benchmark (ABBA)';
const existing = comments.find(c => c.user.type === 'Bot' && c.body.includes(marker));
if (existing) {
await github.rest.issues.updateComment({
diff --git a/infra/gpu_bench.sh b/infra/gpu_bench.sh
deleted file mode 100755
index 1557e1e02..000000000
--- a/infra/gpu_bench.sh
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/bin/bash
-# Run the headline ethrex prover benchmark on a GPU box, with the CUDA path enabled.
-#
-# Usage: infra/gpu_bench.sh [runs]
-# runs number of prove iterations (default 3)
-#
-# Assumes the box was provisioned by the Vast template onstart
-# (yetanotherco/scripts/bootstrap-onstart.sh): Rust 1.94.0 + nightly-2026-02-01,
-# LLVM/clang, and the rv64 sysroot at /opt/lambda-vm-sysroot are already in place;
-# CUDA/nvcc come from the base image. This script does NOT provision — it only
-# builds with `prover/cuda`, generates the bench fixture, and runs the prove loop.
-#
-# It proves the SAME workload as the CPU benchmark (.github/workflows/benchmark-pr.yml):
-# the ethrex guest ELF against a generated 20-transfer (distinct sender->recipient)
-# block. Each run prints the CLI's "Proving time:" / "Peak heap:" lines, which the
-# orchestrating workflow parses.
-
-set -euo pipefail
-
-RUNS="${1:-3}"
-
-# Headline program (keep in sync with benchmark-pr.yml ELF/INPUT).
-ELF="executor/program_artifacts/rust/ethrex.elf"
-INPUT="executor/tests/ethrex_bench_20.bin"
-TRANSFERS=20
-
-log() { printf '\n=== %s ===\n' "$*"; }
-
-# --- 0. Locate cargo + sysroot (provisioned by the template onstart) ---------
-if [ -f "$HOME/.cargo/env" ]; then
- # shellcheck disable=SC1091
- . "$HOME/.cargo/env"
-fi
-export PATH="$HOME/.cargo/bin:$PATH"
-export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}"
-
-# --- 1. Sanity-check the GPU toolchain ---------------------------------------
-log "GPU + toolchain check"
-if ! command -v nvidia-smi >/dev/null 2>&1; then
- echo "::error::nvidia-smi not found — no GPU driver on this box" >&2
- exit 1
-fi
-nvidia-smi --query-gpu=name,compute_cap,driver_version --format=csv,noheader || true
-
-# nvcc may live under /usr/local/cuda/bin without being on PATH.
-if ! command -v nvcc >/dev/null 2>&1; then
- for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do
- if [ -x "$d/nvcc" ]; then
- export PATH="$d:$PATH"
- export CUDA_HOME="${CUDA_HOME:-$(dirname "$d")}"
- break
- fi
- done
-fi
-if ! command -v nvcc >/dev/null 2>&1; then
- echo "::error::nvcc not found — CUDA toolkit missing (math-cuda needs it to compile kernels)" >&2
- exit 1
-fi
-nvcc --version | tail -n 2
-
-if ! command -v cargo >/dev/null 2>&1; then
- echo "::error::cargo not found — template onstart provisioning incomplete" >&2
- exit 1
-fi
-if [ ! -f "$SYSROOT_DIR/include/stdlib.h" ]; then
- echo "::error::rv64 sysroot missing at $SYSROOT_DIR — onstart provisioning incomplete" >&2
- exit 1
-fi
-
-# --- 2. Build the ethrex guest ELF (same target as the CPU bench) ------------
-log "building ethrex guest ELF"
-make "$ELF"
-
-# --- 3. Generate the 20-transfer fixture -------------------------------------
-log "generating $INPUT ($TRANSFERS distinct transfers)"
-( cd tooling/ethrex-fixtures && cargo build --release )
-GEN=tooling/ethrex-fixtures/target/release/ethrex-fixtures
-"$GEN" "$TRANSFERS" "$INPUT" distinct
-
-# --- 4. Build the CLI with the GPU (cuda) path -------------------------------
-# jemalloc-stats gives the deterministic "Peak heap:" line; prover/cuda routes
-# the LDE (and friends) through crypto/math-cuda. math-cuda/build.rs auto-detects
-# the RTX 5090 arch (compute_120) via nvidia-smi, so no arch pin is needed.
-log "building CLI with --features jemalloc-stats,prover/cuda"
-cargo build --release -p cli --features jemalloc-stats,prover/cuda
-
-# --- 5. Prove loop -----------------------------------------------------------
-log "proving $ELF x$RUNS (GPU)"
-for i in $(seq 1 "$RUNS"); do
- echo "--- Run $i/$RUNS ---"
- ./target/release/cli prove "$ELF" --private-input "$INPUT" -o /tmp/proof.bin --time
- rm -f /tmp/proof.bin
-done
-
-log "done"
diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh
index 79bfddf27..57fab5e28 100755
--- a/scripts/bench_abba.sh
+++ b/scripts/bench_abba.sh
@@ -27,6 +27,8 @@
# REF_B baseline (default: origin/main)
# N_PAIRS pairs (default: 20 -> 40 runs, ~33 min on ethrex)
# Env: REBUILD=1 forces a rebuild even if cached binaries exist.
+# BENCH_FEATURES= cargo features for the cli build (default: jemalloc-stats).
+# The GPU ABBA workflow passes "jemalloc-stats,prover/cuda" to bench the GPU path.
#
# Sizing (ethrex pair-noise sd ~1.2%, 80% power): ~12 pairs for a 1% effect,
# ~18 for 0.8%, ~32 for 0.6%. Default 20 -> solid on 0.8-1%, ~60% power at 0.6%
@@ -45,6 +47,9 @@ fi
REF_A="$1"
REF_B="${2:-origin/main}"
N_PAIRS="${3:-20}"
+# cli build features. Default matches the CPU bench; the GPU ABBA workflow overrides
+# with "jemalloc-stats,prover/cuda" to exercise the CUDA prover path.
+BENCH_FEATURES="${BENCH_FEATURES:-jemalloc-stats}"
ELF_REL="executor/program_artifacts/rust/ethrex.elf"
INPUT_REL="executor/tests/ethrex_bench_20.bin"
@@ -102,9 +107,9 @@ if [ "$need_build" = "1" ]; then
echo "==> Building both prover binaries in isolated worktree $WT"
git worktree add --detach "$WT" "$SHA_B" >/dev/null
build_cli() { # $1=sha $2=out (shared target dir -> 2nd build is incremental)
- echo "==> Building cli @ ${1:0:10} -> $2"
+ echo "==> Building cli @ ${1:0:10} -> $2 (features: $BENCH_FEATURES)"
git -C "$WT" checkout --quiet "$1"
- if ! ( cd "$WT" && cargo build --release -p cli --features jemalloc-stats >"$WORK/build_$2.log" 2>&1 ); then
+ if ! ( cd "$WT" && cargo build --release -p cli --features "$BENCH_FEATURES" >"$WORK/build_$2.log" 2>&1 ); then
echo "ERROR: cargo build failed for $2 (@ ${1:0:10}). Tail of $WORK/build_$2.log:" >&2
tail -40 "$WORK/build_$2.log" >&2
exit 1
From 337bce05248c8ec4e7ad07d1cee4142c9ddc15e8 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 17:04:29 -0300
Subject: [PATCH 4/9] ci: use 64gb ram
---
.github/workflows/benchmark-gpu.yml | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 51a7ed63b..c885cc03b 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -38,8 +38,8 @@ concurrency:
cancel-in-progress: true
env:
- # Vast offer search: datacenter RTX 5090, >=16 cores, >=96GB RAM (the ABBA prove
- # peaks ~78 GB), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap.
+ # Vast offer search: datacenter RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk,
+ # verified + rentable, Blackwell-capable driver, <= cap.
GPU_NAME: RTX_5090
PRICE_CAP: "3"
VAST_IMAGE_DISK: "64"
@@ -140,10 +140,10 @@ jobs:
OFFER_ATTEMPTS: "10"
OFFER_INTERVAL: "30"
run: |
- # cpu_ram is the machine's TOTAL RAM in MB; a 1-GPU rental usually gets most of it.
- # The ethrex_bench_20 GPU prove peaks at ~78 GB heap, so require >=96 GB
- # (96 * 1024 = 98304) for headroom — a 32 GB box would OOM.
- QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=98304 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+ # cpu_ram is the per-instance allocated RAM in MB. Require ~64 GB; use 64000
+ # (not 65536) because the "64 GB" datacenter boxes report ~64467 MB. Peak heap
+ # scales with table parallelism (~cores/3), so a 32-core/64 GB box fits the prove.
+ QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
echo "Query: $QUERY"
OFFER_ID=""
for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
@@ -158,7 +158,7 @@ jobs:
sleep "$OFFER_INTERVAL"
done
if [ -z "$OFFER_ID" ]; then
- echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+ echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
exit 1
fi
echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
From 3fce3499a6de1ae0fd4d5209ae0946862dbbc501 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 17:24:17 -0300
Subject: [PATCH 5/9] ci: remove datacenter flag
---
.github/workflows/benchmark-gpu.yml | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index c885cc03b..d9d0b203d 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -38,8 +38,8 @@ concurrency:
cancel-in-progress: true
env:
- # Vast offer search: datacenter RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk,
- # verified + rentable, Blackwell-capable driver, <= cap.
+ # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified +
+ # rentable, Blackwell-capable driver, <= cap.
GPU_NAME: RTX_5090
PRICE_CAP: "3"
VAST_IMAGE_DISK: "64"
@@ -140,10 +140,9 @@ jobs:
OFFER_ATTEMPTS: "10"
OFFER_INTERVAL: "30"
run: |
- # cpu_ram is the per-instance allocated RAM in MB. Require ~64 GB; use 64000
- # (not 65536) because the "64 GB" datacenter boxes report ~64467 MB. Peak heap
- # scales with table parallelism (~cores/3), so a 32-core/64 GB box fits the prove.
- QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 datacenter=true verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+ # cpu_ram is per-instance allocated RAM in MB; use 64000 (not 65536) because the
+ # "64 GB" boxes report ~64467 MB.
+ QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
echo "Query: $QUERY"
OFFER_ID=""
for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
@@ -158,7 +157,7 @@ jobs:
sleep "$OFFER_INTERVAL"
done
if [ -z "$OFFER_ID" ]; then
- echo "::error::No datacenter RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+ echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
exit 1
fi
echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
From 76a137a5542dbc3ea15e4e050e6a03440731c981 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 17:29:49 -0300
Subject: [PATCH 6/9] fix: units for RAM
---
.github/workflows/benchmark-gpu.yml | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index d9d0b203d..d88f5eb92 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -38,7 +38,7 @@ concurrency:
cancel-in-progress: true
env:
- # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM, >=64GB disk, verified +
+ # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM, >=64GB disk, verified +
# rentable, Blackwell-capable driver, <= cap.
GPU_NAME: RTX_5090
PRICE_CAP: "3"
@@ -140,9 +140,9 @@ jobs:
OFFER_ATTEMPTS: "10"
OFFER_INTERVAL: "30"
run: |
- # cpu_ram is per-instance allocated RAM in MB; use 64000 (not 65536) because the
- # "64 GB" boxes report ~64467 MB.
- QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64000 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+ # cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different
+ # units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing.
+ QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
echo "Query: $QUERY"
OFFER_ID=""
for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
@@ -157,7 +157,7 @@ jobs:
sleep "$OFFER_INTERVAL"
done
if [ -z "$OFFER_ID" ]; then
- echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=64GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+ echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
exit 1
fi
echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
From 09cf1e5044a19ef6008a7f278b966f78e389a57d Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 18:18:46 -0300
Subject: [PATCH 7/9] fix: min driver and ssh key
---
.github/workflows/benchmark-gpu.yml | 79 ++++++++++++++++++-----------
1 file changed, 50 insertions(+), 29 deletions(-)
diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index d88f5eb92..69b290ec6 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -19,7 +19,7 @@ on:
inputs:
pairs:
description: "Number of A/B/B/A pairs"
- default: "10"
+ default: "1" # TEMP(testing): fast runs; restore to "10" before merge
issue_comment:
types: [created]
# TEMP(testing): lets the workflow run from this branch before it's on the default
@@ -78,16 +78,16 @@ jobs:
OUT_PR_NUM="$PR_NUM"; OUT_HEAD_SHA="$HEAD_SHA"; OUT_BRANCH=""
# "/bench-gpu 20" -> 20 pairs; otherwise default.
N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-gpu[[:space:]]*\([0-9]\+\).*|\1|p')
- PAIRS=${N:-10}
+ PAIRS=${N:-1} # TEMP(testing): default 1; restore to 10 before merge
else
# workflow_dispatch / push: compare this branch vs main.
OUT_PR_NUM=""; OUT_HEAD_SHA=""; OUT_BRANCH="$DISPATCH_REF"
- PAIRS=${DISPATCH_PAIRS:-10}
+ PAIRS=${DISPATCH_PAIRS:-1} # TEMP(testing): default 1; restore to 10 before merge
fi
- # Clamp to [2,40] (even is ideal so AB/BA orders balance).
- if [ "$PAIRS" -lt 2 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
- echo "::warning::pair count out of range [2,40], defaulting to 10"
- PAIRS=10
+ # TEMP(testing): clamp floor lowered to 1 for fast runs; restore to [2,40] before merge.
+ if [ "$PAIRS" -lt 1 ] 2>/dev/null || [ "$PAIRS" -gt 40 ] 2>/dev/null; then
+ echo "::warning::pair count out of range [1,40], defaulting to 1"
+ PAIRS=1
fi
{
echo "pr_num=$OUT_PR_NUM"
@@ -121,15 +121,12 @@ jobs:
pip install --quiet --upgrade vastai
vastai set api-key "$VAST_API_KEY"
- - name: Register ephemeral SSH key
+ - name: Generate ephemeral SSH key
id: sshkey
run: |
mkdir -p "$HOME/.ssh"
KEY="$HOME/.ssh/vast_bench"
- COMMENT="gh-actions-bench-${GITHUB_RUN_ID}"
- ssh-keygen -t ed25519 -N "" -f "$KEY" -C "$COMMENT" >/dev/null
- vastai create ssh-key "$(cat "$KEY.pub")"
- echo "key_comment=$COMMENT" >> "$GITHUB_OUTPUT"
+ ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-bench-${GITHUB_RUN_ID}" >/dev/null
echo "key_path=$KEY" >> "$GITHUB_OUTPUT"
- name: Pick a Vast offer
@@ -139,16 +136,23 @@ jobs:
# are a small, fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL.
OFFER_ATTEMPTS: "10"
OFFER_INTERVAL: "30"
+ # Require driver >= this major so cudarc (default cuda-version-from-build-system)
+ # matches the runtime driver. Older drivers (e.g. 575) lack newer symbols like
+ # cuCtxGetDevice_v2 and the GPU path falls back to CPU. Filtered client-side in jq
+ # because vast can't numerically compare the driver_version string server-side.
+ MIN_DRIVER: "580"
run: |
# cpu_ram in the search filter is GB (the returned .cpu_ram field is MB — different
# units), so >=96 means 96 GB. >=96000 would mean 96000 GB and match nothing.
QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
- echo "Query: $QUERY"
+ echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
+ # Keep only offers whose driver major >= MIN_DRIVER, then cheapest first.
+ SELECT="map(select((.driver_version|split(\".\")[0]|tonumber) >= ${MIN_DRIVER})) | sort_by(.dph_total)"
OFFER_ID=""
for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
- OFFER_ID=$(jq -r 'sort_by(.dph_total) | .[0].id // empty' offers.json)
- OFFER_PRICE=$(jq -r 'sort_by(.dph_total) | .[0].dph_total // empty' offers.json)
+ OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json)
+ OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json)
if [ -n "$OFFER_ID" ]; then
echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)"
break
@@ -157,7 +161,7 @@ jobs:
sleep "$OFFER_INTERVAL"
done
if [ -z "$OFFER_ID" ]; then
- echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, <= \$${PRICE_CAP}/hr)"
+ echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
exit 1
fi
echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
@@ -184,6 +188,25 @@ jobs:
echo "id=$IID" >> "$GITHUB_OUTPUT"
echo "Created instance $IID"
+ - name: Attach SSH key to instance
+ env:
+ IID: ${{ steps.instance.outputs.id }}
+ KEY: ${{ steps.sshkey.outputs.key_path }}
+ run: |
+ # Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys).
+ # It's removed when the instance is destroyed, so no account-level key to clean up.
+ # Retry: the instance may not accept the attach immediately after create.
+ PUB="$(cat "$KEY.pub")"
+ for attempt in $(seq 1 12); do
+ if vastai attach ssh "$IID" "$PUB"; then
+ echo "Attached ssh key (attempt $attempt)"; exit 0
+ fi
+ echo "attach failed (attempt $attempt/12); retrying in 10s..."
+ sleep 10
+ done
+ echo "::error::Failed to attach ssh key to instance $IID"
+ exit 1
+
- name: Wait for SSH
id: ssh
env:
@@ -285,6 +308,15 @@ jobs:
# Extract the result section for the PR comment (same marker bench-abba.yml uses).
sed -n '/=== ABBA paired result/,$p' "$RUNNER_TEMP/abba_out.txt" > "$RUNNER_TEMP/abba_result.txt"
+ # Surface the result in the Actions run summary too (push/workflow_dispatch
+ # runs have no PR to comment on).
+ {
+ echo "## GPU ABBA — ethrex 20 transfers (vs main)"
+ echo '```'
+ cat "$RUNNER_TEMP/abba_result.txt"
+ echo '```'
+ } >> "$GITHUB_STEP_SUMMARY"
+
- name: Comment ABBA result on PR
if: always() && github.event_name == 'issue_comment'
uses: actions/github-script@v7
@@ -340,19 +372,8 @@ jobs:
if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
IID=$(cat "$RUNNER_TEMP/vast_instance_id")
echo "Destroying instance $IID"
- vastai destroy instance "$IID" || echo "::warning::destroy instance $IID failed — check the Vast console"
+ # --yes: skip the interactive [y/N] confirm (CI has no tty).
+ vastai destroy instance "$IID" --yes || echo "::warning::destroy instance $IID failed — check the Vast console"
else
echo "No instance id recorded; nothing to destroy."
fi
-
- - name: Remove ephemeral SSH key
- if: always()
- env:
- KEY_COMMENT: ${{ steps.sshkey.outputs.key_comment }}
- run: |
- [ -z "$KEY_COMMENT" ] && exit 0
- vastai show ssh-keys --raw > keys.json 2>/dev/null || exit 0
- for kid in $(jq -r --arg c "$KEY_COMMENT" '.[] | select(.public_key | contains($c)) | .id' keys.json); do
- echo "Deleting ssh-key $kid"
- vastai delete ssh-key "$kid" || true
- done
From f645901afff23317b78a73cd760a8bba739094a8 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 19:18:59 -0300
Subject: [PATCH 8/9] fix: rebuild binaries
---
.github/workflows/benchmark-gpu.yml | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 69b290ec6..531000938 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -298,10 +298,12 @@ jobs:
# bench_abba.sh builds the cli at REF_A and origin/main (isolated worktree),
# runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + Wilcoxon
# verdict. BENCH_FEATURES routes the build through the CUDA prover path.
+ # REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
+ # binaries (PTX is compiled for the detected arch); never trust a cached binary.
REMOTE="set -e; cd /workspace/lambda_vm; \
command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
git fetch --force origin main; $FETCH; \
- SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
+ REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
scripts/bench_abba.sh $REF_A origin/main $PAIRS"
$SSH "bash -lc \"$REMOTE\"" | tee "$RUNNER_TEMP/abba_out.txt"
From 963966c0376eaa10c98ad2a002ef9672b4e98649 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Fri, 26 Jun 2026 19:25:54 -0300
Subject: [PATCH 9/9] fix: use correct sh
---
.github/workflows/benchmark-gpu.yml | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/benchmark-gpu.yml b/.github/workflows/benchmark-gpu.yml
index 531000938..e81c65f64 100644
--- a/.github/workflows/benchmark-gpu.yml
+++ b/.github/workflows/benchmark-gpu.yml
@@ -295,14 +295,17 @@ jobs:
REF_A="origin/$BRANCH"
fi
- # bench_abba.sh builds the cli at REF_A and origin/main (isolated worktree),
- # runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI + Wilcoxon
- # verdict. BENCH_FEATURES routes the build through the CUDA prover path.
+ # The template clones the repo at the DEFAULT branch (main), so check out the PR
+ # ref first — otherwise we'd run main's bench_abba.sh (no BENCH_FEATURES => CPU
+ # build). bench_abba.sh then builds the cli at REF_A and origin/main (isolated
+ # worktree), runs PAIRS interleaved A/B/B/A proves, and prints the paired-t CI +
+ # Wilcoxon verdict. BENCH_FEATURES routes the build through the CUDA prover path.
# REBUILD=1: each Vast box is fresh, GPU-specific hardware — always rebuild both
# binaries (PTX is compiled for the detected arch); never trust a cached binary.
REMOTE="set -e; cd /workspace/lambda_vm; \
command -v python3 >/dev/null || { apt-get update -qq && apt-get install -y -qq python3; }; \
git fetch --force origin main; $FETCH; \
+ git checkout -f $REF_A; \
REBUILD=1 SYSROOT_DIR=/opt/lambda-vm-sysroot BENCH_FEATURES='$BENCH_FEATURES' \
scripts/bench_abba.sh $REF_A origin/main $PAIRS"