From 86547310c98a4baaeffa749d412ef2a5320b2977 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Fri, 26 Jun 2026 18:00:39 -0300
Subject: [PATCH] feat: recursion profiling + measurement programs

Add the measurement/profiling harness for the in-VM STARK verifier:

- `empty`-proof and `deserialize-only` bench guests + `sp1/verifier`
  cross-prover comparison, all exercising the no_std verifier.
- Expand the recursion smoke test with PC-histogram, sampled-flamegraph,
  page-count, cycle-count and per-step-breakdown diagnostics, plus the
  `make test-profile-recursion` targets and the histogram-aggregation
  CI script/workflow.
- Expose read-only `Executor::memory()`, `Memory::cells()` and
  `SymbolTable::functions()` accessors and make `flamegraph::demangle`
  public so the diagnostics can resolve guest PCs to functions.
---
 .../scripts/aggregate_recursion_histogram.py  | 126 +++
 .github/workflows/profile-recursion.yml       | 175 ++++
 Makefile                                      |  12 +-
 bench_vs/build_recursion_elfs.sh              |   7 +-
 .../deserialize-only/.cargo/config.toml       |   6 +
 bench_vs/lambda/deserialize-only/Cargo.toml   |  13 +
 bench_vs/lambda/deserialize-only/src/main.rs  |  93 ++
 bench_vs/sp1/verifier/Cargo.toml              |   3 +
 bench_vs/sp1/verifier/program/Cargo.toml      |  10 +
 bench_vs/sp1/verifier/program/src/main.rs     |  34 +
 bench_vs/sp1/verifier/script/Cargo.toml       |  13 +
 bench_vs/sp1/verifier/script/build.rs         |   5 +
 bench_vs/sp1/verifier/script/src/main.rs      |  83 ++
 executor/src/elf.rs                           |   5 +
 executor/src/flamegraph.rs                    |   2 +-
 executor/src/vm/execution.rs                  |   7 +
 executor/src/vm/memory.rs                     |   7 +
 prover/src/tests/recursion_smoke_test.rs      | 929 +++++++++++++++++-
 18 files changed, 1516 insertions(+), 14 deletions(-)
 create mode 100755 .github/scripts/aggregate_recursion_histogram.py
 create mode 100644 .github/workflows/profile-recursion.yml
 create mode 100644 bench_vs/lambda/deserialize-only/.cargo/config.toml
 create mode 100644 bench_vs/lambda/deserialize-only/Cargo.toml
 create mode 100644 bench_vs/lambda/deserialize-only/src/main.rs
 create mode 100644 bench_vs/sp1/verifier/Cargo.toml
 create mode 100644 bench_vs/sp1/verifier/program/Cargo.toml
 create mode 100644 bench_vs/sp1/verifier/program/src/main.rs
 create mode 100644 bench_vs/sp1/verifier/script/Cargo.toml
 create mode 100644 bench_vs/sp1/verifier/script/build.rs
 create mode 100644 bench_vs/sp1/verifier/script/src/main.rs
diff --git a/.github/scripts/aggregate_recursion_histogram.py b/.github/scripts/aggregate_recursion_histogram.py
new file mode 100755
index 000000000..8a12dc05e
--- /dev/null
+++ b/.github/scripts/aggregate_recursion_histogram.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""Format the recursion-guest per-function profile as a Markdown PR comment.
+
+`test_recursion_pc_histogram` prints a per-function summary table (cycles folded
+over each function's PCs, computed across the *full* histogram) followed by a
+per-address detail table. We extract the per-function table — the view that
+shows where the cycles actually go — and render it as Markdown.
+
+    Top 25 functions by cycle count (aggregated over their PCs):
+    rank          cycles        %    cum %    PCs  function (file:line)
+       1         5335072   24.95%   24.95%     72  <...>::visit_seq::<...>
+
+Reads the test's captured output from argv[1]; writes the Markdown body to
+argv[2] (or stdout).
+"""
+
+import re
+import sys
+
+# A per-function summary row: rank, cycles, pct%, cum%, pcs, function.
+# Distinguished from the per-PC detail rows by the absence of a 0x<pc> column.
+FN_ROW = re.compile(
+    r"^\s*\d+\s+(\d+)\s+([\d.]+)%\s+([\d.]+)%\s+(\d+)\s+(.*\S)\s*$"
+)
+FN_TABLE_START = re.compile(r"Top \d+ functions by cycle count")
+PC_TABLE_START = re.compile(r"Top \d+ PCs by cycle count")
+TOTAL_CYCLES = re.compile(r"Total cycles\s*:\s*(\d+)")
+UNIQUE_PCS = re.compile(r"Unique PCs\s*:\s*(\d+)")
+EXEC_TIME = re.compile(r"Exec time\s*:\s*(\S+)")
+
+
+def parse(text):
+    total_cycles = unique_pcs = exec_time = None
+    rows = []
+    in_fn_table = False
+    for line in text.splitlines():
+        if total_cycles is None and (m := TOTAL_CYCLES.search(line)):
+            total_cycles = int(m.group(1))
+        if unique_pcs is None and (m := UNIQUE_PCS.search(line)):
+            unique_pcs = int(m.group(1))
+        if exec_time is None and (m := EXEC_TIME.search(line)):
+            exec_time = m.group(1)
+        if FN_TABLE_START.search(line):
+            in_fn_table = True
+            continue
+        if PC_TABLE_START.search(line):
+            in_fn_table = False
+            continue
+        if in_fn_table and (m := FN_ROW.match(line)):
+            rows.append(
+                {
+                    "cycles": int(m.group(1)),
+                    "pct": m.group(2),
+                    "cum": m.group(3),
+                    "pcs": int(m.group(4)),
+                    "fn": m.group(5),
+                }
+            )
+    return total_cycles, unique_pcs, exec_time, rows
+
+
+def short(name, width=90):
+    return name if len(name) <= width else name[: width - 1] + "…"
+
+
+def render(total_cycles, unique_pcs, exec_time, rows, title="Recursion guest profile"):
+    if not rows:
+        return (
+            f"### {title}\n\n"
+            "> ⚠️ No per-function rows found in the test output — the run may "
+            "have failed before printing the table. Check the workflow logs.\n"
+        )
+
+    body = f"### {title}\n\n"
+    if total_cycles is not None:
+        body += f"**Total cycles:** {total_cycles:,}"
+        if unique_pcs is not None:
+            body += f" · **Unique PCs:** {unique_pcs:,}"
+        if exec_time:
+            body += f" · **Exec time:** {exec_time}"
+        body += "\n\n"
+
+    body += f"#### Top {len(rows)} functions by cycles (folded over their PCs)\n\n"
+    body += "| Rank | Cycles | % | Cum % | PCs | Function |\n"
+    body += "|-----:|-------:|--:|------:|----:|----------|\n"
+    for i, r in enumerate(rows, 1):
+        body += (
+            f"| {i} | {r['cycles']:,} | {r['pct']}% | {r['cum']}% | "
+            f"{r['pcs']} | `{short(r['fn'])}` |\n"
+        )
+
+    last_cum = rows[-1]["cum"]
+    body += (
+        f"\n<sub>Each function's cycles are summed over all its program counters "
+        f"across the full histogram; the top {len(rows)} cover {last_cum}% of total "
+        f"cycles. Percentages are of total cycles.</sub>\n"
+    )
+    return body
+
+
+def main():
+    import argparse
+
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("log", help="captured test output to parse")
+    ap.add_argument("-o", "--out", help="write Markdown here instead of stdout")
+    ap.add_argument(
+        "-t",
+        "--title",
+        default="Recursion guest profile",
+        help="section heading (e.g. the test/config name)",
+    )
+    args = ap.parse_args()
+
+    with open(args.log, "r", errors="replace") as f:
+        text = f.read()
+    body = render(*parse(text), title=args.title)
+    if args.out:
+        with open(args.out, "w") as f:
+            f.write(body)
+    else:
+        sys.stdout.write(body)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml
new file mode 100644
index 000000000..420cebfcb
--- /dev/null
+++ b/.github/workflows/profile-recursion.yml
@@ -0,0 +1,175 @@
+name: Profile Recursion (PR)
+
+# Runs the recursion-guest PC histogram diagnostics (single-query and
+# multi-query, in parallel via a matrix) and posts a combined per-function
+# profile as a PR comment. Triggered by a `/profile_recursion` comment from a
+# repo member, or manually via workflow_dispatch.
+
+on:
+  workflow_dispatch:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+concurrency:
+  group: profile-recursion-${{ github.event.issue.number || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  # One job per configuration; they run in parallel and each uploads a Markdown
+  # fragment artifact. The `comment` job stitches them into one PR comment.
+  profile:
+    # Skip unless: workflow_dispatch, or "/profile_recursion" comment on a PR by a member.
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'issue_comment' &&
+       github.event.issue.pull_request &&
+       startsWith(github.event.comment.body, '/profile_recursion') &&
+       contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
+    runs-on: [self-hosted, bench]
+    timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: single-query
+            test: single
+            title: "Single query (blowup=2, 1 query)"
+          - name: multi-query
+            test: single
+            title: "Multi query (blowup=8, 128-bit)"
+    steps:
+      - name: React to comment
+        if: github.event_name == 'issue_comment' && matrix.name == 'single-query'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: context.payload.comment.id,
+              content: 'eyes'
+            });
+
+      - name: Get PR head ref
+        id: pr-ref
+        if: github.event_name == 'issue_comment'
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUM: ${{ github.event.issue.number }}
+        run: |
+          SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+          echo "sha=$SHA" >> "$GITHUB_OUTPUT"
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ steps.pr-ref.outputs.sha || github.sha }}
+
+      - name: Setup Rust Environment
+        uses: ./.github/actions/setup-rust
+
+      - name: Add cargo to PATH
+        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
+
+      - name: Run recursion PC histogram (${{ matrix.name }})
+        env:
+          TEST: ${{ matrix.test }}
+        run: |
+          # Self-provision the RISC-V sysroot in a user-writable dir (the default
+          # /opt path on the bench runner is root-owned); the guest ELF build the
+          # test triggers picks this up via the Makefile's `SYSROOT_DIR ?=`.
+          export SYSROOT_DIR="$HOME/.lambda-vm-sysroot"
+          set -o pipefail
+          make test-profile-recursion-$TEST
+
+      - name: Aggregate into a per-function fragment
+        if: always()
+        env:
+          TITLE: ${{ matrix.title }}
+        run: |
+          python3 .github/scripts/aggregate_recursion_histogram.py \
+            /tmp/hist.log --title "$TITLE" --out "/tmp/fragment-${{ matrix.name }}.md"
+          cat "/tmp/fragment-${{ matrix.name }}.md" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Upload fragment
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: profile-fragment-${{ matrix.name }}
+          path: /tmp/fragment-${{ matrix.name }}.md
+          retention-days: 7
+
+  # Stitch the matrix fragments into a single PR comment.
+  comment:
+    needs: profile
+    if: always() && github.event_name == 'issue_comment'
+    runs-on: [self-hosted, bench]
+    steps:
+      - name: Get PR head ref
+        id: pr-ref
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUM: ${{ github.event.issue.number }}
+        run: |
+          SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+          echo "sha=$SHA" >> "$GITHUB_OUTPUT"
+
+      - name: Download fragments
+        uses: actions/download-artifact@v4
+        with:
+          path: fragments
+          pattern: profile-fragment-*
+          merge-multiple: true
+
+      - name: Assemble comment body
+        env:
+          COMMIT_SHA: ${{ steps.pr-ref.outputs.sha }}
+        run: |
+          {
+            echo "## Recursion guest profile"
+            echo
+            # Single-query first, then multi-query, then any others.
+            for frag in fragments/fragment-single-query.md \
+                        fragments/fragment-multi-query.md; do
+              [ -f "$frag" ] && { cat "$frag"; echo; }
+            done
+            echo "<sub>Commit: ${COMMIT_SHA:0:8} · Runner: self-hosted bench</sub>"
+          } > /tmp/profile_comment.md
+          cat /tmp/profile_comment.md
+
+      - name: Comment on PR
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const body = fs.readFileSync('/tmp/profile_comment.md', 'utf8');
+
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            // Reuse our own marker comment so repeated /profile_recursion runs update in place.
+            const existing = comments.find(c =>
+              c.user.type === 'Bot' &&
+              c.body.includes('Recursion guest profile')
+            );
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body,
+              });
+            }
diff --git a/Makefile b/Makefile
index 81bc03a8c..82a990644 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 .PHONY: deps deps-linux deps-macos compile-programs-asm compile-programs-rust compile-bench \
 compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm \
-test-rust test-executor test-flamegraph flamegraph-prover \
+test-rust test-executor test-flamegraph flamegraph-prover test-profile-recursion test-profile-recursion-single test-profile-recursion-multi \
 test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \
 bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \
 update-ethrex-fixture-checksums check-ethrex-fixture-checksums
@@ -197,6 +197,16 @@ test-rust: compile-programs-rust
 test-flamegraph:
 	cargo test -p executor --test flamegraph
 
+
+
+test-profile-recursion: test-profile-recursion-single test-profile-recursion-multi
+
+test-profile-recursion-single: compile-programs-rust
+	cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_1query -- --ignored --nocapture
+
+test-profile-recursion-multi: compile-programs-rust
+	cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_multiquery -- --ignored --nocapture
+
 # Regenerate the committed ethrex block fixtures (see tooling/ethrex-fixtures).
 # Run after bumping the ethrex rev; README checksums are refreshed automatically.
 regen-ethrex-fixtures:
diff --git a/bench_vs/build_recursion_elfs.sh b/bench_vs/build_recursion_elfs.sh
index a1fe41f03..a529b4bbe 100755
--- a/bench_vs/build_recursion_elfs.sh
+++ b/bench_vs/build_recursion_elfs.sh
@@ -18,10 +18,10 @@ build_one() {
     echo "[recursion-elfs] building $name ..."
     (
         cd "$dir"
-        # The recursion guest pulls in lambda-vm-prover and its serde stack;
-        # pin serde to 1.0.219 (pre-`serde_core` split) so
+        # Recursion/deserialize-only guests pull in lambda-vm-prover and its
+        # serde stack; pin serde to 1.0.219 (pre-`serde_core` split) so
         # `-Z build-std=core,alloc` works.
-        if [ "$name" = "recursion" ]; then
+        if [ "$name" = "recursion" ] || [ "$name" = "deserialize-only" ]; then
             cargo "+$TOOLCHAIN" update -p serde --precise 1.0.219 2>/dev/null || true
         fi
         cargo "+$TOOLCHAIN" build --release \
@@ -35,5 +35,6 @@ build_one() {
 build_one empty
 build_one fibonacci
 build_one recursion
+build_one deserialize-only
 
 echo "[recursion-elfs] done"
diff --git a/bench_vs/lambda/deserialize-only/.cargo/config.toml b/bench_vs/lambda/deserialize-only/.cargo/config.toml
new file mode 100644
index 000000000..be730c3ec
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/.cargo/config.toml
@@ -0,0 +1,6 @@
+[target.riscv64im-lambda-vm-elf]
+rustflags = [
+  "-C", "link-arg=-e",
+  "-C", "link-arg=main",
+  "-C", "passes=lower-atomic"
+]
diff --git a/bench_vs/lambda/deserialize-only/Cargo.toml b/bench_vs/lambda/deserialize-only/Cargo.toml
new file mode 100644
index 000000000..b4a4616f4
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/Cargo.toml
@@ -0,0 +1,13 @@
+[workspace]
+
+[package]
+name = "deserialize-only-bench"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+lambda-vm-prover = { path = "../../../prover", default-features = false }
+embedded-alloc = "0.6"
+riscv = { version = "0.15", features = ["critical-section-single-hart"] }
+serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] }
+postcard = { version = "1.0", default-features = false, features = ["alloc"] }
diff --git a/bench_vs/lambda/deserialize-only/src/main.rs b/bench_vs/lambda/deserialize-only/src/main.rs
new file mode 100644
index 000000000..8627776a1
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/src/main.rs
@@ -0,0 +1,93 @@
+//! Deserialize-only counterpart to the recursion guest.
+//!
+//! Reads the same private-input blob as `recursion-bench`, postcard-decodes
+//! `(VmProof, Vec<u8>, ProofOptions)`, then commits success
+//! and halts — without ever calling `verify_with_options`. The cycle delta
+//! between this guest and `recursion-bench` is the actual cost of the STARK
+//! verifier inside the VM (everything else being equal).
+
+#![no_std]
+#![no_main]
+
+extern crate alloc;
+
+use alloc::vec::Vec;
+use core::arch::asm;
+use core::panic::PanicInfo;
+
+use embedded_alloc::TlsfHeap as Heap;
+use lambda_vm_prover::{ProofOptions, VmProof};
+// Required to pull in the riscv crate's critical-section implementation.
+use riscv as _;
+
+const PRIVATE_INPUT_START: usize = 0xFF000000;
+const SYSCALL_COMMIT: u64 = 64;
+const SYSCALL_HALT: u64 = 93;
+const MAX_MEMORY_SIZE: usize = 0xC000_0000;
+
+#[global_allocator]
+static HEAP: Heap = Heap::empty();
+
+#[panic_handler]
+fn panic(_info: &PanicInfo) -> ! {
+    loop {}
+}
+
+fn init_allocator() {
+    unsafe extern "C" {
+        static _end: u8;
+    }
+    let heap_pos = (&raw const _end) as usize;
+    unsafe { HEAP.init(heap_pos, MAX_MEMORY_SIZE - heap_pos) }
+}
+
+fn read_private_input() -> &'static [u8] {
+    let len = unsafe { core::ptr::read_volatile(PRIVATE_INPUT_START as *const u32) } as usize;
+    let data = (PRIVATE_INPUT_START + 4) as *const u8;
+    unsafe { core::slice::from_raw_parts(data, len) }
+}
+
+fn commit(bytes: &[u8]) {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 1u64,
+            in("a1") bytes.as_ptr(),
+            in("a2") bytes.len(),
+            in("a7") SYSCALL_COMMIT,
+        );
+    }
+}
+
+fn halt() -> ! {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 0u64,
+            in("a7") SYSCALL_HALT,
+            options(noreturn),
+        );
+    }
+}
+
+#[unsafe(no_mangle)]
+pub fn main() -> ! {
+    init_allocator();
+
+    let blob = read_private_input();
+    let decoded: (VmProof, Vec<u8>, ProofOptions) =
+        postcard::from_bytes(blob).expect("failed to deserialize");
+
+    // Force the commit byte to depend on the actually-decoded value. Without
+    // this, LLVM at -O3 was eliding the postcard decode entirely — the only
+    // sinks for `decoded` were `black_box(&decoded)` (which only forces the
+    // *reference* to materialize, not the pointee) and `Drop`, neither of
+    // which require the decoded bytes to be real. With the commit byte tied
+    // to a deep field of the decoded value, the decode has to run.
+    let proof_options_byte = decoded.2.blowup_factor;
+    let inner_elf_byte = *decoded.1.first().unwrap_or(&0);
+    let marker = proof_options_byte ^ inner_elf_byte;
+
+    commit(&[marker]);
+    halt()
+}
diff --git a/bench_vs/sp1/verifier/Cargo.toml b/bench_vs/sp1/verifier/Cargo.toml
new file mode 100644
index 000000000..fc24039c2
--- /dev/null
+++ b/bench_vs/sp1/verifier/Cargo.toml
@@ -0,0 +1,3 @@
+[workspace]
+members = ["program", "script"]
+resolver = "2"
diff --git a/bench_vs/sp1/verifier/program/Cargo.toml b/bench_vs/sp1/verifier/program/Cargo.toml
new file mode 100644
index 000000000..7fbc9c5ce
--- /dev/null
+++ b/bench_vs/sp1/verifier/program/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "verifier-program"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+sp1-zkvm = "6.0.1"
+lambda-vm-prover = { path = "../../../../prover", default-features = false }
+serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] }
+postcard = { version = "1.0", default-features = false, features = ["alloc"] }
diff --git a/bench_vs/sp1/verifier/program/src/main.rs b/bench_vs/sp1/verifier/program/src/main.rs
new file mode 100644
index 000000000..c63bb67ca
--- /dev/null
+++ b/bench_vs/sp1/verifier/program/src/main.rs
@@ -0,0 +1,34 @@
+//! SP1 guest that runs lambda-vm's `verify_with_options` on a single proof.
+//!
+//! Input layout (postcard-encoded `Vec<u8>` written via `SP1Stdin::write_vec`):
+//!   `(VmProof, Vec<u8>, ProofOptions)`
+//! where the inner `Vec<u8>` is the inner program's ELF bytes.
+//!
+//! Output: commits `[1u8]` on successful verify; the guest panics otherwise.
+//!
+//! Caveats:
+//! - The verifier hashes through the `keccak` crate. SP1 has a Keccak
+//!   precompile but it patches `tiny-keccak`, not `keccak`. We don't patch
+//!   here, so Keccak runs as software inside the guest. Cycle counts will be
+//!   inflated by that overhead. Worth keeping in mind when interpreting the
+//!   number relative to lambda-vm's in-VM count.
+
+#![no_main]
+
+extern crate alloc;
+
+use alloc::vec::Vec;
+
+use lambda_vm_prover::{ProofOptions, VmProof};
+
+sp1_zkvm::entrypoint!(main);
+
+pub fn main() {
+    let blob = sp1_zkvm::io::read_vec();
+    let (vm_proof, inner_elf, options): (VmProof, Vec<u8>, ProofOptions) =
+        postcard::from_bytes(&blob).expect("failed to deserialize input");
+    let ok = lambda_vm_prover::verify_with_options(&vm_proof, &inner_elf, &options, None, None)
+        .expect("verify errored");
+    assert!(ok, "inner proof failed verification");
+    sp1_zkvm::io::commit_slice(&[1u8]);
+}
diff --git a/bench_vs/sp1/verifier/script/Cargo.toml b/bench_vs/sp1/verifier/script/Cargo.toml
new file mode 100644
index 000000000..3198059bd
--- /dev/null
+++ b/bench_vs/sp1/verifier/script/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "verifier-script"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+sp1-sdk = { version = "6.0.1", features = ["blocking", "profiling"] }
+lambda-vm-prover = { path = "../../../../prover" }
+stark = { path = "../../../../crypto/stark" }
+postcard = { version = "1.0", features = ["alloc"] }
+
+[build-dependencies]
+sp1-build = "6.0.1"
diff --git a/bench_vs/sp1/verifier/script/build.rs b/bench_vs/sp1/verifier/script/build.rs
new file mode 100644
index 000000000..d6cf925d6
--- /dev/null
+++ b/bench_vs/sp1/verifier/script/build.rs
@@ -0,0 +1,5 @@
+use sp1_build::build_program_with_args;
+
+fn main() {
+    build_program_with_args("../program", Default::default());
+}
diff --git a/bench_vs/sp1/verifier/script/src/main.rs b/bench_vs/sp1/verifier/script/src/main.rs
new file mode 100644
index 000000000..86e46a710
--- /dev/null
+++ b/bench_vs/sp1/verifier/script/src/main.rs
@@ -0,0 +1,83 @@
+//! Host driver: prove an inner empty program on lambda-vm, then execute the
+//! lambda-vm verifier inside SP1's executor, printing the cycle count.
+//!
+//! Set `TRACE_FILE=profiles/verifier.json` to capture a DWARF-attributed
+//! profile (1 sample = 1 cycle). The output can be opened with
+//! `samply load profiles/verifier.json`.
+
+use std::path::PathBuf;
+
+use sp1_sdk::blocking::{Prover, ProverClient};
+use sp1_sdk::{SP1Stdin, include_elf};
+
+const VERIFIER_ELF: sp1_sdk::Elf = include_elf!("verifier-program");
+
+fn workspace_root() -> PathBuf {
+    // CARGO_MANIFEST_DIR for this crate is `<root>/bench_vs/sp1/verifier/script`.
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .ancestors()
+        .nth(4)
+        .expect("workspace root")
+        .to_path_buf()
+}
+
+fn main() {
+    sp1_sdk::utils::setup_logger();
+
+    let root = workspace_root();
+    let empty_elf_path = root
+        .join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench");
+    assert!(
+        empty_elf_path.exists(),
+        "empty-bench ELF not found at {} — run `bash bench_vs/build_recursion_elfs.sh` first",
+        empty_elf_path.display(),
+    );
+    let inner_elf = std::fs::read(&empty_elf_path).expect("read empty-bench");
+
+    let options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    println!("[sp1-verifier] proving inner (empty, blowup=2, 1 query) ...");
+    let inner_proof = lambda_vm_prover::prove_with_options_and_inputs(
+        &inner_elf,
+        &[],
+        &options,
+        &lambda_vm_prover::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let blob = postcard::to_allocvec(&(&inner_proof, &inner_elf, &options))
+        .expect("postcard encode failed");
+    println!("[sp1-verifier] postcard blob: {} bytes", blob.len());
+
+    let client = ProverClient::from_env();
+    let mut stdin = SP1Stdin::new();
+    stdin.write_vec(blob);
+
+    println!("[sp1-verifier] executing verifier in SP1 ...");
+    let (_, report) = client
+        .execute(VERIFIER_ELF.clone(), stdin)
+        .run()
+        .expect("execute failed");
+
+    let cycles = report.total_instruction_count();
+    println!();
+    println!("============================================================");
+    println!("  SP1 EXECUTION SUMMARY — lambda-vm verifier inside SP1");
+    println!("============================================================");
+    println!("  Total cycles : {cycles}");
+    println!();
+    println!("  Compare against lambda-vm in-VM count (~40.5B for the same");
+    println!("  proof). Both VMs target riscv64im, so word width is symmetric.");
+    println!("  Main remaining asymmetry: lambda-vm's KeccakPermute precompile");
+    println!("  is patched on its guests but SP1 does not patch `keccak` (only");
+    println!("  `tiny-keccak`), so Keccak rounds run as software in SP1 here.");
+    println!();
+    println!("  If TRACE_FILE was set, the profile was written there.");
+    println!("  Render with: samply load <trace>");
+    println!("============================================================");
+}
diff --git a/executor/src/elf.rs b/executor/src/elf.rs
index bf5624988..120436efd 100644
--- a/executor/src/elf.rs
+++ b/executor/src/elf.rs
@@ -559,4 +559,9 @@ impl SymbolTable {
     pub fn len(&self) -> usize {
         self.functions.len()
     }
+
+    /// Borrow the full function list (sorted by address).
+    pub fn functions(&self) -> &[FunctionSymbol] {
+        &self.functions
+    }
 }
diff --git a/executor/src/flamegraph.rs b/executor/src/flamegraph.rs
index f9b447d19..4764d71a2 100644
--- a/executor/src/flamegraph.rs
+++ b/executor/src/flamegraph.rs
@@ -154,7 +154,7 @@ impl FlamegraphGenerator {
 /// Demangle a Rust symbol name using the official rustc-demangle crate.
 ///
 /// Uses the alternate format (`{:#}`) to omit the hash suffix for cleaner output.
-pub(crate) fn demangle(name: &str) -> String {
+pub fn demangle(name: &str) -> String {
     // Use rustc-demangle with alternate format to omit hash
     format!("{:#}", rustc_demangle(name))
 }
diff --git a/executor/src/vm/execution.rs b/executor/src/vm/execution.rs
index cde8ab6e7..81762cf19 100644
--- a/executor/src/vm/execution.rs
+++ b/executor/src/vm/execution.rs
@@ -105,6 +105,13 @@ impl Executor {
         self.get_return_values()
     }
 
+    /// Read-only access to the executor's memory. Exposed for diagnostic
+    /// tooling that needs to inspect the final memory state (e.g. counting
+    /// distinct 4 KB pages touched) after a streaming `resume()` loop.
+    pub fn memory(&self) -> &Memory {
+        &self.memory
+    }
+
     /// Run to completion and return all logs (consumes executor)
     pub fn run(mut self) -> Result<ExecutionResult, ExecutorError> {
         let mut logs = Vec::with_capacity(CHUNK_SIZE);
diff --git a/executor/src/vm/memory.rs b/executor/src/vm/memory.rs
index b258fbd9e..28d156ae2 100644
--- a/executor/src/vm/memory.rs
+++ b/executor/src/vm/memory.rs
@@ -204,6 +204,13 @@ impl Memory {
         Ok(self.public_output.clone())
     }
 
+    /// Read-only access to the underlying 4-byte cell map. Exposed for
+    /// diagnostic tooling (e.g. counting the distinct 4 KB memory pages a
+    /// program touches) — not part of the normal execution interface.
+    pub fn cells(&self) -> &U64HashMap<[u8; 4]> {
+        &self.cells
+    }
+
     /// Pre-loads private input bytes at `PRIVATE_INPUT_START_INDEX` as a
     /// 4-byte LE length prefix followed by the raw data. The guest reads these
     /// bytes directly via normal RISC-V loads (ZisK-style memory-mapped input).
diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index 4ff649ba4..478be3344 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -2,7 +2,7 @@
 //!
 //! Each test:
 //! 1. Proves an inner program on the host.
-//! 2. Serializes `(VmProof, inner_elf, opts)` with postcard.
+//! 2. Serializes `(VmProof, inner_elf)` with postcard.
 //! 3. Hands that as private input to the recursion guest.
 //! 4. Proves the recursion guest's execution.
 //! 5. Verifies the outer proof.
@@ -12,6 +12,7 @@
 //! Tests are `#[ignore]`d because the outer proof runs the full STARK verifier
 //! inside the VM (minutes per run, large memory footprint).
 
+use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::process::Command;
 
@@ -44,10 +45,10 @@ fn read_guest_elf(root: &std::path::Path, name: &str, bin_name: &str) -> Vec<u8>
 }
 
 /// Minimum-security FRI parameters: blowup=2, a single FRI query. Security is
-/// intentionally terrible — used by the capacity-probing test, where the goal
-/// is the smallest possible inner proof, not a sound one.
-/// (`GoldilocksCubicProofOptions::with_blowup` derives a query count from a
-/// 128-bit target, far more than we want here.)
+/// intentionally terrible — used by the capacity-probing test and every cheap
+/// diagnostic below, where the goal is the smallest possible inner proof, not
+/// a sound one. (`GoldilocksCubicProofOptions::with_blowup` derives a query
+/// count from a 128-bit target, far more than we want here.)
 const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions =
     stark::proof::options::ProofOptions {
         blowup_factor: 2,
@@ -57,10 +58,10 @@ const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions =
     };
 
 /// Prove `inner_elf` (fed `inner_input`) under `opts`, then package
-/// `(proof, elf, opts)` into the postcard blob the recursion guest consumes as
-/// its private input. `tag` prefixes the progress lines. Returns the inner
-/// proof — callers that re-verify it on the host need it — next to the encoded
-/// blob.
+/// `(proof, elf, opts)` into the postcard blob the recursion and
+/// deserialize-only guests consume as their private input. `tag` prefixes the
+/// progress lines. Returns the inner proof — callers that re-verify it on the
+/// host need it — next to the encoded blob.
 fn prove_inner_and_encode_blob(
     tag: &str,
     inner_elf: &[u8],
@@ -85,6 +86,130 @@ fn prove_inner_and_encode_blob(
     (inner_proof, blob)
 }
 
+/// Stream a guest's execution via `Executor::resume()`, calling `on_log` for
+/// every `Log` without ever buffering the full log stream (`Executor::run`
+/// would accumulate tens of millions of `Log`s and OOM even a 125 GB box).
+/// `on_log` returns `ControlFlow::Break(())` to stop the run early (e.g. once a
+/// cycle budget is hit); `Continue(())` to keep going. `on_progress(chunks,
+/// total_cycles, elapsed)` fires once per resumed chunk; callers throttle and
+/// format their own progress lines. Returns `(total_cycles, wall_time)` —
+/// `total_cycles` counts logs actually visited, so it is exact even when a run
+/// breaks mid-chunk.
+fn drive_executor(
+    executor: &mut executor::vm::execution::Executor,
+    mut on_log: impl FnMut(&executor::vm::logs::Log) -> ControlFlow<()>,
+    mut on_progress: impl FnMut(usize, u64, std::time::Duration),
+) -> (u64, std::time::Duration) {
+    let start = std::time::Instant::now();
+    let mut total_cycles: u64 = 0;
+    let mut chunks: usize = 0;
+    while let Some(logs) = executor.resume().expect("executor resume failed") {
+        let mut stop = false;
+        for log in logs {
+            total_cycles += 1;
+            if on_log(log).is_break() {
+                stop = true;
+                break;
+            }
+        }
+        chunks += 1;
+        on_progress(chunks, total_cycles, start.elapsed());
+        if stop {
+            break;
+        }
+    }
+    (total_cycles, start.elapsed())
+}
+
+/// Resolve a guest PC to its (demangled) enclosing function name using the
+/// ELF's own symbol table — the same data `executor::flamegraph` resolves
+/// against. `<unknown>` when no function symbol covers the PC (e.g. PLT stubs
+/// or a release build that dropped symbols). No file:line: the symbol table
+/// carries function ranges only, not DWARF line info.
+fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String {
+    symbols.lookup(pc).map_or_else(
+        || "<unknown>".to_string(),
+        |s| executor::flamegraph::demangle(&s.name),
+    )
+}
+
+/// Print a PC histogram as two tables: a per-function summary (the cycles each
+/// resolved function accounts for, folded over all its PCs) followed by the
+/// top-100 per-address detail. `pc_hist` maps program counter → cycle count.
+///
+/// The per-function view is the one that matters: an inlined kernel is spread
+/// across dozens of PCs, so the raw per-address table scatters its true cost.
+fn print_pc_histogram(
+    title: &str,
+    symbols: &executor::elf::SymbolTable,
+    pc_hist: std::collections::HashMap<u64, u64>,
+    total_cycles: u64,
+    exec_time: std::time::Duration,
+) {
+    let mut entries: Vec<(u64, u64)> = pc_hist.into_iter().collect();
+    entries.sort_unstable_by_key(|(_pc, count)| std::cmp::Reverse(*count));
+
+    // Aggregate the full histogram by resolved function, resolving each PC once.
+    let mut by_function: std::collections::HashMap<String, (u64, u64)> =
+        std::collections::HashMap::new();
+    for (pc, count) in &entries {
+        let entry = by_function.entry(resolve_pc(symbols, *pc)).or_insert((0, 0));
+        entry.0 += *count; // cycles
+        entry.1 += 1; // distinct PCs folded into this function
+    }
+    let mut fn_entries: Vec<(String, (u64, u64))> = by_function.into_iter().collect();
+    fn_entries.sort_unstable_by_key(|(_name, (cycles, _pcs))| std::cmp::Reverse(*cycles));
+
+    let pct = |n: u64| 100.0 * (n as f64) / (total_cycles as f64);
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  {title}");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Unique PCs   : {}", entries.len());
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Top 25 functions by cycle count (aggregated over their PCs):");
+    eprintln!(
+        "  {:>4}  {:>14}  {:>7}  {:>7}  {:>5}  {}",
+        "rank", "cycles", "%", "cum %", "PCs", "function"
+    );
+    let mut fn_cumulative: u64 = 0;
+    for (rank, (name, (cycles, pcs))) in fn_entries.iter().take(25).enumerate() {
+        fn_cumulative += cycles;
+        eprintln!(
+            "  {:>4}  {:>14}  {:>6.2}%  {:>6.2}%  {:>5}  {}",
+            rank + 1,
+            cycles,
+            pct(*cycles),
+            pct(fn_cumulative),
+            pcs,
+            name,
+        );
+    }
+    eprintln!();
+    eprintln!("  Top 100 PCs by cycle count (per-address detail):");
+    eprintln!(
+        "  {:>4}  {:>18}  {:>14}  {:>7}  {:>7}  {}",
+        "rank", "pc", "cycles", "%", "cum %", "function"
+    );
+    let mut cumulative: u64 = 0;
+    for (rank, (pc, count)) in entries.iter().take(100).enumerate() {
+        cumulative += count;
+        eprintln!(
+            "  {:>4}  {:#018x}  {:>14}  {:>6.2}%  {:>6.2}%  {}",
+            rank + 1,
+            pc,
+            count,
+            pct(*count),
+            pct(cumulative),
+            resolve_pc(symbols, *pc),
+        );
+    }
+    eprintln!("============================================================");
+}
+
 /// Core pipeline: prove an inner program with the given options, hand the
 /// proof+ELF+options to the recursion guest, then prove and verify the outer
 /// proof.
@@ -172,6 +297,792 @@ fn test_recursion_smoke_1query() {
     );
 }
 
+/// Diagnostic: build the inner proof and dump the recursion guest's private-input
+/// blob to `/tmp/recursion_input.bin` so the CLI's `execute --flamegraph` can
+/// consume it.
+///
+/// Usage after running this test:
+/// ```
+/// cargo run -p cli --release -- execute \
+///     bench_vs/lambda/recursion/target/riscv64im-lambda-vm-elf/release/recursion-bench \
+///     --private-input /tmp/recursion_input.bin \
+///     --flamegraph /tmp/recursion_folded.txt
+/// cat /tmp/recursion_folded.txt | inferno-flamegraph > /tmp/recursion_flamegraph.svg
+/// ```
+#[test]
+#[ignore = "diagnostic: writes recursion private input to /tmp/recursion_input.bin"]
+fn test_dump_recursion_input() {
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("dump-input", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    let path = "/tmp/recursion_input.bin";
+    std::fs::write(path, &blob).expect("write blob");
+    eprintln!("[dump-input] wrote {} bytes to {path}", blob.len());
+}
+
+/// Diagnostic: build the inner proof + recursion guest input, then **execute
+/// only** the recursion guest (no STARK proving) and report cycle counts +
+/// trace size estimates.
+///
+/// This is the cheap way to find out how many RISC-V instructions the
+/// verifier actually executes inside the guest — a much faster signal than
+/// running the full outer prove (which can OOM on a 125 GB machine).
+#[test]
+#[ignore = "diagnostic: runs the executor only, prints cycle counts"]
+fn test_recursion_cycle_count() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    // Build the inner proof exactly as the smoke test does, with the
+    // absolute-minimum FRI params so the inner is as small as possible.
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("cycle-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    // Execute (NOT prove) the recursion guest. `drive_executor` streams chunks
+    // and never accumulates logs in memory — this avoids the Vec<Log> blow-up
+    // that OOMs even a 125 GB server (one Log is 40 B; a few billion of them is
+    // hundreds of GB).
+    eprintln!("[cycle-count] executing recursion guest (streaming counter only) ...");
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |_log| ControlFlow::Continue(()),
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(50) {
+                eprintln!("[cycle-count]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed");
+            }
+        },
+    );
+    let cycle_count = total_cycles as usize;
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  RECURSION GUEST EXECUTION SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Cycle count           : {cycle_count}");
+    eprintln!("  Executor wall time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Rough memory estimate for outer prove:");
+    let bytes_per_field = 8usize;
+    let approx_columns = 250usize; // CPU + MEMW + DECODE + bus columns combined
+    let main_trace_bytes = cycle_count * approx_columns * bytes_per_field;
+    let blowup = 2usize;
+    let lde_main_bytes = main_trace_bytes * blowup;
+    eprintln!(
+        "    main trace            : ~{:.2} GB ({} cycles × ~{} cols × 8 B)",
+        main_trace_bytes as f64 / 1e9,
+        cycle_count,
+        approx_columns
+    );
+    eprintln!(
+        "    main LDE (blowup={})   : ~{:.2} GB",
+        blowup,
+        lde_main_bytes as f64 / 1e9
+    );
+    eprintln!("  (aux trace adds roughly 50% more, so peak peak ≈ 2-3× LDE)");
+    eprintln!("============================================================");
+}
+
+/// Diagnostic: count the distinct 4 KB memory pages the recursion guest
+/// touches when verifying a small inner proof.
+///
+/// We suspect the outer prover's 125 GB OOM wall is dominated by per-page
+/// PAGE-table overhead. The number of PAGE tables the prover would build
+/// equals the number of distinct 4 KB pages the executor touches — code,
+/// heap, private input, and stack. This test surfaces that count without
+/// running the prover.
+///
+/// Layout (per `executor::constants` + `bench_vs/lambda/recursion/src/main.rs`):
+/// - Code/static: whatever PT_LOAD segments the recursion ELF carries.
+/// - Heap: `_end .. 0xC000_0000` (`MAX_MEMORY_SIZE`); `TlsfHeap` scatters
+///   allocations across this region.
+/// - Private input: starts at `PRIVATE_INPUT_START_INDEX = 0xFF000000`.
+/// - Stack: top of address space (down from `STACK_TOP = 0xFFFFFFFFFFFFFFF0`).
+///
+/// Interpretation (rough):
+/// - <1,000 pages: PAGE-table overhead is not the bottleneck.
+/// - 10k-100k pages: TLSF heap fragmentation; design a tighter bump allocator
+///   and re-measure.
+/// - >100k pages: postcard decode dominates; consider streaming decode.
+#[test]
+#[ignore = "diagnostic: counts distinct 4 KB memory pages touched by the recursion guest"]
+fn test_recursion_page_count() {
+    use executor::constants::PRIVATE_INPUT_START_INDEX;
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+    use std::collections::HashSet;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("page-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    // Precompute the recursion ELF's PT_LOAD ranges so we can bucket code/
+    // static pages separately from heap. `Elf::load` already expands BSS
+    // (memsz > filesz) into zero-valued words, so these ranges cover
+    // .text + .rodata + .data + .bss.
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let segment_ranges: Vec<(u64, u64)> = program
+        .data
+        .iter()
+        .map(|seg| (seg.base_addr, seg.base_addr + (seg.values.len() as u64 * 4)))
+        .collect();
+    eprintln!(
+        "[page-count] recursion ELF: {} PT_LOAD segment(s)",
+        segment_ranges.len(),
+    );
+    for (i, (lo, hi)) in segment_ranges.iter().enumerate() {
+        eprintln!(
+            "[page-count]   segment[{i}]: 0x{lo:016x} .. 0x{hi:016x} ({} bytes)",
+            hi - lo,
+        );
+    }
+
+    // Stream through execution — running to completion via `Executor::run`
+    // would accumulate ~67 M `Log` records (~2.7 GB) we don't need. We only
+    // care about the *final* memory state.
+    eprintln!("[page-count] executing recursion guest (streaming) ...");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |_log| ControlFlow::Continue(()),
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(50) {
+                eprintln!("[page-count]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed");
+            }
+        },
+    );
+
+    // Collect the set of distinct 4 KB pages from every cell touched during
+    // (a) program loading, (b) private-input loading, (c) execution.
+    const PAGE_MASK: u64 = !0xFFFu64;
+    let cells = executor.memory().cells();
+    let total_cells = cells.len();
+    let pages: HashSet<u64> = cells.keys().map(|&a| a & PAGE_MASK).collect();
+
+    // Bucket by region. A "code/static" page is any page that overlaps a
+    // PT_LOAD segment. Stack lives near the top of the 64-bit address
+    // space; private input lives in the [0xFF000000, ...) window above the
+    // 3 GB heap ceiling.
+    const HEAP_CEILING: u64 = 0xC000_0000;
+    const STACK_FLOOR: u64 = 0xFFFF_FFFF_0000_0000;
+
+    let mut code_pages = 0usize;
+    let mut heap_pages = 0usize;
+    let mut private_input_pages = 0usize;
+    let mut stack_pages = 0usize;
+    let mut other_pages = 0usize;
+
+    for &page in &pages {
+        let page_end = page.saturating_add(0x1000);
+        let in_code = segment_ranges
+            .iter()
+            .any(|&(lo, hi)| page < hi && lo < page_end);
+        if in_code {
+            code_pages += 1;
+        } else if page >= STACK_FLOOR {
+            stack_pages += 1;
+        } else if page >= PRIVATE_INPUT_START_INDEX {
+            private_input_pages += 1;
+        } else if page < HEAP_CEILING {
+            heap_pages += 1;
+        } else {
+            other_pages += 1;
+        }
+    }
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  RECURSION GUEST PAGE-COUNT SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles                  : {total_cycles}");
+    eprintln!("  Executor wall time            : {exec_time:?}");
+    eprintln!("  Memory cells touched (4 B ea) : {total_cells}");
+    eprintln!("  Distinct 4 KB pages touched   : {}", pages.len());
+    eprintln!();
+    eprintln!("  Pages per region:");
+    eprintln!("    code/static (ELF segments)     : {code_pages}");
+    eprintln!("    heap (0..0xC000_0000)          : {heap_pages}");
+    eprintln!("    private input (0xFF000000..)   : {private_input_pages}");
+    eprintln!("    stack (>= 0xFFFFFFFF_00000000) : {stack_pages}");
+    if other_pages > 0 {
+        eprintln!("    other (unclassified)           : {other_pages}");
+    }
+    eprintln!();
+    eprintln!("  Interpretation (PAGE-table overhead):");
+    eprintln!("    <1k pages     → PAGE overhead is not the bottleneck.");
+    eprintln!("    10k-100k      → TLSF heap fragmentation; try a bump alloc.");
+    eprintln!("    >100k         → postcard decode dominates; stream-decode?");
+    eprintln!("============================================================");
+}
+
+/// Build a PC histogram of the recursion guest verifying an `empty`-program
+/// inner proof produced with `inner_proof_options`, and print it via
+/// [`print_pc_histogram`] under `title`.
+///
+/// `blowup_factor` and `fri_number_of_queries` are coupled (the query count is
+/// derived from blowup for a fixed security target), so each `#[test]` below is
+/// just this runner with a different `ProofOptions` — a single query at low
+/// blowup, vs. the security-derived multi-query count at a higher blowup.
+///
+/// Streams chunks of logs via `Executor::resume()` so memory stays bounded to
+/// the histogram itself. Each PC is resolved to its enclosing function via the
+/// in-house `executor::elf::SymbolTable` (reading the recursion ELF's symbol
+/// table directly — no external tool, no DWARF dependency).
+fn run_recursion_pc_histogram(title: &str, inner_proof_options: stark::proof::options::ProofOptions) {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+    use std::collections::HashMap;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("pc-hist", &empty_elf_bytes, &[], &inner_proof_options);
+
+    eprintln!("[pc-hist] executing recursion guest (building PC histogram) ...");
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    let mut pc_hist: HashMap<u64, u64> = HashMap::with_capacity(300_000);
+    let unique = std::cell::Cell::new(0usize);
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |log| {
+            *pc_hist.entry(log.current_pc).or_insert(0) += 1;
+            unique.set(pc_hist.len());
+            ControlFlow::Continue(())
+        },
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(500) {
+                eprintln!(
+                    "[pc-hist]   ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}",
+                    unique.get()
+                );
+            }
+        },
+    );
+
+    // Resolve PCs to functions directly from the ELF's symbol table.
+    let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes);
+    print_pc_histogram(title, &symbols, pc_hist, total_cycles, exec_time);
+}
+
+/// Diagnostic: PC histogram of the recursion guest with a **single** FRI query
+/// at blowup=2 — the cheapest verifier run, dominated by fixed setup cost
+/// (decode, allocator, postcard) rather than per-query FRI/Merkle work.
+#[test]
+#[ignore = "diagnostic: ~8 minutes; prints PC histogram of the verifier-in-VM"]
+fn test_recursion_pc_histogram_1query() {
+    run_recursion_pc_histogram(
+        "RECURSION GUEST PC HISTOGRAM (blowup=2, 1 query)",
+        MIN_PROOF_OPTIONS,
+    );
+}
+
+/// Diagnostic: PC histogram of the recursion guest at **128-bit security**
+/// (blowup=8, FRI query count derived by the Johnson Bound Regime — tens of
+/// queries). Compared against the single-query runs, weight shifts toward the
+/// verifier's per-query FRI-layer / Merkle-opening and field arithmetic.
+#[test]
+#[ignore = "diagnostic: heavy; PC histogram of the multi-query verifier-in-VM"]
+fn test_recursion_pc_histogram_multiquery() {
+    let inner_proof_options = crate::GoldilocksCubicProofOptions::with_blowup(8)
+        .expect("blowup=8 is always valid");
+    run_recursion_pc_histogram(
+        &format!(
+            "RECURSION GUEST PC HISTOGRAM (blowup=8, {} queries, 128-bit)",
+            inner_proof_options.fri_number_of_queries
+        ),
+        inner_proof_options,
+    );
+}
+
+/// Diagnostic: build a **sampled** call-stack histogram of the recursion guest.
+///
+/// Like `test_recursion_pc_histogram` but groups by full call stack (not PC).
+/// To stay fast, only every `SAMPLE_RATE`-th log is recorded into the histogram.
+/// The call stack itself is updated on every log (skipping would corrupt it).
+///
+/// Output is written to `/tmp/recursion_folded_sampled.txt` in
+/// inferno-flamegraph "folded stacks" format. Pipe it through:
+///
+///     cat /tmp/recursion_folded_sampled.txt | inferno-flamegraph > svg.svg
+///
+/// Expect ~10-20 minutes for SAMPLE_RATE=100 on a 40B-cycle guest.
+#[test]
+#[ignore = "diagnostic: sampled flamegraph for the verifier-in-VM"]
+fn test_recursion_sampled_flamegraph() {
+    use executor::elf::Elf;
+    use executor::flamegraph::FlamegraphGenerator;
+    use executor::vm::execution::Executor;
+    use std::io::BufWriter;
+
+    /// 1 in N logs is fed to `process_logs`, which both updates the call
+    /// stack and records a sample. At 1, every cycle goes through — the call
+    /// stack stays exactly in sync with execution so frame widths are
+    /// trustworthy, but the per-cycle cost (~57µs) limits how many cycles
+    /// we can cover within a wall-clock budget.
+    ///
+    /// At SAMPLE_RATE > 1, every CALL/RETURN that lands on a skipped cycle
+    /// silently desyncs the stack, producing the "stuck-in-visit_seq" effect
+    /// we saw at 1:1000. Use values > 1 only when stack accuracy is
+    /// expendable.
+    const SAMPLE_RATE: usize = 1;
+
+    /// Stop the executor early once we've covered this many cycles.
+    /// Set to 0 to run to completion (40B+ cycles, hours at SAMPLE_RATE=1).
+    /// At SAMPLE_RATE=1, ~57µs per cycle means 5M cycles ≈ 5 min wall time.
+    const CYCLE_BUDGET: u64 = 5_000_000;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("sampled-fg", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    eprintln!("[sampled-fg] executing recursion guest (sampling 1-in-{SAMPLE_RATE}) ...",);
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes);
+    let entry_point = program.entry_point;
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    // Build our own instruction cache from the same segments `Executor::new`
+    // decodes internally. Owning it (rather than reading `executor.instructions`
+    // mid-loop) is what lets the per-log closure call `process_logs` without
+    // borrowing `executor`, which `drive_executor` holds mutably for `resume()`.
+    let instructions = executor::vm::execution::InstructionCache::new(&program.data)
+        .expect("instruction cache build failed");
+
+    // RefCell so the per-log closure (`process_logs`, &mut self) and the
+    // progress closure (`write_folded`, &self) can both reach the generator —
+    // their calls never overlap, so the runtime borrow check never trips.
+    let generator = std::cell::RefCell::new(FlamegraphGenerator::new(symbols, entry_point));
+
+    // Path is defined here (not after the loop) so the periodic checkpoint
+    // writes below can target it. The final write at the end still happens.
+    let path = "/tmp/recursion_folded_sampled.txt";
+
+    let mut i = 0usize;
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |log| {
+            // 1-in-SAMPLE_RATE logs are fed to `process_logs`. At SAMPLE_RATE==1
+            // this is the identity filter (`_ % 1 == 0`); the `#[allow]` keeps
+            // the general form so SAMPLE_RATE can be bumped without touching the
+            // body. Skipped logs lose stack accuracy — acceptable diagnostic
+            // quality at higher rates.
+            #[allow(clippy::modulo_one)]
+            let take = i % SAMPLE_RATE == 0;
+            if take {
+                generator
+                    .borrow_mut()
+                    .process_logs(std::slice::from_ref(log), &instructions)
+                    .expect("flamegraph process_logs");
+            }
+            i += 1;
+
+            // Early exit once we've covered the cycle budget. The dominant hot
+            // kernels are ~uniform across the verifier's runtime, so a partial
+            // run still surfaces them. `#[allow]` lets CYCLE_BUDGET be const-0
+            // (full run) without tripping clippy.
+            #[allow(clippy::absurd_extreme_comparisons)]
+            if CYCLE_BUDGET > 0 && i as u64 >= CYCLE_BUDGET {
+                eprintln!("[sampled-fg] hit cycle budget ({CYCLE_BUDGET} cycles), stopping early");
+                ControlFlow::Break(())
+            } else {
+                ControlFlow::Continue(())
+            }
+        },
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(500) {
+                eprintln!("[sampled-fg]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed");
+                // Checkpoint: re-write the folded file in place so a killed run
+                // still leaves a usable (if partial) flamegraph on disk.
+                let file = std::fs::File::create(path).expect("create output file");
+                let mut writer = BufWriter::new(file);
+                generator
+                    .borrow()
+                    .write_folded(&mut writer)
+                    .expect("write folded output");
+            }
+        },
+    );
+
+    let file = std::fs::File::create(path).expect("create output file");
+    let mut writer = BufWriter::new(file);
+    generator
+        .borrow()
+        .write_folded(&mut writer)
+        .expect("write folded output");
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  SAMPLED FLAMEGRAPH SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Sample rate  : 1 in {SAMPLE_RATE}");
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!("  Output file  : {path}");
+    eprintln!("============================================================");
+    eprintln!();
+    eprintln!("  To render SVG (requires inferno):");
+    eprintln!("    cat {path} | inferno-flamegraph > /tmp/recursion_flamegraph_sampled.svg");
+    eprintln!("============================================================");
+}
+
+/// Diagnostic: host-side per-step timings for the verifier.
+///
+/// Runs an inner prove (empty guest, blowup=2, 1 query) and then verifies it
+/// on the host. When built with `--features stark/instruments`, the verifier
+/// prints `Time spent: ...` for each of the four steps (replay challenges,
+/// composition polynomial, FRI, DEEP openings) plus the step-1-replay it
+/// does before step 2. Lets us see the host-side split in seconds, without
+/// running anything inside the VM.
+///
+/// Usage:
+/// ```
+/// cargo test --release -p lambda-vm-prover --features stark/instruments \
+///   --lib test_host_verify_step_timings -- --ignored --nocapture
+/// ```
+#[test]
+#[ignore = "diagnostic: prints host-side verifier step timings"]
+fn test_host_verify_step_timings() {
+    let root = workspace_root();
+    let empty_path =
+        root.join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench");
+    if !empty_path.exists() {
+        build_elfs(&root);
+    }
+    let empty_elf_bytes = std::fs::read(&empty_path).expect("read empty-bench");
+
+    let inner_proof_options = MIN_PROOF_OPTIONS;
+
+    eprintln!("[host-verify] proving empty (blowup=2, fri_queries=1) ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    eprintln!("[host-verify] verifying on host (with instruments) ...");
+    let ok =
+        crate::verify_with_options(&inner_proof, &empty_elf_bytes, &inner_proof_options, None, None)
+            .expect("verify errored");
+    assert!(ok, "proof must verify");
+    eprintln!("[host-verify] verified OK");
+}
+
+/// Diagnostic: cycle count for the **deserialize-only** counterpart of the
+/// recursion guest. Same input layout
+/// (`(VmProof, Vec<u8>, ProofOptions)`) and same proof, but
+/// the guest just postcard-decodes the blob and halts — it never calls
+/// `verify_with_options`.
+///
+/// The cycle delta between this and `test_recursion_cycle_count` is the
+/// actual cost of the STARK verifier inside the VM. Historically (40.5 B-cycle
+/// recursion guest) postcard decode was ~15.6 M cycles — negligible. Now that
+/// the recursion guest is ~67 M cycles, the same absolute cost would be ~23%
+/// of total; this test re-measures it.
+#[test]
+#[ignore = "diagnostic: runs the deserialize-only guest, prints cycle count"]
+fn test_deserialize_only_cycle_count() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let deser_elf_bytes = read_guest_elf(&root, "deserialize-only", "deserialize-only-bench");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("deser-only", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    eprintln!("[deser-only] executing deserialize-only guest (streaming) ...");
+    let program = Elf::load(&deser_elf_bytes).expect("ELF load failed");
+    eprintln!(
+        "[deser-only] ELF: {} bytes, entry_point=0x{:x}",
+        deser_elf_bytes.len(),
+        program.entry_point,
+    );
+    assert_ne!(
+        program.entry_point, 0,
+        "deserialize-only ELF has entry_point=0 — build artifact is malformed"
+    );
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |_log| ControlFlow::Continue(()),
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(50) {
+                eprintln!("[deser-only]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed");
+            }
+        },
+    );
+    let cycle_count = total_cycles;
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  DESERIALIZE-ONLY GUEST EXECUTION SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Cycle count           : {cycle_count}");
+    eprintln!("  Executor wall time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Compare against test_recursion_cycle_count (~40.5B cycles");
+    eprintln!("  with the same proof). Delta = verifier-in-VM cost.");
+    eprintln!("============================================================");
+}
+
+/// Diagnostic: PC histogram for the **deserialize-only** guest.
+///
+/// Sibling of `test_recursion_pc_histogram`, but targeting the
+/// deserialize-only control guest so we can locate the hot kernel inside the
+/// 15.7 M-cycle postcard decode itself. Every cycle goes through the
+/// histogram (no sampling), so attribution is exact — the previous sampled
+/// flamegraph at 1:1000 had broken stack reconstruction on skipped
+/// CALL/RETURNs, which made it unreliable for a workload this small.
+///
+/// Each top PC is resolved to its enclosing function via the in-house
+/// `executor::elf::SymbolTable`, reading the guest ELF's symbol table directly
+/// (no external tool, no DWARF dependency).
+#[test]
+#[ignore = "diagnostic: ~1 min; PC histogram for the deserialize-only guest"]
+fn test_deserialize_only_pc_histogram() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+    use std::collections::HashMap;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let deser_elf_bytes = read_guest_elf(&root, "deserialize-only", "deserialize-only-bench");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("deser-pc-hist", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    eprintln!("[deser-pc-hist] executing deserialize-only guest (building PC histogram) ...");
+    let program = Elf::load(&deser_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    // ~50k unique PCs is plenty: the deserialize-only guest is ~74 KB of ELF
+    // (~18k 4-byte instructions); the hot inner loop is much smaller still.
+    let mut pc_hist: HashMap<u64, u64> = HashMap::with_capacity(50_000);
+    let unique = std::cell::Cell::new(0usize);
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |log| {
+            *pc_hist.entry(log.current_pc).or_insert(0) += 1;
+            unique.set(pc_hist.len());
+            ControlFlow::Continue(())
+        },
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(50) {
+                eprintln!(
+                    "[deser-pc-hist]   ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}",
+                    unique.get()
+                );
+            }
+        },
+    );
+
+    // Resolve PCs to functions directly from the ELF's symbol table.
+    let symbols = executor::elf::SymbolTable::parse(&deser_elf_bytes);
+    print_pc_histogram(
+        "DESERIALIZE-ONLY GUEST PC HISTOGRAM",
+        &symbols,
+        pc_hist,
+        total_cycles,
+        exec_time,
+    );
+}
+
+/// Diagnostic: bucket the recursion guest's cycles by which verifier step
+/// is currently executing.
+///
+/// The verifier's hot path is `verify_rounds_2_to_4`, which calls four
+/// sub-routines in a fixed order:
+///   1. `replay_rounds_after_round_1`               (recover challenges)
+///   2. `step_2_verify_claimed_composition_polynomial`
+///   3. `step_3_verify_fri`
+///   4. `step_4_verify_trace_and_composition_openings`
+///
+/// We resolve each sub-routine's entry PC from the recursion ELF's symbol
+/// table, then run a monotonic state machine over the execution stream:
+/// the active bucket only advances 0 → 1 → 2 → 3 → 4 (never backwards),
+/// so cycles inside a step's callees stay attributed to that step.
+///
+/// Bucket 0 ("setup") captures everything before step 1 is entered — the
+/// allocator init, postcard decode, and `VmAirs::new` (which contains the
+/// expensive preprocessed-commitment FFTs).
+///
+/// Streams chunks via `Executor::resume()` so memory stays bounded.
+#[test]
+#[ignore = "diagnostic: ~13 min; buckets the 40B cycles by verifier step"]
+fn test_recursion_step_breakdown() {
+    use executor::elf::{Elf, SymbolTable};
+    use executor::vm::execution::Executor;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("step-bkd", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    // Build a per-step "advance bucket to N" lookup. The verifier's step
+    // functions get inlined by LLVM in release mode, so we can't rely on
+    // matching their entry PCs directly. Instead we anchor on closures the
+    // compiler emits *inside* each step's body — iterator combinators like
+    // `.fold(|...|)` keep the step's method name as a substring in their
+    // mangled symbol. Any PC that resolves to a symbol containing step N's
+    // keyword advances the bucket to N (monotonically).
+    //
+    // If step N has no matching symbol at all (e.g. step 4 is fully inlined
+    // with no closure children of its own), its cycles get attributed to the
+    // previous bucket. We report that explicitly in the summary.
+    let symbols = SymbolTable::parse(&recursion_elf_bytes);
+    assert!(
+        !symbols.is_empty(),
+        "recursion ELF has no symbol table — was it stripped?"
+    );
+
+    let step_keywords = [
+        "replay_rounds_after_round_1",
+        "step_2_verify_claimed_composition_polynomial",
+        "step_3_verify_fri",
+        "step_4_verify_trace_and_composition_openings",
+    ];
+    let step_found: [bool; 4] = std::array::from_fn(|i| {
+        symbols
+            .functions()
+            .iter()
+            .any(|f| f.name.contains(step_keywords[i]))
+    });
+    for (i, found) in step_found.iter().enumerate() {
+        let n_matches = symbols
+            .functions()
+            .iter()
+            .filter(|f| f.name.contains(step_keywords[i]))
+            .count();
+        eprintln!(
+            "[step-bkd] step {}: keyword={:?} -> {} symbol(s) {}",
+            i + 1,
+            step_keywords[i],
+            n_matches,
+            if *found {
+                ""
+            } else {
+                "(fully inlined; will merge into the previous bucket)"
+            }
+        );
+    }
+
+    // Monotonic state machine: 0=setup, 1..=4=inside step N (or its callees /
+    // inlined-step-N-cycles attributed here because step N+1 is missing).
+    // `bucket` lives in a Cell so the per-log closure can advance it while the
+    // progress closure reads it for its live readout.
+    let bucket = std::cell::Cell::new(0u8);
+    let mut buckets = [0u64; 5];
+
+    eprintln!("[step-bkd] executing recursion guest (streaming) ...");
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    // Cache the last symbol-table hit so we only do a binary search on
+    // function transitions, not on every cycle. Functions are typically
+    // long-running (>>1 instruction), so this cache hits ~all of the time.
+    let mut last_range: Option<(u64, u64)> = None;
+    let mut last_advance: u8 = 0;
+
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |log| {
+            let pc = log.current_pc;
+            let in_cached = matches!(last_range, Some((s, e)) if pc >= s && pc < e);
+            if !in_cached {
+                // Slow path: refresh the cache from the symbol table.
+                if let Some(sym) = symbols.lookup(pc) {
+                    // SymbolTable accepts size=0 symbols as "any address >="; for
+                    // those we'd need the next symbol's start for a real upper
+                    // bound. Cheapest workaround: set a tiny range so we re-resolve
+                    // soon enough that wrong attribution is bounded.
+                    let end = sym.address + sym.size.max(1);
+                    last_range = Some((sym.address, end));
+                    last_advance = 0;
+                    for (i, kw) in step_keywords.iter().enumerate() {
+                        if sym.name.contains(kw) {
+                            last_advance = (i + 1) as u8;
+                        }
+                    }
+                } else {
+                    last_range = None;
+                    last_advance = 0;
+                }
+            }
+            if bucket.get() < last_advance {
+                bucket.set(last_advance);
+            }
+            buckets[bucket.get() as usize] += 1;
+            ControlFlow::Continue(())
+        },
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(500) {
+                eprintln!(
+                    "[step-bkd]   ... {chunks} chunks, {cycles} cycles, bucket={}, {elapsed:?}",
+                    bucket.get()
+                );
+            }
+        },
+    );
+
+    let labels = [
+        "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)",
+        "1. step 1: replay_rounds_after_round_1",
+        "2. step 2: verify_claimed_composition_polynomial",
+        "3. step 3: verify_fri",
+        "4. step 4: verify_trace_and_composition_openings (+ wrap-up)",
+    ];
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  RECURSION GUEST PER-STEP CYCLE BREAKDOWN");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  {:<60}  {:>14}  {:>7}", "bucket", "cycles", "%");
+    for (label, cycles) in labels.iter().zip(buckets.iter()) {
+        let pct = if total_cycles > 0 {
+            100.0 * (*cycles as f64) / (total_cycles as f64)
+        } else {
+            0.0
+        };
+        eprintln!("  {:<60}  {:>14}  {:>6.2}%", label, cycles, pct);
+    }
+    eprintln!("============================================================");
+}
+
 /// Inner program: fibonacci(10).
 #[test]
 #[ignore = "slow: runs the full STARK verifier inside the VM"]