From 86547310c98a4baaeffa749d412ef2a5320b2977 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Fri, 26 Jun 2026 18:00:39 -0300 Subject: [PATCH] feat: recursion profiling + measurement programs Add the measurement/profiling harness for the in-VM STARK verifier: - `empty`-proof and `deserialize-only` bench guests + `sp1/verifier` cross-prover comparison, all exercising the no_std verifier. - Expand the recursion smoke test with PC-histogram, sampled-flamegraph, page-count, cycle-count and per-step-breakdown diagnostics, plus the `make test-profile-recursion` targets and the histogram-aggregation CI script/workflow. - Expose read-only `Executor::memory()`, `Memory::cells()` and `SymbolTable::functions()` accessors and make `flamegraph::demangle` public so the diagnostics can resolve guest PCs to functions. --- .../scripts/aggregate_recursion_histogram.py | 126 +++ .github/workflows/profile-recursion.yml | 175 ++++ Makefile | 12 +- bench_vs/build_recursion_elfs.sh | 7 +- .../deserialize-only/.cargo/config.toml | 6 + bench_vs/lambda/deserialize-only/Cargo.toml | 13 + bench_vs/lambda/deserialize-only/src/main.rs | 93 ++ bench_vs/sp1/verifier/Cargo.toml | 3 + bench_vs/sp1/verifier/program/Cargo.toml | 10 + bench_vs/sp1/verifier/program/src/main.rs | 34 + bench_vs/sp1/verifier/script/Cargo.toml | 13 + bench_vs/sp1/verifier/script/build.rs | 5 + bench_vs/sp1/verifier/script/src/main.rs | 83 ++ executor/src/elf.rs | 5 + executor/src/flamegraph.rs | 2 +- executor/src/vm/execution.rs | 7 + executor/src/vm/memory.rs | 7 + prover/src/tests/recursion_smoke_test.rs | 929 +++++++++++++++++- 18 files changed, 1516 insertions(+), 14 deletions(-) create mode 100755 .github/scripts/aggregate_recursion_histogram.py create mode 100644 .github/workflows/profile-recursion.yml create mode 100644 bench_vs/lambda/deserialize-only/.cargo/config.toml create mode 100644 bench_vs/lambda/deserialize-only/Cargo.toml create mode 100644 bench_vs/lambda/deserialize-only/src/main.rs create mode 100644 bench_vs/sp1/verifier/Cargo.toml create mode 100644 bench_vs/sp1/verifier/program/Cargo.toml create mode 100644 bench_vs/sp1/verifier/program/src/main.rs create mode 100644 bench_vs/sp1/verifier/script/Cargo.toml create mode 100644 bench_vs/sp1/verifier/script/build.rs create mode 100644 bench_vs/sp1/verifier/script/src/main.rs diff --git a/.github/scripts/aggregate_recursion_histogram.py b/.github/scripts/aggregate_recursion_histogram.py new file mode 100755 index 000000000..8a12dc05e --- /dev/null +++ b/.github/scripts/aggregate_recursion_histogram.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Format the recursion-guest per-function profile as a Markdown PR comment. + +`test_recursion_pc_histogram` prints a per-function summary table (cycles folded +over each function's PCs, computed across the *full* histogram) followed by a +per-address detail table. We extract the per-function table — the view that +shows where the cycles actually go — and render it as Markdown. + + Top 25 functions by cycle count (aggregated over their PCs): + rank cycles % cum % PCs function (file:line) + 1 5335072 24.95% 24.95% 72 <...>::visit_seq::<...> + +Reads the test's captured output from argv[1]; writes the Markdown body to +argv[2] (or stdout). +""" + +import re +import sys + +# A per-function summary row: rank, cycles, pct%, cum%, pcs, function. +# Distinguished from the per-PC detail rows by the absence of a 0x column. +FN_ROW = re.compile( + r"^\s*\d+\s+(\d+)\s+([\d.]+)%\s+([\d.]+)%\s+(\d+)\s+(.*\S)\s*$" +) +FN_TABLE_START = re.compile(r"Top \d+ functions by cycle count") +PC_TABLE_START = re.compile(r"Top \d+ PCs by cycle count") +TOTAL_CYCLES = re.compile(r"Total cycles\s*:\s*(\d+)") +UNIQUE_PCS = re.compile(r"Unique PCs\s*:\s*(\d+)") +EXEC_TIME = re.compile(r"Exec time\s*:\s*(\S+)") + + +def parse(text): + total_cycles = unique_pcs = exec_time = None + rows = [] + in_fn_table = False + for line in text.splitlines(): + if total_cycles is None and (m := TOTAL_CYCLES.search(line)): + total_cycles = int(m.group(1)) + if unique_pcs is None and (m := UNIQUE_PCS.search(line)): + unique_pcs = int(m.group(1)) + if exec_time is None and (m := EXEC_TIME.search(line)): + exec_time = m.group(1) + if FN_TABLE_START.search(line): + in_fn_table = True + continue + if PC_TABLE_START.search(line): + in_fn_table = False + continue + if in_fn_table and (m := FN_ROW.match(line)): + rows.append( + { + "cycles": int(m.group(1)), + "pct": m.group(2), + "cum": m.group(3), + "pcs": int(m.group(4)), + "fn": m.group(5), + } + ) + return total_cycles, unique_pcs, exec_time, rows + + +def short(name, width=90): + return name if len(name) <= width else name[: width - 1] + "…" + + +def render(total_cycles, unique_pcs, exec_time, rows, title="Recursion guest profile"): + if not rows: + return ( + f"### {title}\n\n" + "> ⚠️ No per-function rows found in the test output — the run may " + "have failed before printing the table. Check the workflow logs.\n" + ) + + body = f"### {title}\n\n" + if total_cycles is not None: + body += f"**Total cycles:** {total_cycles:,}" + if unique_pcs is not None: + body += f" · **Unique PCs:** {unique_pcs:,}" + if exec_time: + body += f" · **Exec time:** {exec_time}" + body += "\n\n" + + body += f"#### Top {len(rows)} functions by cycles (folded over their PCs)\n\n" + body += "| Rank | Cycles | % | Cum % | PCs | Function |\n" + body += "|-----:|-------:|--:|------:|----:|----------|\n" + for i, r in enumerate(rows, 1): + body += ( + f"| {i} | {r['cycles']:,} | {r['pct']}% | {r['cum']}% | " + f"{r['pcs']} | `{short(r['fn'])}` |\n" + ) + + last_cum = rows[-1]["cum"] + body += ( + f"\nEach function's cycles are summed over all its program counters " + f"across the full histogram; the top {len(rows)} cover {last_cum}% of total " + f"cycles. Percentages are of total cycles.\n" + ) + return body + + +def main(): + import argparse + + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("log", help="captured test output to parse") + ap.add_argument("-o", "--out", help="write Markdown here instead of stdout") + ap.add_argument( + "-t", + "--title", + default="Recursion guest profile", + help="section heading (e.g. the test/config name)", + ) + args = ap.parse_args() + + with open(args.log, "r", errors="replace") as f: + text = f.read() + body = render(*parse(text), title=args.title) + if args.out: + with open(args.out, "w") as f: + f.write(body) + else: + sys.stdout.write(body) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml new file mode 100644 index 000000000..420cebfcb --- /dev/null +++ b/.github/workflows/profile-recursion.yml @@ -0,0 +1,175 @@ +name: Profile Recursion (PR) + +# Runs the recursion-guest PC histogram diagnostics (single-query and +# multi-query, in parallel via a matrix) and posts a combined per-function +# profile as a PR comment. Triggered by a `/profile_recursion` comment from a +# repo member, or manually via workflow_dispatch. + +on: + workflow_dispatch: + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: write + +concurrency: + group: profile-recursion-${{ github.event.issue.number || github.run_id }} + cancel-in-progress: true + +jobs: + # One job per configuration; they run in parallel and each uploads a Markdown + # fragment artifact. The `comment` job stitches them into one PR comment. + profile: + # Skip unless: workflow_dispatch, or "/profile_recursion" comment on a PR by a member. + if: >- + github.event_name == 'workflow_dispatch' || + (github.event_name == 'issue_comment' && + github.event.issue.pull_request && + startsWith(github.event.comment.body, '/profile_recursion') && + contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)) + runs-on: [self-hosted, bench] + timeout-minutes: 90 + strategy: + fail-fast: false + matrix: + include: + - name: single-query + test: single + title: "Single query (blowup=2, 1 query)" + - name: multi-query + test: single + title: "Multi query (blowup=8, 128-bit)" + steps: + - name: React to comment + if: github.event_name == 'issue_comment' && matrix.name == 'single-query' + uses: actions/github-script@v7 + with: + script: | + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: context.payload.comment.id, + content: 'eyes' + }); + + - name: Get PR head ref + id: pr-ref + if: github.event_name == 'issue_comment' + env: + GH_TOKEN: ${{ github.token }} + PR_NUM: ${{ github.event.issue.number }} + run: | + SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid) + echo "sha=$SHA" >> "$GITHUB_OUTPUT" + + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ steps.pr-ref.outputs.sha || github.sha }} + + - name: Setup Rust Environment + uses: ./.github/actions/setup-rust + + - name: Add cargo to PATH + run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" + + - name: Run recursion PC histogram (${{ matrix.name }}) + env: + TEST: ${{ matrix.test }} + run: | + # Self-provision the RISC-V sysroot in a user-writable dir (the default + # /opt path on the bench runner is root-owned); the guest ELF build the + # test triggers picks this up via the Makefile's `SYSROOT_DIR ?=`. + export SYSROOT_DIR="$HOME/.lambda-vm-sysroot" + set -o pipefail + make test-profile-recursion-$TEST + + - name: Aggregate into a per-function fragment + if: always() + env: + TITLE: ${{ matrix.title }} + run: | + python3 .github/scripts/aggregate_recursion_histogram.py \ + /tmp/hist.log --title "$TITLE" --out "/tmp/fragment-${{ matrix.name }}.md" + cat "/tmp/fragment-${{ matrix.name }}.md" >> "$GITHUB_STEP_SUMMARY" + + - name: Upload fragment + if: always() + uses: actions/upload-artifact@v4 + with: + name: profile-fragment-${{ matrix.name }} + path: /tmp/fragment-${{ matrix.name }}.md + retention-days: 7 + + # Stitch the matrix fragments into a single PR comment. + comment: + needs: profile + if: always() && github.event_name == 'issue_comment' + runs-on: [self-hosted, bench] + steps: + - name: Get PR head ref + id: pr-ref + env: + GH_TOKEN: ${{ github.token }} + PR_NUM: ${{ github.event.issue.number }} + run: | + SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid) + echo "sha=$SHA" >> "$GITHUB_OUTPUT" + + - name: Download fragments + uses: actions/download-artifact@v4 + with: + path: fragments + pattern: profile-fragment-* + merge-multiple: true + + - name: Assemble comment body + env: + COMMIT_SHA: ${{ steps.pr-ref.outputs.sha }} + run: | + { + echo "## Recursion guest profile" + echo + # Single-query first, then multi-query, then any others. + for frag in fragments/fragment-single-query.md \ + fragments/fragment-multi-query.md; do + [ -f "$frag" ] && { cat "$frag"; echo; } + done + echo "Commit: ${COMMIT_SHA:0:8} · Runner: self-hosted bench" + } > /tmp/profile_comment.md + cat /tmp/profile_comment.md + + - name: Comment on PR + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const body = fs.readFileSync('/tmp/profile_comment.md', 'utf8'); + + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + // Reuse our own marker comment so repeated /profile_recursion runs update in place. + const existing = comments.find(c => + c.user.type === 'Bot' && + c.body.includes('Recursion guest profile') + ); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); + } diff --git a/Makefile b/Makefile index 81bc03a8c..82a990644 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: deps deps-linux deps-macos compile-programs-asm compile-programs-rust compile-bench \ compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm \ -test-rust test-executor test-flamegraph flamegraph-prover \ +test-rust test-executor test-flamegraph flamegraph-prover test-profile-recursion test-profile-recursion-single test-profile-recursion-multi \ test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \ bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \ update-ethrex-fixture-checksums check-ethrex-fixture-checksums @@ -197,6 +197,16 @@ test-rust: compile-programs-rust test-flamegraph: cargo test -p executor --test flamegraph + + +test-profile-recursion: test-profile-recursion-single test-profile-recursion-multi + +test-profile-recursion-single: compile-programs-rust + cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_1query -- --ignored --nocapture + +test-profile-recursion-multi: compile-programs-rust + cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_multiquery -- --ignored --nocapture + # Regenerate the committed ethrex block fixtures (see tooling/ethrex-fixtures). # Run after bumping the ethrex rev; README checksums are refreshed automatically. regen-ethrex-fixtures: diff --git a/bench_vs/build_recursion_elfs.sh b/bench_vs/build_recursion_elfs.sh index a1fe41f03..a529b4bbe 100755 --- a/bench_vs/build_recursion_elfs.sh +++ b/bench_vs/build_recursion_elfs.sh @@ -18,10 +18,10 @@ build_one() { echo "[recursion-elfs] building $name ..." ( cd "$dir" - # The recursion guest pulls in lambda-vm-prover and its serde stack; - # pin serde to 1.0.219 (pre-`serde_core` split) so + # Recursion/deserialize-only guests pull in lambda-vm-prover and its + # serde stack; pin serde to 1.0.219 (pre-`serde_core` split) so # `-Z build-std=core,alloc` works. - if [ "$name" = "recursion" ]; then + if [ "$name" = "recursion" ] || [ "$name" = "deserialize-only" ]; then cargo "+$TOOLCHAIN" update -p serde --precise 1.0.219 2>/dev/null || true fi cargo "+$TOOLCHAIN" build --release \ @@ -35,5 +35,6 @@ build_one() { build_one empty build_one fibonacci build_one recursion +build_one deserialize-only echo "[recursion-elfs] done" diff --git a/bench_vs/lambda/deserialize-only/.cargo/config.toml b/bench_vs/lambda/deserialize-only/.cargo/config.toml new file mode 100644 index 000000000..be730c3ec --- /dev/null +++ b/bench_vs/lambda/deserialize-only/.cargo/config.toml @@ -0,0 +1,6 @@ +[target.riscv64im-lambda-vm-elf] +rustflags = [ + "-C", "link-arg=-e", + "-C", "link-arg=main", + "-C", "passes=lower-atomic" +] diff --git a/bench_vs/lambda/deserialize-only/Cargo.toml b/bench_vs/lambda/deserialize-only/Cargo.toml new file mode 100644 index 000000000..b4a4616f4 --- /dev/null +++ b/bench_vs/lambda/deserialize-only/Cargo.toml @@ -0,0 +1,13 @@ +[workspace] + +[package] +name = "deserialize-only-bench" +version = "0.1.0" +edition = "2024" + +[dependencies] +lambda-vm-prover = { path = "../../../prover", default-features = false } +embedded-alloc = "0.6" +riscv = { version = "0.15", features = ["critical-section-single-hart"] } +serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] } +postcard = { version = "1.0", default-features = false, features = ["alloc"] } diff --git a/bench_vs/lambda/deserialize-only/src/main.rs b/bench_vs/lambda/deserialize-only/src/main.rs new file mode 100644 index 000000000..8627776a1 --- /dev/null +++ b/bench_vs/lambda/deserialize-only/src/main.rs @@ -0,0 +1,93 @@ +//! Deserialize-only counterpart to the recursion guest. +//! +//! Reads the same private-input blob as `recursion-bench`, postcard-decodes +//! `(VmProof, Vec, ProofOptions)`, then commits success +//! and halts — without ever calling `verify_with_options`. The cycle delta +//! between this guest and `recursion-bench` is the actual cost of the STARK +//! verifier inside the VM (everything else being equal). + +#![no_std] +#![no_main] + +extern crate alloc; + +use alloc::vec::Vec; +use core::arch::asm; +use core::panic::PanicInfo; + +use embedded_alloc::TlsfHeap as Heap; +use lambda_vm_prover::{ProofOptions, VmProof}; +// Required to pull in the riscv crate's critical-section implementation. +use riscv as _; + +const PRIVATE_INPUT_START: usize = 0xFF000000; +const SYSCALL_COMMIT: u64 = 64; +const SYSCALL_HALT: u64 = 93; +const MAX_MEMORY_SIZE: usize = 0xC000_0000; + +#[global_allocator] +static HEAP: Heap = Heap::empty(); + +#[panic_handler] +fn panic(_info: &PanicInfo) -> ! { + loop {} +} + +fn init_allocator() { + unsafe extern "C" { + static _end: u8; + } + let heap_pos = (&raw const _end) as usize; + unsafe { HEAP.init(heap_pos, MAX_MEMORY_SIZE - heap_pos) } +} + +fn read_private_input() -> &'static [u8] { + let len = unsafe { core::ptr::read_volatile(PRIVATE_INPUT_START as *const u32) } as usize; + let data = (PRIVATE_INPUT_START + 4) as *const u8; + unsafe { core::slice::from_raw_parts(data, len) } +} + +fn commit(bytes: &[u8]) { + unsafe { + asm!( + "ecall", + in("a0") 1u64, + in("a1") bytes.as_ptr(), + in("a2") bytes.len(), + in("a7") SYSCALL_COMMIT, + ); + } +} + +fn halt() -> ! { + unsafe { + asm!( + "ecall", + in("a0") 0u64, + in("a7") SYSCALL_HALT, + options(noreturn), + ); + } +} + +#[unsafe(no_mangle)] +pub fn main() -> ! { + init_allocator(); + + let blob = read_private_input(); + let decoded: (VmProof, Vec, ProofOptions) = + postcard::from_bytes(blob).expect("failed to deserialize"); + + // Force the commit byte to depend on the actually-decoded value. Without + // this, LLVM at -O3 was eliding the postcard decode entirely — the only + // sinks for `decoded` were `black_box(&decoded)` (which only forces the + // *reference* to materialize, not the pointee) and `Drop`, neither of + // which require the decoded bytes to be real. With the commit byte tied + // to a deep field of the decoded value, the decode has to run. + let proof_options_byte = decoded.2.blowup_factor; + let inner_elf_byte = *decoded.1.first().unwrap_or(&0); + let marker = proof_options_byte ^ inner_elf_byte; + + commit(&[marker]); + halt() +} diff --git a/bench_vs/sp1/verifier/Cargo.toml b/bench_vs/sp1/verifier/Cargo.toml new file mode 100644 index 000000000..fc24039c2 --- /dev/null +++ b/bench_vs/sp1/verifier/Cargo.toml @@ -0,0 +1,3 @@ +[workspace] +members = ["program", "script"] +resolver = "2" diff --git a/bench_vs/sp1/verifier/program/Cargo.toml b/bench_vs/sp1/verifier/program/Cargo.toml new file mode 100644 index 000000000..7fbc9c5ce --- /dev/null +++ b/bench_vs/sp1/verifier/program/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "verifier-program" +version = "0.1.0" +edition = "2024" + +[dependencies] +sp1-zkvm = "6.0.1" +lambda-vm-prover = { path = "../../../../prover", default-features = false } +serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] } +postcard = { version = "1.0", default-features = false, features = ["alloc"] } diff --git a/bench_vs/sp1/verifier/program/src/main.rs b/bench_vs/sp1/verifier/program/src/main.rs new file mode 100644 index 000000000..c63bb67ca --- /dev/null +++ b/bench_vs/sp1/verifier/program/src/main.rs @@ -0,0 +1,34 @@ +//! SP1 guest that runs lambda-vm's `verify_with_options` on a single proof. +//! +//! Input layout (postcard-encoded `Vec` written via `SP1Stdin::write_vec`): +//! `(VmProof, Vec, ProofOptions)` +//! where the inner `Vec` is the inner program's ELF bytes. +//! +//! Output: commits `[1u8]` on successful verify; the guest panics otherwise. +//! +//! Caveats: +//! - The verifier hashes through the `keccak` crate. SP1 has a Keccak +//! precompile but it patches `tiny-keccak`, not `keccak`. We don't patch +//! here, so Keccak runs as software inside the guest. Cycle counts will be +//! inflated by that overhead. Worth keeping in mind when interpreting the +//! number relative to lambda-vm's in-VM count. + +#![no_main] + +extern crate alloc; + +use alloc::vec::Vec; + +use lambda_vm_prover::{ProofOptions, VmProof}; + +sp1_zkvm::entrypoint!(main); + +pub fn main() { + let blob = sp1_zkvm::io::read_vec(); + let (vm_proof, inner_elf, options): (VmProof, Vec, ProofOptions) = + postcard::from_bytes(&blob).expect("failed to deserialize input"); + let ok = lambda_vm_prover::verify_with_options(&vm_proof, &inner_elf, &options, None, None) + .expect("verify errored"); + assert!(ok, "inner proof failed verification"); + sp1_zkvm::io::commit_slice(&[1u8]); +} diff --git a/bench_vs/sp1/verifier/script/Cargo.toml b/bench_vs/sp1/verifier/script/Cargo.toml new file mode 100644 index 000000000..3198059bd --- /dev/null +++ b/bench_vs/sp1/verifier/script/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "verifier-script" +version = "0.1.0" +edition = "2024" + +[dependencies] +sp1-sdk = { version = "6.0.1", features = ["blocking", "profiling"] } +lambda-vm-prover = { path = "../../../../prover" } +stark = { path = "../../../../crypto/stark" } +postcard = { version = "1.0", features = ["alloc"] } + +[build-dependencies] +sp1-build = "6.0.1" diff --git a/bench_vs/sp1/verifier/script/build.rs b/bench_vs/sp1/verifier/script/build.rs new file mode 100644 index 000000000..d6cf925d6 --- /dev/null +++ b/bench_vs/sp1/verifier/script/build.rs @@ -0,0 +1,5 @@ +use sp1_build::build_program_with_args; + +fn main() { + build_program_with_args("../program", Default::default()); +} diff --git a/bench_vs/sp1/verifier/script/src/main.rs b/bench_vs/sp1/verifier/script/src/main.rs new file mode 100644 index 000000000..86e46a710 --- /dev/null +++ b/bench_vs/sp1/verifier/script/src/main.rs @@ -0,0 +1,83 @@ +//! Host driver: prove an inner empty program on lambda-vm, then execute the +//! lambda-vm verifier inside SP1's executor, printing the cycle count. +//! +//! Set `TRACE_FILE=profiles/verifier.json` to capture a DWARF-attributed +//! profile (1 sample = 1 cycle). The output can be opened with +//! `samply load profiles/verifier.json`. + +use std::path::PathBuf; + +use sp1_sdk::blocking::{Prover, ProverClient}; +use sp1_sdk::{SP1Stdin, include_elf}; + +const VERIFIER_ELF: sp1_sdk::Elf = include_elf!("verifier-program"); + +fn workspace_root() -> PathBuf { + // CARGO_MANIFEST_DIR for this crate is `/bench_vs/sp1/verifier/script`. + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .ancestors() + .nth(4) + .expect("workspace root") + .to_path_buf() +} + +fn main() { + sp1_sdk::utils::setup_logger(); + + let root = workspace_root(); + let empty_elf_path = root + .join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench"); + assert!( + empty_elf_path.exists(), + "empty-bench ELF not found at {} — run `bash bench_vs/build_recursion_elfs.sh` first", + empty_elf_path.display(), + ); + let inner_elf = std::fs::read(&empty_elf_path).expect("read empty-bench"); + + let options = stark::proof::options::ProofOptions { + blowup_factor: 2, + fri_number_of_queries: 1, + coset_offset: 3, + grinding_factor: 1, + }; + + println!("[sp1-verifier] proving inner (empty, blowup=2, 1 query) ..."); + let inner_proof = lambda_vm_prover::prove_with_options_and_inputs( + &inner_elf, + &[], + &options, + &lambda_vm_prover::MaxRowsConfig::default(), + ) + .expect("inner prove should succeed"); + + let blob = postcard::to_allocvec(&(&inner_proof, &inner_elf, &options)) + .expect("postcard encode failed"); + println!("[sp1-verifier] postcard blob: {} bytes", blob.len()); + + let client = ProverClient::from_env(); + let mut stdin = SP1Stdin::new(); + stdin.write_vec(blob); + + println!("[sp1-verifier] executing verifier in SP1 ..."); + let (_, report) = client + .execute(VERIFIER_ELF.clone(), stdin) + .run() + .expect("execute failed"); + + let cycles = report.total_instruction_count(); + println!(); + println!("============================================================"); + println!(" SP1 EXECUTION SUMMARY — lambda-vm verifier inside SP1"); + println!("============================================================"); + println!(" Total cycles : {cycles}"); + println!(); + println!(" Compare against lambda-vm in-VM count (~40.5B for the same"); + println!(" proof). Both VMs target riscv64im, so word width is symmetric."); + println!(" Main remaining asymmetry: lambda-vm's KeccakPermute precompile"); + println!(" is patched on its guests but SP1 does not patch `keccak` (only"); + println!(" `tiny-keccak`), so Keccak rounds run as software in SP1 here."); + println!(); + println!(" If TRACE_FILE was set, the profile was written there."); + println!(" Render with: samply load "); + println!("============================================================"); +} diff --git a/executor/src/elf.rs b/executor/src/elf.rs index bf5624988..120436efd 100644 --- a/executor/src/elf.rs +++ b/executor/src/elf.rs @@ -559,4 +559,9 @@ impl SymbolTable { pub fn len(&self) -> usize { self.functions.len() } + + /// Borrow the full function list (sorted by address). + pub fn functions(&self) -> &[FunctionSymbol] { + &self.functions + } } diff --git a/executor/src/flamegraph.rs b/executor/src/flamegraph.rs index f9b447d19..4764d71a2 100644 --- a/executor/src/flamegraph.rs +++ b/executor/src/flamegraph.rs @@ -154,7 +154,7 @@ impl FlamegraphGenerator { /// Demangle a Rust symbol name using the official rustc-demangle crate. /// /// Uses the alternate format (`{:#}`) to omit the hash suffix for cleaner output. -pub(crate) fn demangle(name: &str) -> String { +pub fn demangle(name: &str) -> String { // Use rustc-demangle with alternate format to omit hash format!("{:#}", rustc_demangle(name)) } diff --git a/executor/src/vm/execution.rs b/executor/src/vm/execution.rs index cde8ab6e7..81762cf19 100644 --- a/executor/src/vm/execution.rs +++ b/executor/src/vm/execution.rs @@ -105,6 +105,13 @@ impl Executor { self.get_return_values() } + /// Read-only access to the executor's memory. Exposed for diagnostic + /// tooling that needs to inspect the final memory state (e.g. counting + /// distinct 4 KB pages touched) after a streaming `resume()` loop. + pub fn memory(&self) -> &Memory { + &self.memory + } + /// Run to completion and return all logs (consumes executor) pub fn run(mut self) -> Result { let mut logs = Vec::with_capacity(CHUNK_SIZE); diff --git a/executor/src/vm/memory.rs b/executor/src/vm/memory.rs index b258fbd9e..28d156ae2 100644 --- a/executor/src/vm/memory.rs +++ b/executor/src/vm/memory.rs @@ -204,6 +204,13 @@ impl Memory { Ok(self.public_output.clone()) } + /// Read-only access to the underlying 4-byte cell map. Exposed for + /// diagnostic tooling (e.g. counting the distinct 4 KB memory pages a + /// program touches) — not part of the normal execution interface. + pub fn cells(&self) -> &U64HashMap<[u8; 4]> { + &self.cells + } + /// Pre-loads private input bytes at `PRIVATE_INPUT_START_INDEX` as a /// 4-byte LE length prefix followed by the raw data. The guest reads these /// bytes directly via normal RISC-V loads (ZisK-style memory-mapped input). diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index 4ff649ba4..478be3344 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -2,7 +2,7 @@ //! //! Each test: //! 1. Proves an inner program on the host. -//! 2. Serializes `(VmProof, inner_elf, opts)` with postcard. +//! 2. Serializes `(VmProof, inner_elf)` with postcard. //! 3. Hands that as private input to the recursion guest. //! 4. Proves the recursion guest's execution. //! 5. Verifies the outer proof. @@ -12,6 +12,7 @@ //! Tests are `#[ignore]`d because the outer proof runs the full STARK verifier //! inside the VM (minutes per run, large memory footprint). +use std::ops::ControlFlow; use std::path::PathBuf; use std::process::Command; @@ -44,10 +45,10 @@ fn read_guest_elf(root: &std::path::Path, name: &str, bin_name: &str) -> Vec } /// Minimum-security FRI parameters: blowup=2, a single FRI query. Security is -/// intentionally terrible — used by the capacity-probing test, where the goal -/// is the smallest possible inner proof, not a sound one. -/// (`GoldilocksCubicProofOptions::with_blowup` derives a query count from a -/// 128-bit target, far more than we want here.) +/// intentionally terrible — used by the capacity-probing test and every cheap +/// diagnostic below, where the goal is the smallest possible inner proof, not +/// a sound one. (`GoldilocksCubicProofOptions::with_blowup` derives a query +/// count from a 128-bit target, far more than we want here.) const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions = stark::proof::options::ProofOptions { blowup_factor: 2, @@ -57,10 +58,10 @@ const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions = }; /// Prove `inner_elf` (fed `inner_input`) under `opts`, then package -/// `(proof, elf, opts)` into the postcard blob the recursion guest consumes as -/// its private input. `tag` prefixes the progress lines. Returns the inner -/// proof — callers that re-verify it on the host need it — next to the encoded -/// blob. +/// `(proof, elf, opts)` into the postcard blob the recursion and +/// deserialize-only guests consume as their private input. `tag` prefixes the +/// progress lines. Returns the inner proof — callers that re-verify it on the +/// host need it — next to the encoded blob. fn prove_inner_and_encode_blob( tag: &str, inner_elf: &[u8], @@ -85,6 +86,130 @@ fn prove_inner_and_encode_blob( (inner_proof, blob) } +/// Stream a guest's execution via `Executor::resume()`, calling `on_log` for +/// every `Log` without ever buffering the full log stream (`Executor::run` +/// would accumulate tens of millions of `Log`s and OOM even a 125 GB box). +/// `on_log` returns `ControlFlow::Break(())` to stop the run early (e.g. once a +/// cycle budget is hit); `Continue(())` to keep going. `on_progress(chunks, +/// total_cycles, elapsed)` fires once per resumed chunk; callers throttle and +/// format their own progress lines. Returns `(total_cycles, wall_time)` — +/// `total_cycles` counts logs actually visited, so it is exact even when a run +/// breaks mid-chunk. +fn drive_executor( + executor: &mut executor::vm::execution::Executor, + mut on_log: impl FnMut(&executor::vm::logs::Log) -> ControlFlow<()>, + mut on_progress: impl FnMut(usize, u64, std::time::Duration), +) -> (u64, std::time::Duration) { + let start = std::time::Instant::now(); + let mut total_cycles: u64 = 0; + let mut chunks: usize = 0; + while let Some(logs) = executor.resume().expect("executor resume failed") { + let mut stop = false; + for log in logs { + total_cycles += 1; + if on_log(log).is_break() { + stop = true; + break; + } + } + chunks += 1; + on_progress(chunks, total_cycles, start.elapsed()); + if stop { + break; + } + } + (total_cycles, start.elapsed()) +} + +/// Resolve a guest PC to its (demangled) enclosing function name using the +/// ELF's own symbol table — the same data `executor::flamegraph` resolves +/// against. `` when no function symbol covers the PC (e.g. PLT stubs +/// or a release build that dropped symbols). No file:line: the symbol table +/// carries function ranges only, not DWARF line info. +fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String { + symbols.lookup(pc).map_or_else( + || "".to_string(), + |s| executor::flamegraph::demangle(&s.name), + ) +} + +/// Print a PC histogram as two tables: a per-function summary (the cycles each +/// resolved function accounts for, folded over all its PCs) followed by the +/// top-100 per-address detail. `pc_hist` maps program counter → cycle count. +/// +/// The per-function view is the one that matters: an inlined kernel is spread +/// across dozens of PCs, so the raw per-address table scatters its true cost. +fn print_pc_histogram( + title: &str, + symbols: &executor::elf::SymbolTable, + pc_hist: std::collections::HashMap, + total_cycles: u64, + exec_time: std::time::Duration, +) { + let mut entries: Vec<(u64, u64)> = pc_hist.into_iter().collect(); + entries.sort_unstable_by_key(|(_pc, count)| std::cmp::Reverse(*count)); + + // Aggregate the full histogram by resolved function, resolving each PC once. + let mut by_function: std::collections::HashMap = + std::collections::HashMap::new(); + for (pc, count) in &entries { + let entry = by_function.entry(resolve_pc(symbols, *pc)).or_insert((0, 0)); + entry.0 += *count; // cycles + entry.1 += 1; // distinct PCs folded into this function + } + let mut fn_entries: Vec<(String, (u64, u64))> = by_function.into_iter().collect(); + fn_entries.sort_unstable_by_key(|(_name, (cycles, _pcs))| std::cmp::Reverse(*cycles)); + + let pct = |n: u64| 100.0 * (n as f64) / (total_cycles as f64); + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" {title}"); + eprintln!("============================================================"); + eprintln!(" Total cycles : {total_cycles}"); + eprintln!(" Unique PCs : {}", entries.len()); + eprintln!(" Exec time : {exec_time:?}"); + eprintln!(); + eprintln!(" Top 25 functions by cycle count (aggregated over their PCs):"); + eprintln!( + " {:>4} {:>14} {:>7} {:>7} {:>5} {}", + "rank", "cycles", "%", "cum %", "PCs", "function" + ); + let mut fn_cumulative: u64 = 0; + for (rank, (name, (cycles, pcs))) in fn_entries.iter().take(25).enumerate() { + fn_cumulative += cycles; + eprintln!( + " {:>4} {:>14} {:>6.2}% {:>6.2}% {:>5} {}", + rank + 1, + cycles, + pct(*cycles), + pct(fn_cumulative), + pcs, + name, + ); + } + eprintln!(); + eprintln!(" Top 100 PCs by cycle count (per-address detail):"); + eprintln!( + " {:>4} {:>18} {:>14} {:>7} {:>7} {}", + "rank", "pc", "cycles", "%", "cum %", "function" + ); + let mut cumulative: u64 = 0; + for (rank, (pc, count)) in entries.iter().take(100).enumerate() { + cumulative += count; + eprintln!( + " {:>4} {:#018x} {:>14} {:>6.2}% {:>6.2}% {}", + rank + 1, + pc, + count, + pct(*count), + pct(cumulative), + resolve_pc(symbols, *pc), + ); + } + eprintln!("============================================================"); +} + /// Core pipeline: prove an inner program with the given options, hand the /// proof+ELF+options to the recursion guest, then prove and verify the outer /// proof. @@ -172,6 +297,792 @@ fn test_recursion_smoke_1query() { ); } +/// Diagnostic: build the inner proof and dump the recursion guest's private-input +/// blob to `/tmp/recursion_input.bin` so the CLI's `execute --flamegraph` can +/// consume it. +/// +/// Usage after running this test: +/// ``` +/// cargo run -p cli --release -- execute \ +/// bench_vs/lambda/recursion/target/riscv64im-lambda-vm-elf/release/recursion-bench \ +/// --private-input /tmp/recursion_input.bin \ +/// --flamegraph /tmp/recursion_folded.txt +/// cat /tmp/recursion_folded.txt | inferno-flamegraph > /tmp/recursion_flamegraph.svg +/// ``` +#[test] +#[ignore = "diagnostic: writes recursion private input to /tmp/recursion_input.bin"] +fn test_dump_recursion_input() { + let root = workspace_root(); + build_elfs(&root); + let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("dump-input", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + let path = "/tmp/recursion_input.bin"; + std::fs::write(path, &blob).expect("write blob"); + eprintln!("[dump-input] wrote {} bytes to {path}", blob.len()); +} + +/// Diagnostic: build the inner proof + recursion guest input, then **execute +/// only** the recursion guest (no STARK proving) and report cycle counts + +/// trace size estimates. +/// +/// This is the cheap way to find out how many RISC-V instructions the +/// verifier actually executes inside the guest — a much faster signal than +/// running the full outer prove (which can OOM on a 125 GB machine). +#[test] +#[ignore = "diagnostic: runs the executor only, prints cycle counts"] +fn test_recursion_cycle_count() { + use executor::elf::Elf; + use executor::vm::execution::Executor; + + let root = workspace_root(); + build_elfs(&root); + let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench"); + + // Build the inner proof exactly as the smoke test does, with the + // absolute-minimum FRI params so the inner is as small as possible. + let (_inner_proof, blob) = + prove_inner_and_encode_blob("cycle-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + // Execute (NOT prove) the recursion guest. `drive_executor` streams chunks + // and never accumulates logs in memory — this avoids the Vec blow-up + // that OOMs even a 125 GB server (one Log is 40 B; a few billion of them is + // hundreds of GB). + eprintln!("[cycle-count] executing recursion guest (streaming counter only) ..."); + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |_log| ControlFlow::Continue(()), + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(50) { + eprintln!("[cycle-count] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"); + } + }, + ); + let cycle_count = total_cycles as usize; + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" RECURSION GUEST EXECUTION SUMMARY"); + eprintln!("============================================================"); + eprintln!(" Cycle count : {cycle_count}"); + eprintln!(" Executor wall time : {exec_time:?}"); + eprintln!(); + eprintln!(" Rough memory estimate for outer prove:"); + let bytes_per_field = 8usize; + let approx_columns = 250usize; // CPU + MEMW + DECODE + bus columns combined + let main_trace_bytes = cycle_count * approx_columns * bytes_per_field; + let blowup = 2usize; + let lde_main_bytes = main_trace_bytes * blowup; + eprintln!( + " main trace : ~{:.2} GB ({} cycles × ~{} cols × 8 B)", + main_trace_bytes as f64 / 1e9, + cycle_count, + approx_columns + ); + eprintln!( + " main LDE (blowup={}) : ~{:.2} GB", + blowup, + lde_main_bytes as f64 / 1e9 + ); + eprintln!(" (aux trace adds roughly 50% more, so peak peak ≈ 2-3× LDE)"); + eprintln!("============================================================"); +} + +/// Diagnostic: count the distinct 4 KB memory pages the recursion guest +/// touches when verifying a small inner proof. +/// +/// We suspect the outer prover's 125 GB OOM wall is dominated by per-page +/// PAGE-table overhead. The number of PAGE tables the prover would build +/// equals the number of distinct 4 KB pages the executor touches — code, +/// heap, private input, and stack. This test surfaces that count without +/// running the prover. +/// +/// Layout (per `executor::constants` + `bench_vs/lambda/recursion/src/main.rs`): +/// - Code/static: whatever PT_LOAD segments the recursion ELF carries. +/// - Heap: `_end .. 0xC000_0000` (`MAX_MEMORY_SIZE`); `TlsfHeap` scatters +/// allocations across this region. +/// - Private input: starts at `PRIVATE_INPUT_START_INDEX = 0xFF000000`. +/// - Stack: top of address space (down from `STACK_TOP = 0xFFFFFFFFFFFFFFF0`). +/// +/// Interpretation (rough): +/// - <1,000 pages: PAGE-table overhead is not the bottleneck. +/// - 10k-100k pages: TLSF heap fragmentation; design a tighter bump allocator +/// and re-measure. +/// - >100k pages: postcard decode dominates; consider streaming decode. +#[test] +#[ignore = "diagnostic: counts distinct 4 KB memory pages touched by the recursion guest"] +fn test_recursion_page_count() { + use executor::constants::PRIVATE_INPUT_START_INDEX; + use executor::elf::Elf; + use executor::vm::execution::Executor; + use std::collections::HashSet; + + let root = workspace_root(); + build_elfs(&root); + let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("page-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + // Precompute the recursion ELF's PT_LOAD ranges so we can bucket code/ + // static pages separately from heap. `Elf::load` already expands BSS + // (memsz > filesz) into zero-valued words, so these ranges cover + // .text + .rodata + .data + .bss. + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let segment_ranges: Vec<(u64, u64)> = program + .data + .iter() + .map(|seg| (seg.base_addr, seg.base_addr + (seg.values.len() as u64 * 4))) + .collect(); + eprintln!( + "[page-count] recursion ELF: {} PT_LOAD segment(s)", + segment_ranges.len(), + ); + for (i, (lo, hi)) in segment_ranges.iter().enumerate() { + eprintln!( + "[page-count] segment[{i}]: 0x{lo:016x} .. 0x{hi:016x} ({} bytes)", + hi - lo, + ); + } + + // Stream through execution — running to completion via `Executor::run` + // would accumulate ~67 M `Log` records (~2.7 GB) we don't need. We only + // care about the *final* memory state. + eprintln!("[page-count] executing recursion guest (streaming) ..."); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |_log| ControlFlow::Continue(()), + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(50) { + eprintln!("[page-count] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"); + } + }, + ); + + // Collect the set of distinct 4 KB pages from every cell touched during + // (a) program loading, (b) private-input loading, (c) execution. + const PAGE_MASK: u64 = !0xFFFu64; + let cells = executor.memory().cells(); + let total_cells = cells.len(); + let pages: HashSet = cells.keys().map(|&a| a & PAGE_MASK).collect(); + + // Bucket by region. A "code/static" page is any page that overlaps a + // PT_LOAD segment. Stack lives near the top of the 64-bit address + // space; private input lives in the [0xFF000000, ...) window above the + // 3 GB heap ceiling. + const HEAP_CEILING: u64 = 0xC000_0000; + const STACK_FLOOR: u64 = 0xFFFF_FFFF_0000_0000; + + let mut code_pages = 0usize; + let mut heap_pages = 0usize; + let mut private_input_pages = 0usize; + let mut stack_pages = 0usize; + let mut other_pages = 0usize; + + for &page in &pages { + let page_end = page.saturating_add(0x1000); + let in_code = segment_ranges + .iter() + .any(|&(lo, hi)| page < hi && lo < page_end); + if in_code { + code_pages += 1; + } else if page >= STACK_FLOOR { + stack_pages += 1; + } else if page >= PRIVATE_INPUT_START_INDEX { + private_input_pages += 1; + } else if page < HEAP_CEILING { + heap_pages += 1; + } else { + other_pages += 1; + } + } + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" RECURSION GUEST PAGE-COUNT SUMMARY"); + eprintln!("============================================================"); + eprintln!(" Total cycles : {total_cycles}"); + eprintln!(" Executor wall time : {exec_time:?}"); + eprintln!(" Memory cells touched (4 B ea) : {total_cells}"); + eprintln!(" Distinct 4 KB pages touched : {}", pages.len()); + eprintln!(); + eprintln!(" Pages per region:"); + eprintln!(" code/static (ELF segments) : {code_pages}"); + eprintln!(" heap (0..0xC000_0000) : {heap_pages}"); + eprintln!(" private input (0xFF000000..) : {private_input_pages}"); + eprintln!(" stack (>= 0xFFFFFFFF_00000000) : {stack_pages}"); + if other_pages > 0 { + eprintln!(" other (unclassified) : {other_pages}"); + } + eprintln!(); + eprintln!(" Interpretation (PAGE-table overhead):"); + eprintln!(" <1k pages → PAGE overhead is not the bottleneck."); + eprintln!(" 10k-100k → TLSF heap fragmentation; try a bump alloc."); + eprintln!(" >100k → postcard decode dominates; stream-decode?"); + eprintln!("============================================================"); +} + +/// Build a PC histogram of the recursion guest verifying an `empty`-program +/// inner proof produced with `inner_proof_options`, and print it via +/// [`print_pc_histogram`] under `title`. +/// +/// `blowup_factor` and `fri_number_of_queries` are coupled (the query count is +/// derived from blowup for a fixed security target), so each `#[test]` below is +/// just this runner with a different `ProofOptions` — a single query at low +/// blowup, vs. the security-derived multi-query count at a higher blowup. +/// +/// Streams chunks of logs via `Executor::resume()` so memory stays bounded to +/// the histogram itself. Each PC is resolved to its enclosing function via the +/// in-house `executor::elf::SymbolTable` (reading the recursion ELF's symbol +/// table directly — no external tool, no DWARF dependency). +fn run_recursion_pc_histogram(title: &str, inner_proof_options: stark::proof::options::ProofOptions) { + use executor::elf::Elf; + use executor::vm::execution::Executor; + use std::collections::HashMap; + + let root = workspace_root(); + build_elfs(&root); + let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("pc-hist", &empty_elf_bytes, &[], &inner_proof_options); + + eprintln!("[pc-hist] executing recursion guest (building PC histogram) ..."); + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + let mut pc_hist: HashMap = HashMap::with_capacity(300_000); + let unique = std::cell::Cell::new(0usize); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |log| { + *pc_hist.entry(log.current_pc).or_insert(0) += 1; + unique.set(pc_hist.len()); + ControlFlow::Continue(()) + }, + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(500) { + eprintln!( + "[pc-hist] ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}", + unique.get() + ); + } + }, + ); + + // Resolve PCs to functions directly from the ELF's symbol table. + let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes); + print_pc_histogram(title, &symbols, pc_hist, total_cycles, exec_time); +} + +/// Diagnostic: PC histogram of the recursion guest with a **single** FRI query +/// at blowup=2 — the cheapest verifier run, dominated by fixed setup cost +/// (decode, allocator, postcard) rather than per-query FRI/Merkle work. +#[test] +#[ignore = "diagnostic: ~8 minutes; prints PC histogram of the verifier-in-VM"] +fn test_recursion_pc_histogram_1query() { + run_recursion_pc_histogram( + "RECURSION GUEST PC HISTOGRAM (blowup=2, 1 query)", + MIN_PROOF_OPTIONS, + ); +} + +/// Diagnostic: PC histogram of the recursion guest at **128-bit security** +/// (blowup=8, FRI query count derived by the Johnson Bound Regime — tens of +/// queries). Compared against the single-query runs, weight shifts toward the +/// verifier's per-query FRI-layer / Merkle-opening and field arithmetic. +#[test] +#[ignore = "diagnostic: heavy; PC histogram of the multi-query verifier-in-VM"] +fn test_recursion_pc_histogram_multiquery() { + let inner_proof_options = crate::GoldilocksCubicProofOptions::with_blowup(8) + .expect("blowup=8 is always valid"); + run_recursion_pc_histogram( + &format!( + "RECURSION GUEST PC HISTOGRAM (blowup=8, {} queries, 128-bit)", + inner_proof_options.fri_number_of_queries + ), + inner_proof_options, + ); +} + +/// Diagnostic: build a **sampled** call-stack histogram of the recursion guest. +/// +/// Like `test_recursion_pc_histogram` but groups by full call stack (not PC). +/// To stay fast, only every `SAMPLE_RATE`-th log is recorded into the histogram. +/// The call stack itself is updated on every log (skipping would corrupt it). +/// +/// Output is written to `/tmp/recursion_folded_sampled.txt` in +/// inferno-flamegraph "folded stacks" format. Pipe it through: +/// +/// cat /tmp/recursion_folded_sampled.txt | inferno-flamegraph > svg.svg +/// +/// Expect ~10-20 minutes for SAMPLE_RATE=100 on a 40B-cycle guest. +#[test] +#[ignore = "diagnostic: sampled flamegraph for the verifier-in-VM"] +fn test_recursion_sampled_flamegraph() { + use executor::elf::Elf; + use executor::flamegraph::FlamegraphGenerator; + use executor::vm::execution::Executor; + use std::io::BufWriter; + + /// 1 in N logs is fed to `process_logs`, which both updates the call + /// stack and records a sample. At 1, every cycle goes through — the call + /// stack stays exactly in sync with execution so frame widths are + /// trustworthy, but the per-cycle cost (~57µs) limits how many cycles + /// we can cover within a wall-clock budget. + /// + /// At SAMPLE_RATE > 1, every CALL/RETURN that lands on a skipped cycle + /// silently desyncs the stack, producing the "stuck-in-visit_seq" effect + /// we saw at 1:1000. Use values > 1 only when stack accuracy is + /// expendable. + const SAMPLE_RATE: usize = 1; + + /// Stop the executor early once we've covered this many cycles. + /// Set to 0 to run to completion (40B+ cycles, hours at SAMPLE_RATE=1). + /// At SAMPLE_RATE=1, ~57µs per cycle means 5M cycles ≈ 5 min wall time. + const CYCLE_BUDGET: u64 = 5_000_000; + + let root = workspace_root(); + build_elfs(&root); + let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("sampled-fg", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + eprintln!("[sampled-fg] executing recursion guest (sampling 1-in-{SAMPLE_RATE}) ...",); + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes); + let entry_point = program.entry_point; + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + // Build our own instruction cache from the same segments `Executor::new` + // decodes internally. Owning it (rather than reading `executor.instructions` + // mid-loop) is what lets the per-log closure call `process_logs` without + // borrowing `executor`, which `drive_executor` holds mutably for `resume()`. + let instructions = executor::vm::execution::InstructionCache::new(&program.data) + .expect("instruction cache build failed"); + + // RefCell so the per-log closure (`process_logs`, &mut self) and the + // progress closure (`write_folded`, &self) can both reach the generator — + // their calls never overlap, so the runtime borrow check never trips. + let generator = std::cell::RefCell::new(FlamegraphGenerator::new(symbols, entry_point)); + + // Path is defined here (not after the loop) so the periodic checkpoint + // writes below can target it. The final write at the end still happens. + let path = "/tmp/recursion_folded_sampled.txt"; + + let mut i = 0usize; + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |log| { + // 1-in-SAMPLE_RATE logs are fed to `process_logs`. At SAMPLE_RATE==1 + // this is the identity filter (`_ % 1 == 0`); the `#[allow]` keeps + // the general form so SAMPLE_RATE can be bumped without touching the + // body. Skipped logs lose stack accuracy — acceptable diagnostic + // quality at higher rates. + #[allow(clippy::modulo_one)] + let take = i % SAMPLE_RATE == 0; + if take { + generator + .borrow_mut() + .process_logs(std::slice::from_ref(log), &instructions) + .expect("flamegraph process_logs"); + } + i += 1; + + // Early exit once we've covered the cycle budget. The dominant hot + // kernels are ~uniform across the verifier's runtime, so a partial + // run still surfaces them. `#[allow]` lets CYCLE_BUDGET be const-0 + // (full run) without tripping clippy. + #[allow(clippy::absurd_extreme_comparisons)] + if CYCLE_BUDGET > 0 && i as u64 >= CYCLE_BUDGET { + eprintln!("[sampled-fg] hit cycle budget ({CYCLE_BUDGET} cycles), stopping early"); + ControlFlow::Break(()) + } else { + ControlFlow::Continue(()) + } + }, + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(500) { + eprintln!("[sampled-fg] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"); + // Checkpoint: re-write the folded file in place so a killed run + // still leaves a usable (if partial) flamegraph on disk. + let file = std::fs::File::create(path).expect("create output file"); + let mut writer = BufWriter::new(file); + generator + .borrow() + .write_folded(&mut writer) + .expect("write folded output"); + } + }, + ); + + let file = std::fs::File::create(path).expect("create output file"); + let mut writer = BufWriter::new(file); + generator + .borrow() + .write_folded(&mut writer) + .expect("write folded output"); + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" SAMPLED FLAMEGRAPH SUMMARY"); + eprintln!("============================================================"); + eprintln!(" Total cycles : {total_cycles}"); + eprintln!(" Sample rate : 1 in {SAMPLE_RATE}"); + eprintln!(" Exec time : {exec_time:?}"); + eprintln!(" Output file : {path}"); + eprintln!("============================================================"); + eprintln!(); + eprintln!(" To render SVG (requires inferno):"); + eprintln!(" cat {path} | inferno-flamegraph > /tmp/recursion_flamegraph_sampled.svg"); + eprintln!("============================================================"); +} + +/// Diagnostic: host-side per-step timings for the verifier. +/// +/// Runs an inner prove (empty guest, blowup=2, 1 query) and then verifies it +/// on the host. When built with `--features stark/instruments`, the verifier +/// prints `Time spent: ...` for each of the four steps (replay challenges, +/// composition polynomial, FRI, DEEP openings) plus the step-1-replay it +/// does before step 2. Lets us see the host-side split in seconds, without +/// running anything inside the VM. +/// +/// Usage: +/// ``` +/// cargo test --release -p lambda-vm-prover --features stark/instruments \ +/// --lib test_host_verify_step_timings -- --ignored --nocapture +/// ``` +#[test] +#[ignore = "diagnostic: prints host-side verifier step timings"] +fn test_host_verify_step_timings() { + let root = workspace_root(); + let empty_path = + root.join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench"); + if !empty_path.exists() { + build_elfs(&root); + } + let empty_elf_bytes = std::fs::read(&empty_path).expect("read empty-bench"); + + let inner_proof_options = MIN_PROOF_OPTIONS; + + eprintln!("[host-verify] proving empty (blowup=2, fri_queries=1) ..."); + let inner_proof = crate::prove_with_options_and_inputs( + &empty_elf_bytes, + &[], + &inner_proof_options, + &crate::MaxRowsConfig::default(), + ) + .expect("inner prove should succeed"); + + eprintln!("[host-verify] verifying on host (with instruments) ..."); + let ok = + crate::verify_with_options(&inner_proof, &empty_elf_bytes, &inner_proof_options, None, None) + .expect("verify errored"); + assert!(ok, "proof must verify"); + eprintln!("[host-verify] verified OK"); +} + +/// Diagnostic: cycle count for the **deserialize-only** counterpart of the +/// recursion guest. Same input layout +/// (`(VmProof, Vec, ProofOptions)`) and same proof, but +/// the guest just postcard-decodes the blob and halts — it never calls +/// `verify_with_options`. +/// +/// The cycle delta between this and `test_recursion_cycle_count` is the +/// actual cost of the STARK verifier inside the VM. Historically (40.5 B-cycle +/// recursion guest) postcard decode was ~15.6 M cycles — negligible. Now that +/// the recursion guest is ~67 M cycles, the same absolute cost would be ~23% +/// of total; this test re-measures it. +#[test] +#[ignore = "diagnostic: runs the deserialize-only guest, prints cycle count"] +fn test_deserialize_only_cycle_count() { + use executor::elf::Elf; + use executor::vm::execution::Executor; + + let root = workspace_root(); + build_elfs(&root); + let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench"); + let deser_elf_bytes = read_guest_elf(&root, "deserialize-only", "deserialize-only-bench"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("deser-only", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + eprintln!("[deser-only] executing deserialize-only guest (streaming) ..."); + let program = Elf::load(&deser_elf_bytes).expect("ELF load failed"); + eprintln!( + "[deser-only] ELF: {} bytes, entry_point=0x{:x}", + deser_elf_bytes.len(), + program.entry_point, + ); + assert_ne!( + program.entry_point, 0, + "deserialize-only ELF has entry_point=0 — build artifact is malformed" + ); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |_log| ControlFlow::Continue(()), + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(50) { + eprintln!("[deser-only] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"); + } + }, + ); + let cycle_count = total_cycles; + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" DESERIALIZE-ONLY GUEST EXECUTION SUMMARY"); + eprintln!("============================================================"); + eprintln!(" Cycle count : {cycle_count}"); + eprintln!(" Executor wall time : {exec_time:?}"); + eprintln!(); + eprintln!(" Compare against test_recursion_cycle_count (~40.5B cycles"); + eprintln!(" with the same proof). Delta = verifier-in-VM cost."); + eprintln!("============================================================"); +} + +/// Diagnostic: PC histogram for the **deserialize-only** guest. +/// +/// Sibling of `test_recursion_pc_histogram`, but targeting the +/// deserialize-only control guest so we can locate the hot kernel inside the +/// 15.7 M-cycle postcard decode itself. Every cycle goes through the +/// histogram (no sampling), so attribution is exact — the previous sampled +/// flamegraph at 1:1000 had broken stack reconstruction on skipped +/// CALL/RETURNs, which made it unreliable for a workload this small. +/// +/// Each top PC is resolved to its enclosing function via the in-house +/// `executor::elf::SymbolTable`, reading the guest ELF's symbol table directly +/// (no external tool, no DWARF dependency). +#[test] +#[ignore = "diagnostic: ~1 min; PC histogram for the deserialize-only guest"] +fn test_deserialize_only_pc_histogram() { + use executor::elf::Elf; + use executor::vm::execution::Executor; + use std::collections::HashMap; + + let root = workspace_root(); + build_elfs(&root); + let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench"); + let deser_elf_bytes = read_guest_elf(&root, "deserialize-only", "deserialize-only-bench"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("deser-pc-hist", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + eprintln!("[deser-pc-hist] executing deserialize-only guest (building PC histogram) ..."); + let program = Elf::load(&deser_elf_bytes).expect("ELF load failed"); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + // ~50k unique PCs is plenty: the deserialize-only guest is ~74 KB of ELF + // (~18k 4-byte instructions); the hot inner loop is much smaller still. + let mut pc_hist: HashMap = HashMap::with_capacity(50_000); + let unique = std::cell::Cell::new(0usize); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |log| { + *pc_hist.entry(log.current_pc).or_insert(0) += 1; + unique.set(pc_hist.len()); + ControlFlow::Continue(()) + }, + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(50) { + eprintln!( + "[deser-pc-hist] ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}", + unique.get() + ); + } + }, + ); + + // Resolve PCs to functions directly from the ELF's symbol table. + let symbols = executor::elf::SymbolTable::parse(&deser_elf_bytes); + print_pc_histogram( + "DESERIALIZE-ONLY GUEST PC HISTOGRAM", + &symbols, + pc_hist, + total_cycles, + exec_time, + ); +} + +/// Diagnostic: bucket the recursion guest's cycles by which verifier step +/// is currently executing. +/// +/// The verifier's hot path is `verify_rounds_2_to_4`, which calls four +/// sub-routines in a fixed order: +/// 1. `replay_rounds_after_round_1` (recover challenges) +/// 2. `step_2_verify_claimed_composition_polynomial` +/// 3. `step_3_verify_fri` +/// 4. `step_4_verify_trace_and_composition_openings` +/// +/// We resolve each sub-routine's entry PC from the recursion ELF's symbol +/// table, then run a monotonic state machine over the execution stream: +/// the active bucket only advances 0 → 1 → 2 → 3 → 4 (never backwards), +/// so cycles inside a step's callees stay attributed to that step. +/// +/// Bucket 0 ("setup") captures everything before step 1 is entered — the +/// allocator init, postcard decode, and `VmAirs::new` (which contains the +/// expensive preprocessed-commitment FFTs). +/// +/// Streams chunks via `Executor::resume()` so memory stays bounded. +#[test] +#[ignore = "diagnostic: ~13 min; buckets the 40B cycles by verifier step"] +fn test_recursion_step_breakdown() { + use executor::elf::{Elf, SymbolTable}; + use executor::vm::execution::Executor; + + let root = workspace_root(); + build_elfs(&root); + let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("step-bkd", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + // Build a per-step "advance bucket to N" lookup. The verifier's step + // functions get inlined by LLVM in release mode, so we can't rely on + // matching their entry PCs directly. Instead we anchor on closures the + // compiler emits *inside* each step's body — iterator combinators like + // `.fold(|...|)` keep the step's method name as a substring in their + // mangled symbol. Any PC that resolves to a symbol containing step N's + // keyword advances the bucket to N (monotonically). + // + // If step N has no matching symbol at all (e.g. step 4 is fully inlined + // with no closure children of its own), its cycles get attributed to the + // previous bucket. We report that explicitly in the summary. + let symbols = SymbolTable::parse(&recursion_elf_bytes); + assert!( + !symbols.is_empty(), + "recursion ELF has no symbol table — was it stripped?" + ); + + let step_keywords = [ + "replay_rounds_after_round_1", + "step_2_verify_claimed_composition_polynomial", + "step_3_verify_fri", + "step_4_verify_trace_and_composition_openings", + ]; + let step_found: [bool; 4] = std::array::from_fn(|i| { + symbols + .functions() + .iter() + .any(|f| f.name.contains(step_keywords[i])) + }); + for (i, found) in step_found.iter().enumerate() { + let n_matches = symbols + .functions() + .iter() + .filter(|f| f.name.contains(step_keywords[i])) + .count(); + eprintln!( + "[step-bkd] step {}: keyword={:?} -> {} symbol(s) {}", + i + 1, + step_keywords[i], + n_matches, + if *found { + "" + } else { + "(fully inlined; will merge into the previous bucket)" + } + ); + } + + // Monotonic state machine: 0=setup, 1..=4=inside step N (or its callees / + // inlined-step-N-cycles attributed here because step N+1 is missing). + // `bucket` lives in a Cell so the per-log closure can advance it while the + // progress closure reads it for its live readout. + let bucket = std::cell::Cell::new(0u8); + let mut buckets = [0u64; 5]; + + eprintln!("[step-bkd] executing recursion guest (streaming) ..."); + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + // Cache the last symbol-table hit so we only do a binary search on + // function transitions, not on every cycle. Functions are typically + // long-running (>>1 instruction), so this cache hits ~all of the time. + let mut last_range: Option<(u64, u64)> = None; + let mut last_advance: u8 = 0; + + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |log| { + let pc = log.current_pc; + let in_cached = matches!(last_range, Some((s, e)) if pc >= s && pc < e); + if !in_cached { + // Slow path: refresh the cache from the symbol table. + if let Some(sym) = symbols.lookup(pc) { + // SymbolTable accepts size=0 symbols as "any address >="; for + // those we'd need the next symbol's start for a real upper + // bound. Cheapest workaround: set a tiny range so we re-resolve + // soon enough that wrong attribution is bounded. + let end = sym.address + sym.size.max(1); + last_range = Some((sym.address, end)); + last_advance = 0; + for (i, kw) in step_keywords.iter().enumerate() { + if sym.name.contains(kw) { + last_advance = (i + 1) as u8; + } + } + } else { + last_range = None; + last_advance = 0; + } + } + if bucket.get() < last_advance { + bucket.set(last_advance); + } + buckets[bucket.get() as usize] += 1; + ControlFlow::Continue(()) + }, + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(500) { + eprintln!( + "[step-bkd] ... {chunks} chunks, {cycles} cycles, bucket={}, {elapsed:?}", + bucket.get() + ); + } + }, + ); + + let labels = [ + "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)", + "1. step 1: replay_rounds_after_round_1", + "2. step 2: verify_claimed_composition_polynomial", + "3. step 3: verify_fri", + "4. step 4: verify_trace_and_composition_openings (+ wrap-up)", + ]; + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" RECURSION GUEST PER-STEP CYCLE BREAKDOWN"); + eprintln!("============================================================"); + eprintln!(" Total cycles : {total_cycles}"); + eprintln!(" Exec time : {exec_time:?}"); + eprintln!(); + eprintln!(" {:<60} {:>14} {:>7}", "bucket", "cycles", "%"); + for (label, cycles) in labels.iter().zip(buckets.iter()) { + let pct = if total_cycles > 0 { + 100.0 * (*cycles as f64) / (total_cycles as f64) + } else { + 0.0 + }; + eprintln!(" {:<60} {:>14} {:>6.2}%", label, cycles, pct); + } + eprintln!("============================================================"); +} + /// Inner program: fibonacci(10). #[test] #[ignore = "slow: runs the full STARK verifier inside the VM"]