From c759182da9998284c802e369bb894a23fb1bba4b Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Wed, 17 Jun 2026 15:56:22 +0000 Subject: [PATCH 1/2] [MINOR][CI] Add hard guard that dumps stacks on stalled test forks Some Java test forks intermittently stall in a way that surefire's own timeouts never catch, so the job runs until the GitHub Actions cap and is cancelled with no output to diagnose, and the stall does not reproduce locally. Add an outer guard in the docker test entrypoint that watches the test log for a stall (no new line for a window kept just above the per-fork surefire timeout) and an absolute runtime ceiling below the job cap. On either trigger it dumps thread stacks from every JVM in the test process tree via SIGQUIT (relayed into the job log) plus a jstack file backup, then force-kills the tree so the job fails fast with stacks instead of being cancelled empty-handed. Limits are overridable via SYSDS_TEST_STALL_LIMIT and SYSDS_TEST_MAX_RUNTIME. Also set surefire runOrder to alphabetical so a hang reproduces at a stable class boundary, making the responsible class identifiable from the dumps. --- docker/entrypoint.sh | 85 ++++++++++++++++++++++++++++++++++++++++++-- pom.xml | 2 ++ 2 files changed, 84 insertions(+), 3 deletions(-) diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 53dfabb96e6..f80913d6f6b 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -51,15 +51,94 @@ if [ "$compile_transient_failure" = true ]; then else echo "No transient Maven repository error detected; no retry needed." fi -mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \ +# Outer guard: catch test-fork hangs that surefire's own timeouts miss, dump +# stacks for diagnosis, and kill the run before the job cap (kept just above the +# 600s per-fork timeout; MAX_RUNTIME is the absolute ceiling under the cap). +STALL_LIMIT="${SYSDS_TEST_STALL_LIMIT:-660}" +MAX_RUNTIME="${SYSDS_TEST_MAX_RUNTIME:-1600}" +dump_dir="/github/workspace/target/thread-dumps" +mkdir -p "$dump_dir" +jstack_bin="${JAVA_HOME:+$JAVA_HOME/bin/}jstack" + +# Emit the pid of a process and all of its descendants. +proc_tree() { + local pid=$1 child + for child in $(pgrep -P "$pid" 2>/dev/null); do proc_tree "$child"; done + echo "$pid" +} + +# SIGQUIT every JVM in the test tree (stacks relayed into $log) plus a jstack file. +dump_thread_stacks() { + local reason="$1" root="$2" ts pid comm cmd + ts=$(date +%Y%m%d-%H%M%S) + echo "================ HARD-GUARD THREAD DUMP: $reason ($ts) ================" + for pid in $(proc_tree "$root"); do + [ -r "/proc/$pid/comm" ] || continue + comm=$(cat "/proc/$pid/comm" 2>/dev/null) + case "$comm" in + java|java.bin) ;; + *) continue ;; + esac + cmd=$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null | cut -c1-160) + echo "---- SIGQUIT dump: pid=$pid comm=$comm cmd=$cmd ----" + kill -3 "$pid" 2>/dev/null + timeout 30 "$jstack_bin" -l "$pid" > "$dump_dir/jstack_${pid}_${ts}.txt" 2>&1 || true + done + # Let the JVMs flush their dumps into the relayed output stream. + sleep 12 + echo "================ END HARD-GUARD THREAD DUMP ($reason) ================" +} + +# Background the run so the guard can watch it; $1 stays unquoted to keep the extra -D flags it carries. +( mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \ | stdbuf -oL grep -Ev "already exists in destination.|Using incubator" \ - | tee $log + | tee $log ) & +runner=$! + +guard_tripped=false +start=$(date +%s) +prev_lines=-1 +idle=0 +interval=15 +while kill -0 "$runner" 2>/dev/null; do + sleep "$interval" + now=$(date +%s) + runtime=$((now - start)) + lines=$(wc -l < "$log" 2>/dev/null || echo 0) + if [ "$lines" -eq "$prev_lines" ]; then + idle=$((idle + interval)) + else + idle=0 + prev_lines=$lines + fi + + reason="" + if [ "$idle" -ge "$STALL_LIMIT" ]; then + reason="no test output for ${idle}s (stall limit ${STALL_LIMIT}s)" + elif [ "$runtime" -ge "$MAX_RUNTIME" ]; then + reason="exceeded absolute runtime ${runtime}s (max ${MAX_RUNTIME}s)" + fi + + if [ -n "$reason" ]; then + guard_tripped=true + { + echo "" + echo "##[error] HARD GUARD TRIPPED: $reason" + echo "Last test classes seen before the stall:" + grep -E "Running org.apache" "$log" | tail -5 + } | tee -a "$log" + dump_thread_stacks "$reason" "$runner" 2>&1 | tee -a "$log" + for pid in $(proc_tree "$runner"); do kill -9 "$pid" 2>/dev/null; done + break + fi +done +wait "$runner" 2>/dev/null grep_args="SUCCESS" grepvals="$( tail -n 100 $log | grep $grep_args)" -if [[ $grepvals == *"SUCCESS"* ]]; then +if [ "$guard_tripped" = false ] && [[ $grepvals == *"SUCCESS"* ]]; then # Merge Federated test runs. # if merged jacoco exist temporarily rename to not overwrite. [ -f target/jacoco.exec ] && mv target/jacoco.exec target/jacoco_main.exec diff --git a/pom.xml b/pom.xml index 5762dc2289e..42a00c469ca 100644 --- a/pom.xml +++ b/pom.xml @@ -411,6 +411,8 @@ ${test-forkCount} false + + alphabetical ${test-forkedProcessTimeout} From ea945c2abf2676380a242eb5391447835d632cc2 Mon Sep 17 00:00:00 2001 From: Jannik Lindemann Date: Mon, 22 Jun 2026 17:07:02 +0200 Subject: [PATCH 2/2] Add More Detailed Logs Regarding Stalling Forks --- .github/workflows/javaTests.yml | 24 ++++++++++++++++++------ docker/entrypoint.sh | 17 +++++++++++++---- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/.github/workflows/javaTests.yml b/.github/workflows/javaTests.yml index 0d4c71e946b..db8088d7c23 100644 --- a/.github/workflows/javaTests.yml +++ b/.github/workflows/javaTests.yml @@ -98,12 +98,6 @@ jobs: - name: Checkout Repository uses: actions/checkout@v6 - - name: ${{ matrix.tests }} - uses: ./.github/action/ - id: test - with: - test-to-run: ${{ matrix.tests }} - - name: Clean Github Artifact Name of Asterisks run: | ARTIFACT_NAME="transient_jacoco" @@ -111,6 +105,24 @@ jobs: ARTIFACT_NAME=${ARTIFACT_NAME//\*/x} # replace * with x echo "ARTIFACT_NAME=$ARTIFACT_NAME" >> $GITHUB_ENV + - name: ${{ matrix.tests }} + uses: ./.github/action/ + id: test + with: + test-to-run: ${{ matrix.tests }} + + - name: Save Java Test Diagnostics as Artifact + if: always() + uses: actions/upload-artifact@v7 + with: + name: diagnostics-${{ env.ARTIFACT_NAME }} + path: | + target/sysdstest.log + target/thread-dumps/** + target/surefire-reports/** + if-no-files-found: ignore + retention-days: 7 + - name: Save Java Test Coverage as Artifact uses: actions/upload-artifact@v7 with: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index f80913d6f6b..71811b7f5d0 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -32,7 +32,9 @@ export MAVEN_OPTS="-Xmx512m" # error), unlike genuine compilation or test failures which fail fast. transient_mvn_error="Could not transfer artifact" -log="/tmp/sysdstest.log" +target_dir="/github/workspace/target" +mkdir -p "$target_dir" +log="$target_dir/sysdstest.log" compile_log="$(mktemp)" # test-compile downloads all dependencies; retry once on a transient repo # error so the test run below can resolve them from the local cache. @@ -56,7 +58,7 @@ fi # 600s per-fork timeout; MAX_RUNTIME is the absolute ceiling under the cap). STALL_LIMIT="${SYSDS_TEST_STALL_LIMIT:-660}" MAX_RUNTIME="${SYSDS_TEST_MAX_RUNTIME:-1600}" -dump_dir="/github/workspace/target/thread-dumps" +dump_dir="$target_dir/thread-dumps" mkdir -p "$dump_dir" jstack_bin="${JAVA_HOME:+$JAVA_HOME/bin/}jstack" @@ -69,7 +71,7 @@ proc_tree() { # SIGQUIT every JVM in the test tree (stacks relayed into $log) plus a jstack file. dump_thread_stacks() { - local reason="$1" root="$2" ts pid comm cmd + local reason="$1" root="$2" ts pid comm cmd jstack_file ts=$(date +%Y%m%d-%H%M%S) echo "================ HARD-GUARD THREAD DUMP: $reason ($ts) ================" for pid in $(proc_tree "$root"); do @@ -82,7 +84,14 @@ dump_thread_stacks() { cmd=$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null | cut -c1-160) echo "---- SIGQUIT dump: pid=$pid comm=$comm cmd=$cmd ----" kill -3 "$pid" 2>/dev/null - timeout 30 "$jstack_bin" -l "$pid" > "$dump_dir/jstack_${pid}_${ts}.txt" 2>&1 || true + jstack_file="$dump_dir/jstack_${pid}_${ts}.txt" + if timeout 30 "$jstack_bin" -l "$pid" > "$jstack_file" 2>&1; then + echo "---- jstack dump: pid=$pid file=$jstack_file ----" + else + echo "---- jstack dump failed or timed out: pid=$pid file=$jstack_file ----" + fi + cat "$jstack_file" || true + echo "---- end jstack dump: pid=$pid ----" done # Let the JVMs flush their dumps into the relayed output stream. sleep 12