From e8a8eb95b9e79c2e134079286ad0128a4f2eacb3 Mon Sep 17 00:00:00 2001 From: Artur Shiriev Date: Tue, 16 Jun 2026 12:58:40 +0300 Subject: [PATCH 1/9] feat(circuit-breaker): add time-bucketed _RollingWindow recorder Co-Authored-By: Claude Opus 4.8 (1M context) --- .../middleware/resilience/circuit_breaker.py | 52 ++++++++++++++ tests/test_rolling_window.py | 70 +++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 tests/test_rolling_window.py diff --git a/src/httpware/middleware/resilience/circuit_breaker.py b/src/httpware/middleware/resilience/circuit_breaker.py index 795a001..2ddd40a 100644 --- a/src/httpware/middleware/resilience/circuit_breaker.py +++ b/src/httpware/middleware/resilience/circuit_breaker.py @@ -50,6 +50,8 @@ _DEFAULT_FAILURE_STATUS_CODES = frozenset(range(500, 600)) +_BUCKET_COUNT = 10 + _ROLE_CLOSED = "closed" _ROLE_PROBE = "probe" @@ -62,6 +64,56 @@ class _CircuitState(enum.Enum): HALF_OPEN = "half_open" +class _RollingWindow: + """Time-bucketed success/failure counters over a rolling window. + + `window_seconds` is split into `_BUCKET_COUNT` buckets. Each bucket holds + [successes, failures] tagged with the integer time-slot it represents; a + bucket whose slot is stale is reset on write, and `totals` filters to the + live slot range so data older than the window never counts. Every method is + synchronous and reads `now` from its caller (so the breaker's critical + section owns the clock read). + """ + + def __init__(self, window_seconds: float) -> None: + self._bucket_width = window_seconds / _BUCKET_COUNT + self._slot = [-1] * _BUCKET_COUNT + self._success = [0] * _BUCKET_COUNT + self._failure = [0] * _BUCKET_COUNT + + def _current_slot(self, now: float) -> int: + return int(now // self._bucket_width) + + def record(self, now: float, *, failed: bool) -> None: + slot = self._current_slot(now) + index = slot % _BUCKET_COUNT + if self._slot[index] != slot: # bucket reused for a new slot — evict + self._slot[index] = slot + self._success[index] = 0 + self._failure[index] = 0 + if failed: + self._failure[index] += 1 + else: + self._success[index] += 1 + + def totals(self, now: float) -> tuple[int, int]: + """Return (total, failures) across buckets still inside the window at `now`.""" + slot = self._current_slot(now) + oldest = slot - _BUCKET_COUNT + 1 + total = 0 + failures = 0 + for i in range(_BUCKET_COUNT): + if oldest <= self._slot[i] <= slot: + total += self._success[i] + self._failure[i] + failures += self._failure[i] + return total, failures + + def clear(self) -> None: + self._slot = [-1] * _BUCKET_COUNT + self._success = [0] * _BUCKET_COUNT + self._failure = [0] * _BUCKET_COUNT + + class _CircuitBreakerState: """Lock-free circuit-breaker state machine shared by the sync + async wrappers. diff --git a/tests/test_rolling_window.py b/tests/test_rolling_window.py new file mode 100644 index 0000000..b407cc7 --- /dev/null +++ b/tests/test_rolling_window.py @@ -0,0 +1,70 @@ +"""Unit tests for the time-bucketed _RollingWindow used by rate-mode CircuitBreaker.""" + +from hypothesis import given, settings +from hypothesis import strategies as st + +from httpware.middleware.resilience.circuit_breaker import _BUCKET_COUNT, _RollingWindow + + +def test_counts_within_window() -> None: + w = _RollingWindow(window_seconds=10.0) + w.record(0.0, failed=True) + w.record(1.0, failed=True) + w.record(2.0, failed=False) + total, failures = w.totals(2.0) + assert (total, failures) == (3, 2) + + +def test_empty_window_is_zero() -> None: + w = _RollingWindow(window_seconds=10.0) + assert w.totals(0.0) == (0, 0) + + +def test_stale_buckets_evicted_by_time() -> None: + w = _RollingWindow(window_seconds=10.0) + w.record(0.0, failed=True) + w.record(0.5, failed=True) + # advance a full window past those records + w.record(11.0, failed=False) + total, failures = w.totals(11.0) + assert (total, failures) == (1, 0) + + +def test_totals_excludes_stale_without_new_write() -> None: + w = _RollingWindow(window_seconds=10.0) + w.record(0.0, failed=True) + # no write after the window elapses — totals() alone must drop the stale bucket + assert w.totals(20.0) == (0, 0) + + +def test_clear_resets_everything() -> None: + w = _RollingWindow(window_seconds=10.0) + w.record(0.0, failed=True) + w.record(1.0, failed=False) + w.clear() + assert w.totals(1.0) == (0, 0) + + +@given( + events=st.lists( + st.tuples(st.floats(min_value=0.0, max_value=1000.0), st.booleans()), + min_size=1, + max_size=200, + ), +) +@settings(max_examples=100, deadline=None) +def test_totals_match_live_events(events: list[tuple[float, bool]]) -> None: + """totals() at the final time asserts the live-window totals exactly.""" + window_seconds = 10.0 + w = _RollingWindow(window_seconds=window_seconds) + ordered = sorted(events, key=lambda e: e[0]) + for now, failed in ordered: + w.record(now, failed=failed) + final = ordered[-1][0] + bucket_width = window_seconds / _BUCKET_COUNT + live_cutoff_slot = int(final // bucket_width) - _BUCKET_COUNT + 1 + expected_live = [(t, f) for (t, f) in ordered if int(t // bucket_width) >= live_cutoff_slot] + total, failures = w.totals(final) + assert total == len(expected_live) + assert failures == sum(1 for _, f in expected_live if f) + assert 0 <= failures <= total From 9d60fd107a127f258d7fd54dc632f7294592f8fc Mon Sep 17 00:00:00 2001 From: Artur Shiriev Date: Tue, 16 Jun 2026 13:07:18 +0300 Subject: [PATCH 2/9] feat(circuit-breaker): thread rate-mode config + validation Co-Authored-By: Claude Opus 4.8 (1M context) --- .../middleware/resilience/circuit_breaker.py | 35 +++++++++++++++++-- tests/test_circuit_breaker.py | 32 ++++++++++++++++- 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/src/httpware/middleware/resilience/circuit_breaker.py b/src/httpware/middleware/resilience/circuit_breaker.py index 2ddd40a..9d4fb61 100644 --- a/src/httpware/middleware/resilience/circuit_breaker.py +++ b/src/httpware/middleware/resilience/circuit_breaker.py @@ -42,6 +42,9 @@ _FAILURE_THRESHOLD_INVALID = "failure_threshold must be >= 1" _RESET_TIMEOUT_INVALID = "reset_timeout must be >= 0" _SUCCESS_THRESHOLD_INVALID = "success_threshold must be >= 1" +_FAILURE_RATE_THRESHOLD_INVALID = "failure_rate_threshold must be in (0, 1]" +_WINDOW_SECONDS_INVALID = "window_seconds must be > 0" +_MINIMUM_CALLS_INVALID = "minimum_calls must be >= 1" _CROSS_LOOP_MSG = ( "AsyncCircuitBreaker is bound to a single event loop. First seen on {first!r}; " "current request is on {current!r}. Use one AsyncCircuitBreaker per loop; " @@ -122,13 +125,16 @@ class _CircuitBreakerState: inside a transition); the sync wrapper wraps each call in a threading.Lock. """ - def __init__( + def __init__( # noqa: PLR0913 — breaker state has many orthogonal knobs; a dataclass would be worse self, *, failure_threshold: int, reset_timeout: float, success_threshold: int, failure_status_codes: Collection[int] | None, + failure_rate_threshold: float | None, + window_seconds: float, + minimum_calls: int, now: Callable[[], float], ) -> None: if failure_threshold < 1: @@ -137,6 +143,12 @@ def __init__( raise ValueError(_RESET_TIMEOUT_INVALID) if success_threshold < 1: raise ValueError(_SUCCESS_THRESHOLD_INVALID) + if failure_rate_threshold is not None and not (0.0 < failure_rate_threshold <= 1.0): + raise ValueError(_FAILURE_RATE_THRESHOLD_INVALID) + if window_seconds <= 0: + raise ValueError(_WINDOW_SECONDS_INVALID) + if minimum_calls < 1: + raise ValueError(_MINIMUM_CALLS_INVALID) self._failure_threshold = failure_threshold self._reset_timeout = reset_timeout self._success_threshold = success_threshold @@ -145,6 +157,11 @@ def __init__( self._failure_status_codes = ( frozenset(failure_status_codes) if failure_status_codes is not None else _DEFAULT_FAILURE_STATUS_CODES ) + self._failure_rate_threshold = failure_rate_threshold + self._minimum_calls = minimum_calls + self._rate_mode = failure_rate_threshold is not None + self._window = _RollingWindow(window_seconds) if self._rate_mode else None + self._window_seconds = window_seconds self._now = now self._state = _CircuitState.CLOSED self._consecutive_failures = 0 @@ -249,13 +266,16 @@ def _emit( class AsyncCircuitBreaker: """Async classic circuit breaker middleware. See the module docstring for the contract.""" - def __init__( + def __init__( # noqa: PLR0913 — breaker has many orthogonal knobs; a dataclass would be worse self, *, failure_threshold: int = 5, reset_timeout: float = 30.0, success_threshold: int = 1, failure_status_codes: Collection[int] | None = None, + failure_rate_threshold: float | None = None, + window_seconds: float = 30.0, + minimum_calls: int = 20, _now: Callable[[], float] = time.monotonic, ) -> None: self._state = _CircuitBreakerState( @@ -263,6 +283,9 @@ def __init__( reset_timeout=reset_timeout, success_threshold=success_threshold, failure_status_codes=failure_status_codes, + failure_rate_threshold=failure_rate_threshold, + window_seconds=window_seconds, + minimum_calls=minimum_calls, now=_now, ) self._loop: asyncio.AbstractEventLoop | None = None @@ -313,13 +336,16 @@ class CircuitBreaker: (one shared circuit); a sync instance cannot be shared with an AsyncClient. """ - def __init__( + def __init__( # noqa: PLR0913 — breaker has many orthogonal knobs; a dataclass would be worse self, *, failure_threshold: int = 5, reset_timeout: float = 30.0, success_threshold: int = 1, failure_status_codes: Collection[int] | None = None, + failure_rate_threshold: float | None = None, + window_seconds: float = 30.0, + minimum_calls: int = 20, _now: Callable[[], float] = time.monotonic, ) -> None: self._state = _CircuitBreakerState( @@ -327,6 +353,9 @@ def __init__( reset_timeout=reset_timeout, success_threshold=success_threshold, failure_status_codes=failure_status_codes, + failure_rate_threshold=failure_rate_threshold, + window_seconds=window_seconds, + minimum_calls=minimum_calls, now=_now, ) self._lock = threading.Lock() diff --git a/tests/test_circuit_breaker.py b/tests/test_circuit_breaker.py index 5d8a47e..23f78dc 100644 --- a/tests/test_circuit_breaker.py +++ b/tests/test_circuit_breaker.py @@ -7,6 +7,7 @@ import asyncio import logging +import re from collections.abc import Callable from http import HTTPStatus @@ -23,7 +24,12 @@ ServiceUnavailableError, TimeoutError, # noqa: A004 — intentional: httpware.TimeoutError shadows the builtin ) -from httpware.middleware.resilience.circuit_breaker import AsyncCircuitBreaker +from httpware.middleware.resilience.circuit_breaker import ( + _FAILURE_RATE_THRESHOLD_INVALID, + _MINIMUM_CALLS_INVALID, + _WINDOW_SECONDS_INVALID, + AsyncCircuitBreaker, +) class _Clock: @@ -503,3 +509,27 @@ async def _run_once() -> None: asyncio.run(_run_once()) # binds to loop L1 with pytest.raises(RuntimeError, match="bound to a single event loop"): asyncio.run(_run_once()) + + +# ── rate-mode config validation ── + + +@pytest.mark.parametrize("bad", [0.0, -0.1, 1.5]) +def test_rate_threshold_out_of_range_raises(bad: float) -> None: + with pytest.raises(ValueError, match=re.escape(_FAILURE_RATE_THRESHOLD_INVALID)): + AsyncCircuitBreaker(failure_rate_threshold=bad) + + +def test_non_positive_window_seconds_raises() -> None: + with pytest.raises(ValueError, match=re.escape(_WINDOW_SECONDS_INVALID)): + AsyncCircuitBreaker(failure_rate_threshold=0.5, window_seconds=0.0) + + +def test_minimum_calls_below_one_raises() -> None: + with pytest.raises(ValueError, match=re.escape(_MINIMUM_CALLS_INVALID)): + AsyncCircuitBreaker(failure_rate_threshold=0.5, minimum_calls=0) + + +def test_classic_mode_is_default_when_rate_threshold_none() -> None: + breaker = AsyncCircuitBreaker() # no failure_rate_threshold + assert breaker._state._rate_mode is False # noqa: SLF001 — white-box assertion for internal mode flag From c5fcb7a7dbe1637ebd098f62e77d7d790966bbfb Mon Sep 17 00:00:00 2001 From: Artur Shiriev Date: Tue, 16 Jun 2026 13:13:58 +0300 Subject: [PATCH 3/9] feat(circuit-breaker): rate-over-window trip mode Co-Authored-By: Claude Opus 4.8 (1M context) --- .../middleware/resilience/circuit_breaker.py | 46 +++++++++-- tests/test_circuit_breaker.py | 78 +++++++++++++++++++ tests/test_circuit_breaker_sync.py | 78 +++++++++++++++++++ 3 files changed, 194 insertions(+), 8 deletions(-) diff --git a/src/httpware/middleware/resilience/circuit_breaker.py b/src/httpware/middleware/resilience/circuit_breaker.py index 9d4fb61..0a36af5 100644 --- a/src/httpware/middleware/resilience/circuit_breaker.py +++ b/src/httpware/middleware/resilience/circuit_breaker.py @@ -209,22 +209,30 @@ def on_success(self, role: str, request: httpx2.Request) -> None: if role == _ROLE_PROBE: self._probe_in_flight = False if self._state is _CircuitState.CLOSED: - self._consecutive_failures = 0 + if self._rate_mode: + self._record_outcome(request, failed=False) + else: + self._consecutive_failures = 0 elif self._state is _CircuitState.HALF_OPEN: self._consecutive_successes += 1 if self._consecutive_successes >= self._success_threshold: self._state = _CircuitState.CLOSED self._consecutive_failures = 0 self._consecutive_successes = 0 + if self._rate_mode: + self._window.clear() # ty: ignore[unresolved-attribute] self._emit(request, "circuit.closed", logging.INFO, "circuit closed — service recovered", {}) def on_failure(self, role: str, request: httpx2.Request) -> None: if role == _ROLE_PROBE: self._probe_in_flight = False if self._state is _CircuitState.CLOSED: - self._consecutive_failures += 1 - if self._consecutive_failures >= self._failure_threshold: - self._open(request, failures=self._consecutive_failures) + if self._rate_mode: + self._record_outcome(request, failed=True) + else: + self._consecutive_failures += 1 + if self._consecutive_failures >= self._failure_threshold: + self._open(request, failures=self._consecutive_failures) elif self._state is _CircuitState.HALF_OPEN: self._open(request, failures=1) # 1 = the single probe failure that re-opened the circuit @@ -233,19 +241,41 @@ def release_probe(self, role: str) -> None: if role == _ROLE_PROBE: self._probe_in_flight = False - def _open(self, request: httpx2.Request, *, failures: int) -> None: + def _enter_open(self, request: httpx2.Request, message: str, attributes: dict[str, typing.Any]) -> None: self._state = _CircuitState.OPEN self._opened_at = self._now() self._consecutive_failures = 0 self._consecutive_successes = 0 - self._emit( + self._emit(request, "circuit.opened", logging.WARNING, message, attributes) + + def _open(self, request: httpx2.Request, *, failures: int) -> None: + self._enter_open( request, - "circuit.opened", - logging.WARNING, "circuit opened — failure threshold reached", {"failure_threshold": self._failure_threshold, "failures": failures}, ) + def _open_rate(self, request: httpx2.Request, *, total: int, failures: int) -> None: + self._enter_open( + request, + "circuit opened — failure rate threshold reached", + { + "failure_rate": failures / total, + "failure_rate_threshold": self._failure_rate_threshold, + "window_seconds": self._window_seconds, + "observed_calls": total, + }, + ) + + def _record_outcome(self, request: httpx2.Request, *, failed: bool) -> None: + # Only reached in rate mode, where _window and _failure_rate_threshold are non-None. + now = self._now() + self._window.record(now, failed=failed) # ty: ignore[unresolved-attribute] + total, failures = self._window.totals(now) # ty: ignore[unresolved-attribute] + threshold = self._failure_rate_threshold + if threshold is not None and total >= self._minimum_calls and failures / total >= threshold: + self._open_rate(request, total=total, failures=failures) + def _emit( self, request: httpx2.Request, diff --git a/tests/test_circuit_breaker.py b/tests/test_circuit_breaker.py index 23f78dc..207d4a4 100644 --- a/tests/test_circuit_breaker.py +++ b/tests/test_circuit_breaker.py @@ -6,6 +6,7 @@ """ import asyncio +import contextlib import logging import re from collections.abc import Callable @@ -533,3 +534,80 @@ def test_minimum_calls_below_one_raises() -> None: def test_classic_mode_is_default_when_rate_threshold_none() -> None: breaker = AsyncCircuitBreaker() # no failure_rate_threshold assert breaker._state._rate_mode is False # noqa: SLF001 — white-box assertion for internal mode flag + + +# ── rate-mode trip behavior ── + + +async def test_rate_mode_trips_on_partial_failure() -> None: + """Alternating 50% failures trip rate mode (classic never would).""" + clock = _Clock() + breaker = AsyncCircuitBreaker(failure_rate_threshold=0.5, window_seconds=100.0, minimum_calls=10, _now=clock) + handler = _StatusSequence([500, 200, 500, 200, 500, 200, 500, 200, 500, 200]) + client = _client(handler, breaker=breaker) + for _ in range(10): + with contextlib.suppress(InternalServerError): + await client.get("https://example.test/x") + with pytest.raises(CircuitOpenError): + await client.get("https://example.test/x") + + +async def test_rate_mode_does_not_trip_below_minimum_calls() -> None: + clock = _Clock() + breaker = AsyncCircuitBreaker(failure_rate_threshold=0.5, window_seconds=100.0, minimum_calls=10, _now=clock) + handler = _StatusSequence([500, 500, 500]) # 3 failures, below floor of 10 + client = _client(handler, breaker=breaker) + for _ in range(3): + with pytest.raises(InternalServerError): + await client.get("https://example.test/x") + handler_ok = _StatusSequence([200]) + client_ok = _client(handler_ok, breaker=breaker) + assert (await client_ok.get("https://example.test/x")).status_code == HTTPStatus.OK + + +async def test_rate_mode_evicts_old_failures() -> None: + clock = _Clock() + breaker = AsyncCircuitBreaker(failure_rate_threshold=0.5, window_seconds=10.0, minimum_calls=4, _now=clock) + fail = _client(_StatusSequence([500, 500, 500, 500, 500, 500, 500, 500]), breaker=breaker) + for _ in range(3): + with pytest.raises(InternalServerError): + await fail.get("https://example.test/x") + clock.advance(20.0) # push them fully out of the 10s window + with pytest.raises(InternalServerError): + await fail.get("https://example.test/x") + ok = _client(_StatusSequence([200]), breaker=breaker) + assert (await ok.get("https://example.test/x")).status_code == HTTPStatus.OK + + +async def test_rate_mode_clears_window_on_close() -> None: + """Closing from HALF_OPEN in rate mode clears the window — recovery starts fresh. + + Discriminating: without the clear, the pre-open failures would still be inside the + window after recovery and a single post-close failure would re-cross the rate + threshold immediately. With the clear, the post-close failure is below minimum_calls + again, so the circuit stays CLOSED. + """ + clock = _Clock() + breaker = AsyncCircuitBreaker( + failure_rate_threshold=0.5, + window_seconds=100.0, + minimum_calls=2, + reset_timeout=5.0, + success_threshold=1, + _now=clock, + ) + open_client = _client(_StatusSequence([500, 500]), breaker=breaker) + for _ in range(2): + with pytest.raises(InternalServerError): + await open_client.get("https://example.test/x") + with pytest.raises(CircuitOpenError): # 2/2 failures >= 0.5 -> OPEN + await open_client.get("https://example.test/x") + clock.advance(5.0) + probe_client = _client(_StatusSequence([200]), breaker=breaker) + await probe_client.get("https://example.test/x") # probe 200 -> CLOSED, window cleared + # One fresh failure: total=1 < minimum_calls=2, so the circuit stays CLOSED. + fail_client = _client(_StatusSequence([500]), breaker=breaker) + with pytest.raises(InternalServerError): + await fail_client.get("https://example.test/x") + ok_client = _client(_StatusSequence([200]), breaker=breaker) + assert (await ok_client.get("https://example.test/x")).status_code == HTTPStatus.OK diff --git a/tests/test_circuit_breaker_sync.py b/tests/test_circuit_breaker_sync.py index 8f23144..b80664c 100644 --- a/tests/test_circuit_breaker_sync.py +++ b/tests/test_circuit_breaker_sync.py @@ -1,5 +1,6 @@ """Tests for the sync CircuitBreaker middleware (mirror of AsyncCircuitBreaker).""" +import contextlib import logging import threading from collections.abc import Callable @@ -445,3 +446,80 @@ def _probe() -> None: assert not thread.is_alive() assert rejected[0].retry_after is None + + +# ── rate-mode trip behavior (sync mirror) ── + + +def test_rate_mode_trips_on_partial_failure() -> None: + """Alternating 50% failures trip rate mode (classic never would).""" + clock = _Clock() + breaker = CircuitBreaker(failure_rate_threshold=0.5, window_seconds=100.0, minimum_calls=10, _now=clock) + handler = _StatusSequence([500, 200, 500, 200, 500, 200, 500, 200, 500, 200]) + client = _client(handler, breaker=breaker) + for _ in range(10): + with contextlib.suppress(InternalServerError): + client.get("https://example.test/x") + with pytest.raises(CircuitOpenError): + client.get("https://example.test/x") + + +def test_rate_mode_does_not_trip_below_minimum_calls() -> None: + clock = _Clock() + breaker = CircuitBreaker(failure_rate_threshold=0.5, window_seconds=100.0, minimum_calls=10, _now=clock) + handler = _StatusSequence([500, 500, 500]) # 3 failures, below floor of 10 + client = _client(handler, breaker=breaker) + for _ in range(3): + with pytest.raises(InternalServerError): + client.get("https://example.test/x") + handler_ok = _StatusSequence([200]) + client_ok = _client(handler_ok, breaker=breaker) + assert client_ok.get("https://example.test/x").status_code == HTTPStatus.OK + + +def test_rate_mode_evicts_old_failures() -> None: + clock = _Clock() + breaker = CircuitBreaker(failure_rate_threshold=0.5, window_seconds=10.0, minimum_calls=4, _now=clock) + fail = _client(_StatusSequence([500, 500, 500, 500, 500, 500, 500, 500]), breaker=breaker) + for _ in range(3): + with pytest.raises(InternalServerError): + fail.get("https://example.test/x") + clock.advance(20.0) # push them fully out of the 10s window + with pytest.raises(InternalServerError): + fail.get("https://example.test/x") + ok = _client(_StatusSequence([200]), breaker=breaker) + assert ok.get("https://example.test/x").status_code == HTTPStatus.OK + + +def test_rate_mode_clears_window_on_close() -> None: + """Closing from HALF_OPEN in rate mode clears the window — recovery starts fresh. + + Discriminating: without the clear, the pre-open failures would still be inside the + window after recovery and a single post-close failure would re-cross the rate + threshold immediately. With the clear, the post-close failure is below minimum_calls + again, so the circuit stays CLOSED. + """ + clock = _Clock() + breaker = CircuitBreaker( + failure_rate_threshold=0.5, + window_seconds=100.0, + minimum_calls=2, + reset_timeout=5.0, + success_threshold=1, + _now=clock, + ) + open_client = _client(_StatusSequence([500, 500]), breaker=breaker) + for _ in range(2): + with pytest.raises(InternalServerError): + open_client.get("https://example.test/x") + with pytest.raises(CircuitOpenError): # 2/2 failures >= 0.5 -> OPEN + open_client.get("https://example.test/x") + clock.advance(5.0) + probe_client = _client(_StatusSequence([200]), breaker=breaker) + probe_client.get("https://example.test/x") # probe 200 -> CLOSED, window cleared + # One fresh failure: total=1 < minimum_calls=2, so the circuit stays CLOSED. + fail_client = _client(_StatusSequence([500]), breaker=breaker) + with pytest.raises(InternalServerError): + fail_client.get("https://example.test/x") + ok_client = _client(_StatusSequence([200]), breaker=breaker) + assert ok_client.get("https://example.test/x").status_code == HTTPStatus.OK From f15895935e7118365fc12ad6d33f697e460eb197 Mon Sep 17 00:00:00 2001 From: Artur Shiriev Date: Tue, 16 Jun 2026 13:24:00 +0300 Subject: [PATCH 4/9] test(circuit-breaker): assert rate-mode circuit.opened attributes Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_circuit_breaker.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_circuit_breaker.py b/tests/test_circuit_breaker.py index 207d4a4..0ac0796 100644 --- a/tests/test_circuit_breaker.py +++ b/tests/test_circuit_breaker.py @@ -611,3 +611,24 @@ async def test_rate_mode_clears_window_on_close() -> None: await fail_client.get("https://example.test/x") ok_client = _client(_StatusSequence([200]), breaker=breaker) assert (await ok_client.get("https://example.test/x")).status_code == HTTPStatus.OK + + +async def test_rate_mode_open_event_carries_rate_attributes(caplog: pytest.LogCaptureFixture) -> None: + """circuit.opened in rate mode carries rate attributes, not the classic ones.""" + clock = _Clock() + breaker = AsyncCircuitBreaker(failure_rate_threshold=0.5, window_seconds=100.0, minimum_calls=4, _now=clock) + # 2 failures then 2 successes → total 4 (meets minimum_calls), rate 2/4 = 0.5 → opens + client = _client(_StatusSequence([500, 500, 200, 200]), breaker=breaker) + with caplog.at_level(logging.WARNING, logger="httpware.circuit_breaker"): + for _ in range(2): + with pytest.raises(InternalServerError): + await client.get("https://example.test/x") + for _ in range(2): + await client.get("https://example.test/x") + opened = [r for r in caplog.records if r.event == "circuit.opened"] # ty: ignore[unresolved-attribute] + assert opened, "expected a circuit.opened record" + rec = opened[-1] + assert rec.failure_rate_threshold == 0.5 # noqa: PLR2004 # ty: ignore[unresolved-attribute] + assert rec.observed_calls >= 4 # noqa: PLR2004 # ty: ignore[unresolved-attribute] + assert hasattr(rec, "failure_rate") + assert not hasattr(rec, "failure_threshold") # classic attribute absent in rate mode From 6138df04c56e72253347e7bfe9d0a0bf98344cda Mon Sep 17 00:00:00 2001 From: Artur Shiriev Date: Tue, 16 Jun 2026 13:28:16 +0300 Subject: [PATCH 5/9] docs(circuit-breaker): document rate mode; 0.13.0 release notes Co-Authored-By: Claude Opus 4.8 (1M context) --- architecture/resilience.md | 2 + docs/resilience.md | 19 ++++++++++ planning/releases/0.13.0.md | 76 +++++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 planning/releases/0.13.0.md diff --git a/architecture/resilience.md b/architecture/resilience.md index ce145a6..26f3f2c 100644 --- a/architecture/resilience.md +++ b/architecture/resilience.md @@ -14,6 +14,8 @@ `AsyncCircuitBreaker` and sync `CircuitBreaker` are a classic consecutive-failure circuit breaker: the circuit opens after `failure_threshold` consecutive counted failures, fast-fails while OPEN, admits one probe after `reset_timeout` (HALF_OPEN), and closes again after `success_threshold` consecutive probe successes; a probe failure re-opens it. A *counted failure* is a `NetworkError`, an httpware `TimeoutError`, or a `StatusError` whose `status_code` is in the effective failure set (default: all 5xx, 500–599); 4xx including 429 count as successes, and any other exception type propagates unchanged without affecting circuit state. When the breaker refuses a request — OPEN, or HALF_OPEN with the single probe slot already taken — it raises `CircuitOpenError` and never forwards to `next`; the error's `retry_after` carries the seconds until the next probe will be admitted, or `None` when a concurrent probe is already in flight. A breaker instance is sharable across clients (one shared circuit); a sync instance cannot be shared with an async one. +The classic consecutive-failure mode is the default and unchanged. An opt-in time-based failure-rate mode is available: set `failure_rate_threshold` (a float in `(0, 1]`) to switch. In rate mode the circuit opens when the observed failure rate over a rolling `window_seconds` window (default `30.0` s) meets or exceeds the threshold, but only once `minimum_calls` outcomes have been observed in that window (default `20`). The `failure_threshold` parameter is ignored in rate mode — the trip condition is purely rate-based. Half-open recovery (`reset_timeout`, `success_threshold`, the single-probe admission) is identical to classic mode. The event names (`circuit.opened`, `circuit.rejected`, `circuit.half_open`, `circuit.closed`) are the same in both modes; in rate mode the `circuit.opened` event carries extra attributes — `failure_rate`, `failure_rate_threshold`, `window_seconds`, `observed_calls` — and its message is `"circuit opened — failure rate threshold reached"`. + `AsyncTimeout` is an async-only middleware that bounds the total wall-clock for the whole inner pipeline (most importantly across an `AsyncRetry` loop, whose attempts and backoff sleeps `httpx2` cannot bound). It is not a per-call timeout — `httpx2`'s connect/read/write/pool timeouts are the right tool for a single outbound call, and `AsyncTimeout` does not duplicate them. It rejects a non-finite or non-positive `timeout` at construction, and on expiry raises httpware `TimeoutError`. There is no sync `Timeout`: a sync total-deadline cannot interrupt a blocking call mid-flight, and `httpx2` already covers sync per-call timeouts. Sync callers configure `httpx2`'s timeouts directly. The recommended (documented, not enforced) composition order is `AsyncTimeout → AsyncCircuitBreaker → AsyncBulkhead → AsyncRetry → terminal`. With the breaker outside retry, an open circuit short-circuits the entire retry loop and the breaker counts one outcome per fully-exhausted retry sequence rather than per attempt. diff --git a/docs/resilience.md b/docs/resilience.md index e6603a1..ff00b2a 100644 --- a/docs/resilience.md +++ b/docs/resilience.md @@ -191,6 +191,25 @@ Emitted on logger `httpware.circuit_breaker`: | `circuit.half_open` | Reset timeout elapsed; circuit transitions OPEN → HALF_OPEN | | `circuit.closed` | Success threshold reached; circuit transitions HALF_OPEN → CLOSED | +### Time-based failure-rate mode + +By default the circuit breaker trips on `failure_threshold` *consecutive* counted failures. This can miss partial degradation: a downstream returning errors on exactly half of all requests will never form a consecutive streak long enough to trip — the circuit stays closed while the error rate sits at 50%. + +For that pattern, switch to rate mode by passing `failure_rate_threshold`: + +```python +from httpware.middleware.resilience import AsyncCircuitBreaker + + +breaker = AsyncCircuitBreaker( + failure_rate_threshold=0.5, # open at ≥50% failures + window_seconds=30.0, # over a rolling 30s window + minimum_calls=20, # but only once 20+ calls are observed +) +``` + +When `failure_rate_threshold` is set the breaker watches the rolling `window_seconds` window (default `30.0` s) and opens once the failure rate meets the threshold — provided at least `minimum_calls` (default `20`) outcomes have been observed in that window. Classic mode is the default; `failure_threshold` is ignored in rate mode. Half-open recovery works identically in both modes. The same `CircuitBreaker` constructor accepts the same parameters for sync clients. + ### Sharing Pass the same instance to multiple clients to enforce one shared circuit across them. A `CircuitBreaker` (sync) cannot be shared with an `AsyncCircuitBreaker` — they use different concurrency primitives. diff --git a/planning/releases/0.13.0.md b/planning/releases/0.13.0.md new file mode 100644 index 0000000..8bcbbb6 --- /dev/null +++ b/planning/releases/0.13.0.md @@ -0,0 +1,76 @@ +# httpware 0.13.0 — time-based failure-rate trip mode for the circuit breaker + +**Minor release. Additive only — no breaking changes.** + +This release adds an opt-in time-based failure-rate trip mode to both +`AsyncCircuitBreaker` and `CircuitBreaker`. Classic consecutive-failure behavior +is the default and is unchanged. + +## New behavior + +The classic circuit breaker trips after `failure_threshold` consecutive counted +failures — a simple and effective policy for hard outages. It can miss partial +degradation, though: a downstream returning errors on half of all requests may +never form a long enough consecutive streak to trip the circuit. + +Rate mode addresses this. Pass `failure_rate_threshold` to switch: + +```python +from httpware import AsyncClient +from httpware.middleware.resilience import AsyncCircuitBreaker + + +breaker = AsyncCircuitBreaker( + failure_rate_threshold=0.5, # open at ≥50% failures + window_seconds=30.0, # over a rolling 30s window + minimum_calls=20, # but only once 20+ calls are observed +) + +async with AsyncClient( + base_url="https://api.example.com", + middleware=[breaker], +) as client: + response = await client.get("/users/1") +``` + +The circuit opens when the observed failure rate over the rolling `window_seconds` +window meets or exceeds `failure_rate_threshold` — but only once `minimum_calls` +outcomes have been recorded in that window. The `minimum_calls` guard prevents a +single early failure from immediately tripping the circuit before a meaningful +sample has accumulated. + +## New constructor parameters + +Both `AsyncCircuitBreaker` and `CircuitBreaker` accept three new keyword +arguments: + +| Parameter | Default | Effect | +|---|---|---| +| `failure_rate_threshold` | `None` | Float in `(0, 1]`. When set, switches the breaker to rate mode. `None` keeps classic consecutive-failure mode. `≤0` or `>1` raises `ValueError`. | +| `window_seconds` | `30.0` | Rolling window width for rate mode. Ignored in classic mode. `≤0` raises `ValueError`. | +| `minimum_calls` | `20` | Minimum outcomes in the window before the rate is evaluated. Ignored in classic mode. `<1` raises `ValueError`. | + +In rate mode, `failure_threshold` is ignored — the trip condition is purely +rate-based. All other parameters (`reset_timeout`, `success_threshold`, +`failure_status_codes`) apply in both modes. + +## Observability + +Event names are identical in both modes: `circuit.opened`, `circuit.rejected`, +`circuit.half_open`, `circuit.closed`. In rate mode the `circuit.opened` event +carries additional attributes — `failure_rate`, `failure_rate_threshold`, +`window_seconds`, `observed_calls` — and its message is +`"circuit opened — failure rate threshold reached"`. + +## What is NOT in this release + +The following remain deferred and are not part of 0.13.0: + +- Count-based sliding windows (the current implementation is time-based only) +- Slow-call detection (a separate trip axis based on latency percentiles) +- Manual circuit control (`force_open`, `force_closed`) +- State introspection properties on the breaker instance + +## Shipped via + +PR #XX — time-based failure-rate trip mode for the circuit breaker. From bab3655ad382111fe604a04e52cf9cd769661cbc Mon Sep 17 00:00:00 2001 From: Artur Shiriev Date: Tue, 16 Jun 2026 13:32:20 +0300 Subject: [PATCH 6/9] docs(planning): add the circuit-breaker-rate-mode change bundle Design + plan for the opt-in time-based failure-rate trip mode, and the Active Index entry. Bundle stays active/draft until merge. Co-Authored-By: Claude Opus 4.8 (1M context) --- planning/README.md | 2 +- .../design.md | 180 +++++ .../plan.md | 696 ++++++++++++++++++ 3 files changed, 877 insertions(+), 1 deletion(-) create mode 100644 planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/design.md create mode 100644 planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/plan.md diff --git a/planning/README.md b/planning/README.md index 16bda74..faaaf0e 100644 --- a/planning/README.md +++ b/planning/README.md @@ -70,7 +70,7 @@ carry **no** frontmatter — living prose, dated by git. ### Active -_None._ +- **[circuit-breaker-rate-mode](changes/active/2026-06-16.02-circuit-breaker-rate-mode/design.md)** (2026-06-16) — Add an opt-in time-based failure-rate trip mode to the circuit breaker (classic stays default). Closes deferred item "CircuitBreaker v2 (a)". Targets 0.13.0. ### Archived (shipped) diff --git a/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/design.md b/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/design.md new file mode 100644 index 0000000..b8f223c --- /dev/null +++ b/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/design.md @@ -0,0 +1,180 @@ +--- +status: draft +date: 2026-06-16 +slug: circuit-breaker-rate-mode +supersedes: null +superseded_by: null +pr: null +outcome: null +--- + +# Design: CircuitBreaker v2 — time-based failure-rate trip mode + +## Summary + +Add an additive, opt-in **time-based failure-rate** trip mode to +`AsyncCircuitBreaker` / `CircuitBreaker`. The classic consecutive-failure model +stays the default and is byte-unchanged; nothing trips differently unless the +caller sets `failure_rate_threshold`. Rate mode opens the circuit when the +failure rate over a rolling time window meets the threshold, once a minimum call +volume is observed. Ships as 0.13.0. + +## Motivation + +The 0.10.0 breaker ships only the classic model: open after N *consecutive* +counted failures. That cannot catch *partial* degradation — a steady 50% error +rate that alternates success/fail never reaches a consecutive streak, so the +breaker never trips while half the traffic is failing. This was deferred to v2 +in the 0.10.0 spec, with the config deliberately shaped so a rate mode is purely +additive (see [`deferred.md`](../../deferred.md) → "CircuitBreaker v2"). + +The verified comparison in `deferred.md` (2026-06-13) shows rate-over-window is +the mainstream model for service-level breakers: Hystrix (time-bucketed), +Polly v8 (time-based only), and Envoy/Istio outlier detection (time intervals) +are all time-based; Resilience4j defaults to count-based but offers both. We +choose **time-based** because the mental model matches the HTTP domain ("trip if +>50% of calls failed in the last 30s"), it degrades sanely under variable +traffic (a count-based window can hold hour-old outcomes when traffic is low), +and it is consistent with the existing wall-clock `reset_timeout`. + +## Non-goals + +- **Count-based windows.** Deferred; the config leaves room to add a window-type + selector later if anyone asks. +- **Slow-call rate axis.** Resilience4j-only; redundant with `AsyncTimeout`. +- **Manual control / read-only `state` introspection** (deferred item b). Stays + parked as YAGNI; independent design axis. +- **Rate-based half-open recovery.** Half-open stays identical to v1 in both + modes (consecutive `success_threshold` probe successes) — simpler, and the + trip mode is the only behavioral change. + +## Design + +### 1. Opt-in config shape + +`failure_rate_threshold` is the mode switch on both wrappers' `__init__`: + +```python +AsyncCircuitBreaker( + failure_rate_threshold=0.5, # None (default) = classic; set = rate mode + window_seconds=30.0, # rolling window duration (default 30.0) + minimum_calls=20, # floor before the rate is evaluated (default 20) + # unchanged, shared by both modes: + reset_timeout=30.0, + success_threshold=1, + failure_status_codes=None, +) +``` + +- **Shared across modes:** `reset_timeout`, `success_threshold` (half-open + recovery), `failure_status_codes` (the counted-failure set — 429/4xx remain + successes). +- **Classic-only:** `failure_threshold`. In rate mode it is **silently ignored** + (documented). The two thresholds don't conflict — the mode is selected solely + by whether `failure_rate_threshold` is `None` — so no raise-on-both guard is + added. +- **Validation** (in `_CircuitBreakerState.__init__`, alongside the existing + checks): when `failure_rate_threshold is not None`, require + `0.0 < failure_rate_threshold <= 1.0`; require `window_seconds > 0`; require + `minimum_calls >= 1`. New message constants follow the existing + `_FAILURE_THRESHOLD_INVALID` pattern. + +### 2. Time-based rolling-bucket window + +A new internal `_RollingWindow` (or inline state on `_CircuitBreakerState`): +`window_seconds` divided into a fixed **10 buckets** (`_BUCKET_COUNT`), each a +`[successes, failures]` pair tagged with the time-slot it represents. Bucket +width = `window_seconds / 10`. + +Recording an outcome (synchronous, no await): +1. `slot = floor(self._now() / bucket_width)`. +2. `index = slot % _BUCKET_COUNT`. If the bucket at `index` carries a stale slot + tag (`!= slot`), reset it to `[0, 0]` and retag — this evicts data older than + one full window in O(`_BUCKET_COUNT`), independent of call volume. +3. Increment the bucket's success or failure count. + +Rate computation sums `(successes, failures)` across buckets whose slot tag is +within the last `_BUCKET_COUNT` slots (live), giving `total` and `failures`; +`rate = failures / total` when `total > 0`. Eviction-on-read drops buckets that +fell out of the window since the last write. + +All bucket reads/writes happen inside the same synchronous critical section the +breaker already uses (async: lock-free under one event loop; sync: +`threading.Lock`), and `_now()` is read inside that section. + +### 3. State-machine integration — mode changes only the CLOSED trip test + +The trip mode affects exactly one decision: when to open from CLOSED. Everything +else is shared. + +- **CLOSED, rate mode:** `on_success` and `on_failure` record the outcome into + the window (a counted failure increments failures; a success increments + successes). After recording, if `total >= minimum_calls` **and** + `rate >= failure_rate_threshold`, open the circuit. The classic consecutive + counters are not used in rate mode. +- **CLOSED, classic mode:** unchanged — consecutive-failure counter, open at + `failure_threshold`. +- **OPEN → HALF_OPEN → CLOSED:** identical for both modes — lazy probe after + `reset_timeout`, `success_threshold` consecutive probe successes close it, one + probe failure re-opens. On transition to CLOSED, the window is cleared (all + buckets reset) so recovery starts from a clean slate. +- **`release_probe` and non-counted exceptions** never touch the window — + consistent with today (programming errors can't trip the breaker). + +This logic lives entirely in the shared `_CircuitBreakerState`, so +`AsyncCircuitBreaker` and `CircuitBreaker` reach parity with no per-wrapper code +(the wrappers' `__init__` just forward the three new params). + +### 4. Observability + +Event names are unchanged (`circuit.opened`, `circuit.rejected`, +`circuit.half_open`, `circuit.closed`) — the stable observability surface is +preserved. In rate mode, `circuit.opened` carries rate attributes instead of the +classic ones: `failure_rate`, `failure_rate_threshold`, `window_seconds`, +`observed_calls` (the `total`). Classic mode keeps emitting `failure_threshold` + +`failures`. `circuit.rejected`/`half_open`/`closed` are unchanged. + +## Testing + +Deterministic tests with a pinned `_now` callable (the existing constructor +already accepts `_now`), sync + async mirrors: + +- **Trips at threshold:** with `minimum_calls` met and `rate >= + failure_rate_threshold`, the circuit opens; an alternating 50% pattern that + never trips the classic breaker DOES trip rate mode. +- **Volume floor:** below `minimum_calls`, a 100%-failure burst does NOT open. +- **Time eviction:** failures recorded, then `_now` advanced past + `window_seconds`, then fresh successes — old failures age out and the rate + reflects only the live window. +- **Classic unchanged:** existing breaker tests stay green (no behavior drift + when `failure_rate_threshold is None`). +- **Half-open in rate mode:** open → probe after `reset_timeout` → + `success_threshold` successes close → window cleared (a subsequent single + failure doesn't immediately re-trip). +- **Validation:** out-of-range `failure_rate_threshold`, non-positive + `window_seconds`, `minimum_calls < 1` raise `ValueError`. +- **Hypothesis prop** (`test_circuit_breaker_props.py` companion) for the + rolling-window recorder: arbitrary interleavings of outcomes and time advances + never miscount the live-window totals or evict live data. + +`just test` green; `just lint` clean. + +## Risk + +- **Window-eviction correctness (medium × high).** Off-by-one in slot tagging or + the modulo ring could count stale data or drop live data. Mitigated by the + Hypothesis prop on the recorder and explicit time-advance tests; the standard + slot-tag-and-retag pattern is well understood. +- **Concurrency (low × high).** Recording stays a synchronous mutation, so the + async lock-free atomicity invariant and the sync `threading.Lock` both still + hold — no new await points. Eviction reads `_now()` inside the critical + section. This matches the `deferred.md` concurrency note. +- **Config confusion (low × low).** `failure_threshold` being ignored in rate + mode could surprise; mitigated by docstring + `architecture/resilience.md` + wording. + +## Out of scope + +Count-based windows; slow-call axis; manual control + `state`; rate-based +half-open; any change to classic-mode behavior, `AsyncTimeout`, or the +composition-order recommendation. diff --git a/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/plan.md b/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/plan.md new file mode 100644 index 0000000..e39eab6 --- /dev/null +++ b/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/plan.md @@ -0,0 +1,696 @@ +--- +status: draft +date: 2026-06-16 +slug: circuit-breaker-rate-mode +spec: circuit-breaker-rate-mode +pr: null +--- + +# circuit-breaker-rate-mode — implementation plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use +> superpowers:subagent-driven-development (recommended) or +> superpowers:executing-plans to implement this plan task-by-task. Steps +> use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add an opt-in time-based failure-rate trip mode to +`AsyncCircuitBreaker` / `CircuitBreaker`; classic consecutive-failure stays the +default and unchanged. + +**Architecture:** A new `_RollingWindow` (time-bucketed success/failure counters) +lives beside the existing lock-free `_CircuitBreakerState`. Three new +constructor params (`failure_rate_threshold`, `window_seconds`, `minimum_calls`) +select and configure rate mode. The mode changes only the CLOSED→OPEN decision; +half-open recovery, event names, and concurrency model are unchanged. All logic +sits in `_CircuitBreakerState`, so both wrappers reach parity for free. + +**Tech Stack:** Python 3.11+, `httpx2`, `pytest` (asyncio auto mode), Hypothesis +for the window-recorder property test, `time.monotonic` (injected as `_now` in +tests). + +**Spec:** [`design.md`](./design.md) + +**Branch:** `feat/circuit-breaker-rate-mode` + +**Commit strategy:** Per-task commits. + +--- + +### Task 1: `_RollingWindow` time-bucketed recorder + +A standalone, fully-tested data structure before any breaker wiring. This is the +riskiest piece (slot math / eviction), so it gets unit tests + a Hypothesis prop +in isolation. + +**Files:** +- Modify: `src/httpware/middleware/resilience/circuit_breaker.py` +- Test: `tests/test_rolling_window.py` (create) + +- [ ] **Step 1: Write the failing unit tests** + + Create `tests/test_rolling_window.py`: + + ```python + """Unit tests for the time-bucketed _RollingWindow used by rate-mode CircuitBreaker.""" + + from httpware.middleware.resilience.circuit_breaker import _RollingWindow + + + def test_counts_within_window() -> None: + w = _RollingWindow(window_seconds=10.0) + w.record(0.0, failed=True) + w.record(1.0, failed=True) + w.record(2.0, failed=False) + total, failures = w.totals(2.0) + assert (total, failures) == (3, 2) + + + def test_empty_window_is_zero() -> None: + w = _RollingWindow(window_seconds=10.0) + assert w.totals(0.0) == (0, 0) + + + def test_stale_buckets_evicted_by_time() -> None: + w = _RollingWindow(window_seconds=10.0) + w.record(0.0, failed=True) + w.record(0.5, failed=True) + # advance a full window past those records + w.record(11.0, failed=False) + total, failures = w.totals(11.0) + assert (total, failures) == (1, 0) + + + def test_totals_excludes_stale_without_new_write() -> None: + w = _RollingWindow(window_seconds=10.0) + w.record(0.0, failed=True) + # no write after the window elapses — totals() alone must drop the stale bucket + assert w.totals(20.0) == (0, 0) + + + def test_clear_resets_everything() -> None: + w = _RollingWindow(window_seconds=10.0) + w.record(0.0, failed=True) + w.record(1.0, failed=False) + w.clear() + assert w.totals(1.0) == (0, 0) + ``` + +- [ ] **Step 2: Run to verify failure** + + Run: `just test tests/test_rolling_window.py` + Expected: FAIL — `ImportError: cannot import name '_RollingWindow'`. + +- [ ] **Step 3: Implement `_RollingWindow` + the bucket constant** + + In `src/httpware/middleware/resilience/circuit_breaker.py`, add the constant + near the other module constants (after `_DEFAULT_FAILURE_STATUS_CODES`): + + ```python + _BUCKET_COUNT = 10 + ``` + + Add the class above `class _CircuitBreakerState:`: + + ```python + class _RollingWindow: + """Time-bucketed success/failure counters over a rolling window. + + `window_seconds` is split into `_BUCKET_COUNT` buckets. Each bucket holds + [successes, failures] tagged with the integer time-slot it represents; a + bucket whose slot is stale is reset on write, and `totals` filters to the + live slot range so data older than the window never counts. Every method is + synchronous and reads `now` from its caller (so the breaker's critical + section owns the clock read). + """ + + def __init__(self, window_seconds: float) -> None: + self._bucket_width = window_seconds / _BUCKET_COUNT + self._slot = [-1] * _BUCKET_COUNT + self._success = [0] * _BUCKET_COUNT + self._failure = [0] * _BUCKET_COUNT + + def _current_slot(self, now: float) -> int: + return int(now // self._bucket_width) + + def record(self, now: float, *, failed: bool) -> None: + slot = self._current_slot(now) + index = slot % _BUCKET_COUNT + if self._slot[index] != slot: # bucket reused for a new slot — evict + self._slot[index] = slot + self._success[index] = 0 + self._failure[index] = 0 + if failed: + self._failure[index] += 1 + else: + self._success[index] += 1 + + def totals(self, now: float) -> tuple[int, int]: + """Return (total, failures) across buckets still inside the window at `now`.""" + slot = self._current_slot(now) + oldest = slot - _BUCKET_COUNT + 1 + total = 0 + failures = 0 + for i in range(_BUCKET_COUNT): + if oldest <= self._slot[i] <= slot: + total += self._success[i] + self._failure[i] + failures += self._failure[i] + return total, failures + + def clear(self) -> None: + self._slot = [-1] * _BUCKET_COUNT + self._success = [0] * _BUCKET_COUNT + self._failure = [0] * _BUCKET_COUNT + ``` + +- [ ] **Step 4: Run to verify pass** + + Run: `just test tests/test_rolling_window.py` + Expected: PASS (5 tests). + +- [ ] **Step 5: Add a Hypothesis property test** + + Append to `tests/test_rolling_window.py`: + + ```python + from hypothesis import given, settings + from hypothesis import strategies as st + + + @given( + events=st.lists( + st.tuples(st.floats(min_value=0.0, max_value=1000.0), st.booleans()), + min_size=1, + max_size=200, + ), + ) + @settings(max_examples=100, deadline=None) + def test_totals_never_exceed_live_events(events: list[tuple[float, bool]]) -> None: + """totals() at the final time never counts more than the events inside the live window.""" + window_seconds = 10.0 + w = _RollingWindow(window_seconds=window_seconds) + ordered = sorted(events, key=lambda e: e[0]) + for now, failed in ordered: + w.record(now, failed=failed) + final = ordered[-1][0] + bucket_width = window_seconds / 10 + live_cutoff_slot = int(final // bucket_width) - 10 + 1 + expected_live = [(t, f) for (t, f) in ordered if int(t // bucket_width) >= live_cutoff_slot] + total, failures = w.totals(final) + assert total <= len(expected_live) + assert failures <= sum(1 for _, f in expected_live if f) + assert 0 <= failures <= total + ``` + +- [ ] **Step 6: Run the props + full suite + lint** + + Run: `just test tests/test_rolling_window.py && just lint` + Expected: PASS; lint clean. + +- [ ] **Step 7: Commit** + + ```bash + git add src/httpware/middleware/resilience/circuit_breaker.py tests/test_rolling_window.py + git commit -m "feat(circuit-breaker): add time-bucketed _RollingWindow recorder + + Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 2: Rate-mode config params + validation + +Thread the three new params through `_CircuitBreakerState` and both wrappers, +with validation. No trip-behavior change yet — rate mode is configured but the +CLOSED decision still uses the classic counter (wired in Task 3). + +**Files:** +- Modify: `src/httpware/middleware/resilience/circuit_breaker.py` +- Test: `tests/test_circuit_breaker.py` (add validation tests) + +- [ ] **Step 1: Write failing validation tests** + + Append to `tests/test_circuit_breaker.py`: + + ```python + import pytest as _pytest + + from httpware.middleware.resilience.circuit_breaker import ( + _FAILURE_RATE_THRESHOLD_INVALID, + _MINIMUM_CALLS_INVALID, + _WINDOW_SECONDS_INVALID, + ) + + + @_pytest.mark.parametrize("bad", [0.0, -0.1, 1.5]) + def test_rate_threshold_out_of_range_raises(bad: float) -> None: + with _pytest.raises(ValueError, match=_FAILURE_RATE_THRESHOLD_INVALID): + AsyncCircuitBreaker(failure_rate_threshold=bad) + + + def test_non_positive_window_seconds_raises() -> None: + with _pytest.raises(ValueError, match=_WINDOW_SECONDS_INVALID): + AsyncCircuitBreaker(failure_rate_threshold=0.5, window_seconds=0.0) + + + def test_minimum_calls_below_one_raises() -> None: + with _pytest.raises(ValueError, match=_MINIMUM_CALLS_INVALID): + AsyncCircuitBreaker(failure_rate_threshold=0.5, minimum_calls=0) + + + def test_classic_mode_is_default_when_rate_threshold_none() -> None: + breaker = AsyncCircuitBreaker() # no failure_rate_threshold + assert breaker._state._rate_mode is False + ``` + +- [ ] **Step 2: Run to verify failure** + + Run: `just test tests/test_circuit_breaker.py -k "rate_threshold or window_seconds or minimum_calls or classic_mode_is_default"` + Expected: FAIL — import error for the new message constants / unexpected kwargs. + +- [ ] **Step 3: Add message constants** + + In `circuit_breaker.py`, after the existing `_SUCCESS_THRESHOLD_INVALID`: + + ```python + _FAILURE_RATE_THRESHOLD_INVALID = "failure_rate_threshold must be in (0, 1]" + _WINDOW_SECONDS_INVALID = "window_seconds must be > 0" + _MINIMUM_CALLS_INVALID = "minimum_calls must be >= 1" + ``` + +- [ ] **Step 4: Extend `_CircuitBreakerState.__init__`** + + Add the three params (after `failure_status_codes`, before `now`) and validate + + store them. Set `_rate_mode` and build the window only in rate mode: + + ```python + def __init__( + self, + *, + failure_threshold: int, + reset_timeout: float, + success_threshold: int, + failure_status_codes: Collection[int] | None, + failure_rate_threshold: float | None, + window_seconds: float, + minimum_calls: int, + now: Callable[[], float], + ) -> None: + if failure_threshold < 1: + raise ValueError(_FAILURE_THRESHOLD_INVALID) + if reset_timeout < 0: + raise ValueError(_RESET_TIMEOUT_INVALID) + if success_threshold < 1: + raise ValueError(_SUCCESS_THRESHOLD_INVALID) + if failure_rate_threshold is not None and not (0.0 < failure_rate_threshold <= 1.0): + raise ValueError(_FAILURE_RATE_THRESHOLD_INVALID) + if window_seconds <= 0: + raise ValueError(_WINDOW_SECONDS_INVALID) + if minimum_calls < 1: + raise ValueError(_MINIMUM_CALLS_INVALID) + self._failure_threshold = failure_threshold + self._reset_timeout = reset_timeout + self._success_threshold = success_threshold + self._failure_status_codes = ( + frozenset(failure_status_codes) if failure_status_codes is not None else _DEFAULT_FAILURE_STATUS_CODES + ) + self._failure_rate_threshold = failure_rate_threshold + self._minimum_calls = minimum_calls + self._rate_mode = failure_rate_threshold is not None + self._window = _RollingWindow(window_seconds) if self._rate_mode else None + self._window_seconds = window_seconds + self._now = now + self._state = _CircuitState.CLOSED + self._consecutive_failures = 0 + self._consecutive_successes = 0 + self._opened_at = 0.0 + self._probe_in_flight = False + ``` + +- [ ] **Step 5: Thread the params through both wrappers** + + In BOTH `AsyncCircuitBreaker.__init__` and `CircuitBreaker.__init__`, add the + three params to the signature (after `failure_status_codes`, before `_now`) + and forward them to `_CircuitBreakerState(...)`: + + ```python + failure_rate_threshold: float | None = None, + window_seconds: float = 30.0, + minimum_calls: int = 20, + ``` + + and in the `_CircuitBreakerState(...)` call add: + + ```python + failure_rate_threshold=failure_rate_threshold, + window_seconds=window_seconds, + minimum_calls=minimum_calls, + ``` + +- [ ] **Step 6: Run validation tests + full suite** + + Run: `just test tests/test_circuit_breaker.py` + Expected: PASS (new validation tests + all existing breaker tests unchanged). + Then `just test` (full suite) — expect green. + +- [ ] **Step 7: Commit** + + ```bash + git add src/httpware/middleware/resilience/circuit_breaker.py tests/test_circuit_breaker.py + git commit -m "feat(circuit-breaker): thread rate-mode config + validation + + Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 3: Rate-mode trip integration (CLOSED decision + window clear on close) + +Wire rate mode into the state machine: record outcomes into the window while +CLOSED and open on rate; clear the window when the circuit closes. Async + sync +behavior tests. + +**Files:** +- Modify: `src/httpware/middleware/resilience/circuit_breaker.py` +- Test: `tests/test_circuit_breaker.py`, `tests/test_circuit_breaker_sync.py` + +- [ ] **Step 1: Write failing behavior tests (async)** + + Append to `tests/test_circuit_breaker.py`. These reuse the file's existing + `_Clock`, `_StatusSequence`, `_client`, and error imports: + + ```python + async def test_rate_mode_trips_on_partial_failure() -> None: + """Alternating 50% failures trip rate mode (classic never would).""" + clock = _Clock() + breaker = AsyncCircuitBreaker( + failure_rate_threshold=0.5, window_seconds=100.0, minimum_calls=10, _now=clock + ) + # alternate 500 / 200 for 10 calls → 5 failures / 10 = 0.5 + handler = _StatusSequence([500, 200, 500, 200, 500, 200, 500, 200, 500, 200]) + client = _client(handler, breaker=breaker) + for _ in range(10): + try: + await client.get("https://example.test/x") + except InternalServerError: + pass + # next call is rejected — circuit opened on the rate + with pytest.raises(CircuitOpenError): + await client.get("https://example.test/x") + + + async def test_rate_mode_does_not_trip_below_minimum_calls() -> None: + clock = _Clock() + breaker = AsyncCircuitBreaker( + failure_rate_threshold=0.5, window_seconds=100.0, minimum_calls=10, _now=clock + ) + handler = _StatusSequence([500, 500, 500]) # 3 failures, below floor of 10 + client = _client(handler, breaker=breaker) + for _ in range(3): + with pytest.raises(InternalServerError): + await client.get("https://example.test/x") + # still closed — under the volume floor + handler_ok = _StatusSequence([200]) + client_ok = _client(handler_ok, breaker=breaker) + assert (await client_ok.get("https://example.test/x")).status_code == HTTPStatus.OK + + + async def test_rate_mode_evicts_old_failures() -> None: + clock = _Clock() + breaker = AsyncCircuitBreaker( + failure_rate_threshold=0.5, window_seconds=10.0, minimum_calls=4, _now=clock + ) + fail = _client(_StatusSequence([500, 500, 500, 500, 500, 500, 500, 500]), breaker=breaker) + # 3 failures early in the window + for _ in range(3): + with pytest.raises(InternalServerError): + await fail.get("https://example.test/x") + clock.advance(20.0) # push them fully out of the 10s window + # one fresh failure: live window now has 1 failure / 1 total, but total < minimum_calls + with pytest.raises(InternalServerError): + await fail.get("https://example.test/x") + ok = _client(_StatusSequence([200]), breaker=breaker) + assert (await ok.get("https://example.test/x")).status_code == HTTPStatus.OK + ``` + +- [ ] **Step 2: Run to verify failure** + + Run: `just test tests/test_circuit_breaker.py -k "rate_mode_trips or rate_mode_does_not_trip or rate_mode_evicts"` + Expected: FAIL — the breaker does not yet open on rate (no CircuitOpenError raised). + +- [ ] **Step 3: Add the rate-record helper + open-on-rate transition** + + In `_CircuitBreakerState`, refactor `_open` to share an `_enter_open` core and + add `_record_outcome` + `_open_rate`. Replace the existing `_open` method with: + + ```python + def _enter_open(self, request: httpx2.Request, attributes: dict[str, typing.Any]) -> None: + self._state = _CircuitState.OPEN + self._opened_at = self._now() + self._consecutive_failures = 0 + self._consecutive_successes = 0 + self._emit(request, "circuit.opened", logging.WARNING, "circuit opened — failure threshold reached", attributes) + + def _open(self, request: httpx2.Request, *, failures: int) -> None: + self._enter_open(request, {"failure_threshold": self._failure_threshold, "failures": failures}) + + def _open_rate(self, request: httpx2.Request, *, total: int, failures: int) -> None: + self._enter_open( + request, + { + "failure_rate": failures / total, + "failure_rate_threshold": self._failure_rate_threshold, + "window_seconds": self._window_seconds, + "observed_calls": total, + }, + ) + + def _record_outcome(self, request: httpx2.Request, *, failed: bool) -> None: + now = self._now() + self._window.record(now, failed=failed) # _window is non-None in rate mode + total, failures = self._window.totals(now) + if total >= self._minimum_calls and failures / total >= self._failure_rate_threshold: + self._open_rate(request, total=total, failures=failures) + ``` + + NOTE: `self._window` is `_RollingWindow | None`; it is only accessed inside + `_record_outcome`, which only runs in rate mode (guarded by `_rate_mode` at the + call sites in Step 4). Add `# ty: ignore[possibly-unbound-attribute]` on the + `self._window.record(...)` / `self._window.totals(...)` lines ONLY if `ty` + flags the `| None`; otherwise leave them. (Run `just lint` to find out.) + +- [ ] **Step 4: Route CLOSED outcomes through the window in rate mode** + + Update `on_success` and `on_failure` so the CLOSED branch chooses by mode, and + clear the window when the circuit closes: + + ```python + def on_success(self, role: str, request: httpx2.Request) -> None: + if role == _ROLE_PROBE: + self._probe_in_flight = False + if self._state is _CircuitState.CLOSED: + if self._rate_mode: + self._record_outcome(request, failed=False) + else: + self._consecutive_failures = 0 + elif self._state is _CircuitState.HALF_OPEN: + self._consecutive_successes += 1 + if self._consecutive_successes >= self._success_threshold: + self._state = _CircuitState.CLOSED + self._consecutive_failures = 0 + self._consecutive_successes = 0 + if self._rate_mode: + self._window.clear() # fresh slate on recovery + self._emit(request, "circuit.closed", logging.INFO, "circuit closed — service recovered", {}) + + def on_failure(self, role: str, request: httpx2.Request) -> None: + if role == _ROLE_PROBE: + self._probe_in_flight = False + if self._state is _CircuitState.CLOSED: + if self._rate_mode: + self._record_outcome(request, failed=True) + else: + self._consecutive_failures += 1 + if self._consecutive_failures >= self._failure_threshold: + self._open(request, failures=self._consecutive_failures) + elif self._state is _CircuitState.HALF_OPEN: + self._open(request, failures=1) # 1 = the single probe failure that re-opened the circuit + ``` + +- [ ] **Step 5: Run async behavior tests** + + Run: `just test tests/test_circuit_breaker.py` + Expected: PASS (new rate tests + all classic tests unchanged). + +- [ ] **Step 6: Add + run sync mirror tests** + + Read `tests/test_circuit_breaker_sync.py` to match its `_Clock`/client helpers, + then append sync mirrors of the three Step-1 tests (no `async`/`await`, + `CircuitBreaker` + sync `Client`). Run: + `just test tests/test_circuit_breaker_sync.py` + Expected: PASS. (Rate logic lives in the shared state, so the sync wrapper + needs no extra code — only the param threading from Task 2 Step 5.) + +- [ ] **Step 7: Full suite + lint** + + Run: `just test && just lint` + Expected: all green, lint clean. + +- [ ] **Step 8: Commit** + + ```bash + git add src/httpware/middleware/resilience/circuit_breaker.py tests/test_circuit_breaker.py tests/test_circuit_breaker_sync.py + git commit -m "feat(circuit-breaker): rate-over-window trip mode + + Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 4: Rate-mode observability assertion + +Lock in the `circuit.opened` rate attributes with a test (the implementation +already emits them via `_open_rate` from Task 3 — this task proves it). + +`_emit_event` (see `tests/test_observability.py`) exposes the event name on the +log record as `record.event` and each attribute as a direct record attribute +(e.g. `record.failure_rate_threshold`), so we assert via `caplog.records`. `ty` +flags these dynamic attributes — suppress with `# ty: ignore[unresolved-attribute]` +exactly as `test_observability.py` does. + +**Files:** +- Test: `tests/test_circuit_breaker.py` (add — it already has `_Clock`, + `_StatusSequence`, `_client`, `logging`, and the error imports) + +- [ ] **Step 1: Write the failing test** + + Append to `tests/test_circuit_breaker.py`: + + ```python + async def test_rate_mode_open_event_carries_rate_attributes(caplog: pytest.LogCaptureFixture) -> None: + """circuit.opened in rate mode carries rate attributes, not the classic ones.""" + clock = _Clock() + breaker = AsyncCircuitBreaker( + failure_rate_threshold=0.5, window_seconds=100.0, minimum_calls=4, _now=clock + ) + # 2 failures then 2 successes → total 4 (meets minimum_calls), rate 2/4 = 0.5 → opens + client = _client(_StatusSequence([500, 500, 200, 200]), breaker=breaker) + with caplog.at_level(logging.WARNING, logger="httpware.circuit_breaker"): + for _ in range(2): + with pytest.raises(InternalServerError): + await client.get("https://example.test/x") + for _ in range(2): + await client.get("https://example.test/x") + opened = [r for r in caplog.records if r.event == "circuit.opened"] # ty: ignore[unresolved-attribute] + assert opened, "expected a circuit.opened record" + rec = opened[-1] + assert rec.failure_rate_threshold == 0.5 # ty: ignore[unresolved-attribute] + assert rec.observed_calls >= 4 # ty: ignore[unresolved-attribute] + assert hasattr(rec, "failure_rate") + assert not hasattr(rec, "failure_threshold") # classic attribute absent in rate mode + ``` + + NOTE: `_StatusSequence` returns 200 once its list is exhausted, so a single + shared `client` serves all four calls; the breaker instance carries the state. + +- [ ] **Step 2: Run to verify it passes (impl already emits these)** + + Run: `just test tests/test_circuit_breaker.py::test_rate_mode_open_event_carries_rate_attributes` + Expected: PASS — `_open_rate` (Task 3) already emits these attributes. If it + FAILS on attribute access, fix the test to match the real record surface + (compare against `tests/test_observability.py`), not the implementation. + +- [ ] **Step 3: Full suite + lint** + + Run: `just test && just lint` + Expected: green, clean. + +- [ ] **Step 4: Commit** + + ```bash + git add tests/test_circuit_breaker.py + git commit -m "test(circuit-breaker): assert rate-mode circuit.opened attributes + + Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +### Task 5: Docs + release notes (0.13.0) + +Document rate mode and cut the release notes. Version is **tag-driven** — do NOT +edit `pyproject.toml` (the static `version` field stays at the placeholder `"0"`; +`publish.yml` runs `uv version` from the `0.13.0` tag at release). + +**Files:** +- Modify: `architecture/resilience.md`, `docs/resilience.md` +- Create: `planning/releases/0.13.0.md` + +- [ ] **Step 1: Update architecture/resilience.md** + + Read the `## CircuitBreaker + AsyncTimeout` section and add a paragraph: the + opt-in time-based rate mode — set `failure_rate_threshold` (0–1] to switch from + classic consecutive-failure to "open when the failure rate over a rolling + `window_seconds` (default 30s) meets the threshold, once `minimum_calls` + (default 20) outcomes are observed". Note: classic stays the default; + `failure_threshold` is ignored in rate mode; half-open recovery and event names + are identical; `circuit.opened` carries rate attributes in rate mode. No + frontmatter (living prose). + +- [ ] **Step 2: Update docs/resilience.md** + + Read the circuit-breaker section of `docs/resilience.md` and add a short + user-facing subsection with a code example: + + ```python + from httpware import AsyncClient + from httpware.middleware.resilience.circuit_breaker import AsyncCircuitBreaker + + breaker = AsyncCircuitBreaker( + failure_rate_threshold=0.5, # open at ≥50% failures + window_seconds=30.0, # over a rolling 30s window + minimum_calls=20, # but only once 20+ calls are observed + ) + ``` + + Explain when to prefer rate mode (partial/intermittent degradation a + consecutive-failure breaker misses) and that classic is the default. Match the + page's existing voice and fence style. + +- [ ] **Step 3: Write the release notes** + + Read `planning/releases/0.12.0.md` for voice/structure. Create + `planning/releases/0.13.0.md`: minor, additive-only; the opt-in time-based + failure-rate trip mode on `AsyncCircuitBreaker` / `CircuitBreaker` + (`failure_rate_threshold` + `window_seconds` + `minimum_calls`); classic stays + default and unchanged; same event names with rate attributes on + `circuit.opened`; no head/options-style scope creep; explicitly note count-based + windows / slow-call axis / manual control remain deferred. Include a usage code + block. Leave a `## Shipped via` line referencing the PR (number filled at PR + time). + +- [ ] **Step 4: Verify docs build + full gate** + + Run: `uvx --with-requirements docs/requirements.txt mkdocs build --strict` + Expected: clean; then `rm -rf site`. + Run: `just test && just lint` — green, clean. + +- [ ] **Step 5: Commit** + + ```bash + git add architecture/resilience.md docs/resilience.md planning/releases/0.13.0.md + git commit -m "docs(circuit-breaker): document rate mode; 0.13.0 release notes + + Co-Authored-By: Claude Opus 4.8 (1M context) " + ``` + +--- + +## Ship bookkeeping (after merge) + +Per the planning convention: set this bundle's `design.md` + `plan.md` +frontmatter to `status: shipped` with the PR number, move +`changes/active/2026-06-16.02-circuit-breaker-rate-mode/` to `changes/archive/`, +flip its Index line from Active to Archived, and remove the now-closed +"CircuitBreaker v2" item from `planning/deferred.md` (or trim it to just the +still-deferred parts: count-based windows, manual control + state, slow-call +axis). Release 0.13.0 by creating the `0.13.0` GitHub release (tag-driven publish). From 89fe1cd36cdf6a850c0e8a55365f676c87447cfc Mon Sep 17 00:00:00 2001 From: Artur Shiriev Date: Tue, 16 Jun 2026 13:36:28 +0300 Subject: [PATCH 7/9] docs(circuit-breaker): document rate mode in the module docstring Co-Authored-By: Claude Opus 4.8 (1M context) --- src/httpware/middleware/resilience/circuit_breaker.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/httpware/middleware/resilience/circuit_breaker.py b/src/httpware/middleware/resilience/circuit_breaker.py index 0a36af5..3c530ca 100644 --- a/src/httpware/middleware/resilience/circuit_breaker.py +++ b/src/httpware/middleware/resilience/circuit_breaker.py @@ -1,4 +1,4 @@ -"""CircuitBreaker + AsyncCircuitBreaker — classic consecutive-failure circuit breaker. +"""CircuitBreaker + AsyncCircuitBreaker — consecutive-failure and failure-rate circuit breakers. See planning/specs/2026-06-13-circuit-breaker-and-timeout-design.md for the contract. @@ -17,6 +17,15 @@ HALF_OPEN — admit exactly one probe at a time; success_threshold consecutive probe successes close the circuit; one probe failure re-opens it. +Trip modes: + Classic (default) — opens when consecutive counted-failures reach failure_threshold. + Set failure_threshold to use this mode; leave failure_rate_threshold unset. + Rate (opt-in) — opens when the failure rate over a rolling window_seconds window + meets or exceeds failure_rate_threshold, provided at least minimum_calls + outcomes have been observed in that window. Set failure_rate_threshold to + activate; failure_threshold is ignored in this mode. + Half-open recovery and event names are identical across both modes. + The lock-free _CircuitBreakerState holds the transition logic, shared by both wrappers. AsyncCircuitBreaker relies on asyncio atomicity (no await inside a transition) plus a single-event-loop guard; CircuitBreaker (sync) serializes transitions with a From 0cbc523ccd9ca6ba14acd8659cbfce50100550be Mon Sep 17 00:00:00 2001 From: Artur Shiriev Date: Tue, 16 Jun 2026 13:45:08 +0300 Subject: [PATCH 8/9] chore(planning): archive the circuit-breaker-rate-mode bundle (#69) Ship bookkeeping for PR #69: fill the 0.13.0 release-notes PR number, mark the bundle shipped (pr: 69), move it from changes/active/ to changes/archive/, flip its Index line to Archived, and trim the deferred "CircuitBreaker v2" item to the still-open axes (count-based window, manual control + state). Co-Authored-By: Claude Opus 4.8 (1M context) --- planning/README.md | 3 ++- .../design.md | 6 ++--- .../plan.md | 4 ++-- planning/deferred.md | 24 ++++--------------- planning/releases/0.13.0.md | 2 +- 5 files changed, 13 insertions(+), 26 deletions(-) rename planning/changes/{active => archive}/2026-06-16.02-circuit-breaker-rate-mode/design.md (96%) rename planning/changes/{active => archive}/2026-06-16.02-circuit-breaker-rate-mode/plan.md (99%) diff --git a/planning/README.md b/planning/README.md index faaaf0e..c7dd471 100644 --- a/planning/README.md +++ b/planning/README.md @@ -70,10 +70,11 @@ carry **no** frontmatter — living prose, dated by git. ### Active -- **[circuit-breaker-rate-mode](changes/active/2026-06-16.02-circuit-breaker-rate-mode/design.md)** (2026-06-16) — Add an opt-in time-based failure-rate trip mode to the circuit breaker (classic stays default). Closes deferred item "CircuitBreaker v2 (a)". Targets 0.13.0. +_None._ ### Archived (shipped) +- **[circuit-breaker-rate-mode](changes/archive/2026-06-16.02-circuit-breaker-rate-mode/design.md)** (#69, 2026-06-16) — Added an opt-in time-based failure-rate trip mode to the circuit breaker (classic stays default). Shipped 0.13.0; closed deferred item "CircuitBreaker v2 (a)". - **[per-verb-with-response](changes/archive/2026-06-16.01-per-verb-with-response/design.md)** (#68, 2026-06-16) — Added `get_with_response` … `request_with_response` siblings (required `response_model`, returns `(Response, T)`) to both clients. Shipped 0.12.0; closed the deferred "Per-verb-with-response siblings" item. - **[custom-decoder-guide](changes/archive/2026-06-15.01-custom-decoder-guide/change.md)** (#67, 2026-06-15) — Docs: a "write your own `ResponseDecoder`" guide for Seam B, mirroring `docs/middleware.md`. Closed deferred item G6. - **[audit-doc-fixes](changes/archive/2026-06-14.06-audit-doc-fixes/change.md)** (#66, 2026-06-14) — Closed the [deep-audit](audits/2026-06-14-deep-audit.md) doc-accuracy findings: `Client.stream()` docs, terminal-call attribution, the four auto-raise sites, the pydantic upper bound, and root import paths. diff --git a/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/design.md b/planning/changes/archive/2026-06-16.02-circuit-breaker-rate-mode/design.md similarity index 96% rename from planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/design.md rename to planning/changes/archive/2026-06-16.02-circuit-breaker-rate-mode/design.md index b8f223c..ea96e62 100644 --- a/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/design.md +++ b/planning/changes/archive/2026-06-16.02-circuit-breaker-rate-mode/design.md @@ -1,11 +1,11 @@ --- -status: draft +status: shipped date: 2026-06-16 slug: circuit-breaker-rate-mode supersedes: null superseded_by: null -pr: null -outcome: null +pr: 69 +outcome: Shipped 0.13.0 — opt-in time-based failure-rate trip mode (failure_rate_threshold + window_seconds + minimum_calls) on both breakers; classic stays default. Closed the "CircuitBreaker v2 (a)" deferred item; count-based windows, slow-call axis, and manual control + state remain deferred. --- # Design: CircuitBreaker v2 — time-based failure-rate trip mode diff --git a/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/plan.md b/planning/changes/archive/2026-06-16.02-circuit-breaker-rate-mode/plan.md similarity index 99% rename from planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/plan.md rename to planning/changes/archive/2026-06-16.02-circuit-breaker-rate-mode/plan.md index e39eab6..c0b10f5 100644 --- a/planning/changes/active/2026-06-16.02-circuit-breaker-rate-mode/plan.md +++ b/planning/changes/archive/2026-06-16.02-circuit-breaker-rate-mode/plan.md @@ -1,9 +1,9 @@ --- -status: draft +status: shipped date: 2026-06-16 slug: circuit-breaker-rate-mode spec: circuit-breaker-rate-mode -pr: null +pr: 69 --- # circuit-breaker-rate-mode — implementation plan diff --git a/planning/deferred.md b/planning/deferred.md index d752b1a..bdc869e 100644 --- a/planning/deferred.md +++ b/planning/deferred.md @@ -8,27 +8,13 @@ As of 0.7.0, all planned epics (3, 4, 5, 6) are closed — see the [change Index ### Resilience -- **CircuitBreaker v2 — rolling-window / failure-rate mode** (`src/httpware/middleware/resilience/circuit_breaker.py`) — the 0.10.0 breaker ships only the *classic consecutive-failure* model (open after N counted failures in a row; any success resets the streak). That can't catch *partial* degradation (e.g. a steady 50% error rate that alternates success/fail never trips). Deferred to v2 in the 0.10.0 spec; the config was shaped so a rate mode is purely additive (a new opt-in `failure_rate_threshold` + window + `minimum_calls`, with classic remaining the default). Demand-gated: build when someone needs rate-based tripping. +- **CircuitBreaker v2 — remaining axes** (`src/httpware/middleware/resilience/circuit_breaker.py`) — 0.13.0 shipped axis **(a)**, the opt-in **time-based** failure-rate trip mode (`failure_rate_threshold` + `window_seconds` + `minimum_calls`; classic stays default). Still open, each independent and demand-gated: - **Comparison with the reference implementations** (verified against current docs, 2026-06-13): + - **Count-based window variant** — a `window_type="count"` selector (ring buffer of the last N outcomes) alongside the shipped time-based window. Resilience4j offers both; we chose time-based first as the better HTTP-service fit. Additive: a new window-type knob, time-based remaining the default. Build if someone needs volume-relative (not time-relative) windows. + - **(b) Manual control + read-only `state`** — `force_open`/`force_closed` and a `state` introspection property (Resilience4j's registry, Polly's `StateProvider`/`ManualControl`). Parked as YAGNI in the 0.10.0 audit (decision 4: events-only control surface). Independent of the trip mode. + - **(c) Slow-call-rate dimension** — *don't*: Resilience4j-only, and redundant with `AsyncTimeout`. Recorded here only so a future reader doesn't re-propose it. - | Axis | httpware v1 (shipped) | Resilience4j | Polly v8 | - |---|---|---|---| - | Trip model | consecutive count (`failure_threshold=5`) | failure **rate** over sliding window (`failureRateThreshold=50%`) | failure **rate** over time window (`FailureRatio=0.1`) | - | Window | none (one counter) | count-based (default, size 100) *or* time-based | time-based only (`SamplingDuration=30s`) | - | Min-volume floor | n/a | `minimumNumberOfCalls=100` | `MinimumThroughput=100` | - | Consecutive-count mode | only mode | non-default | **removed in v8** (was v7 default) | - | Half-open recovery | one probe, `success_threshold` consecutive successes (default 1) | permits N calls (default 10), closes on rate over them | one trial call, success→close (≈ httpware default) | - | OPEN→HALF_OPEN | lazy (next request) | lazy, or optional timer (`automaticTransition…`) | lazy (next request after `BreakDuration=5s`) | - | Failure classification | HTTP-native: `failure_status_codes` (5xx), **429/4xx = success** | generic exception predicate (`recordExceptions`/`ignoreExceptions`) | generic predicate (`ShouldHandle`, default all except cancellation) | - | Slow-call trip axis | none (latency is `AsyncTimeout`'s job) | yes — `slowCallRateThreshold` (100%) / `slowCallDurationThreshold` (60s) | none | - | Control surface | events-only (no `state`/`reset`/`isolate` — audit decision 4) | registry: state + metrics + manual transitions | `StateProvider` (read) + `ManualControl` (Isolate/Close) | - - **Takeaways for scoping v2:** (1) Polly v8 *deleted* consecutive-count; Resilience4j doesn't default to it — so httpware v1's only mode is the one both treat as legacy/non-default. Adding rate mode while *keeping* classic is a small edge neither offers. (2) "Polly-v8-equivalent" = just the rate-over-window mode. "Resilience4j-equivalent" additionally implies count-vs-time window choice and (separately) manual control + state introspection. (3) httpware's HTTP-native classification (429-as-success out of the box) is already *ahead* of both generic-predicate libraries — don't regress it. (4) Skip the slow-call axis (Resilience4j-only; redundant with `AsyncTimeout`). - - Three separable additions, rough priority: **(a) rate-over-window trip mode** (the core ask; additive opt-in), **(b) manual control + read-only `state`** (independent; both libraries have it, httpware parked it as YAGNI), **(c) slow-call-rate dimension** (don't — covered by `AsyncTimeout`). - - **Concurrency note:** the window recorder (ring buffer for count-based; time-bucketed counters for time-based) is more state than v1's single counter, but recording an outcome stays a synchronous mutation, so the async lock-free atomicity invariant and the sync `threading.Lock` both still hold. Time-based eviction must read `_now()` inside the same synchronous critical section. + **Don't regress:** httpware's HTTP-native failure classification (429/4xx = success out of the box) is already ahead of the generic-predicate breakers — preserve it in any v2 work. ### Documentation diff --git a/planning/releases/0.13.0.md b/planning/releases/0.13.0.md index 8bcbbb6..c7b1f8d 100644 --- a/planning/releases/0.13.0.md +++ b/planning/releases/0.13.0.md @@ -73,4 +73,4 @@ The following remain deferred and are not part of 0.13.0: ## Shipped via -PR #XX — time-based failure-rate trip mode for the circuit breaker. +PR #69 — time-based failure-rate trip mode for the circuit breaker. From 33457cccf9292493e4284da60e6cf0cab640322b Mon Sep 17 00:00:00 2001 From: Artur Shiriev Date: Tue, 16 Jun 2026 13:49:03 +0300 Subject: [PATCH 9/9] docs(circuit-breaker): clarify mode-switch precedence and cross-mode validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address PR #69 review: note that failure_rate_threshold is the sole mode switch (both-set → rate wins, not an error), and that window_seconds / minimum_calls are validated in both modes even when inert in classic. Co-Authored-By: Claude Opus 4.8 (1M context) --- architecture/resilience.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/architecture/resilience.md b/architecture/resilience.md index 26f3f2c..7e91288 100644 --- a/architecture/resilience.md +++ b/architecture/resilience.md @@ -14,7 +14,7 @@ `AsyncCircuitBreaker` and sync `CircuitBreaker` are a classic consecutive-failure circuit breaker: the circuit opens after `failure_threshold` consecutive counted failures, fast-fails while OPEN, admits one probe after `reset_timeout` (HALF_OPEN), and closes again after `success_threshold` consecutive probe successes; a probe failure re-opens it. A *counted failure* is a `NetworkError`, an httpware `TimeoutError`, or a `StatusError` whose `status_code` is in the effective failure set (default: all 5xx, 500–599); 4xx including 429 count as successes, and any other exception type propagates unchanged without affecting circuit state. When the breaker refuses a request — OPEN, or HALF_OPEN with the single probe slot already taken — it raises `CircuitOpenError` and never forwards to `next`; the error's `retry_after` carries the seconds until the next probe will be admitted, or `None` when a concurrent probe is already in flight. A breaker instance is sharable across clients (one shared circuit); a sync instance cannot be shared with an async one. -The classic consecutive-failure mode is the default and unchanged. An opt-in time-based failure-rate mode is available: set `failure_rate_threshold` (a float in `(0, 1]`) to switch. In rate mode the circuit opens when the observed failure rate over a rolling `window_seconds` window (default `30.0` s) meets or exceeds the threshold, but only once `minimum_calls` outcomes have been observed in that window (default `20`). The `failure_threshold` parameter is ignored in rate mode — the trip condition is purely rate-based. Half-open recovery (`reset_timeout`, `success_threshold`, the single-probe admission) is identical to classic mode. The event names (`circuit.opened`, `circuit.rejected`, `circuit.half_open`, `circuit.closed`) are the same in both modes; in rate mode the `circuit.opened` event carries extra attributes — `failure_rate`, `failure_rate_threshold`, `window_seconds`, `observed_calls` — and its message is `"circuit opened — failure rate threshold reached"`. +The classic consecutive-failure mode is the default and unchanged. An opt-in time-based failure-rate mode is available: set `failure_rate_threshold` (a float in `(0, 1]`) to switch. In rate mode the circuit opens when the observed failure rate over a rolling `window_seconds` window (default `30.0` s) meets or exceeds the threshold, but only once `minimum_calls` outcomes have been observed in that window (default `20`). The presence of `failure_rate_threshold` is the sole mode switch: when it is set, the breaker is in rate mode and `failure_threshold` is ignored (setting both is not an error — rate mode wins). `window_seconds` and `minimum_calls` are validated at construction in both modes even though they are inert in classic mode, so an invalid value is rejected eagerly regardless of mode. Half-open recovery (`reset_timeout`, `success_threshold`, the single-probe admission) is identical to classic mode. The event names (`circuit.opened`, `circuit.rejected`, `circuit.half_open`, `circuit.closed`) are the same in both modes; in rate mode the `circuit.opened` event carries extra attributes — `failure_rate`, `failure_rate_threshold`, `window_seconds`, `observed_calls` — and its message is `"circuit opened — failure rate threshold reached"`. `AsyncTimeout` is an async-only middleware that bounds the total wall-clock for the whole inner pipeline (most importantly across an `AsyncRetry` loop, whose attempts and backoff sleeps `httpx2` cannot bound). It is not a per-call timeout — `httpx2`'s connect/read/write/pool timeouts are the right tool for a single outbound call, and `AsyncTimeout` does not duplicate them. It rejects a non-finite or non-positive `timeout` at construction, and on expiry raises httpware `TimeoutError`. There is no sync `Timeout`: a sync total-deadline cannot interrupt a blocking call mid-flight, and `httpx2` already covers sync per-call timeouts. Sync callers configure `httpx2`'s timeouts directly.