From 9e1d0b40fcf274c01de7bd6e67ed7834a117e6c0 Mon Sep 17 00:00:00 2001 From: Vineeth Sai Date: Sat, 20 Jun 2026 17:58:12 -0700 Subject: [PATCH] Fix from_format crash on multi-character literal blocks Formatter.parse() runs re.escape() on the format string first, so a literal block like [de] arrives as \[de\]. The token regex only suppressed the characters immediately adjacent to the brackets, so token letters in the middle of a multi-character literal (for example the d and e in [del]) were still tokenized, raising AttributeError or a "redefinition of group name" re.error. Match the whole escaped literal block as a single token in _FROM_FORMAT_RE and unwrap it in _replace_tokens, keeping the inner text as a literal. Fixes #971 --- src/pendulum/formatting/formatter.py | 7 ++++++- tests/datetime/test_from_format.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/pendulum/formatting/formatter.py b/src/pendulum/formatting/formatter.py index b09977d0..062b293b 100644 --- a/src/pendulum/formatting/formatter.py +++ b/src/pendulum/formatting/formatter.py @@ -66,7 +66,7 @@ class Formatter: _FORMAT_RE: re.Pattern[str] = re.compile(_TOKENS) - _FROM_FORMAT_RE: re.Pattern[str] = re.compile(r"(? str: + if token.startswith("\\[") and token.endswith("\\]"): + # parse() runs re.escape() on the format first, so a literal block such + # as "[de]" arrives here as "\\[de\\]". Drop the (escaped) brackets and + # keep the already-escaped inner text as a literal. + return token[2:-2] if token.startswith("[") and token.endswith("]"): return token[1:-1] elif token.startswith("\\"): diff --git a/tests/datetime/test_from_format.py b/tests/datetime/test_from_format.py index ecf59632..19a920eb 100644 --- a/tests/datetime/test_from_format.py +++ b/tests/datetime/test_from_format.py @@ -57,6 +57,22 @@ def test_from_format_with_escaped_elements_valid_tokens(): assert d.timezone_name == "UTC" +def test_from_format_with_multi_character_escaped_elements(): + # GH #971: literal blocks longer than one character that contain token letters + # (e.g. "the" holds the h/e tokens, "del" holds the d/e tokens) were + # mis-tokenized and raised instead of being treated as literal text. + d = pendulum.from_format("the year 2023", "[the year] YYYY") + assert_datetime(d, 2023, 1, 1, 0, 0, 0) + d = pendulum.from_format( + "21 de noviembre del 2023", "DD [de] MMMM [del] YYYY", locale="es" + ) + assert_datetime(d, 2023, 11, 21, 0, 0, 0) + d = pendulum.from_format( + "21 de noviembre de 2023", "DD [de] MMMM [de] YYYY", locale="es" + ) + assert_datetime(d, 2023, 11, 21, 0, 0, 0) + + def test_from_format_with_millis(): d = pendulum.from_format("1975-05-21 22:32:11.123456", "YYYY-MM-DD HH:mm:ss.SSSSSS") assert_datetime(d, 1975, 5, 21, 22, 32, 11, 123456)