diff --git a/src/pendulum/formatting/formatter.py b/src/pendulum/formatting/formatter.py index b09977d0..062b293b 100644 --- a/src/pendulum/formatting/formatter.py +++ b/src/pendulum/formatting/formatter.py @@ -66,7 +66,7 @@ class Formatter: _FORMAT_RE: re.Pattern[str] = re.compile(_TOKENS) - _FROM_FORMAT_RE: re.Pattern[str] = re.compile(r"(? str: + if token.startswith("\\[") and token.endswith("\\]"): + # parse() runs re.escape() on the format first, so a literal block such + # as "[de]" arrives here as "\\[de\\]". Drop the (escaped) brackets and + # keep the already-escaped inner text as a literal. + return token[2:-2] if token.startswith("[") and token.endswith("]"): return token[1:-1] elif token.startswith("\\"): diff --git a/tests/datetime/test_from_format.py b/tests/datetime/test_from_format.py index ecf59632..19a920eb 100644 --- a/tests/datetime/test_from_format.py +++ b/tests/datetime/test_from_format.py @@ -57,6 +57,22 @@ def test_from_format_with_escaped_elements_valid_tokens(): assert d.timezone_name == "UTC" +def test_from_format_with_multi_character_escaped_elements(): + # GH #971: literal blocks longer than one character that contain token letters + # (e.g. "the" holds the h/e tokens, "del" holds the d/e tokens) were + # mis-tokenized and raised instead of being treated as literal text. + d = pendulum.from_format("the year 2023", "[the year] YYYY") + assert_datetime(d, 2023, 1, 1, 0, 0, 0) + d = pendulum.from_format( + "21 de noviembre del 2023", "DD [de] MMMM [del] YYYY", locale="es" + ) + assert_datetime(d, 2023, 11, 21, 0, 0, 0) + d = pendulum.from_format( + "21 de noviembre de 2023", "DD [de] MMMM [de] YYYY", locale="es" + ) + assert_datetime(d, 2023, 11, 21, 0, 0, 0) + + def test_from_format_with_millis(): d = pendulum.from_format("1975-05-21 22:32:11.123456", "YYYY-MM-DD HH:mm:ss.SSSSSS") assert_datetime(d, 1975, 5, 21, 22, 32, 11, 123456)