diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst index 1a73a79a58b78b1..ed96a328be5f6d5 100644 --- a/Doc/whatsnew/3.16.rst +++ b/Doc/whatsnew/3.16.rst @@ -423,6 +423,12 @@ re (Contributed by Serhiy Storchaka in :gh:`152033` and Pieter Eendebak in :gh:`152056`.) +* A greedy repeat is now compiled as possessive when the repeated characters + and the characters that can follow it are provably disjoint (for example + ``\d+\.`` is compiled as if it were ``\d++\.``), which makes failing and + heavily backtracking matches severalfold faster. + (Contributed by Serhiy Storchaka in :gh:`153044`.) + module_name ----------- diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 58a24964c3b374f..476163540a27ce7 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -221,7 +221,7 @@ def _code(p, flags): flags = p.state.flags | flags # run the optimizer passes over the parsed pattern - optimize(p) + optimize(p, flags) code = [] diff --git a/Lib/re/_optimizer.py b/Lib/re/_optimizer.py index 6a0bb5a2973eaec..6bae0c6233b037b 100644 --- a/Lib/re/_optimizer.py +++ b/Lib/re/_optimizer.py @@ -17,6 +17,7 @@ import _sre from . import _parser +from ._casefix import _EXTRA_CASES from ._constants import * _CHARSET_ALL = [(NEGATE, None)] @@ -428,6 +429,330 @@ def _compile_info(code, pattern, flags): _compile_charset(charset, flags, code) code[skip] = len(code) - skip + +# --- Auto-possessification pass --------------------------------------------- + +_REPEAT_CODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) +_POSSESSIFY_UNITS = frozenset({LITERAL, NOT_LITERAL, ANY, ANY_ALL, IN, CATEGORY}) + +# \d, \w, \s and the line break category as unions of disjoint "atoms": +# d=digit, l=word non-digit, b=line break, s=space non-line-break, o=other. +# digit<=word, linebreak<=space and word disjoint from space hold for both +# ASCII and Unicode, so disjoint atom sets mean really disjoint categories. +_CAT_UNIVERSE = frozenset('dlbso') +_CAT_ATOMS = { + CATEGORY_DIGIT: frozenset('d'), + CATEGORY_WORD: frozenset('dl'), + CATEGORY_SPACE: frozenset('bs'), + CATEGORY_LINEBREAK: frozenset('b'), +} +_CAT_ATOMS.update({ + CATEGORY_NOT_DIGIT: _CAT_UNIVERSE - _CAT_ATOMS[CATEGORY_DIGIT], + CATEGORY_NOT_WORD: _CAT_UNIVERSE - _CAT_ATOMS[CATEGORY_WORD], + CATEGORY_NOT_SPACE: _CAT_UNIVERSE - _CAT_ATOMS[CATEGORY_SPACE], + CATEGORY_NOT_LINEBREAK: _CAT_UNIVERSE - _CAT_ATOMS[CATEGORY_LINEBREAK], +}) + +_ASCII_SPACE = frozenset(b' \t\n\r\f\v') +_ASCII_WORD = frozenset(b'_') | frozenset( + range(0x30, 0x3a)) | frozenset(range(0x41, 0x5b)) | frozenset(range(0x61, 0x7b)) +_PROBE_LIMIT = 64 # cap on the size of a finite atom used as a witness set +_FOLLOW_LIMIT = 64 # cap on a follower set: an empty branch alternative + # re-appends the continuation, exponential for (|)(|)... +_DEPTH_LIMIT = 100 # cap on the follower-scan recursion: one level per group + +def _tolower(c, flags): + if flags & SRE_FLAG_UNICODE: + return _sre.unicode_tolower(c) + return _sre.ascii_tolower(c) + +def _fold_set(c, flags): + # Code points witnessing what LITERAL c matches: tolower() of any match + # lies here and each element is itself a match (simple tolower plus + # _EXTRA_CASES, like the IGNORECASE matcher). + if not (flags & SRE_FLAG_IGNORECASE) or flags & SRE_FLAG_LOCALE: + return (c,) + lo = _tolower(c, flags) + if flags & SRE_FLAG_UNICODE: + extra = _EXTRA_CASES.get(lo) + if extra: + return (lo, *extra) + return (lo,) + +def _lit_matches(d, c, flags): + # Whether LITERAL d matches input code point c. + if not (flags & SRE_FLAG_IGNORECASE) or flags & SRE_FLAG_LOCALE: + return c == d + return _tolower(c, flags) in _fold_set(d, flags) + +# Categories whose membership is invariant under case folding (verified over +# the full range); the others cannot be decided under IGNORECASE, where a +# charset member is matched against the lowercased character. +_FOLD_CLOSED = frozenset({ + CATEGORY_DIGIT, CATEGORY_WORD, CATEGORY_SPACE, CATEGORY_LINEBREAK, + CATEGORY_NUMERIC, CATEGORY_PRINTABLE, CATEGORY_N, CATEGORY_LM, + CATEGORY_NL, CATEGORY_NO, CATEGORY_CF, CATEGORY_Z, CATEGORY_ZS, + CATEGORY_C, CATEGORY_CN, CATEGORY_XID_CONTINUE, CATEGORY_ASSIGNED, + CATEGORY_BLANK, CATEGORY_GRAPH, CATEGORY_PRINT, CATEGORY_CASED, +}) +_FOLD_CLOSED |= frozenset(CH_NEGATE[cat] for cat in _FOLD_CLOSED) + +def _cat_matches(cat, c, flags): + # Whether category cat matches code point c, decided by the engine's own + # predicate; None if it depends on the runtime locale or on case folding. + if flags & SRE_FLAG_LOCALE: + return None + if flags & SRE_FLAG_IGNORECASE and cat not in _FOLD_CLOSED: + return None + if flags & SRE_FLAG_UNICODE: + cat = CH_UNICODE[cat] + return _sre.category_matches(cat, c) + +def _member_matches(op, av, c, flags): + # Whether a charset member (op, av) matches code point c. None if unknown. + if op is LITERAL: + return _lit_matches(av, c, flags) + if op is RANGE: + lo, hi = av + if lo <= c <= hi: + return True + if not (flags & SRE_FLAG_IGNORECASE) or flags & SRE_FLAG_LOCALE: + return False + if lo <= _tolower(c, flags) <= hi or any(lo <= x <= hi + for x in _fold_set(c, flags)): + return True + return None # case folding into the range can't be ruled out cheaply + if op is CATEGORY: + return _cat_matches(av, c, flags) + return None + +def _atom_matches(op, av, c, flags): + # Whether the one-character atom (op, av) matches code point c. + # Returns None when it cannot be decided (callers treat that as "maybe"). + if op is LITERAL: + return _lit_matches(av, c, flags) + if op is NOT_LITERAL: + return not _lit_matches(av, c, flags) + if op is CATEGORY: + return _cat_matches(av, c, flags) + if op is ANY: + return True if flags & SRE_FLAG_DOTALL else c != 0x0a + if op is ANY_ALL: + return True + if op is IN: + # Evaluate the charset the way the engine's charset() walk does: + # NEGATE toggles the polarity, a member hit returns the current + # polarity, and the end returns the complement of the final one + # (this also covers difference-fused charsets, see _fuse_difference). + ok = True + results = set() + for iop, iav in av: + if iop is NEGATE: + ok = not ok + continue + r = _member_matches(iop, iav, c, flags) + if r: + results.add(ok) + break + if r is None: + results.add(ok) # may or may not hit this member + else: + results.add(not ok) + if len(results) == 1: + return results.pop() + return None + return None + +def _finite_set(op, av, flags): + # The set of code points the atom matches, if finite and small; else None. + if op is LITERAL: + return set(_fold_set(av, flags)) + if op is IN: + if av and av[0] == (NEGATE, None): + return None + out = set() + for iop, iav in av: + if iop is LITERAL: + out.update(_fold_set(iav, flags)) + elif iop is RANGE: + if iav[1] - iav[0] >= _PROBE_LIMIT: + return None + for x in range(iav[0], iav[1] + 1): + out.update(_fold_set(x, flags)) + elif iop is CATEGORY: + if flags & SRE_FLAG_LOCALE or flags & SRE_FLAG_UNICODE: + return None # Unicode/locale categories are not small + if iav is CATEGORY_DIGIT: + out.update(range(0x30, 0x3a)) + elif iav is CATEGORY_WORD: + out.update(_ASCII_WORD) + elif iav is CATEGORY_SPACE: + out.update(_ASCII_SPACE) + elif iav is CATEGORY_LINEBREAK: + out.add(0x0a) + else: + return None # a negated ASCII category is not small + else: + return None + if len(out) > _PROBE_LIMIT: + return None + return out + return None + +def _cat_atom_set(op, av): + # The dlbso atom set the atom matches, if it is a bare category or a + # charset of categories (the first member claiming an atom decides it + # with the current NEGATE polarity, the end claims the rest). + if op is CATEGORY: + return _CAT_ATOMS.get(av) + if op is not IN: + return None + ok = True + decided = set() + matched = set() + for iop, iav in av: + if iop is NEGATE: + ok = not ok + continue + if iop is not CATEGORY: + if not ok: + # a non-category member of a fail segment only narrows the + # set; ignoring it over-approximates, which stays sound + continue + return None + atoms = _CAT_ATOMS.get(iav) + if atoms is None: + return None + if ok: + matched |= atoms - decided + decided |= atoms + if not ok: + matched |= _CAT_UNIVERSE - decided + return matched + +def _as_single_category(op, av): + # The category code if the atom is a bare category or a single-category + # class, else None. + if op is CATEGORY: + return av + if op is IN and len(av) == 1 and av[0][0] is CATEGORY: + return av[0][1] + return None + +def _disjoint(atom, other, flags): + # True only if atom and other provably cannot match a common character. + if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_IGNORECASE: + # case folding is decided by the runtime locale; prove nothing + return False + ca = _as_single_category(*atom) + if ca is not None: + cb = _as_single_category(*other) + # A category and its complement are disjoint whatever they mean -- + # but only within one flag context (unicode \w and ascii \W + # overlap), which holds because the walk never compares atoms + # across a flag-scoping boundary. + if cb is not None and cb == CH_NEGATE[ca]: + return True + if not (flags & SRE_FLAG_LOCALE): + a1 = _cat_atom_set(*atom) + if a1 is not None: + a2 = _cat_atom_set(*other) + if a2 is not None and a1.isdisjoint(a2): + return True + fa = _finite_set(*atom, flags) + fb = _finite_set(*other, flags) + if fa is not None and fb is not None: + return fa.isdisjoint(fb) + if fa is not None: + return not any(_atom_matches(*other, c, flags) is not False for c in fa) + if fb is not None: + return not any(_atom_matches(*atom, c, flags) is not False for c in fb) + return False + +def _leading_atom(data): + # The leading atom of a rigid body -- a concatenation of single-character + # atoms with no internal choice. A repeat of it gives back only whole + # iterations, so its leading atom is all the follower must avoid. + lead = None + for op, av in data: + if op is SUBPATTERN and not av[1] and not av[2]: + a = _leading_atom(av[3].data) + elif op is ATOMIC_GROUP: + a = _leading_atom(av.data) + elif op in _POSSESSIFY_UNITS: + a = (op, av) + else: + return None + if a is None: + return None + if lead is None: + lead = a + return lead + +def _first_consumers(seq, i, flags, cont, depth=0): + # Atoms for every character that could be consumed at position i of seq; + # cont is the same for what follows seq. None if it can't be analyzed. + if depth >= _DEPTH_LIMIT: + return None + depth += 1 + acc = [] + n = len(seq) + while i < n: + op, av = seq[i] + if op in _POSSESSIFY_UNITS: + acc.append((op, av)) + return acc + if op is SUBPATTERN: + if av[1] or av[2]: + return None # flag-scoping group: atoms can't carry their flags + after = _first_consumers(seq, i + 1, flags, cont, depth) + if after is None: + return None + inner = _first_consumers(av[3].data, 0, flags, after, depth) + return None if inner is None else acc + inner + if op is ATOMIC_GROUP: + after = _first_consumers(seq, i + 1, flags, cont, depth) + if after is None: + return None + inner = _first_consumers(av.data, 0, flags, after, depth) + return None if inner is None else acc + inner + if op is BRANCH: + after = _first_consumers(seq, i + 1, flags, cont, depth) + if after is None: + return None + for alt in av[1]: + a = _first_consumers(alt.data, 0, flags, after, depth) + if a is None or len(acc) + len(a) > _FOLLOW_LIMIT: + return None + acc += a + return acc + if op in _REPEAT_CODES: + mn, mx, p = av + sub = _first_consumers(p.data, 0, flags, None, depth) + if sub is None or len(acc) + len(sub) > _FOLLOW_LIMIT: + return None + acc += sub + if mn == 0: + i += 1 + continue + return acc + if op is AT and av is AT_END_STRING: + # \z matches only at the very end; backtracking the repeat moves + # earlier and can never satisfy it, so nothing need be disjoint. + return acc + if op is AT and av is AT_END: + # $ is like \z but also matches before a '\n'. Only MULTILINE + # exposes an interior one to backtracking, and then only if the + # repeat can match '\n'. + if flags & SRE_FLAG_MULTILINE: + return acc + [(LITERAL, 0x0a)] + return acc + return None # assertion, anchor, group reference, ... -> give up + if cont is None or len(acc) + len(cont) > _FOLLOW_LIMIT: + return None + return acc + cont + + # Difference-fusion peephole: rewrite [A--B]-style A(?a)', 'aaa', 0, (0, 3)), + ]) + def test_auto_possessification(self, pattern, string, flags, expected): + m = re.compile(pattern, flags).search(string) + self.assertEqual(m.span() if m else None, expected) + + def test_auto_possessification_keeps_captures(self): + # captures are preserved when a repeat is made possessive + self.assertEqual(re.search(r'(a+)b', 'aaab').group(1), 'aaa') + self.assertEqual(re.search(r'(ab)+c', 'ababc').span(), (0, 5)) + def test_atomic_grouping(self): """Test Atomic Grouping Test non-capturing groups of the form (?>...), which does @@ -3044,6 +3074,88 @@ def test_possesive_repeat(self): ''') +@cpython_only +class OptimizerTests(unittest.TestCase): + # Auto-possessification (see Lib/re/_optimizer.py). + + def is_possessive(self, pattern, flags=0): + with captured_stdout() as out: + re.compile(pattern, flags | re.DEBUG) + return 'POSSESSIVE_REPEAT' in out.getvalue() + + @subTests('pattern,flags', [ + (r'a+b', 0), (r'\d+\.', 0), (r'[a-z]+[0-9]', 0), (r'\w+\s', 0), + (r'(a)+b', 0), (r'(\d)+x', 0), # capturing groups + (r'(ab)+c', 0), (r'(\d\d\d)+x', 0), + (r'([ab]c)+d', 0), (r'((ab)c)+d', 0), # rigid bodies + (r'a+\z', 0), (r'a+$', 0), (r'\d+$', 0), (r'a+$', re.M), # anchors + (r'(?>ab)+c', 0), (r'a+(?>bc)d', 0), # atomic groups + (r'(a+)b', 0), (r'(a+|c)d', 0), # across a group + (r'a+', 0), # end of the pattern + # a fused set-operation charset that excludes the follower + (r'[a-z--b]+b', 0), (r'[\w--\d]+\d', 0), (r'[\s--\n]+\S', 0), + (r'[\w--0-5]+\s', 0), (r'[^\d]+\d', 0), + # \p{...} engine categories, decided per character + (r'\p{Lu}+x', 0), (r'\p{Alpha}+!', 0), (r'x+\p{Lu}', 0), + (r'\p{XID_Start}+-', 0), (r'\p{Cf}+A', re.I), # Cf is fold-closed + (r'\p{Cased}+!', 0), (r'\p{Case_Ignorable}+A', 0), + # a category and its complement are disjoint whatever they mean + (r'\p{Lu}+\P{Lu}', 0), (r'\P{Cased}+\p{Cased}', 0), + (r'\p{Lu}+\P{Lu}', re.I), (r'\d+\D', 0), + (r'\w+\W', 0), (r'(?a:\w+\W)', 0), + ]) + def test_possessified(self, pattern, flags): + self.assertTrue(self.is_possessive(pattern, flags)) + + def test_many_sequential_groups(self): + # The follower scan recurses once per following group; a very long + # sequence of groups must not turn that into a crash, and repeats + # outside the chain are still optimized. + self.assertTrue(self.is_possessive('(a)' * 2000 + 'b+c')) + + def test_follower_set_capped(self): + # Each empty branch alternative appends the continuation again, so + # an uncapped follower set grows exponentially (found by OSS-Fuzz). + re.compile('(|)' * 100 + 'x') + # Each growth point gives up at the limit and no earlier (the y? + # followers overlap each other, so only x+ can possessify): + # a chain of optional followers, + self.assertTrue(self.is_possessive('x+' + 'y?' * 64 + 'y')) + self.assertFalse(self.is_possessive('x+' + 'y?' * 65 + 'y')) + # branch alternatives (the first characters must be distinct -- the + # parser factors a common prefix out of the alternation -- and there + # are not enough unreserved ASCII alphanumerics for 65 of them), + alts = ['%cz' % (0x100 + i) for i in range(65)] + self.assertTrue(self.is_possessive('x+(?:%s)' % '|'.join(alts[:64]))) + self.assertFalse(self.is_possessive('x+(?:%s)' % '|'.join(alts))) + # and resuming what follows the group. + alts[0] = 'yz' + self.assertTrue(self.is_possessive( + '(x+' + 'y?' * 32 + ')(?:%s)' % '|'.join(alts[:32]))) + self.assertFalse(self.is_possessive( + '(x+' + 'y?' * 32 + ')(?:%s)' % '|'.join(alts[:33]))) + + @subTests('pattern,flags', [ + (r'a+a', 0), (r'.+x', 0), # overlapping + (r'(ab)+a', 0), (r'(a|ab)+c', 0), (r'(ab?)+c', 0), + (r'[a\n]+$', re.M), (r'\s+$', re.M), # $ before a newline + (r'a+\b', 0), (r'a+\B', 0), # word boundary + (r'a+(?=a)', 0), (r'a+(?!b)', 0), # lookaround + (r'(?i:a)+A', 0), (r'a+(?i:A)', 0), # scoped flags + # not complement pairs: unicode \w and ascii \W overlap (e.g. é) + (r'(?a:\w+)\W', 0), (r'\w+(?a:\W)', 0), + (b'a+B', re.L | re.I), # runtime locale folding + (r'a+?b', 0), # lazy + (r'[a-z--b]+c', 0), (r'[\w--\d]+\w', 0), # follower in the set + (r'\p{Lu}+A', 0), (r'\p{L}+\d', 0), # not provably disjoint + (r'\p{Lu}+\P{L}', 0), # disjoint, but not a complement pair + # Lu is not fold-closed: under IGNORECASE it depends on the context + (r'\p{Lu}+x', re.I), + ]) + def test_not_possessified(self, pattern, flags): + self.assertFalse(self.is_possessive(pattern, flags)) + + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): self.assertEqual(repr(re.compile(pattern)), expected) diff --git a/Misc/NEWS.d/next/Library/2026-07-04-21-00-00.gh-issue-153044.PoSs3s.rst b/Misc/NEWS.d/next/Library/2026-07-04-21-00-00.gh-issue-153044.PoSs3s.rst new file mode 100644 index 000000000000000..d09a54658a27444 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-07-04-21-00-00.gh-issue-153044.PoSs3s.rst @@ -0,0 +1,5 @@ +Regular expressions now compile a greedy repeat as possessive when the +repeated characters and the characters that can follow it are provably +disjoint (for example ``\d+\.`` as if it were ``\d++\.``). This cannot +change what a pattern matches, and makes failing and heavily backtracking +matches severalfold faster. diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h index b49bf4e058b69b6..4c7b708b5a4f5ae 100644 --- a/Modules/_sre/clinic/sre.c.h +++ b/Modules/_sre/clinic/sre.c.h @@ -7,7 +7,7 @@ preserve # include "pycore_runtime.h" // _Py_ID() #endif #include "pycore_abstract.h" // _PyNumber_Index() -#include "pycore_modsupport.h" // _PyArg_UnpackKeywords() +#include "pycore_modsupport.h" // _PyArg_CheckPositional() PyDoc_STRVAR(_sre_getcodesize__doc__, "getcodesize($module, /)\n" @@ -164,6 +164,60 @@ _sre_unicode_tolower(PyObject *module, PyObject *arg) return return_value; } +PyDoc_STRVAR(_sre_category_matches__doc__, +"category_matches($module, category, character, /)\n" +"--\n" +"\n" +"Whether the character matches the resolved category code."); + +#define _SRE_CATEGORY_MATCHES_METHODDEF \ + {"category_matches", _PyCFunction_CAST(_sre_category_matches), METH_FASTCALL, _sre_category_matches__doc__}, + +static int +_sre_category_matches_impl(PyObject *module, unsigned int category, + int character); + +static PyObject * +_sre_category_matches(PyObject *module, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + unsigned int category; + int character; + int _return_value; + + if (!_PyArg_CheckPositional("category_matches", nargs, 2, 2)) { + goto exit; + } + { + Py_ssize_t _bytes = PyLong_AsNativeBytes(args[0], &category, sizeof(unsigned int), + Py_ASNATIVEBYTES_NATIVE_ENDIAN | + Py_ASNATIVEBYTES_ALLOW_INDEX | + Py_ASNATIVEBYTES_UNSIGNED_BUFFER); + if (_bytes < 0) { + goto exit; + } + if ((size_t)_bytes > sizeof(unsigned int)) { + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "integer value out of range", 1) < 0) + { + goto exit; + } + } + } + character = PyLong_AsInt(args[1]); + if (character == -1 && PyErr_Occurred()) { + goto exit; + } + _return_value = _sre_category_matches_impl(module, category, character); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyBool_FromLong((long)_return_value); + +exit: + return return_value; +} + PyDoc_STRVAR(_sre_SRE_Pattern_prefixmatch__doc__, "prefixmatch($self, /, string, pos=0, endpos=sys.maxsize)\n" "--\n" @@ -1568,4 +1622,4 @@ _sre_SRE_Scanner_search(PyObject *self, PyTypeObject *cls, PyObject *const *args #ifndef _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF #define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF #endif /* !defined(_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF) */ -/*[clinic end generated code: output=0c867efb64e020aa input=a9049054013a1b77]*/ +/*[clinic end generated code: output=dd44960c30198f1a input=a9049054013a1b77]*/ diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 7d03b909226f244..4f14ddd7aadd1c8 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -629,6 +629,24 @@ _sre_unicode_tolower_impl(PyObject *module, int character) return sre_lower_unicode(character); } +/*[clinic input] +_sre.category_matches -> bool + + category: unsigned_int(bitwise=True) + character: int + / + +Whether the character matches the resolved category code. +[clinic start generated code]*/ + +static int +_sre_category_matches_impl(PyObject *module, unsigned int category, + int character) +/*[clinic end generated code: output=fab7e15a7c0467bc input=5ad3c5d79206b936]*/ +{ + return sre_category(category, (unsigned int)character) != 0; +} + LOCAL(void) state_reset(SRE_STATE* state) { @@ -3563,6 +3581,7 @@ static PyMethodDef _functions[] = { _SRE_UNICODE_ISCASED_METHODDEF _SRE_ASCII_TOLOWER_METHODDEF _SRE_UNICODE_TOLOWER_METHODDEF + _SRE_CATEGORY_MATCHES_METHODDEF {NULL, NULL} };