diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 617dc96f479926c..1598eb21b42e413 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -696,6 +696,9 @@ character ``'$'``. The supported values are the groups ``L``, ``N``, ``Z`` and ``C`` and the values ``Lu``, ``Lt``, ``Lm``, ``Nd``, ``Nl``, ``No``, ``Zs``, ``Zl``, ``Zp``, ``Cc``, ``Cf``, ``Cs``, ``Co`` and ``Cn``. + * The enumerated properties ``Bidi_Class`` (``bc``), + ``East_Asian_Width`` (``ea``), ``Grapheme_Cluster_Break`` (``gcb``) + and ``Indic_Conjunct_Break`` (``incb``), for example ``\p{bc=AL}``. * The binary properties ``XID_Start``, ``XID_Continue``, ``Alphabetic``, ``Lowercase``, ``Uppercase``, ``Numeric``, ``Printable``, ``Cased`` and ``Case_Ignorable``. A binary property may also be spelled diff --git a/Include/internal/pycore_unicodedata_re.h b/Include/internal/pycore_unicodedata_re.h new file mode 100644 index 000000000000000..1c55199f68838ff --- /dev/null +++ b/Include/internal/pycore_unicodedata_re.h @@ -0,0 +1,38 @@ +/* unicodedata property-index interface for the re module's \p{...} matcher. + + The capsule exposes the value index that unicodedata stores for each + character property (the same index returned by unicodedata._ucd_re_info()), + so the SRE engine can match \p{...} in C without a per-character Python call + into unicodedata. See Lib/re/_properties.py and Modules/_sre/sre.c. */ + +#ifndef Py_INTERNAL_UNICODEDATA_RE_H +#define Py_INTERNAL_UNICODEDATA_RE_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#define PyUnicodeData_RE_CAPSULE_NAME "unicodedata._ucd_re_CAPI" + +/* Property selectors. Private to unicodedata and _sre; the numbering only + needs to be consistent within a single build. */ +enum { + _Py_UCD_RE_BC = 0, /* Bidi_Class */ + _Py_UCD_RE_EA, /* East_Asian_Width */ + _Py_UCD_RE_GCB, /* Grapheme_Cluster_Break */ + _Py_UCD_RE_INCB, /* Indic_Conjunct_Break */ +}; + +typedef struct { + /* Return the value index of property prop for character ch, matching the + indices in unicodedata._ucd_re_info(); -1 for an unknown property. */ + int (*property)(int prop, Py_UCS4 ch); +} _PyUnicode_RE_CAPI; + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_UNICODEDATA_RE_H */ diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 58a24964c3b374f..4937e471afa9265 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -288,6 +288,10 @@ def print_2(*args): arg = str(CHCODES[arg]) assert arg[:9] == 'CATEGORY_' print_(op, arg[9:]) + elif op is CATEGORY_UCD: + arg = code[i] + i += 1 + print_(op, '%#x' % arg) elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): skip = code[i] print_(op, skip, to=i+skip) diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py index 6e99dae53501512..d72aab4e429cfeb 100644 --- a/Lib/re/_constants.py +++ b/Lib/re/_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20260622 +MAGIC = 20260626 from _sre import MAXREPEAT, MAXGROUPS # noqa: F401 @@ -120,6 +120,10 @@ def _makecodes(*names): 'NOT_LITERAL_UNI_IGNORE', 'RANGE_UNI_IGNORE', + # Matches a character by a unicodedata property, via the capsule (see + # _properties.py and sre.c). The operand packs negate, property and value. + 'CATEGORY_UCD', + # The following opcodes are only occurred in the parser output, # but not in the compiled code. 'MIN_REPEAT', 'MAX_REPEAT', diff --git a/Lib/re/_optimizer.py b/Lib/re/_optimizer.py index 6a0bb5a2973eaec..c30f3b56b938dc4 100644 --- a/Lib/re/_optimizer.py +++ b/Lib/re/_optimizer.py @@ -51,6 +51,9 @@ def _compile_charset(charset, flags, code): emit(CH_UNICODE[av]) else: emit(av) + elif op is CATEGORY_UCD: + # av already packs negate, property and value (flag-independent). + emit(av) else: raise PatternError(f"internal: unsupported set operator {op!r}") emit(FAILURE) diff --git a/Lib/re/_properties.py b/Lib/re/_properties.py index 6310aa7fa88f955..836bb8c91372eef 100644 --- a/Lib/re/_properties.py +++ b/Lib/re/_properties.py @@ -26,7 +26,7 @@ # from ._constants import ( - IN, CATEGORY, NEGATE, RANGE, LITERAL, + IN, CATEGORY, CATEGORY_UCD, NEGATE, RANGE, LITERAL, CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE, CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD, @@ -159,6 +159,16 @@ (0x3008, 0x3020), (0x3030, 0x3030), (0xFD3E, 0xFD3F), (0xFE45, 0xFE46), ] +# Enumerated properties matched through the unicodedata capsule: normalised +# property-name alias -> table key (as in unicodedata._ucd_re_info()); +# General_Category ("gc") has its own key set below. +_ENUM_PROPERTIES = { + "bc": "bc", "bidiclass": "bc", + "ea": "ea", "eastasianwidth": "ea", + "gcb": "gcb", "graphemeclusterbreak": "gcb", + "incb": "incb", "indicconjunctbreak": "incb", +} + # Normalised property names that introduce a General_Category value. A bare # \p{Lu} is shorthand for \p{gc=Lu} (UTS #18 1.2.4, "Property Syntax"). _GC_KEYS = frozenset({"gc", "generalcategory"}) @@ -231,6 +241,44 @@ def _from_ranges(ranges, negate): return (IN, items) +# Properties matched in C through the unicodedata capsule (see +# Modules/_sre/sre.c). Lazily loaded: {prop_key: (prop_id, vmap)} where vmap +# maps a normalised value name to the index the capsule returns. +_ucd_info = None + + +def _ucd_property_info(): + global _ucd_info + if _ucd_info is None: + import unicodedata + info = {} + for key, (prop_id, names) in unicodedata._ucd_re_info().items(): + vmap = None + if names is not None: + vmap = {} + for index, name in enumerate(names): + vmap.setdefault(_normalize(name), index) + info[key] = (prop_id, vmap) + _ucd_info = info + return _ucd_info + + +def _ucd_item(prop_id, index, negate): + # A single CATEGORY_UCD charset item; the operand packs negate, property + # and value (see sre_category_ucd in sre.c). + return (CATEGORY_UCD, (negate << 24) | (prop_id << 16) | index) + + +def _ucd_enum(table_key, value, negate): + # A capsule-backed enumerated property value, or None if the value is + # unknown. + prop_id, vmap = _ucd_property_info()[table_key] + index = vmap.get(value) + if index is None: + return None + return (IN, [_ucd_item(prop_id, index, negate)]) + + def _general_category(value, negate): # Resolve a General_Category value to a subpattern using an engine category # or a fixed range set; unsupported values return None. @@ -249,6 +297,15 @@ def _truth(value): return None +def _binary_property(key, negate): + # Resolve a property given with no value: an engine category or a fixed + # range set; unknown names return None. + cat = _CATEGORY_PROPERTIES.get(key) + if cat is not None: + return (IN, [(CATEGORY, cat[1] if negate else cat[0])]) + return _from_ranges(_analytic_ranges().get(key), negate) + + def parse_property(name, negate): """Parse the text inside \\p{...} / \\P{...}. @@ -259,21 +316,18 @@ def parse_property(name, negate): key = _normalize(prop) if key in _GC_KEYS: return _general_category(_normalize(value), negate) + table = _ENUM_PROPERTIES.get(key) + if table is not None: + return _ucd_enum(table, _normalize(value), negate) # A binary property spelled name=yes or name=no. truth = _truth(value) if truth is None: return None negate ^= not truth - cat = _CATEGORY_PROPERTIES.get(key) - if cat is not None: - return (IN, [(CATEGORY, cat[1] if negate else cat[0])]) - return _from_ranges(_analytic_ranges().get(key), negate) + return _binary_property(key, negate) key = _normalize(name) - cat = _CATEGORY_PROPERTIES.get(key) - if cat is not None: - return (IN, [(CATEGORY, cat[1] if negate else cat[0])]) - ranges = _analytic_ranges().get(key) - if ranges is not None: - return _from_ranges(ranges, negate) + sub = _binary_property(key, negate) + if sub is not None: + return sub return _general_category(key, negate) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index af6e4612dcfaef5..fd8ce22153f8200 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -986,6 +986,15 @@ def test_property_escapes(self): self.assertTrue(re.fullmatch(r'\p{XID_Start=No}+', '123 ')) self.assertTrue(re.fullmatch(r'\P{XID_Start}+', '123 ')) + # Enumerated properties matched through the unicodedata capsule. + self.assertTrue(re.fullmatch(r'\p{Bidi_Class=L}+', 'abc')) + self.assertTrue(re.fullmatch(r'\p{bc=AL}+', 'ا')) # Arabic + self.assertTrue(re.fullmatch(r'\p{East_Asian_Width=W}+', '日本')) + self.assertTrue(re.fullmatch(r'\p{ea=Na}+', 'AB')) + self.assertTrue(re.fullmatch(r'\p{GCB=Extend}', '̀')) # comb. grave + self.assertTrue(re.fullmatch(r'\p{Indic_Conjunct_Break=Consonant}', + 'क')) # devanagari ka + # Binary properties from str predicates. self.assertTrue(re.fullmatch(r'\p{Alphabetic}+', 'fo\xf6Д日')) self.assertTrue(re.fullmatch(r'\p{Lowercase}+', 'abc')) diff --git a/Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst b/Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst index fa792cae5ec0761..ae68bda1eef5ad8 100644 --- a/Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst +++ b/Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst @@ -1,4 +1,6 @@ Regular expressions now support Unicode property escapes ``\p{...}`` and -``\P{...}`` for properties that the engine can resolve without the unicodedata -database: many ``General_Category`` values, a number of binary properties, the -POSIX compatibility classes, and properties derivable from the code point. +``\P{...}`` for properties that can be examined from Python, including the +``General_Category``, ``Bidi_Class``, ``East_Asian_Width``, +``Grapheme_Cluster_Break`` and ``Indic_Conjunct_Break`` properties, a number +of binary properties, the POSIX compatibility classes, and properties +derivable from the code point. diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 7d03b909226f244..872036808eae983 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -47,6 +47,7 @@ static const char copyright[] = #include "pycore_tuple.h" // _PyTuple_FromPairSteal #include "pycore_unicodeobject.h" // _PyUnicode_Copy #include "pycore_unicodectype.h" // _PyUnicode_IsXidStart() +#include "pycore_unicodedata_re.h" // _PyUnicode_RE_CAPI #include "pycore_weakref.h" // FT_CLEAR_WEAKREFS() #include "sre.h" // SRE_CODE @@ -224,6 +225,29 @@ static unsigned int sre_upper_unicode(unsigned int ch) return (unsigned int) Py_UNICODE_TOUPPER(ch); } +/* Match a character against a unicodedata property via the capsule (see + pycore_unicodedata_re.h). The operand packs the negate flag (bit 24), + the property selector (bits 16..23) and the value index (bits 0..15). + Compiling \p{...} imports unicodedata, so the capsule import here always + succeeds; the pointer is cached statically. */ +static int +sre_category_ucd(SRE_CODE packed, Py_UCS4 ch) +{ + static _PyUnicode_RE_CAPI *capi = NULL; + if (capi == NULL) { + capi = (_PyUnicode_RE_CAPI *)PyCapsule_Import( + PyUnicodeData_RE_CAPSULE_NAME, 0); + if (capi == NULL) { + PyErr_Clear(); + return 0; + } + } + int negate = (packed >> 24) & 1; + int prop = (packed >> 16) & 0xFF; + int value = (int)(packed & 0xFFFF); + return (capi->property(prop, ch) == value) ^ negate; +} + LOCAL(int) sre_category(SRE_CODE category, unsigned int ch) { @@ -2121,6 +2145,13 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end) } break; + case SRE_OP_CATEGORY_UCD: + /* */ + /* Any operand is memory-safe: an unknown property simply does + not match (see sre_category_ucd). */ + GET_ARG; + break; + default: FAIL; diff --git a/Modules/_sre/sre_constants.h b/Modules/_sre/sre_constants.h index d5ecefb37647555..f2ba067cbd07393 100644 --- a/Modules/_sre/sre_constants.h +++ b/Modules/_sre/sre_constants.h @@ -11,7 +11,7 @@ * See the sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20260622 +#define SRE_MAGIC 20260626 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -55,6 +55,7 @@ #define SRE_OP_LITERAL_UNI_IGNORE 40 #define SRE_OP_NOT_LITERAL_UNI_IGNORE 41 #define SRE_OP_RANGE_UNI_IGNORE 42 +#define SRE_OP_CATEGORY_UCD 43 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 6e6ae46f05a50f0..b58a5912d8f8209 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -115,6 +115,13 @@ SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch) set++; break; + case SRE_OP_CATEGORY_UCD: + /* */ + if (sre_category_ucd(set[0], (Py_UCS4) ch)) + return ok; + set++; + break; + case SRE_OP_CHARSET: /* */ if (ch < 256 && @@ -731,6 +738,17 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) ptr++; DISPATCH; + TARGET(SRE_OP_CATEGORY_UCD): + /* match at given unicodedata property */ + /* */ + TRACE(("|%p|%p|CATEGORY_UCD %d\n", pattern, + ptr, *pattern)); + if (ptr >= end || !sre_category_ucd(pattern[0], (Py_UCS4) ptr[0])) + RETURN_FAILURE; + pattern++; + ptr++; + DISPATCH; + TARGET(SRE_OP_ANY): /* match anything (except a newline) */ /* */ diff --git a/Modules/_sre/sre_targets.h b/Modules/_sre/sre_targets.h index 62761a0000d8368..642d2f824238be6 100644 --- a/Modules/_sre/sre_targets.h +++ b/Modules/_sre/sre_targets.h @@ -11,7 +11,7 @@ * See the sre.c file for information on usage and redistribution. */ -static void *sre_targets[43] = { +static void *sre_targets[44] = { &&TARGET_SRE_OP_FAILURE, &&TARGET_SRE_OP_SUCCESS, &&TARGET_SRE_OP_ANY, @@ -55,4 +55,5 @@ static void *sre_targets[43] = { &&TARGET_SRE_OP_LITERAL_UNI_IGNORE, &&TARGET_SRE_OP_NOT_LITERAL_UNI_IGNORE, &&TARGET_SRE_OP_RANGE_UNI_IGNORE, + &&TARGET_SRE_OP_CATEGORY_UCD, }; diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 5f0e7ab6ec220d6..94b4e35e1a251c3 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,6 +19,7 @@ #include "Python.h" #include "pycore_object.h" // _PyObject_VisitType() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI +#include "pycore_unicodedata_re.h" // _PyUnicode_RE_CAPI #include "pycore_unicodectype.h" // _PyUnicode_IsXidStart() #include @@ -1627,6 +1628,113 @@ capi_getcode(const char* name, int namelen, Py_UCS4* code, return _check_alias_and_seq(code, with_named_seq); } +/* -------------------------------------------------------------------- */ +/* re \p{...} property-index interface (see pycore_unicodedata_re.h) */ + +static int +ucd_re_property(int prop, Py_UCS4 ch) +{ + const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); + switch (prop) { + case _Py_UCD_RE_BC: return record->bidirectional; + case _Py_UCD_RE_EA: return record->east_asian_width; + case _Py_UCD_RE_GCB: return record->grapheme_cluster_break; + case _Py_UCD_RE_INCB: return record->incb; + } + return -1; +} + +static PyObject * +unicodedata_create_re_capi(void) +{ + static _PyUnicode_RE_CAPI capi = { + .property = ucd_re_property, + }; + return PyCapsule_New(&capi, PyUnicodeData_RE_CAPSULE_NAME, NULL); +} + +/* Build a tuple of value-name strings from a names array, stopping at the end + of the array or at a trailing NULL entry (some arrays are NULL-terminated). */ +static PyObject * +ucd_re_names(const char * const *names, Py_ssize_t maxlen) +{ + Py_ssize_t count = 0; + while (count < maxlen && names[count] != NULL) { + count++; + } + PyObject *tuple = PyTuple_New(count); + if (tuple == NULL) { + return NULL; + } + for (Py_ssize_t i = 0; i < count; i++) { + PyObject *s = PyUnicode_FromString(names[i]); + if (s == NULL) { + Py_DECREF(tuple); + return NULL; + } + PyTuple_SET_ITEM(tuple, i, s); + } + return tuple; +} + +/* Add one property entry {key: (prop, names)} to the info dict. Steals + names; a NULL names (a failed constructor) is propagated as an error. */ +static int +ucd_re_add(PyObject *info, const char *key, int prop, PyObject *names) +{ + if (names == NULL) { + return -1; + } + PyObject *value = Py_BuildValue("(iN)", prop, names); + if (value == NULL) { + return -1; + } + int rc = PyDict_SetItemString(info, key, value); + Py_DECREF(value); + return rc; +} + +/* Private helper for re._properties: returns + {prop_key: (prop_id, value_names_tuple_or_None)}; the names tuple maps + each value index to its name. */ +static PyObject * +unicodedata_ucd_re_info(PyObject *module, PyObject *Py_UNUSED(ignored)) +{ + PyObject *info = PyDict_New(); + if (info == NULL) { + return NULL; + } + static const struct { + const char *key; + int prop; + const char * const *names; + Py_ssize_t maxlen; + } enumerated[] = { + {"bc", _Py_UCD_RE_BC, _PyUnicode_BidirectionalNames, + Py_ARRAY_LENGTH(_PyUnicode_BidirectionalNames)}, + {"ea", _Py_UCD_RE_EA, _PyUnicode_EastAsianWidthNames, + Py_ARRAY_LENGTH(_PyUnicode_EastAsianWidthNames)}, + {"gcb", _Py_UCD_RE_GCB, _PyUnicode_GraphemeBreakNames, + Py_ARRAY_LENGTH(_PyUnicode_GraphemeBreakNames)}, + {"incb", _Py_UCD_RE_INCB, _PyUnicode_IndicConjunctBreakNames, + Py_ARRAY_LENGTH(_PyUnicode_IndicConjunctBreakNames)}, + }; + for (size_t i = 0; i < Py_ARRAY_LENGTH(enumerated); i++) { + if (ucd_re_add(info, enumerated[i].key, enumerated[i].prop, + ucd_re_names(enumerated[i].names, + enumerated[i].maxlen)) < 0) { + goto error; + } + } + + return info; +error: + Py_DECREF(info); + return NULL; +} + +/* -------------------------------------------------------------------- */ + static PyObject * unicodedata_create_capi(void) { @@ -2236,6 +2344,10 @@ unicodedata_extended_pictographic_impl(PyObject *module, int chr) // an UCD instance. static PyMethodDef unicodedata_functions[] = { // Module only functions. + {"_ucd_re_info", unicodedata_ucd_re_info, METH_NOARGS, + PyDoc_STR("_ucd_re_info()\n--\n\n" + "Private helper for re: property selectors and value names " + "for the _ucd_re_CAPI capsule.")}, UNICODEDATA_BLOCK_METHODDEF UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF @@ -2368,6 +2480,9 @@ unicodedata_exec(PyObject *module) if (PyModule_Add(module, "_ucnhash_CAPI", unicodedata_create_capi()) < 0) { return -1; } + if (PyModule_Add(module, "_ucd_re_CAPI", unicodedata_create_re_capi()) < 0) { + return -1; + } return 0; } diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index 6e18593ad698570..1e797e0b39aebca 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -254,6 +254,7 @@ Modules/_lsprof.c - callback_table - Modules/_pickle.c - READ_WHOLE_LINE - Modules/_sqlite/module.c - error_codes - Modules/_sre/sre.c pattern_repr flag_names - +Modules/_sre/sre.c sre_category_ucd capi - # XXX I'm pretty sure this is actually constant: Modules/_sre/sre_targets.h - sre_targets - Modules/_sre.c pattern_repr flag_names - @@ -328,6 +329,8 @@ Modules/pyexpat.c - handler_info - Modules/termios.c - termios_constants - Modules/timemodule.c init_timezone YEAR - Modules/unicodedata.c unicodedata_create_capi capi - +Modules/unicodedata.c unicodedata_create_re_capi capi - +Modules/unicodedata.c unicodedata_ucd_re_info enumerated - Objects/bytearrayobject.c - _PyByteArray_empty_string - Objects/complexobject.c - c_1 - Objects/exceptions.c - static_exceptions -