python · serhiy-storchaka · Jul 4, 2026
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
@@ -696,6 +696,9 @@ character ``'$'``.
      The supported values are the groups ``L``, ``N``, ``Z`` and ``C`` and the
      values ``Lu``, ``Lt``, ``Lm``, ``Nd``, ``Nl``, ``No``, ``Zs``, ``Zl``,
      ``Zp``, ``Cc``, ``Cf``, ``Cs``, ``Co`` and ``Cn``.
+   * The enumerated properties ``Bidi_Class`` (``bc``),
+     ``East_Asian_Width`` (``ea``), ``Grapheme_Cluster_Break`` (``gcb``)
+     and ``Indic_Conjunct_Break`` (``incb``), for example ``\p{bc=AL}``.
    * The binary properties ``XID_Start``, ``XID_Continue``, ``Alphabetic``,
      ``Lowercase``, ``Uppercase``, ``Numeric``, ``Printable``, ``Cased`` and
      ``Case_Ignorable``.  A binary property may also be spelled

diff --git a/Include/internal/pycore_unicodedata_re.h b/Include/internal/pycore_unicodedata_re.h
@@ -0,0 +1,38 @@
+/* unicodedata property-index interface for the re module's \p{...} matcher.
+
+   The capsule exposes the value index that unicodedata stores for each
+   character property (the same index returned by unicodedata._ucd_re_info()),
+   so the SRE engine can match \p{...} in C without a per-character Python call
+   into unicodedata.  See Lib/re/_properties.py and Modules/_sre/sre.c. */
+
+#ifndef Py_INTERNAL_UNICODEDATA_RE_H
+#define Py_INTERNAL_UNICODEDATA_RE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#define PyUnicodeData_RE_CAPSULE_NAME "unicodedata._ucd_re_CAPI"
+
+/* Property selectors.  Private to unicodedata and _sre; the numbering only
+   needs to be consistent within a single build. */
+enum {
+    _Py_UCD_RE_BC = 0,        /* Bidi_Class */
+    _Py_UCD_RE_EA,            /* East_Asian_Width */
+    _Py_UCD_RE_GCB,           /* Grapheme_Cluster_Break */
+    _Py_UCD_RE_INCB,          /* Indic_Conjunct_Break */
+};
+
+typedef struct {
+    /* Return the value index of property prop for character ch, matching the
+       indices in unicodedata._ucd_re_info(); -1 for an unknown property. */
+    int (*property)(int prop, Py_UCS4 ch);
+} _PyUnicode_RE_CAPI;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UNICODEDATA_RE_H */
diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py
@@ -288,6 +288,10 @@ def print_2(*args):
                 arg = str(CHCODES[arg])
                 assert arg[:9] == 'CATEGORY_'
                 print_(op, arg[9:])
+            elif op is CATEGORY_UCD:
+                arg = code[i]
+                i += 1
+                print_(op, '%#x' % arg)
             elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
                 skip = code[i]
                 print_(op, skip, to=i+skip)

diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py
@@ -13,7 +13,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20260622
+MAGIC = 20260626
 
 from _sre import MAXREPEAT, MAXGROUPS  # noqa: F401
 
@@ -120,6 +120,10 @@ def _makecodes(*names):
     'NOT_LITERAL_UNI_IGNORE',
     'RANGE_UNI_IGNORE',
 
+    # Matches a character by a unicodedata property, via the capsule (see
+    # _properties.py and sre.c).  The operand packs negate, property and value.
+    'CATEGORY_UCD',
+
     # The following opcodes are only occurred in the parser output,
     # but not in the compiled code.
     'MIN_REPEAT', 'MAX_REPEAT',

diff --git a/Lib/re/_optimizer.py b/Lib/re/_optimizer.py
@@ -51,6 +51,9 @@ def _compile_charset(charset, flags, code):
                 emit(CH_UNICODE[av])
             else:
                 emit(av)
+        elif op is CATEGORY_UCD:
+            # av already packs negate, property and value (flag-independent).
+            emit(av)
         else:
             raise PatternError(f"internal: unsupported set operator {op!r}")
     emit(FAILURE)

diff --git a/Lib/re/_properties.py b/Lib/re/_properties.py
@@ -26,7 +26,7 @@
 #
 
 from ._constants import (
-    IN, CATEGORY, NEGATE, RANGE, LITERAL,
+    IN, CATEGORY, CATEGORY_UCD, NEGATE, RANGE, LITERAL,
     CATEGORY_DIGIT, CATEGORY_NOT_DIGIT,
     CATEGORY_SPACE, CATEGORY_NOT_SPACE,
     CATEGORY_WORD, CATEGORY_NOT_WORD,
@@ -159,6 +159,16 @@
     (0x3008, 0x3020), (0x3030, 0x3030), (0xFD3E, 0xFD3F), (0xFE45, 0xFE46),
 ]
 
+# Enumerated properties matched through the unicodedata capsule: normalised
+# property-name alias -> table key (as in unicodedata._ucd_re_info());
+# General_Category ("gc") has its own key set below.
+_ENUM_PROPERTIES = {
+    "bc": "bc",     "bidiclass": "bc",
+    "ea": "ea",     "eastasianwidth": "ea",
+    "gcb": "gcb",   "graphemeclusterbreak": "gcb",
+    "incb": "incb", "indicconjunctbreak": "incb",
+}
+
 # Normalised property names that introduce a General_Category value.  A bare
 # \p{Lu} is shorthand for \p{gc=Lu} (UTS #18 1.2.4, "Property Syntax").
 _GC_KEYS = frozenset({"gc", "generalcategory"})
@@ -231,6 +241,44 @@ def _from_ranges(ranges, negate):
     return (IN, items)
 
 
+# Properties matched in C through the unicodedata capsule (see
+# Modules/_sre/sre.c).  Lazily loaded: {prop_key: (prop_id, vmap)} where vmap
+# maps a normalised value name to the index the capsule returns.
+_ucd_info = None
+
+
+def _ucd_property_info():
+    global _ucd_info
+    if _ucd_info is None:
+        import unicodedata
+        info = {}
+        for key, (prop_id, names) in unicodedata._ucd_re_info().items():
+            vmap = None
+            if names is not None:
+                vmap = {}
+                for index, name in enumerate(names):
+                    vmap.setdefault(_normalize(name), index)
+            info[key] = (prop_id, vmap)
+        _ucd_info = info
+    return _ucd_info
+
+
+def _ucd_item(prop_id, index, negate):
+    # A single CATEGORY_UCD charset item; the operand packs negate, property
+    # and value (see sre_category_ucd in sre.c).
+    return (CATEGORY_UCD, (negate << 24) | (prop_id << 16) | index)
+
+
+def _ucd_enum(table_key, value, negate):
+    # A capsule-backed enumerated property value, or None if the value is
+    # unknown.
+    prop_id, vmap = _ucd_property_info()[table_key]
+    index = vmap.get(value)
+    if index is None:
+        return None
+    return (IN, [_ucd_item(prop_id, index, negate)])
+
+
 def _general_category(value, negate):
     # Resolve a General_Category value to a subpattern using an engine category
     # or a fixed range set; unsupported values return None.
@@ -249,6 +297,15 @@ def _truth(value):
     return None
 
 
+def _binary_property(key, negate):
+    # Resolve a property given with no value: an engine category or a fixed
+    # range set; unknown names return None.
+    cat = _CATEGORY_PROPERTIES.get(key)
+    if cat is not None:
+        return (IN, [(CATEGORY, cat[1] if negate else cat[0])])
+    return _from_ranges(_analytic_ranges().get(key), negate)
+
+
 def parse_property(name, negate):
     """Parse the text inside \\p{...} / \\P{...}.
 
@@ -259,21 +316,18 @@ def parse_property(name, negate):
         key = _normalize(prop)
         if key in _GC_KEYS:
             return _general_category(_normalize(value), negate)
+        table = _ENUM_PROPERTIES.get(key)
+        if table is not None:
+            return _ucd_enum(table, _normalize(value), negate)
         # A binary property spelled name=yes or name=no.
         truth = _truth(value)
         if truth is None:
             return None
         negate ^= not truth
-        cat = _CATEGORY_PROPERTIES.get(key)
-        if cat is not None:
-            return (IN, [(CATEGORY, cat[1] if negate else cat[0])])
-        return _from_ranges(_analytic_ranges().get(key), negate)
+        return _binary_property(key, negate)
 
     key = _normalize(name)
-    cat = _CATEGORY_PROPERTIES.get(key)
-    if cat is not None:
-        return (IN, [(CATEGORY, cat[1] if negate else cat[0])])
-    ranges = _analytic_ranges().get(key)
-    if ranges is not None:
-        return _from_ranges(ranges, negate)
+    sub = _binary_property(key, negate)
+    if sub is not None:
+        return sub
     return _general_category(key, negate)
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
@@ -986,6 +986,15 @@ def test_property_escapes(self):
         self.assertTrue(re.fullmatch(r'\p{XID_Start=No}+', '123 '))
         self.assertTrue(re.fullmatch(r'\P{XID_Start}+', '123 '))
 
+        # Enumerated properties matched through the unicodedata capsule.
+        self.assertTrue(re.fullmatch(r'\p{Bidi_Class=L}+', 'abc'))
+        self.assertTrue(re.fullmatch(r'\p{bc=AL}+', 'ا'))           # Arabic
+        self.assertTrue(re.fullmatch(r'\p{East_Asian_Width=W}+', '日本'))
+        self.assertTrue(re.fullmatch(r'\p{ea=Na}+', 'AB'))
+        self.assertTrue(re.fullmatch(r'\p{GCB=Extend}', '̀'))  # comb. grave
+        self.assertTrue(re.fullmatch(r'\p{Indic_Conjunct_Break=Consonant}',
+                                     'क'))                # devanagari ka
+
         # Binary properties from str predicates.
         self.assertTrue(re.fullmatch(r'\p{Alphabetic}+', 'fo\xf6Д日'))
         self.assertTrue(re.fullmatch(r'\p{Lowercase}+', 'abc'))

diff --git a/Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst b/Misc/NEWS.d/next/Library/2026-06-22-12-00-00.gh-issue-95555.Pr0p18.rst
@@ -1,4 +1,6 @@
 Regular expressions now support Unicode property escapes ``\p{...}`` and
-``\P{...}`` for properties that the engine can resolve without the unicodedata
-database: many ``General_Category`` values, a number of binary properties, the
-POSIX compatibility classes, and properties derivable from the code point.
+``\P{...}`` for properties that can be examined from Python, including the
+``General_Category``, ``Bidi_Class``, ``East_Asian_Width``,
+``Grapheme_Cluster_Break`` and ``Indic_Conjunct_Break`` properties, a number
+of binary properties, the POSIX compatibility classes, and properties
+derivable from the code point.
diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c
@@ -47,6 +47,7 @@ static const char copyright[] =
 #include "pycore_tuple.h"            // _PyTuple_FromPairSteal
 #include "pycore_unicodeobject.h"    // _PyUnicode_Copy
 #include "pycore_unicodectype.h"     // _PyUnicode_IsXidStart()
+#include "pycore_unicodedata_re.h"   // _PyUnicode_RE_CAPI
 #include "pycore_weakref.h"          // FT_CLEAR_WEAKREFS()
 
 #include "sre.h"                     // SRE_CODE
@@ -224,6 +225,29 @@ static unsigned int sre_upper_unicode(unsigned int ch)
     return (unsigned int) Py_UNICODE_TOUPPER(ch);
 }
 
+/* Match a character against a unicodedata property via the capsule (see
+   pycore_unicodedata_re.h).  The operand packs the negate flag (bit 24),
+   the property selector (bits 16..23) and the value index (bits 0..15).
+   Compiling \p{...} imports unicodedata, so the capsule import here always
+   succeeds; the pointer is cached statically. */
+static int
+sre_category_ucd(SRE_CODE packed, Py_UCS4 ch)
+{
+    static _PyUnicode_RE_CAPI *capi = NULL;
+    if (capi == NULL) {
+        capi = (_PyUnicode_RE_CAPI *)PyCapsule_Import(
+                                        PyUnicodeData_RE_CAPSULE_NAME, 0);
+        if (capi == NULL) {
+            PyErr_Clear();
+            return 0;
+        }
+    }
+    int negate = (packed >> 24) & 1;
+    int prop = (packed >> 16) & 0xFF;
+    int value = (int)(packed & 0xFFFF);
+    return (capi->property(prop, ch) == value) ^ negate;
+}
+
 LOCAL(int)
 sre_category(SRE_CODE category, unsigned int ch)
 {
@@ -2121,6 +2145,13 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
             }
             break;
 
+        case SRE_OP_CATEGORY_UCD:
+            /* <CATEGORY_UCD> <packed> */
+            /* Any operand is memory-safe: an unknown property simply does
+               not match (see sre_category_ucd). */
+            GET_ARG;
+            break;
+
         default:
             FAIL;
 

diff --git a/Modules/_sre/sre_constants.h b/Modules/_sre/sre_constants.h
@@ -11,7 +11,7 @@
  * See the sre.c file for information on usage and redistribution.
  */
 
-#define SRE_MAGIC 20260622
+#define SRE_MAGIC 20260626
 #define SRE_OP_FAILURE 0
 #define SRE_OP_SUCCESS 1
 #define SRE_OP_ANY 2
@@ -55,6 +55,7 @@
 #define SRE_OP_LITERAL_UNI_IGNORE 40
 #define SRE_OP_NOT_LITERAL_UNI_IGNORE 41
 #define SRE_OP_RANGE_UNI_IGNORE 42
+#define SRE_OP_CATEGORY_UCD 43
 #define SRE_AT_BEGINNING 0
 #define SRE_AT_BEGINNING_LINE 1
 #define SRE_AT_BEGINNING_STRING 2

diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h
@@ -115,6 +115,13 @@ SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)
             set++;
             break;
 
+        case SRE_OP_CATEGORY_UCD:
+            /* <CATEGORY_UCD> <packed> */
+            if (sre_category_ucd(set[0], (Py_UCS4) ch))
+                return ok;
+            set++;
+            break;
+
         case SRE_OP_CHARSET:
             /* <CHARSET> <bitmap> */
             if (ch < 256 &&
@@ -731,6 +738,17 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
             ptr++;
             DISPATCH;
 
+        TARGET(SRE_OP_CATEGORY_UCD):
+            /* match at given unicodedata property */
+            /* <CATEGORY_UCD> <packed> */
+            TRACE(("|%p|%p|CATEGORY_UCD %d\n", pattern,
+                   ptr, *pattern));
+            if (ptr >= end || !sre_category_ucd(pattern[0], (Py_UCS4) ptr[0]))
+                RETURN_FAILURE;
+            pattern++;
+            ptr++;
+            DISPATCH;
+
         TARGET(SRE_OP_ANY):
             /* match anything (except a newline) */
             /* <ANY> */

diff --git a/Modules/_sre/sre_targets.h b/Modules/_sre/sre_targets.h
@@ -11,7 +11,7 @@
  * See the sre.c file for information on usage and redistribution.
  */
 
-static void *sre_targets[43] = {
+static void *sre_targets[44] = {
     &&TARGET_SRE_OP_FAILURE,
     &&TARGET_SRE_OP_SUCCESS,
     &&TARGET_SRE_OP_ANY,
@@ -55,4 +55,5 @@ static void *sre_targets[43] = {
     &&TARGET_SRE_OP_LITERAL_UNI_IGNORE,
     &&TARGET_SRE_OP_NOT_LITERAL_UNI_IGNORE,
     &&TARGET_SRE_OP_RANGE_UNI_IGNORE,
+    &&TARGET_SRE_OP_CATEGORY_UCD,
 };