Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,9 @@ character ``'$'``.
The supported values are the groups ``L``, ``N``, ``Z`` and ``C`` and the
values ``Lu``, ``Lt``, ``Lm``, ``Nd``, ``Nl``, ``No``, ``Zs``, ``Zl``,
``Zp``, ``Cc``, ``Cf``, ``Cs``, ``Co`` and ``Cn``.
* The enumerated properties ``Bidi_Class`` (``bc``),
``East_Asian_Width`` (``ea``), ``Grapheme_Cluster_Break`` (``gcb``)
and ``Indic_Conjunct_Break`` (``incb``), for example ``\p{bc=AL}``.
* The binary properties ``XID_Start``, ``XID_Continue``, ``Alphabetic``,
``Lowercase``, ``Uppercase``, ``Numeric``, ``Printable``, ``Cased`` and
``Case_Ignorable``. A binary property may also be spelled
Expand Down
38 changes: 38 additions & 0 deletions Include/internal/pycore_unicodedata_re.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/* unicodedata property-index interface for the re module's \p{...} matcher.

The capsule exposes the value index that unicodedata stores for each
character property (the same index returned by unicodedata._ucd_re_info()),
so the SRE engine can match \p{...} in C without a per-character Python call
into unicodedata. See Lib/re/_properties.py and Modules/_sre/sre.c. */

#ifndef Py_INTERNAL_UNICODEDATA_RE_H
#define Py_INTERNAL_UNICODEDATA_RE_H
#ifdef __cplusplus
extern "C" {
#endif

#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif

#define PyUnicodeData_RE_CAPSULE_NAME "unicodedata._ucd_re_CAPI"

/* Property selectors. Private to unicodedata and _sre; the numbering only
needs to be consistent within a single build. */
enum {
_Py_UCD_RE_BC = 0, /* Bidi_Class */
_Py_UCD_RE_EA, /* East_Asian_Width */
_Py_UCD_RE_GCB, /* Grapheme_Cluster_Break */
_Py_UCD_RE_INCB, /* Indic_Conjunct_Break */
};

typedef struct {
/* Return the value index of property prop for character ch, matching the
indices in unicodedata._ucd_re_info(); -1 for an unknown property. */
int (*property)(int prop, Py_UCS4 ch);
} _PyUnicode_RE_CAPI;

#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_UNICODEDATA_RE_H */
4 changes: 4 additions & 0 deletions Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,10 @@ def print_2(*args):
arg = str(CHCODES[arg])
assert arg[:9] == 'CATEGORY_'
print_(op, arg[9:])
elif op is CATEGORY_UCD:
arg = code[i]
i += 1
print_(op, '%#x' % arg)
elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
skip = code[i]
print_(op, skip, to=i+skip)
Expand Down
6 changes: 5 additions & 1 deletion Lib/re/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

# update when constants are added or removed

MAGIC = 20260622
MAGIC = 20260626

from _sre import MAXREPEAT, MAXGROUPS # noqa: F401

Expand Down Expand Up @@ -120,6 +120,10 @@ def _makecodes(*names):
'NOT_LITERAL_UNI_IGNORE',
'RANGE_UNI_IGNORE',

# Matches a character by a unicodedata property, via the capsule (see
# _properties.py and sre.c). The operand packs negate, property and value.
'CATEGORY_UCD',

# The following opcodes are only occurred in the parser output,
# but not in the compiled code.
'MIN_REPEAT', 'MAX_REPEAT',
Expand Down
3 changes: 3 additions & 0 deletions Lib/re/_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ def _compile_charset(charset, flags, code):
emit(CH_UNICODE[av])
else:
emit(av)
elif op is CATEGORY_UCD:
# av already packs negate, property and value (flag-independent).
emit(av)
else:
raise PatternError(f"internal: unsupported set operator {op!r}")
emit(FAILURE)
Expand Down
76 changes: 65 additions & 11 deletions Lib/re/_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#

from ._constants import (
IN, CATEGORY, NEGATE, RANGE, LITERAL,
IN, CATEGORY, CATEGORY_UCD, NEGATE, RANGE, LITERAL,
CATEGORY_DIGIT, CATEGORY_NOT_DIGIT,
CATEGORY_SPACE, CATEGORY_NOT_SPACE,
CATEGORY_WORD, CATEGORY_NOT_WORD,
Expand Down Expand Up @@ -159,6 +159,16 @@
(0x3008, 0x3020), (0x3030, 0x3030), (0xFD3E, 0xFD3F), (0xFE45, 0xFE46),
]

# Enumerated properties matched through the unicodedata capsule: normalised
# property-name alias -> table key (as in unicodedata._ucd_re_info());
# General_Category ("gc") has its own key set below.
_ENUM_PROPERTIES = {
"bc": "bc", "bidiclass": "bc",
"ea": "ea", "eastasianwidth": "ea",
"gcb": "gcb", "graphemeclusterbreak": "gcb",
"incb": "incb", "indicconjunctbreak": "incb",
}

# Normalised property names that introduce a General_Category value. A bare
# \p{Lu} is shorthand for \p{gc=Lu} (UTS #18 1.2.4, "Property Syntax").
_GC_KEYS = frozenset({"gc", "generalcategory"})
Expand Down Expand Up @@ -231,6 +241,44 @@ def _from_ranges(ranges, negate):
return (IN, items)


# Properties matched in C through the unicodedata capsule (see
# Modules/_sre/sre.c). Lazily loaded: {prop_key: (prop_id, vmap)} where vmap
# maps a normalised value name to the index the capsule returns.
_ucd_info = None


def _ucd_property_info():
global _ucd_info
if _ucd_info is None:
import unicodedata
info = {}
for key, (prop_id, names) in unicodedata._ucd_re_info().items():
vmap = None
if names is not None:
vmap = {}
for index, name in enumerate(names):
vmap.setdefault(_normalize(name), index)
info[key] = (prop_id, vmap)
_ucd_info = info
return _ucd_info


def _ucd_item(prop_id, index, negate):
# A single CATEGORY_UCD charset item; the operand packs negate, property
# and value (see sre_category_ucd in sre.c).
return (CATEGORY_UCD, (negate << 24) | (prop_id << 16) | index)


def _ucd_enum(table_key, value, negate):
# A capsule-backed enumerated property value, or None if the value is
# unknown.
prop_id, vmap = _ucd_property_info()[table_key]
index = vmap.get(value)
if index is None:
return None
return (IN, [_ucd_item(prop_id, index, negate)])


def _general_category(value, negate):
# Resolve a General_Category value to a subpattern using an engine category
# or a fixed range set; unsupported values return None.
Expand All @@ -249,6 +297,15 @@ def _truth(value):
return None


def _binary_property(key, negate):
# Resolve a property given with no value: an engine category or a fixed
# range set; unknown names return None.
cat = _CATEGORY_PROPERTIES.get(key)
if cat is not None:
return (IN, [(CATEGORY, cat[1] if negate else cat[0])])
return _from_ranges(_analytic_ranges().get(key), negate)


def parse_property(name, negate):
"""Parse the text inside \\p{...} / \\P{...}.

Expand All @@ -259,21 +316,18 @@ def parse_property(name, negate):
key = _normalize(prop)
if key in _GC_KEYS:
return _general_category(_normalize(value), negate)
table = _ENUM_PROPERTIES.get(key)
if table is not None:
return _ucd_enum(table, _normalize(value), negate)
# A binary property spelled name=yes or name=no.
truth = _truth(value)
if truth is None:
return None
negate ^= not truth
cat = _CATEGORY_PROPERTIES.get(key)
if cat is not None:
return (IN, [(CATEGORY, cat[1] if negate else cat[0])])
return _from_ranges(_analytic_ranges().get(key), negate)
return _binary_property(key, negate)

key = _normalize(name)
cat = _CATEGORY_PROPERTIES.get(key)
if cat is not None:
return (IN, [(CATEGORY, cat[1] if negate else cat[0])])
ranges = _analytic_ranges().get(key)
if ranges is not None:
return _from_ranges(ranges, negate)
sub = _binary_property(key, negate)
if sub is not None:
return sub
return _general_category(key, negate)
9 changes: 9 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,15 @@ def test_property_escapes(self):
self.assertTrue(re.fullmatch(r'\p{XID_Start=No}+', '123 '))
self.assertTrue(re.fullmatch(r'\P{XID_Start}+', '123 '))

# Enumerated properties matched through the unicodedata capsule.
self.assertTrue(re.fullmatch(r'\p{Bidi_Class=L}+', 'abc'))
self.assertTrue(re.fullmatch(r'\p{bc=AL}+', 'ا')) # Arabic
self.assertTrue(re.fullmatch(r'\p{East_Asian_Width=W}+', '日本'))
self.assertTrue(re.fullmatch(r'\p{ea=Na}+', 'AB'))
self.assertTrue(re.fullmatch(r'\p{GCB=Extend}', '̀')) # comb. grave
self.assertTrue(re.fullmatch(r'\p{Indic_Conjunct_Break=Consonant}',
'क')) # devanagari ka

# Binary properties from str predicates.
self.assertTrue(re.fullmatch(r'\p{Alphabetic}+', 'fo\xf6Д日'))
self.assertTrue(re.fullmatch(r'\p{Lowercase}+', 'abc'))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
Regular expressions now support Unicode property escapes ``\p{...}`` and
``\P{...}`` for properties that the engine can resolve without the unicodedata
database: many ``General_Category`` values, a number of binary properties, the
POSIX compatibility classes, and properties derivable from the code point.
``\P{...}`` for properties that can be examined from Python, including the
``General_Category``, ``Bidi_Class``, ``East_Asian_Width``,
``Grapheme_Cluster_Break`` and ``Indic_Conjunct_Break`` properties, a number
of binary properties, the POSIX compatibility classes, and properties
derivable from the code point.
31 changes: 31 additions & 0 deletions Modules/_sre/sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ static const char copyright[] =
#include "pycore_tuple.h" // _PyTuple_FromPairSteal
#include "pycore_unicodeobject.h" // _PyUnicode_Copy
#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart()
#include "pycore_unicodedata_re.h" // _PyUnicode_RE_CAPI
#include "pycore_weakref.h" // FT_CLEAR_WEAKREFS()

#include "sre.h" // SRE_CODE
Expand Down Expand Up @@ -224,6 +225,29 @@ static unsigned int sre_upper_unicode(unsigned int ch)
return (unsigned int) Py_UNICODE_TOUPPER(ch);
}

/* Match a character against a unicodedata property via the capsule (see
pycore_unicodedata_re.h). The operand packs the negate flag (bit 24),
the property selector (bits 16..23) and the value index (bits 0..15).
Compiling \p{...} imports unicodedata, so the capsule import here always
succeeds; the pointer is cached statically. */
static int
sre_category_ucd(SRE_CODE packed, Py_UCS4 ch)
{
static _PyUnicode_RE_CAPI *capi = NULL;
if (capi == NULL) {
capi = (_PyUnicode_RE_CAPI *)PyCapsule_Import(
PyUnicodeData_RE_CAPSULE_NAME, 0);
if (capi == NULL) {
PyErr_Clear();
return 0;
}
}
int negate = (packed >> 24) & 1;
int prop = (packed >> 16) & 0xFF;
int value = (int)(packed & 0xFFFF);
return (capi->property(prop, ch) == value) ^ negate;
}

LOCAL(int)
sre_category(SRE_CODE category, unsigned int ch)
{
Expand Down Expand Up @@ -2121,6 +2145,13 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
}
break;

case SRE_OP_CATEGORY_UCD:
/* <CATEGORY_UCD> <packed> */
/* Any operand is memory-safe: an unknown property simply does
not match (see sre_category_ucd). */
GET_ARG;
break;

default:
FAIL;

Expand Down
3 changes: 2 additions & 1 deletion Modules/_sre/sre_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* See the sre.c file for information on usage and redistribution.
*/

#define SRE_MAGIC 20260622
#define SRE_MAGIC 20260626
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
Expand Down Expand Up @@ -55,6 +55,7 @@
#define SRE_OP_LITERAL_UNI_IGNORE 40
#define SRE_OP_NOT_LITERAL_UNI_IGNORE 41
#define SRE_OP_RANGE_UNI_IGNORE 42
#define SRE_OP_CATEGORY_UCD 43
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2
Expand Down
18 changes: 18 additions & 0 deletions Modules/_sre/sre_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,13 @@ SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)
set++;
break;

case SRE_OP_CATEGORY_UCD:
/* <CATEGORY_UCD> <packed> */
if (sre_category_ucd(set[0], (Py_UCS4) ch))
return ok;
set++;
break;

case SRE_OP_CHARSET:
/* <CHARSET> <bitmap> */
if (ch < 256 &&
Expand Down Expand Up @@ -731,6 +738,17 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
ptr++;
DISPATCH;

TARGET(SRE_OP_CATEGORY_UCD):
/* match at given unicodedata property */
/* <CATEGORY_UCD> <packed> */
TRACE(("|%p|%p|CATEGORY_UCD %d\n", pattern,
ptr, *pattern));
if (ptr >= end || !sre_category_ucd(pattern[0], (Py_UCS4) ptr[0]))
RETURN_FAILURE;
pattern++;
ptr++;
DISPATCH;

TARGET(SRE_OP_ANY):
/* match anything (except a newline) */
/* <ANY> */
Expand Down
3 changes: 2 additions & 1 deletion Modules/_sre/sre_targets.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* See the sre.c file for information on usage and redistribution.
*/

static void *sre_targets[43] = {
static void *sre_targets[44] = {
&&TARGET_SRE_OP_FAILURE,
&&TARGET_SRE_OP_SUCCESS,
&&TARGET_SRE_OP_ANY,
Expand Down Expand Up @@ -55,4 +55,5 @@ static void *sre_targets[43] = {
&&TARGET_SRE_OP_LITERAL_UNI_IGNORE,
&&TARGET_SRE_OP_NOT_LITERAL_UNI_IGNORE,
&&TARGET_SRE_OP_RANGE_UNI_IGNORE,
&&TARGET_SRE_OP_CATEGORY_UCD,
};
Loading
Loading