Skip to content

Commit 35aabc8

Browse files
committed
Route codepoint helpers through the capsule API
Per follow-up review on #21522, the codepoint classifiers belong with the rest of the librt.strings public surface rather than in a shared inline header, since they implement Python-visible librt.strings functions (not mypyc-internal codegen helpers like the other _extra_ops files). Move them out of codepoint_extra_ops.h and into librt_strings.c as proper non-static functions, exposed to mypyc-compiled callers via the capsule API the same way every other LibRTStrings_* function works. This keeps the librt module files independent of mypyc-internal _extra_ops headers, matching the pattern used by BytesWriter_internal etc. The cost is one indirect call per use vs. the previous inlined macro; still substantially faster than the Python method dispatch the primitives are replacing. - librt_strings.h: bump LIBRT_STRINGS_API_VERSION 4->5, LIBRT_STRINGS_API_LEN 14->19. - librt_strings_api.h: add 5 macro entries for LibRTStrings_API[14..18]. - librt_strings.c: define the 5 helpers; register them in the capsule table; drop `#include "codepoint_extra_ops.h"`. - mypyc/ir/deps.py: delete CODEPOINT_EXTRA_OPS. - mypyc/primitives/librt_strings_ops.py: drop the CODEPOINT_EXTRA_OPS dep from the five codepoint primitives. - Delete codepoint_extra_ops.{c,h}.
1 parent 472aab0 commit 35aabc8

7 files changed

Lines changed: 64 additions & 82 deletions

File tree

mypyc/ir/deps.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,5 +116,4 @@ def get_header(self) -> str:
116116
STRING_WRITER_EXTRA_OPS: Final = SourceDep("stringwriter_extra_ops.c")
117117
BYTEARRAY_EXTRA_OPS: Final = SourceDep("bytearray_extra_ops.c")
118118
STR_EXTRA_OPS: Final = SourceDep("str_extra_ops.c")
119-
CODEPOINT_EXTRA_OPS: Final = SourceDep("codepoint_extra_ops.c")
120119
VECS_EXTRA_OPS: Final = SourceDep("vecs_extra_ops.c")

mypyc/lib-rt/codepoint_extra_ops.c

Lines changed: 0 additions & 6 deletions
This file was deleted.

mypyc/lib-rt/codepoint_extra_ops.h

Lines changed: 0 additions & 51 deletions
This file was deleted.

mypyc/lib-rt/strings/librt_strings.c

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include <Python.h>
55
#include <stdint.h>
66
#include "CPy.h"
7-
#include "codepoint_extra_ops.h"
87
#include "librt_strings.h"
98

109
#define CPY_BOOL_ERROR 2
@@ -1154,15 +1153,50 @@ read_f64_be(PyObject *module, PyObject *const *args, size_t nargs) {
11541153
return PyFloat_FromDouble(CPyBytes_ReadF64BEUnsafe(data + index));
11551154
}
11561155

1157-
// Codepoint classification helpers exposed to interpreted callers.
1158-
// The C-side names are prefixed `cp_` to avoid colliding with libc's
1159-
// <ctype.h> isspace / isdigit / etc. Compiled callers go through the
1160-
// LibRTStrings_* static inlines in codepoint_extra_ops.h instead.
1161-
//
1162-
// All wrappers parse a single int argument as i32 (codepoint) and
1163-
// dispatch to the corresponding LibRTStrings_* function. The parse
1164-
// step accepts any int but rejects values outside the i32 range with
1165-
// OverflowError, matching the input domain of the compiled fast path.
1156+
// Codepoint classification helpers. Inputs are signed i32 for compatibility
1157+
// with mypyc's int32_rprimitive; negative values are non-codepoints and
1158+
// return false. Mypyc-compiled callers reach these through the librt.strings
1159+
// capsule API (see librt_strings_api.h); interpreted callers go through the
1160+
// `cp_*` Python wrappers below.
1161+
1162+
bool LibRTStrings_IsSpace(int32_t c) {
1163+
return c >= 0 && Py_UNICODE_ISSPACE((Py_UCS4)c);
1164+
}
1165+
1166+
bool LibRTStrings_IsDigit(int32_t c) {
1167+
return c >= 0 && Py_UNICODE_ISDIGIT((Py_UCS4)c);
1168+
}
1169+
1170+
bool LibRTStrings_IsAlnum(int32_t c) {
1171+
return c >= 0 && Py_UNICODE_ISALNUM((Py_UCS4)c);
1172+
}
1173+
1174+
bool LibRTStrings_IsAlpha(int32_t c) {
1175+
return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c);
1176+
}
1177+
1178+
// True if c could start a valid identifier (XID_Start, per PEP 3131).
1179+
// ASCII fast path covers `[A-Za-z_]`; non-ASCII delegates to CPython's
1180+
// PyUnicode_IsIdentifier on a 1-character string. Aborts via
1181+
// CPyError_OutOfMemory on allocation failure to keep this ERR_NEVER.
1182+
bool LibRTStrings_IsIdentifier(int32_t c) {
1183+
if (c < 0) return false;
1184+
if (c < 128) {
1185+
return (c >= 'a' && c <= 'z')
1186+
|| (c >= 'A' && c <= 'Z')
1187+
|| c == '_';
1188+
}
1189+
PyObject *s = PyUnicode_FromOrdinal((int)c);
1190+
if (s == NULL) {
1191+
CPyError_OutOfMemory();
1192+
}
1193+
int r = PyUnicode_IsIdentifier(s);
1194+
Py_DECREF(s);
1195+
return r == 1;
1196+
}
1197+
1198+
// Python-level wrappers (`cp_*`) for interpreted callers. The C-side names
1199+
// are prefixed `cp_` to avoid colliding with libc's <ctype.h> isspace etc.
11661200

11671201
// Parse a Python int as i32 codepoint. Returns 0 on success and writes
11681202
// the value to *out; returns -1 on error with a Python exception set.
@@ -1317,6 +1351,11 @@ librt_strings_module_exec(PyObject *m)
13171351
(void *)StringWriter_type_internal,
13181352
(void *)StringWriter_write_internal,
13191353
(void *)grow_string_buffer,
1354+
(void *)LibRTStrings_IsSpace,
1355+
(void *)LibRTStrings_IsDigit,
1356+
(void *)LibRTStrings_IsAlnum,
1357+
(void *)LibRTStrings_IsAlpha,
1358+
(void *)LibRTStrings_IsIdentifier,
13201359
};
13211360
PyObject *c_api_object = PyCapsule_New((void *)librt_strings_api, "librt.strings._C_API", NULL);
13221361
if (PyModule_Add(m, "_C_API", c_api_object) < 0) {

mypyc/lib-rt/strings/librt_strings.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
// API version -- more recent versions must maintain backward compatibility, i.e.
1414
// we can add new features but not remove or change existing features (unless
1515
// ABI version is changed, but see the comment above).
16-
#define LIBRT_STRINGS_API_VERSION 4
16+
#define LIBRT_STRINGS_API_VERSION 5
1717

1818
// Number of functions in the capsule API. If you add a new function, also increase
1919
// LIBRT_STRINGS_API_VERSION.
20-
#define LIBRT_STRINGS_API_LEN 14
20+
#define LIBRT_STRINGS_API_LEN 19
2121

2222
typedef struct {
2323
PyObject_HEAD

mypyc/lib-rt/strings/librt_strings_api.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import_librt_strings(void);
66

77
#include <Python.h>
88
#include <stdbool.h>
9+
#include <stdint.h>
910
#include "librt_strings.h"
1011

1112
extern void *LibRTStrings_API[LIBRT_STRINGS_API_LEN];
@@ -24,6 +25,11 @@ extern void *LibRTStrings_API[LIBRT_STRINGS_API_LEN];
2425
#define LibRTStrings_StringWriter_type_internal (*(PyTypeObject* (*)(void)) LibRTStrings_API[11])
2526
#define LibRTStrings_StringWriter_write_internal (*(char (*)(PyObject *source, PyObject *value)) LibRTStrings_API[12])
2627
#define LibRTStrings_grow_string_buffer (*(bool (*)(StringWriterObject *obj, Py_ssize_t n)) LibRTStrings_API[13])
28+
#define LibRTStrings_IsSpace (*(bool (*)(int32_t c)) LibRTStrings_API[14])
29+
#define LibRTStrings_IsDigit (*(bool (*)(int32_t c)) LibRTStrings_API[15])
30+
#define LibRTStrings_IsAlnum (*(bool (*)(int32_t c)) LibRTStrings_API[16])
31+
#define LibRTStrings_IsAlpha (*(bool (*)(int32_t c)) LibRTStrings_API[17])
32+
#define LibRTStrings_IsIdentifier (*(bool (*)(int32_t c)) LibRTStrings_API[18])
2733

2834

2935
static inline bool CPyBytesWriter_Check(PyObject *obj) {

mypyc/primitives/librt_strings_ops.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
1-
from mypyc.ir.deps import (
2-
BYTES_WRITER_EXTRA_OPS,
3-
CODEPOINT_EXTRA_OPS,
4-
LIBRT_STRINGS,
5-
STRING_WRITER_EXTRA_OPS,
6-
)
1+
from mypyc.ir.deps import BYTES_WRITER_EXTRA_OPS, LIBRT_STRINGS, STRING_WRITER_EXTRA_OPS
72
from mypyc.ir.ops import ERR_MAGIC, ERR_MAGIC_OVERLAPPING, ERR_NEVER
83
from mypyc.ir.rtypes import (
94
bool_rprimitive,
@@ -402,7 +397,7 @@
402397
return_type=bool_rprimitive,
403398
c_function_name="LibRTStrings_IsSpace",
404399
error_kind=ERR_NEVER,
405-
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
400+
dependencies=[LIBRT_STRINGS],
406401
)
407402

408403
function_op(
@@ -411,7 +406,7 @@
411406
return_type=bool_rprimitive,
412407
c_function_name="LibRTStrings_IsDigit",
413408
error_kind=ERR_NEVER,
414-
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
409+
dependencies=[LIBRT_STRINGS],
415410
)
416411

417412
function_op(
@@ -420,7 +415,7 @@
420415
return_type=bool_rprimitive,
421416
c_function_name="LibRTStrings_IsAlnum",
422417
error_kind=ERR_NEVER,
423-
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
418+
dependencies=[LIBRT_STRINGS],
424419
)
425420

426421
function_op(
@@ -429,17 +424,17 @@
429424
return_type=bool_rprimitive,
430425
c_function_name="LibRTStrings_IsAlpha",
431426
error_kind=ERR_NEVER,
432-
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
427+
dependencies=[LIBRT_STRINGS],
433428
)
434429

435430
# isidentifier checks XID_Start semantics for a single codepoint, matching
436431
# str.isidentifier() on a 1-character string. The non-ASCII path allocates
437-
# but swallows OOM (returning False), keeping the function ERR_NEVER.
432+
# and aborts via CPyError_OutOfMemory on failure, so this stays ERR_NEVER.
438433
function_op(
439434
name="librt.strings.isidentifier",
440435
arg_types=[int32_rprimitive],
441436
return_type=bool_rprimitive,
442437
c_function_name="LibRTStrings_IsIdentifier",
443438
error_kind=ERR_NEVER,
444-
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
439+
dependencies=[LIBRT_STRINGS],
445440
)

0 commit comments

Comments
 (0)