Skip to content

Commit 7a2c6f4

Browse files
authored
Improve libunicode and libregexp headers (#288)
- move all `lre_xxx` functions to libunicode - use flags table `lre_ctype_bits` instead of bitmaps - simplify `lre_is_space`, `lre_js_is_ident_first` and `lre_js_is_ident_next` - simplify `simple_next_token`, handle UTF-8 correctly - simplify `is_let`, remove dead code
1 parent 1402478 commit 7a2c6f4

File tree

6 files changed

+243
-134
lines changed

6 files changed

+243
-134
lines changed

libregexp.c

+2-27
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
#include "cutils.h"
3232
#include "libregexp.h"
33+
#include "libunicode.h"
3334

3435
/*
3536
TODO:
@@ -141,32 +142,6 @@ static const uint16_t char_range_s[] = {
141142
0xFEFF, 0xFEFF + 1,
142143
};
143144

144-
BOOL lre_is_space(int c)
145-
{
146-
int i, n, low, high;
147-
n = (countof(char_range_s) - 1) / 2;
148-
for(i = 0; i < n; i++) {
149-
low = char_range_s[2 * i + 1];
150-
if (c < low)
151-
return FALSE;
152-
high = char_range_s[2 * i + 2];
153-
if (c < high)
154-
return TRUE;
155-
}
156-
return FALSE;
157-
}
158-
159-
uint32_t const lre_id_start_table_ascii[4] = {
160-
/* $ A-Z _ a-z */
161-
0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
162-
};
163-
164-
uint32_t const lre_id_continue_table_ascii[4] = {
165-
/* $ 0-9 A-Z _ a-z */
166-
0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
167-
};
168-
169-
170145
static const uint16_t char_range_w[] = {
171146
4,
172147
0x0030, 0x0039 + 1,
@@ -186,7 +161,7 @@ typedef enum {
186161
CHAR_RANGE_W,
187162
} CharRangeEnum;
188163

189-
static const uint16_t *char_range_table[] = {
164+
static const uint16_t * const char_range_table[] = {
190165
char_range_d,
191166
char_range_s,
192167
char_range_w,

libregexp.h

+3-40
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@
2525
#define LIBREGEXP_H
2626

2727
#include <stddef.h>
28-
29-
#include "libunicode.h"
30-
31-
#define LRE_BOOL int /* for documentation purposes */
28+
#include <stdint.h>
3229

3330
#define LRE_FLAG_GLOBAL (1 << 0)
3431
#define LRE_FLAG_IGNORECASE (1 << 1)
@@ -50,43 +47,9 @@ int lre_exec(uint8_t **capture,
5047
int cbuf_type, void *opaque);
5148

5249
int lre_parse_escape(const uint8_t **pp, int allow_utf16);
53-
LRE_BOOL lre_is_space(int c);
5450

55-
/* must be provided by the user */
56-
LRE_BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size);
51+
/* must be provided by the user, return non zero if overflow */
52+
int lre_check_stack_overflow(void *opaque, size_t alloca_size);
5753
void *lre_realloc(void *opaque, void *ptr, size_t size);
5854

59-
/* JS identifier test */
60-
extern uint32_t const lre_id_start_table_ascii[4];
61-
extern uint32_t const lre_id_continue_table_ascii[4];
62-
63-
static inline int lre_js_is_ident_first(int c)
64-
{
65-
if ((uint32_t)c < 128) {
66-
return (lre_id_start_table_ascii[c >> 5] >> (c & 31)) & 1;
67-
} else {
68-
#ifdef CONFIG_ALL_UNICODE
69-
return lre_is_id_start(c);
70-
#else
71-
return !lre_is_space(c);
72-
#endif
73-
}
74-
}
75-
76-
static inline int lre_js_is_ident_next(int c)
77-
{
78-
if ((uint32_t)c < 128) {
79-
return (lre_id_continue_table_ascii[c >> 5] >> (c & 31)) & 1;
80-
} else {
81-
/* ZWNJ and ZWJ are accepted in identifiers */
82-
#ifdef CONFIG_ALL_UNICODE
83-
return lre_is_id_continue(c) || c == 0x200C || c == 0x200D;
84-
#else
85-
return !lre_is_space(c) || c == 0x200C || c == 0x200D;
86-
#endif
87-
}
88-
}
89-
90-
#undef LRE_BOOL
91-
9255
#endif /* LIBREGEXP_H */

libunicode.c

+94
Original file line numberDiff line numberDiff line change
@@ -1814,3 +1814,97 @@ int unicode_prop(CharRange *cr, const char *prop_name)
18141814
}
18151815

18161816
#endif /* CONFIG_ALL_UNICODE */
1817+
1818+
/*---- lre codepoint categorizing functions ----*/
1819+
1820+
#define S UNICODE_C_SPACE
1821+
#define D UNICODE_C_DIGIT
1822+
#define X UNICODE_C_XDIGIT
1823+
#define U UNICODE_C_UPPER
1824+
#define L UNICODE_C_LOWER
1825+
#define _ UNICODE_C_UNDER
1826+
#define d UNICODE_C_DOLLAR
1827+
1828+
uint8_t const lre_ctype_bits[256] = {
1829+
0, 0, 0, 0, 0, 0, 0, 0,
1830+
0, S, S, S, S, S, 0, 0,
1831+
0, 0, 0, 0, 0, 0, 0, 0,
1832+
0, 0, 0, 0, 0, 0, 0, 0,
1833+
1834+
S, 0, 0, 0, d, 0, 0, 0,
1835+
0, 0, 0, 0, 0, 0, 0, 0,
1836+
X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D,
1837+
X|D, X|D, 0, 0, 0, 0, 0, 0,
1838+
1839+
0, X|U, X|U, X|U, X|U, X|U, X|U, U,
1840+
U, U, U, U, U, U, U, U,
1841+
U, U, U, U, U, U, U, U,
1842+
U, U, U, 0, 0, 0, 0, _,
1843+
1844+
0, X|L, X|L, X|L, X|L, X|L, X|L, L,
1845+
L, L, L, L, L, L, L, L,
1846+
L, L, L, L, L, L, L, L,
1847+
L, L, L, 0, 0, 0, 0, 0,
1848+
1849+
0, 0, 0, 0, 0, 0, 0, 0,
1850+
0, 0, 0, 0, 0, 0, 0, 0,
1851+
0, 0, 0, 0, 0, 0, 0, 0,
1852+
0, 0, 0, 0, 0, 0, 0, 0,
1853+
1854+
S, 0, 0, 0, 0, 0, 0, 0,
1855+
0, 0, 0, 0, 0, 0, 0, 0,
1856+
0, 0, 0, 0, 0, 0, 0, 0,
1857+
0, 0, 0, 0, 0, 0, 0, 0,
1858+
1859+
0, 0, 0, 0, 0, 0, 0, 0,
1860+
0, 0, 0, 0, 0, 0, 0, 0,
1861+
0, 0, 0, 0, 0, 0, 0, 0,
1862+
0, 0, 0, 0, 0, 0, 0, 0,
1863+
1864+
0, 0, 0, 0, 0, 0, 0, 0,
1865+
0, 0, 0, 0, 0, 0, 0, 0,
1866+
0, 0, 0, 0, 0, 0, 0, 0,
1867+
0, 0, 0, 0, 0, 0, 0, 0,
1868+
};
1869+
1870+
#undef S
1871+
#undef D
1872+
#undef X
1873+
#undef U
1874+
#undef L
1875+
#undef _
1876+
#undef d
1877+
1878+
/* code point ranges for Zs,Zl or Zp property */
1879+
static const uint16_t char_range_s[] = {
1880+
10,
1881+
0x0009, 0x000D + 1,
1882+
0x0020, 0x0020 + 1,
1883+
0x00A0, 0x00A0 + 1,
1884+
0x1680, 0x1680 + 1,
1885+
0x2000, 0x200A + 1,
1886+
/* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
1887+
/* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
1888+
0x2028, 0x2029 + 1,
1889+
0x202F, 0x202F + 1,
1890+
0x205F, 0x205F + 1,
1891+
0x3000, 0x3000 + 1,
1892+
/* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
1893+
0xFEFF, 0xFEFF + 1,
1894+
};
1895+
1896+
BOOL lre_is_space_non_ascii(uint32_t c)
1897+
{
1898+
size_t i, n;
1899+
1900+
n = countof(char_range_s);
1901+
for(i = 5; i < n; i += 2) {
1902+
uint32_t low = char_range_s[i];
1903+
uint32_t high = char_range_s[i + 1];
1904+
if (c < low)
1905+
return FALSE;
1906+
if (c < high)
1907+
return TRUE;
1908+
}
1909+
return FALSE;
1910+
}

libunicode.h

+79-24
Original file line numberDiff line numberDiff line change
@@ -24,27 +24,13 @@
2424
#ifndef LIBUNICODE_H
2525
#define LIBUNICODE_H
2626

27-
#include <inttypes.h>
28-
29-
#define LRE_BOOL int /* for documentation purposes */
27+
#include <stdint.h>
3028

3129
/* define it to include all the unicode tables (40KB larger) */
3230
#define CONFIG_ALL_UNICODE
3331

3432
#define LRE_CC_RES_LEN_MAX 3
3533

36-
typedef enum {
37-
UNICODE_NFC,
38-
UNICODE_NFD,
39-
UNICODE_NFKC,
40-
UNICODE_NFKD,
41-
} UnicodeNormalizationEnum;
42-
43-
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
44-
int lre_canonicalize(uint32_t c, LRE_BOOL is_unicode);
45-
LRE_BOOL lre_is_cased(uint32_t c);
46-
LRE_BOOL lre_is_case_ignorable(uint32_t c);
47-
4834
/* char ranges */
4935

5036
typedef struct {
@@ -102,26 +88,95 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
10288

10389
int cr_invert(CharRange *cr);
10490

105-
int cr_regexp_canonicalize(CharRange *cr, LRE_BOOL is_unicode);
106-
107-
#ifdef CONFIG_ALL_UNICODE
91+
int cr_regexp_canonicalize(CharRange *cr, int is_unicode);
10892

109-
LRE_BOOL lre_is_id_start(uint32_t c);
110-
LRE_BOOL lre_is_id_continue(uint32_t c);
93+
typedef enum {
94+
UNICODE_NFC,
95+
UNICODE_NFD,
96+
UNICODE_NFKC,
97+
UNICODE_NFKD,
98+
} UnicodeNormalizationEnum;
11199

112100
int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
113101
UnicodeNormalizationEnum n_type,
114102
void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
115103

116104
/* Unicode character range functions */
117105

118-
int unicode_script(CharRange *cr,
119-
const char *script_name, LRE_BOOL is_ext);
106+
int unicode_script(CharRange *cr, const char *script_name, int is_ext);
120107
int unicode_general_category(CharRange *cr, const char *gc_name);
121108
int unicode_prop(CharRange *cr, const char *prop_name);
122109

123-
#endif /* CONFIG_ALL_UNICODE */
110+
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
111+
int lre_canonicalize(uint32_t c, int is_unicode);
112+
113+
/* Code point type categories */
114+
enum {
115+
UNICODE_C_SPACE = (1 << 0),
116+
UNICODE_C_DIGIT = (1 << 1),
117+
UNICODE_C_UPPER = (1 << 2),
118+
UNICODE_C_LOWER = (1 << 3),
119+
UNICODE_C_UNDER = (1 << 4),
120+
UNICODE_C_DOLLAR = (1 << 5),
121+
UNICODE_C_XDIGIT = (1 << 6),
122+
};
123+
extern uint8_t const lre_ctype_bits[256];
124+
125+
/* zero or non-zero return value */
126+
int lre_is_cased(uint32_t c);
127+
int lre_is_case_ignorable(uint32_t c);
128+
int lre_is_id_start(uint32_t c);
129+
int lre_is_id_continue(uint32_t c);
130+
131+
static inline int lre_is_space_byte(uint8_t c) {
132+
return lre_ctype_bits[c] & UNICODE_C_SPACE;
133+
}
134+
135+
static inline int lre_is_id_start_byte(uint8_t c) {
136+
return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
137+
UNICODE_C_UNDER | UNICODE_C_DOLLAR);
138+
}
124139

125-
#undef LRE_BOOL
140+
static inline int lre_is_id_continue_byte(uint8_t c) {
141+
return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER |
142+
UNICODE_C_UNDER | UNICODE_C_DOLLAR |
143+
UNICODE_C_DIGIT);
144+
}
145+
146+
int lre_is_space_non_ascii(uint32_t c);
147+
148+
static inline int lre_is_space(uint32_t c) {
149+
if (c < 256)
150+
return lre_is_space_byte(c);
151+
else
152+
return lre_is_space_non_ascii(c);
153+
}
154+
155+
static inline int lre_js_is_ident_first(uint32_t c) {
156+
if (c < 128) {
157+
return lre_is_id_start_byte(c);
158+
} else {
159+
#ifdef CONFIG_ALL_UNICODE
160+
return lre_is_id_start(c);
161+
#else
162+
return !lre_is_space_non_ascii(c);
163+
#endif
164+
}
165+
}
166+
167+
static inline int lre_js_is_ident_next(uint32_t c) {
168+
if (c < 128) {
169+
return lre_is_id_continue_byte(c);
170+
} else {
171+
/* ZWNJ and ZWJ are accepted in identifiers */
172+
if (c >= 0x200C && c <= 0x200D)
173+
return TRUE;
174+
#ifdef CONFIG_ALL_UNICODE
175+
return lre_is_id_continue(c);
176+
#else
177+
return !lre_is_space_non_ascii(c);
178+
#endif
179+
}
180+
}
126181

127182
#endif /* LIBUNICODE_H */

0 commit comments

Comments
 (0)