Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * regc_pg_locale.c
4 : : * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 : : * and functions to cache the results of wholesale ctype probing.
6 : : *
7 : : * This file is #included by regcomp.c; it's not meant to compile standalone.
8 : : *
9 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10 : : * Portions Copyright (c) 1994, Regents of the University of California
11 : : *
12 : : * IDENTIFICATION
13 : : * src/backend/regex/regc_pg_locale.c
14 : : *
15 : : *-------------------------------------------------------------------------
16 : : */
17 : :
18 : : #include "catalog/pg_collation.h"
19 : : #include "common/unicode_case.h"
20 : : #include "common/unicode_category.h"
21 : : #include "utils/pg_locale.h"
22 : :
23 : : static pg_locale_t pg_regex_locale;
24 : :
25 : : static struct pg_locale_struct dummy_c_locale = {
26 : : .collate_is_c = true,
27 : : .ctype_is_c = true,
28 : : };
29 : :
30 : : /*
31 : : * Hard-wired character properties for C locale
32 : : */
33 : : #define PG_ISDIGIT 0x01
34 : : #define PG_ISALPHA 0x02
35 : : #define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
36 : : #define PG_ISUPPER 0x04
37 : : #define PG_ISLOWER 0x08
38 : : #define PG_ISGRAPH 0x10
39 : : #define PG_ISPRINT 0x20
40 : : #define PG_ISPUNCT 0x40
41 : : #define PG_ISSPACE 0x80
42 : :
43 : : static const unsigned char pg_char_properties[128] = {
44 : : /* NUL */ 0,
45 : : /* ^A */ 0,
46 : : /* ^B */ 0,
47 : : /* ^C */ 0,
48 : : /* ^D */ 0,
49 : : /* ^E */ 0,
50 : : /* ^F */ 0,
51 : : /* ^G */ 0,
52 : : /* ^H */ 0,
53 : : /* ^I */ PG_ISSPACE,
54 : : /* ^J */ PG_ISSPACE,
55 : : /* ^K */ PG_ISSPACE,
56 : : /* ^L */ PG_ISSPACE,
57 : : /* ^M */ PG_ISSPACE,
58 : : /* ^N */ 0,
59 : : /* ^O */ 0,
60 : : /* ^P */ 0,
61 : : /* ^Q */ 0,
62 : : /* ^R */ 0,
63 : : /* ^S */ 0,
64 : : /* ^T */ 0,
65 : : /* ^U */ 0,
66 : : /* ^V */ 0,
67 : : /* ^W */ 0,
68 : : /* ^X */ 0,
69 : : /* ^Y */ 0,
70 : : /* ^Z */ 0,
71 : : /* ^[ */ 0,
72 : : /* ^\ */ 0,
73 : : /* ^] */ 0,
74 : : /* ^^ */ 0,
75 : : /* ^_ */ 0,
76 : : /* */ PG_ISPRINT | PG_ISSPACE,
77 : : /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
78 : : /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
79 : : /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
80 : : /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
81 : : /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
82 : : /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
83 : : /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
84 : : /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
85 : : /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
86 : : /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
87 : : /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
88 : : /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
89 : : /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
90 : : /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
91 : : /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
92 : : /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
93 : : /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
94 : : /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
95 : : /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
96 : : /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
97 : : /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
98 : : /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
99 : : /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
100 : : /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
101 : : /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
102 : : /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
103 : : /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
104 : : /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
105 : : /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
106 : : /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
107 : : /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
108 : : /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
109 : : /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
110 : : /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
111 : : /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
112 : : /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
113 : : /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
114 : : /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
115 : : /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
116 : : /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
117 : : /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
118 : : /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
119 : : /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
120 : : /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
121 : : /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
122 : : /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
123 : : /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
124 : : /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
125 : : /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
126 : : /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
127 : : /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
128 : : /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
129 : : /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
130 : : /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
131 : : /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
132 : : /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
133 : : /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
134 : : /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
135 : : /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
136 : : /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
137 : : /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
138 : : /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
139 : : /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
140 : : /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
141 : : /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
142 : : /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
143 : : /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
144 : : /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
145 : : /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
146 : : /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
147 : : /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
148 : : /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
149 : : /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
150 : : /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
151 : : /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
152 : : /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
153 : : /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
154 : : /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
155 : : /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
156 : : /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
157 : : /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
158 : : /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
159 : : /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
160 : : /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
161 : : /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
162 : : /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
163 : : /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
164 : : /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
165 : : /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
166 : : /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
167 : : /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
168 : : /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
169 : : /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
170 : : /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
171 : : /* DEL */ 0
172 : : };
173 : :
174 : :
175 : : /*
176 : : * pg_set_regex_collation: set collation for these functions to obey
177 : : *
178 : : * This is called when beginning compilation or execution of a regexp.
179 : : * Since there's no need for reentrancy of regexp operations, it's okay
180 : : * to store the results in static variables.
181 : : */
182 : : void
5263 tgl@sss.pgh.pa.us 183 :CBC 4049215 : pg_set_regex_collation(Oid collation)
184 : : {
366 jdavis@postgresql.or 185 : 4049215 : pg_locale_t locale = 0;
186 : :
1325 peter@eisentraut.org 187 [ - + ]: 4049215 : if (!OidIsValid(collation))
188 : : {
189 : : /*
190 : : * This typically means that the parser could not resolve a conflict
191 : : * of implicit collations, so report it that way.
192 : : */
1325 peter@eisentraut.org 193 [ # # ]:UBC 0 : ereport(ERROR,
194 : : (errcode(ERRCODE_INDETERMINATE_COLLATION),
195 : : errmsg("could not determine which collation to use for regular expression"),
196 : : errhint("Use the COLLATE clause to set the collation explicitly.")));
197 : : }
198 : :
365 jdavis@postgresql.or 199 [ + + ]:CBC 4049215 : if (collation == C_COLLATION_OID)
200 : : {
201 : : /*
202 : : * Some callers expect regexes to work for C_COLLATION_OID before
203 : : * catalog access is available, so we can't call
204 : : * pg_newlocale_from_collation().
205 : : */
67 jdavis@postgresql.or 206 :GNC 82470 : locale = &dummy_c_locale;
207 : : }
208 : : else
209 : : {
366 jdavis@postgresql.or 210 :CBC 3966745 : locale = pg_newlocale_from_collation(collation);
211 : :
359 212 [ + + ]: 3966745 : if (!locale->deterministic)
2360 peter@eisentraut.org 213 [ + - ]: 12 : ereport(ERROR,
214 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
215 : : errmsg("nondeterministic collations are not supported for regular expressions")));
216 : :
365 jdavis@postgresql.or 217 [ + + ]: 3966733 : if (locale->ctype_is_c)
218 : : {
219 : : /*
220 : : * C/POSIX collations use this path regardless of database
221 : : * encoding
222 : : */
67 jdavis@postgresql.or 223 :GNC 142 : locale = &dummy_c_locale;
224 : : }
225 : : }
226 : :
366 jdavis@postgresql.or 227 :CBC 4049203 : pg_regex_locale = locale;
5263 tgl@sss.pgh.pa.us 228 : 4049203 : }
229 : :
230 : : static int
231 : 93455 : pg_wc_isdigit(pg_wchar c)
232 : : {
67 jdavis@postgresql.or 233 [ + + ]:GNC 93455 : if (pg_regex_locale->ctype_is_c)
234 [ + - ]: 2130 : return (c <= (pg_wchar) 127 &&
235 [ + + ]: 1065 : (pg_char_properties[c] & PG_ISDIGIT));
236 : : else
237 : 92390 : return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale);
238 : : }
239 : :
240 : : static int
5263 tgl@sss.pgh.pa.us 241 :CBC 14859 : pg_wc_isalpha(pg_wchar c)
242 : : {
67 jdavis@postgresql.or 243 [ + + ]:GNC 14859 : if (pg_regex_locale->ctype_is_c)
244 [ + - ]: 768 : return (c <= (pg_wchar) 127 &&
245 [ + + ]: 384 : (pg_char_properties[c] & PG_ISALPHA));
246 : : else
247 : 14475 : return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale);
248 : : }
249 : :
250 : : static int
5263 tgl@sss.pgh.pa.us 251 :CBC 47493 : pg_wc_isalnum(pg_wchar c)
252 : : {
67 jdavis@postgresql.or 253 [ + + ]:GNC 47493 : if (pg_regex_locale->ctype_is_c)
254 [ + - ]: 762 : return (c <= (pg_wchar) 127 &&
255 [ + + ]: 381 : (pg_char_properties[c] & PG_ISALNUM));
256 : : else
257 : 47112 : return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale);
258 : : }
259 : :
260 : : static int
1654 tgl@sss.pgh.pa.us 261 :CBC 18817 : pg_wc_isword(pg_wchar c)
262 : : {
263 : : /* We define word characters as alnum class plus underscore */
264 [ + + ]: 18817 : if (c == CHR('_'))
265 : 12 : return 1;
266 : 18805 : return pg_wc_isalnum(c);
267 : : }
268 : :
269 : : static int
5263 270 : 20488 : pg_wc_isupper(pg_wchar c)
271 : : {
67 jdavis@postgresql.or 272 [ - + ]:GNC 20488 : if (pg_regex_locale->ctype_is_c)
67 jdavis@postgresql.or 273 [ # # ]:UNC 0 : return (c <= (pg_wchar) 127 &&
274 [ # # ]: 0 : (pg_char_properties[c] & PG_ISUPPER));
275 : : else
67 jdavis@postgresql.or 276 :GNC 20488 : return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale);
277 : : }
278 : :
279 : : static int
5263 tgl@sss.pgh.pa.us 280 :CBC 8195 : pg_wc_islower(pg_wchar c)
281 : : {
67 jdavis@postgresql.or 282 [ - + ]:GNC 8195 : if (pg_regex_locale->ctype_is_c)
67 jdavis@postgresql.or 283 [ # # ]:UNC 0 : return (c <= (pg_wchar) 127 &&
284 [ # # ]: 0 : (pg_char_properties[c] & PG_ISLOWER));
285 : : else
67 jdavis@postgresql.or 286 :GNC 8195 : return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale);
287 : : }
288 : :
289 : : static int
5263 tgl@sss.pgh.pa.us 290 :CBC 8195 : pg_wc_isgraph(pg_wchar c)
291 : : {
67 jdavis@postgresql.or 292 [ - + ]:GNC 8195 : if (pg_regex_locale->ctype_is_c)
67 jdavis@postgresql.or 293 [ # # ]:UNC 0 : return (c <= (pg_wchar) 127 &&
294 [ # # ]: 0 : (pg_char_properties[c] & PG_ISGRAPH));
295 : : else
67 jdavis@postgresql.or 296 :GNC 8195 : return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale);
297 : : }
298 : :
299 : : static int
5263 tgl@sss.pgh.pa.us 300 :CBC 8195 : pg_wc_isprint(pg_wchar c)
301 : : {
67 jdavis@postgresql.or 302 [ - + ]:GNC 8195 : if (pg_regex_locale->ctype_is_c)
67 jdavis@postgresql.or 303 [ # # ]:UNC 0 : return (c <= (pg_wchar) 127 &&
304 [ # # ]: 0 : (pg_char_properties[c] & PG_ISPRINT));
305 : : else
67 jdavis@postgresql.or 306 :GNC 8195 : return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale);
307 : : }
308 : :
309 : : static int
5263 tgl@sss.pgh.pa.us 310 :CBC 20483 : pg_wc_ispunct(pg_wchar c)
311 : : {
67 jdavis@postgresql.or 312 [ - + ]:GNC 20483 : if (pg_regex_locale->ctype_is_c)
67 jdavis@postgresql.or 313 [ # # ]:UNC 0 : return (c <= (pg_wchar) 127 &&
314 [ # # ]: 0 : (pg_char_properties[c] & PG_ISPUNCT));
315 : : else
67 jdavis@postgresql.or 316 :GNC 20483 : return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale);
317 : : }
318 : :
319 : : static int
5263 tgl@sss.pgh.pa.us 320 :CBC 38193 : pg_wc_isspace(pg_wchar c)
321 : : {
67 jdavis@postgresql.or 322 [ - + ]:GNC 38193 : if (pg_regex_locale->ctype_is_c)
67 jdavis@postgresql.or 323 [ # # ]:UNC 0 : return (c <= (pg_wchar) 127 &&
324 [ # # ]: 0 : (pg_char_properties[c] & PG_ISSPACE));
325 : : else
67 jdavis@postgresql.or 326 :GNC 38193 : return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale);
327 : : }
328 : :
329 : : static pg_wchar
5263 tgl@sss.pgh.pa.us 330 :CBC 5351 : pg_wc_toupper(pg_wchar c)
331 : : {
67 jdavis@postgresql.or 332 [ + + ]:GNC 5351 : if (pg_regex_locale->ctype_is_c)
333 : : {
334 [ + - ]: 489 : if (c <= (pg_wchar) 127)
335 : 489 : return pg_ascii_toupper((unsigned char) c);
67 jdavis@postgresql.or 336 :UNC 0 : return c;
337 : : }
338 : : else
67 jdavis@postgresql.or 339 :GNC 4862 : return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale);
340 : : }
341 : :
342 : : static pg_wchar
5263 tgl@sss.pgh.pa.us 343 :CBC 5353 : pg_wc_tolower(pg_wchar c)
344 : : {
67 jdavis@postgresql.or 345 [ + + ]:GNC 5353 : if (pg_regex_locale->ctype_is_c)
346 : : {
347 [ + - ]: 489 : if (c <= (pg_wchar) 127)
348 : 489 : return pg_ascii_tolower((unsigned char) c);
67 jdavis@postgresql.or 349 :UNC 0 : return c;
350 : : }
351 : : else
67 jdavis@postgresql.or 352 :GNC 4864 : return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale);
353 : : }
354 : :
355 : :
356 : : /*
357 : : * These functions cache the results of probing libc's ctype behavior for
358 : : * all character codes of interest in a given encoding/collation. The
359 : : * result is provided as a "struct cvec", but notice that the representation
360 : : * is a touch different from a cvec created by regc_cvec.c: we allocate the
361 : : * chrs[] and ranges[] arrays separately from the struct so that we can
362 : : * realloc them larger at need. This is okay since the cvecs made here
363 : : * should never be freed by freecvec().
364 : : *
365 : : * We use malloc not palloc since we mustn't lose control on out-of-memory;
366 : : * the main regex code expects us to return a failure indication instead.
367 : : */
368 : :
369 : : typedef int (*pg_wc_probefunc) (pg_wchar c);
370 : :
371 : : typedef struct pg_ctype_cache
372 : : {
373 : : pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
374 : : pg_locale_t locale; /* locale this entry is for */
375 : : struct cvec cv; /* cache entry contents */
376 : : struct pg_ctype_cache *next; /* chain link */
377 : : } pg_ctype_cache;
378 : :
379 : : static pg_ctype_cache *pg_ctype_cache_list = NULL;
380 : :
381 : : /*
382 : : * Add a chr or range to pcc->cv; return false if run out of memory
383 : : */
384 : : static bool
4948 tgl@sss.pgh.pa.us 385 :CBC 5897 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
386 : : {
387 : : chr *newchrs;
388 : :
389 [ + + ]: 5897 : if (nchrs > 1)
390 : : {
391 [ - + ]: 1860 : if (pcc->cv.nranges >= pcc->cv.rangespace)
392 : : {
4948 tgl@sss.pgh.pa.us 393 :UBC 0 : pcc->cv.rangespace *= 2;
394 : 0 : newchrs = (chr *) realloc(pcc->cv.ranges,
395 : 0 : pcc->cv.rangespace * sizeof(chr) * 2);
396 [ # # ]: 0 : if (newchrs == NULL)
397 : 0 : return false;
398 : 0 : pcc->cv.ranges = newchrs;
399 : : }
4948 tgl@sss.pgh.pa.us 400 :CBC 1860 : pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
401 : 1860 : pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
402 : 1860 : pcc->cv.nranges++;
403 : : }
404 : : else
405 : : {
406 [ - + ]: 4037 : assert(nchrs == 1);
407 [ + + ]: 4037 : if (pcc->cv.nchrs >= pcc->cv.chrspace)
408 : : {
409 : 14 : pcc->cv.chrspace *= 2;
410 : 14 : newchrs = (chr *) realloc(pcc->cv.chrs,
411 : 14 : pcc->cv.chrspace * sizeof(chr));
412 [ - + ]: 14 : if (newchrs == NULL)
4948 tgl@sss.pgh.pa.us 413 :UBC 0 : return false;
4948 tgl@sss.pgh.pa.us 414 :CBC 14 : pcc->cv.chrs = newchrs;
415 : : }
416 : 4037 : pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
417 : : }
418 : 5897 : return true;
419 : : }
420 : :
421 : : /*
422 : : * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
423 : : * chrs satisfying the probe function. The active collation is the one
424 : : * previously set by pg_set_regex_collation. Return NULL if out of memory.
425 : : *
426 : : * Note that the result must not be freed or modified by caller.
427 : : */
428 : : static struct cvec *
3288 429 : 439 : pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
430 : : {
431 : : pg_ctype_cache *pcc;
432 : : pg_wchar max_chr;
433 : : pg_wchar cur_chr;
434 : : int nmatches;
435 : : chr *newchrs;
436 : :
437 : : /*
438 : : * Do we already have the answer cached?
439 : : */
4948 440 [ + + ]: 1020 : for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
441 : : {
442 [ + + ]: 881 : if (pcc->probefunc == probefunc &&
275 peter@eisentraut.org 443 [ + + ]: 336 : pcc->locale == pg_regex_locale)
4948 tgl@sss.pgh.pa.us 444 : 300 : return &pcc->cv;
445 : : }
446 : :
447 : : /*
448 : : * Nope, so initialize some workspace ...
449 : : */
450 : 139 : pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
451 [ - + ]: 139 : if (pcc == NULL)
4948 tgl@sss.pgh.pa.us 452 :UBC 0 : return NULL;
4948 tgl@sss.pgh.pa.us 453 :CBC 139 : pcc->probefunc = probefunc;
275 peter@eisentraut.org 454 : 139 : pcc->locale = pg_regex_locale;
4948 tgl@sss.pgh.pa.us 455 : 139 : pcc->cv.nchrs = 0;
456 : 139 : pcc->cv.chrspace = 128;
457 : 139 : pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
458 : 139 : pcc->cv.nranges = 0;
459 : 139 : pcc->cv.rangespace = 64;
460 : 139 : pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
461 [ + - - + ]: 139 : if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
4948 tgl@sss.pgh.pa.us 462 :UBC 0 : goto out_of_memory;
3288 tgl@sss.pgh.pa.us 463 :CBC 139 : pcc->cv.cclasscode = cclasscode;
464 : :
465 : : /*
466 : : * Decide how many character codes we ought to look through. In general
467 : : * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
468 : : * runtime using the "high colormap" mechanism. However, in C locale
469 : : * there's no need to go further than 127, and if we only have a 1-byte
470 : : * <ctype.h> API there's no need to go further than that can handle.
471 : : *
472 : : * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
473 : : * output cvec as not having any locale-dependent behavior, since there
474 : : * will be no need to do any run-time locale checks. (The #if's here
475 : : * would always be true for production values of MAX_SIMPLE_CHR, but it's
476 : : * useful to allow it to be small for testing purposes.)
477 : : */
67 jdavis@postgresql.or 478 [ + + ]:GNC 139 : if (pg_regex_locale->ctype_is_c)
479 : : {
480 : : #if MAX_SIMPLE_CHR >= 127
481 : 14 : max_chr = (pg_wchar) 127;
482 : 14 : pcc->cv.cclasscode = -1;
483 : : #else
484 : : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
485 : : #endif
486 : : }
487 : : else
488 : : {
489 [ - + ]: 125 : if (pg_regex_locale->ctype->max_chr != 0 &&
67 jdavis@postgresql.or 490 [ # # ]:UNC 0 : pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR)
491 : : {
492 : 0 : max_chr = pg_regex_locale->ctype->max_chr;
3288 tgl@sss.pgh.pa.us 493 :UBC 0 : pcc->cv.cclasscode = -1;
494 : : }
495 : : else
3089 peter_e@gmx.net 496 :CBC 125 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
497 : : }
498 : :
499 : : /*
500 : : * And scan 'em ...
501 : : */
4948 tgl@sss.pgh.pa.us 502 : 139 : nmatches = 0; /* number of consecutive matches */
503 : :
504 [ + + ]: 257931 : for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
505 : : {
506 [ + + ]: 257792 : if ((*probefunc) (cur_chr))
507 : 71040 : nmatches++;
508 [ + + ]: 186752 : else if (nmatches > 0)
509 : : {
510 [ - + ]: 5885 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
4948 tgl@sss.pgh.pa.us 511 :UBC 0 : goto out_of_memory;
4948 tgl@sss.pgh.pa.us 512 :CBC 5885 : nmatches = 0;
513 : : }
514 : : }
515 : :
516 [ + + ]: 139 : if (nmatches > 0)
517 [ - + ]: 12 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
4948 tgl@sss.pgh.pa.us 518 :UBC 0 : goto out_of_memory;
519 : :
520 : : /*
521 : : * We might have allocated more memory than needed, if so free it
522 : : */
4948 tgl@sss.pgh.pa.us 523 [ + + ]:CBC 139 : if (pcc->cv.nchrs == 0)
524 : : {
525 : 56 : free(pcc->cv.chrs);
526 : 56 : pcc->cv.chrs = NULL;
527 : 56 : pcc->cv.chrspace = 0;
528 : : }
529 [ + - ]: 83 : else if (pcc->cv.nchrs < pcc->cv.chrspace)
530 : : {
531 : 83 : newchrs = (chr *) realloc(pcc->cv.chrs,
532 : 83 : pcc->cv.nchrs * sizeof(chr));
533 [ - + ]: 83 : if (newchrs == NULL)
4948 tgl@sss.pgh.pa.us 534 :UBC 0 : goto out_of_memory;
4948 tgl@sss.pgh.pa.us 535 :CBC 83 : pcc->cv.chrs = newchrs;
536 : 83 : pcc->cv.chrspace = pcc->cv.nchrs;
537 : : }
538 [ - + ]: 139 : if (pcc->cv.nranges == 0)
539 : : {
4948 tgl@sss.pgh.pa.us 540 :UBC 0 : free(pcc->cv.ranges);
541 : 0 : pcc->cv.ranges = NULL;
542 : 0 : pcc->cv.rangespace = 0;
543 : : }
4948 tgl@sss.pgh.pa.us 544 [ + - ]:CBC 139 : else if (pcc->cv.nranges < pcc->cv.rangespace)
545 : : {
546 : 139 : newchrs = (chr *) realloc(pcc->cv.ranges,
547 : 139 : pcc->cv.nranges * sizeof(chr) * 2);
548 [ - + ]: 139 : if (newchrs == NULL)
4948 tgl@sss.pgh.pa.us 549 :UBC 0 : goto out_of_memory;
4948 tgl@sss.pgh.pa.us 550 :CBC 139 : pcc->cv.ranges = newchrs;
551 : 139 : pcc->cv.rangespace = pcc->cv.nranges;
552 : : }
553 : :
554 : : /*
555 : : * Success, link it into cache chain
556 : : */
557 : 139 : pcc->next = pg_ctype_cache_list;
558 : 139 : pg_ctype_cache_list = pcc;
559 : :
560 : 139 : return &pcc->cv;
561 : :
562 : : /*
563 : : * Failure, clean up
564 : : */
4948 tgl@sss.pgh.pa.us 565 :UBC 0 : out_of_memory:
1178 peter@eisentraut.org 566 : 0 : free(pcc->cv.chrs);
567 : 0 : free(pcc->cv.ranges);
4948 tgl@sss.pgh.pa.us 568 : 0 : free(pcc);
569 : :
570 : 0 : return NULL;
571 : : }
|