Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * ts_locale.c
4 : : * locale compatibility layer for tsearch
5 : : *
6 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : : *
8 : : *
9 : : * IDENTIFICATION
10 : : * src/backend/tsearch/ts_locale.c
11 : : *
12 : : *-------------------------------------------------------------------------
13 : : */
14 : : #include "postgres.h"
15 : :
16 : : #include "common/string.h"
17 : : #include "storage/fd.h"
18 : : #include "tsearch/ts_locale.h"
19 : :
20 : : static void tsearch_readline_callback(void *arg);
21 : :
22 : :
23 : : /*
24 : : * The reason these functions use a 3-wchar_t output buffer, not 2 as you
25 : : * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
26 : : * getting from char2wchar() is UTF16 not UTF32. A single input character
27 : : * may therefore produce a surrogate pair rather than just one wchar_t;
28 : : * we also need room for a trailing null. When we do get a surrogate pair,
29 : : * we pass just the first code to iswdigit() etc, so that these functions will
30 : : * always return false for characters outside the Basic Multilingual Plane.
31 : : */
32 : : #define WC_BUF_LEN 3
33 : :
34 : : int
6511 tgl@sss.pgh.pa.us 35 :CBC 5142 : t_isalpha(const char *ptr)
36 : : {
37 : 5142 : int clen = pg_mblen(ptr);
38 : : wchar_t character[WC_BUF_LEN];
59 jdavis@postgresql.or 39 :GNC 5142 : locale_t mylocale = 0; /* TODO */
40 : :
904 jdavis@postgresql.or 41 [ - + - - ]:CBC 5142 : if (clen == 1 || database_ctype_is_c)
6591 tgl@sss.pgh.pa.us 42 : 5142 : return isalpha(TOUCHAR(ptr));
43 : :
2499 tgl@sss.pgh.pa.us 44 :UBC 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
45 : :
6511 46 : 0 : return iswalpha((wint_t) character[0]);
47 : : }
48 : :
49 : : int
1066 tgl@sss.pgh.pa.us 50 :CBC 1395983 : t_isalnum(const char *ptr)
51 : : {
52 : 1395983 : int clen = pg_mblen(ptr);
53 : : wchar_t character[WC_BUF_LEN];
59 jdavis@postgresql.or 54 :GNC 1395983 : locale_t mylocale = 0; /* TODO */
55 : :
904 jdavis@postgresql.or 56 [ - + - - ]:CBC 1395983 : if (clen == 1 || database_ctype_is_c)
1066 tgl@sss.pgh.pa.us 57 : 1395983 : return isalnum(TOUCHAR(ptr));
58 : :
1066 tgl@sss.pgh.pa.us 59 :UBC 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
60 : :
61 : 0 : return iswalnum((wint_t) character[0]);
62 : : }
63 : :
64 : :
65 : : /*
66 : : * Set up to read a file using tsearch_readline(). This facility is
67 : : * better than just reading the file directly because it provides error
68 : : * context pointing to the specific line where a problem is detected.
69 : : *
70 : : * Expected usage is:
71 : : *
72 : : * tsearch_readline_state trst;
73 : : *
74 : : * if (!tsearch_readline_begin(&trst, filename))
75 : : * ereport(ERROR,
76 : : * (errcode(ERRCODE_CONFIG_FILE_ERROR),
77 : : * errmsg("could not open stop-word file \"%s\": %m",
78 : : * filename)));
79 : : * while ((line = tsearch_readline(&trst)) != NULL)
80 : : * process line;
81 : : * tsearch_readline_end(&trst);
82 : : *
83 : : * Note that the caller supplies the ereport() for file open failure;
84 : : * this is so that a custom message can be provided. The filename string
85 : : * passed to tsearch_readline_begin() must remain valid through
86 : : * tsearch_readline_end().
87 : : */
88 : : bool
6289 tgl@sss.pgh.pa.us 89 :CBC 278 : tsearch_readline_begin(tsearch_readline_state *stp,
90 : : const char *filename)
91 : : {
92 [ - + ]: 278 : if ((stp->fp = AllocateFile(filename, "r")) == NULL)
6289 tgl@sss.pgh.pa.us 93 :UBC 0 : return false;
6289 tgl@sss.pgh.pa.us 94 :CBC 278 : stp->filename = filename;
95 : 278 : stp->lineno = 0;
1809 96 : 278 : initStringInfo(&stp->buf);
6289 97 : 278 : stp->curline = NULL;
98 : : /* Setup error traceback support for ereport() */
99 : 278 : stp->cb.callback = tsearch_readline_callback;
282 peter@eisentraut.org 100 : 278 : stp->cb.arg = stp;
6289 tgl@sss.pgh.pa.us 101 : 278 : stp->cb.previous = error_context_stack;
102 : 278 : error_context_stack = &stp->cb;
103 : 278 : return true;
104 : : }
105 : :
106 : : /*
107 : : * Read the next line from a tsearch data file (expected to be in UTF-8), and
108 : : * convert it to database encoding if needed. The returned string is palloc'd.
109 : : * NULL return means EOF.
110 : : */
111 : : char *
112 : 12733 : tsearch_readline(tsearch_readline_state *stp)
113 : : {
114 : : char *recoded;
115 : :
116 : : /* Advance line number to use in error reports */
117 : 12733 : stp->lineno++;
118 : :
119 : : /* Clear curline, it's no longer relevant */
1809 120 [ + + ]: 12733 : if (stp->curline)
121 : : {
122 [ - + ]: 12455 : if (stp->curline != stp->buf.data)
1809 tgl@sss.pgh.pa.us 123 :UBC 0 : pfree(stp->curline);
1809 tgl@sss.pgh.pa.us 124 :CBC 12455 : stp->curline = NULL;
125 : : }
126 : :
127 : : /* Collect next line, if there is one */
128 [ + + ]: 12733 : if (!pg_get_line_buf(stp->fp, &stp->buf))
129 : 235 : return NULL;
130 : :
131 : : /* Validate the input as UTF-8, then convert to DB encoding if needed */
132 : 12498 : recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
133 : :
134 : : /* Save the correctly-encoded string for possible error reports */
135 : 12498 : stp->curline = recoded; /* might be equal to buf.data */
136 : :
137 : : /*
138 : : * We always return a freshly pstrdup'd string. This is clearly necessary
139 : : * if pg_any_to_server() returned buf.data, and we need a second copy even
140 : : * if encoding conversion did occur. The caller is entitled to pfree the
141 : : * returned string at any time, which would leave curline pointing to
142 : : * recycled storage, causing problems if an error occurs after that point.
143 : : * (It's preferable to return the result of pstrdup instead of the output
144 : : * of pg_any_to_server, because the conversion result tends to be
145 : : * over-allocated. Since callers might save the result string directly
146 : : * into a long-lived dictionary structure, we don't want it to be a larger
147 : : * palloc chunk than necessary. We'll reclaim the conversion result on
148 : : * the next call.)
149 : : */
150 : 12498 : return pstrdup(recoded);
151 : : }
152 : :
153 : : /*
154 : : * Close down after reading a file with tsearch_readline()
155 : : */
156 : : void
6289 157 : 278 : tsearch_readline_end(tsearch_readline_state *stp)
158 : : {
159 : : /* Suppress use of curline in any error reported below */
1809 160 [ + + ]: 278 : if (stp->curline)
161 : : {
162 [ - + ]: 43 : if (stp->curline != stp->buf.data)
1809 tgl@sss.pgh.pa.us 163 :UBC 0 : pfree(stp->curline);
1809 tgl@sss.pgh.pa.us 164 :CBC 43 : stp->curline = NULL;
165 : : }
166 : :
167 : : /* Release other resources */
168 : 278 : pfree(stp->buf.data);
6289 169 : 278 : FreeFile(stp->fp);
170 : :
171 : : /* Pop the error context stack */
172 : 278 : error_context_stack = stp->cb.previous;
173 : 278 : }
174 : :
175 : : /*
176 : : * Error context callback for errors occurring while reading a tsearch
177 : : * configuration file.
178 : : */
179 : : static void
6289 tgl@sss.pgh.pa.us 180 :UBC 0 : tsearch_readline_callback(void *arg)
181 : : {
182 : 0 : tsearch_readline_state *stp = (tsearch_readline_state *) arg;
183 : :
184 : : /*
185 : : * We can't include the text of the config line for errors that occur
186 : : * during tsearch_readline() itself. The major cause of such errors is
187 : : * encoding violations, and we daren't try to print error messages
188 : : * containing badly-encoded data.
189 : : */
190 [ # # ]: 0 : if (stp->curline)
191 : 0 : errcontext("line %d of configuration file \"%s\": \"%s\"",
192 : : stp->lineno,
193 : : stp->filename,
194 : : stp->curline);
195 : : else
196 : 0 : errcontext("line %d of configuration file \"%s\"",
197 : : stp->lineno,
198 : : stp->filename);
199 : 0 : }
|