Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * trgm_regexp.c
4 : : * Regular expression matching using trigrams.
5 : : *
6 : : * The general idea of trigram index support for a regular expression (regex)
7 : : * search is to transform the regex into a logical expression on trigrams.
8 : : * For example:
9 : : *
10 : : * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
11 : : *
12 : : * If a string matches the regex, then it must match the logical expression on
13 : : * trigrams. The opposite is not necessarily true, however: a string that
14 : : * matches the logical expression might not match the original regex. Such
15 : : * false positives are removed via recheck, by running the regular regex match
16 : : * operator on the retrieved heap tuple.
17 : : *
18 : : * Since the trigram expression involves both AND and OR operators, we can't
19 : : * expect the core index machinery to evaluate it completely. Instead, the
20 : : * result of regex analysis is a list of trigrams to be sought in the index,
21 : : * plus a simplified graph that is used by trigramsMatchGraph() to determine
22 : : * whether a particular indexed value matches the expression.
23 : : *
24 : : * Converting a regex to a trigram expression is based on analysis of an
25 : : * automaton corresponding to the regex. The algorithm consists of four
26 : : * stages:
27 : : *
28 : : * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
29 : : * regexp library, which provides accessors for its opaque regex_t struct
30 : : * to expose the NFA state graph and the "colors" (sets of equivalent
31 : : * characters) used as state transition labels.
32 : : *
33 : : * 2) Transform the original NFA into an expanded graph, where arcs
34 : : * are labeled with trigrams that must be present in order to move from
35 : : * one state to another via the arcs. The trigrams used in this stage
36 : : * consist of colors, not characters, as in the original NFA.
37 : : *
38 : : * 3) Expand the color trigrams into regular trigrams consisting of
39 : : * characters. If too many distinct trigrams are produced, trigrams are
40 : : * eliminated and the graph is simplified until it's simple enough.
41 : : *
42 : : * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
43 : : * and returned to the caller.
44 : : *
45 : : * 1) Compile the regexp to NFA form
46 : : * ---------------------------------
47 : : * The automaton returned by the regexp compiler is a graph where vertices
48 : : * are "states" and arcs are labeled with colors. Each color represents
49 : : * a set of characters, so that all characters assigned to the same color
50 : : * are interchangeable, so far as matching the regexp is concerned. There
51 : : * are two special states: "initial" and "final". A state can have multiple
52 : : * outgoing arcs labeled with the same color, which makes the automaton
53 : : * non-deterministic, because it can be in many states simultaneously.
54 : : *
55 : : * Note that this NFA is already lossy compared to the original regexp,
56 : : * since it ignores some regex features such as lookahead constraints and
57 : : * backref matching. This is OK for our purposes since it's still the case
58 : : * that only strings matching the NFA can possibly satisfy the regexp.
59 : : *
60 : : * 2) Transform the original NFA into an expanded graph
61 : : * ----------------------------------------------------
62 : : * In the 2nd stage, the automaton is transformed into a graph based on the
63 : : * original NFA. Each state in the expanded graph represents a state from
64 : : * the original NFA, plus a prefix identifying the last two characters
65 : : * (colors, to be precise) seen before entering the state. There can be
66 : : * multiple states in the expanded graph for each state in the original NFA,
67 : : * depending on what characters can precede it. A prefix position can be
68 : : * "unknown" if it's uncertain what the preceding character was, or "blank"
69 : : * if the character was a non-word character (we don't need to distinguish
70 : : * which non-word character it was, so just think of all of them as blanks).
71 : : *
72 : : * For convenience in description, call an expanded-state identifier
73 : : * (two prefix colors plus a state number from the original NFA) an
74 : : * "enter key".
75 : : *
76 : : * Each arc of the expanded graph is labeled with a trigram that must be
77 : : * present in the string to match. We can construct this from an out-arc of
78 : : * the underlying NFA state by combining the expanded state's prefix with the
79 : : * color label of the underlying out-arc, if neither prefix position is
80 : : * "unknown". But note that some of the colors in the trigram might be
81 : : * "blank". This is OK since we want to generate word-boundary trigrams as
82 : : * the regular trigram machinery would, if we know that some word characters
83 : : * must be adjacent to a word boundary in all strings matching the NFA.
84 : : *
85 : : * The expanded graph can also have fewer states than the original NFA,
86 : : * because we don't bother to make a separate state entry unless the state
87 : : * is reachable by a valid arc. When an enter key is reachable from a state
88 : : * of the expanded graph, but we do not know a complete trigram associated
89 : : * with that transition, we cannot make a valid arc; instead we insert the
90 : : * enter key into the enterKeys list of the source state. This effectively
91 : : * means that the two expanded states are not reliably distinguishable based
92 : : * on examining trigrams.
93 : : *
94 : : * So the expanded graph resembles the original NFA, but the arcs are
95 : : * labeled with trigrams instead of individual characters, and there may be
96 : : * more or fewer states. It is a lossy representation of the original NFA:
97 : : * any string that matches the original regexp must match the expanded graph,
98 : : * but the reverse is not true.
99 : : *
100 : : * We build the expanded graph through a breadth-first traversal of states
101 : : * reachable from the initial state. At each reachable state, we identify the
102 : : * states reachable from it without traversing a predictable trigram, and add
103 : : * those states' enter keys to the current state. Then we generate all
104 : : * out-arcs leading out of this collection of states that have predictable
105 : : * trigrams, adding their target states to the queue of states to examine.
106 : : *
107 : : * When building the graph, if the number of states or arcs exceed pre-defined
108 : : * limits, we give up and simply mark any states not yet processed as final
109 : : * states. Roughly speaking, that means that we make use of some portion from
110 : : * the beginning of the regexp. Also, any colors that have too many member
111 : : * characters are treated as "unknown", so that we can't derive trigrams
112 : : * from them.
113 : : *
114 : : * 3) Expand the color trigrams into regular trigrams
115 : : * --------------------------------------------------
116 : : * The trigrams in the expanded graph are "color trigrams", consisting
117 : : * of three consecutive colors that must be present in the string. But for
118 : : * search, we need regular trigrams consisting of characters. In the 3rd
119 : : * stage, the color trigrams are expanded into regular trigrams. Since each
120 : : * color can represent many characters, the total number of regular trigrams
121 : : * after expansion could be very large. Because searching the index for
122 : : * thousands of trigrams would be slow, and would likely produce so many
123 : : * false positives that we would have to traverse a large fraction of the
124 : : * index, the graph is simplified further in a lossy fashion by removing
125 : : * color trigrams. When a color trigram is removed, the states connected by
126 : : * any arcs labeled with that trigram are merged.
127 : : *
128 : : * Trigrams do not all have equivalent value for searching: some of them are
129 : : * more frequent and some of them are less frequent. Ideally, we would like
130 : : * to know the distribution of trigrams, but we don't. But because of padding
131 : : * we know for sure that the empty character is more frequent than others,
132 : : * so we can penalize trigrams according to presence of whitespace. The
133 : : * penalty assigned to each color trigram is the number of simple trigrams
134 : : * it would produce, times the penalties[] multiplier associated with its
135 : : * whitespace content. (The penalties[] constants were calculated by analysis
136 : : * of some real-life text.) We eliminate color trigrams starting with the
137 : : * highest-penalty one, until we get to a total penalty of no more than
138 : : * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
139 : : * lead to merging the initial and final states, so we may not be able to
140 : : * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
141 : : * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
142 : : *
143 : : * 4) Pack the graph into a compact representation
144 : : * -----------------------------------------------
145 : : * The 2nd and 3rd stages might have eliminated or merged many of the states
146 : : * and trigrams created earlier, so in this final stage, the graph is
147 : : * compacted and packed into a simpler struct that contains only the
148 : : * information needed to evaluate it.
149 : : *
150 : : * ALGORITHM EXAMPLE:
151 : : *
152 : : * Consider the example regex "ab[cd]". This regex is transformed into the
153 : : * following NFA (for simplicity we show colors as their single members):
154 : : *
155 : : * 4#
156 : : * c/
157 : : * a b /
158 : : * 1* --- 2 ---- 3
159 : : * \
160 : : * d\
161 : : * 5#
162 : : *
163 : : * We use * to mark initial state and # to mark final state. It's not depicted,
164 : : * but states 1, 4, 5 have self-referencing arcs for all possible characters,
165 : : * because this pattern can match to any part of a string.
166 : : *
167 : : * As the result of stage 2 we will have the following graph:
168 : : *
169 : : * abc abd
170 : : * 2# <---- 1* ----> 3#
171 : : *
172 : : * The process for generating this graph is:
173 : : * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
174 : : * 2) Add key (UNKNOWN, "a", 2) to state 1.
175 : : * 3) Add key ("a", "b", 3) to state 1.
176 : : * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
177 : : * from state 1 to state 2 with label trigram "abc".
178 : : * 5) Mark state 2 final because state 4 of source NFA is marked as final.
179 : : * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
180 : : * from state 1 to state 3 with label trigram "abd".
181 : : * 7) Mark state 3 final because state 5 of source NFA is marked as final.
182 : : *
183 : : *
184 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
185 : : * Portions Copyright (c) 1994, Regents of the University of California
186 : : *
187 : : * IDENTIFICATION
188 : : * contrib/pg_trgm/trgm_regexp.c
189 : : *
190 : : *-------------------------------------------------------------------------
191 : : */
192 : : #include "postgres.h"
193 : :
194 : : #include "catalog/pg_collation_d.h"
195 : : #include "regex/regexport.h"
196 : : #include "trgm.h"
197 : : #include "tsearch/ts_locale.h"
198 : : #include "utils/formatting.h"
199 : : #include "utils/hsearch.h"
200 : : #include "utils/memutils.h"
201 : : #include "varatt.h"
202 : :
203 : : /*
204 : : * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
205 : : * for exploring and debugging the algorithm implementation.
206 : : * This produces three graph files in /tmp, in Graphviz .gv format.
207 : : * Some progress information is also printed to postmaster stderr.
208 : : */
209 : : /* #define TRGM_REGEXP_DEBUG */
210 : :
211 : : /*
212 : : * These parameters are used to limit the amount of work done.
213 : : * Otherwise regex processing could be too slow and memory-consuming.
214 : : *
215 : : * MAX_EXPANDED_STATES - How many states we allow in expanded graph
216 : : * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
217 : : * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
218 : : * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
219 : : * COLOR_COUNT_LIMIT - Maximum number of characters per color
220 : : */
221 : : #define MAX_EXPANDED_STATES 128
222 : : #define MAX_EXPANDED_ARCS 1024
223 : : #define MAX_TRGM_COUNT 256
224 : : #define WISH_TRGM_PENALTY 16
225 : : #define COLOR_COUNT_LIMIT 256
226 : :
227 : : /*
228 : : * Penalty multipliers for trigram counts depending on whitespace contents.
229 : : * Numbers based on analysis of real-life texts.
230 : : */
231 : : static const float4 penalties[8] = {
232 : : 1.0f, /* "aaa" */
233 : : 3.5f, /* "aa " */
234 : : 0.0f, /* "a a" (impossible) */
235 : : 0.0f, /* "a " (impossible) */
236 : : 4.2f, /* " aa" */
237 : : 2.1f, /* " a " */
238 : : 25.0f, /* " a" */
239 : : 0.0f /* " " (impossible) */
240 : : };
241 : :
242 : : /* Struct representing a single pg_wchar, converted back to multibyte form */
243 : : typedef struct
244 : : {
245 : : char bytes[MAX_MULTIBYTE_CHAR_LEN];
246 : : } trgm_mb_char;
247 : :
248 : : /*
249 : : * Attributes of NFA colors:
250 : : *
251 : : * expandable - we know the character expansion of this color
252 : : * containsNonWord - color contains non-word characters
253 : : * (which will not be extracted into trigrams)
254 : : * wordCharsCount - count of word characters in color
255 : : * wordChars - array of this color's word characters
256 : : * (which can be extracted into trigrams)
257 : : *
258 : : * When expandable is false, the other attributes don't matter; we just
259 : : * assume this color represents unknown character(s).
260 : : */
261 : : typedef struct
262 : : {
263 : : bool expandable;
264 : : bool containsNonWord;
265 : : int wordCharsCount;
266 : : trgm_mb_char *wordChars;
267 : : } TrgmColorInfo;
268 : :
269 : : /*
270 : : * A "prefix" is information about the colors of the last two characters read
271 : : * before reaching a specific NFA state. These colors can have special values
272 : : * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
273 : : * information, for example because we read some character of an unexpandable
274 : : * color. COLOR_BLANK means that we read a non-word character.
275 : : *
276 : : * We call a prefix ambiguous if at least one of its colors is unknown. It's
277 : : * fully ambiguous if both are unknown, partially ambiguous if only the first
278 : : * is unknown. (The case of first color known, second unknown is not valid.)
279 : : *
280 : : * Wholly- or partly-blank prefixes are mostly handled the same as regular
281 : : * color prefixes. This allows us to generate appropriate partly-blank
282 : : * trigrams when the NFA requires word character(s) to appear adjacent to
283 : : * non-word character(s).
284 : : */
285 : : typedef int TrgmColor;
286 : :
287 : : /* We assume that colors returned by the regexp engine cannot be these: */
288 : : #define COLOR_UNKNOWN (-3)
289 : : #define COLOR_BLANK (-4)
290 : :
291 : : typedef struct
292 : : {
293 : : TrgmColor colors[2];
294 : : } TrgmPrefix;
295 : :
296 : : /*
297 : : * Color-trigram data type. Note that some elements of the trigram can be
298 : : * COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
299 : : */
300 : : typedef struct
301 : : {
302 : : TrgmColor colors[3];
303 : : } ColorTrgm;
304 : :
305 : : /*
306 : : * Key identifying a state of our expanded graph: color prefix, and number
307 : : * of the corresponding state in the underlying regex NFA. The color prefix
308 : : * shows how we reached the regex state (to the extent that we know it).
309 : : */
310 : : typedef struct
311 : : {
312 : : TrgmPrefix prefix;
313 : : int nstate;
314 : : } TrgmStateKey;
315 : :
316 : : /*
317 : : * One state of the expanded graph.
318 : : *
319 : : * stateKey - ID of this state
320 : : * arcs - outgoing arcs of this state (List of TrgmArc)
321 : : * enterKeys - enter keys reachable from this state without reading any
322 : : * predictable trigram (List of TrgmStateKey)
323 : : * flags - flag bits
324 : : * snumber - number of this state (initially assigned as -1, -2, etc,
325 : : * for debugging purposes only; then at the packaging stage,
326 : : * surviving states are renumbered with positive numbers)
327 : : * parent - parent state, if this state has been merged into another
328 : : * tentFlags - flags this state would acquire via planned merges
329 : : * tentParent - planned parent state, if considering a merge
330 : : */
331 : : #define TSTATE_INIT 0x01 /* flag indicating this state is initial */
332 : : #define TSTATE_FIN 0x02 /* flag indicating this state is final */
333 : :
334 : : typedef struct TrgmState
335 : : {
336 : : TrgmStateKey stateKey; /* hashtable key: must be first field */
337 : : List *arcs;
338 : : List *enterKeys;
339 : : int flags;
340 : : int snumber;
341 : : struct TrgmState *parent;
342 : : int tentFlags;
343 : : struct TrgmState *tentParent;
344 : : } TrgmState;
345 : :
346 : : /*
347 : : * One arc in the expanded graph.
348 : : */
349 : : typedef struct
350 : : {
351 : : ColorTrgm ctrgm; /* trigram needed to traverse arc */
352 : : TrgmState *target; /* next state */
353 : : } TrgmArc;
354 : :
355 : : /*
356 : : * Information about arc of specific color trigram (used in stage 3)
357 : : *
358 : : * Contains pointers to the source and target states.
359 : : */
360 : : typedef struct
361 : : {
362 : : TrgmState *source;
363 : : TrgmState *target;
364 : : } TrgmArcInfo;
365 : :
366 : : /*
367 : : * Information about color trigram (used in stage 3)
368 : : *
369 : : * ctrgm - trigram itself
370 : : * cnumber - number of this trigram (used in the packaging stage)
371 : : * count - number of simple trigrams created from this color trigram
372 : : * expanded - indicates this color trigram is expanded into simple trigrams
373 : : * arcs - list of all arcs labeled with this color trigram.
374 : : */
375 : : typedef struct
376 : : {
377 : : ColorTrgm ctrgm;
378 : : int cnumber;
379 : : int count;
380 : : float4 penalty;
381 : : bool expanded;
382 : : List *arcs;
383 : : } ColorTrgmInfo;
384 : :
385 : : /*
386 : : * Data structure representing all the data we need during regex processing.
387 : : *
388 : : * regex - compiled regex
389 : : * colorInfo - extracted information about regex's colors
390 : : * ncolors - number of colors in colorInfo[]
391 : : * states - hashtable of TrgmStates (states of expanded graph)
392 : : * initState - pointer to initial state of expanded graph
393 : : * queue - queue of to-be-processed TrgmStates
394 : : * keysQueue - queue of to-be-processed TrgmStateKeys
395 : : * arcsCount - total number of arcs of expanded graph (for resource
396 : : * limiting)
397 : : * overflowed - we have exceeded resource limit for transformation
398 : : * colorTrgms - array of all color trigrams present in graph
399 : : * colorTrgmsCount - count of those color trigrams
400 : : * totalTrgmCount - total count of extracted simple trigrams
401 : : */
402 : : typedef struct
403 : : {
404 : : /* Source regexp, and color information extracted from it (stage 1) */
405 : : regex_t *regex;
406 : : TrgmColorInfo *colorInfo;
407 : : int ncolors;
408 : :
409 : : /* Expanded graph (stage 2) */
410 : : HTAB *states;
411 : : TrgmState *initState;
412 : : int nstates;
413 : :
414 : : /* Workspace for stage 2 */
415 : : List *queue;
416 : : List *keysQueue;
417 : : int arcsCount;
418 : : bool overflowed;
419 : :
420 : : /* Information about distinct color trigrams in the graph (stage 3) */
421 : : ColorTrgmInfo *colorTrgms;
422 : : int colorTrgmsCount;
423 : : int totalTrgmCount;
424 : : } TrgmNFA;
425 : :
426 : : /*
427 : : * Final, compact representation of expanded graph.
428 : : */
429 : : typedef struct
430 : : {
431 : : int targetState; /* index of target state (zero-based) */
432 : : int colorTrgm; /* index of color trigram for transition */
433 : : } TrgmPackedArc;
434 : :
435 : : typedef struct
436 : : {
437 : : int arcsCount; /* number of out-arcs for this state */
438 : : TrgmPackedArc *arcs; /* array of arcsCount packed arcs */
439 : : } TrgmPackedState;
440 : :
441 : : /* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
442 : : struct TrgmPackedGraph
443 : : {
444 : : /*
445 : : * colorTrigramsCount and colorTrigramGroups contain information about how
446 : : * trigrams are grouped into color trigrams. "colorTrigramsCount" is the
447 : : * count of color trigrams and "colorTrigramGroups" contains number of
448 : : * simple trigrams for each color trigram. The array of simple trigrams
449 : : * (stored separately from this struct) is ordered so that the simple
450 : : * trigrams for each color trigram are consecutive, and they're in order
451 : : * by color trigram number.
452 : : */
453 : : int colorTrigramsCount;
454 : : int *colorTrigramGroups; /* array of size colorTrigramsCount */
455 : :
456 : : /*
457 : : * The states of the simplified NFA. State number 0 is always initial
458 : : * state and state number 1 is always final state.
459 : : */
460 : : int statesCount;
461 : : TrgmPackedState *states; /* array of size statesCount */
462 : :
463 : : /* Temporary work space for trigramsMatchGraph() */
464 : : bool *colorTrigramsActive; /* array of size colorTrigramsCount */
465 : : bool *statesActive; /* array of size statesCount */
466 : : int *statesQueue; /* array of size statesCount */
467 : : };
468 : :
469 : : /*
470 : : * Temporary structure for representing an arc during packaging.
471 : : */
472 : : typedef struct
473 : : {
474 : : int sourceState;
475 : : int targetState;
476 : : int colorTrgm;
477 : : } TrgmPackArcInfo;
478 : :
479 : :
480 : : /* prototypes for private functions */
481 : : static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
482 : : MemoryContext rcontext);
483 : : static void RE_compile(regex_t *regex, text *text_re,
484 : : int cflags, Oid collation);
485 : : static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
486 : : static int convertPgWchar(pg_wchar c, trgm_mb_char *result);
487 : : static void transformGraph(TrgmNFA *trgmNFA);
488 : : static void processState(TrgmNFA *trgmNFA, TrgmState *state);
489 : : static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
490 : : static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key);
491 : : static void addArcs(TrgmNFA *trgmNFA, TrgmState *state);
492 : : static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
493 : : TrgmColor co, TrgmStateKey *destKey);
494 : : static bool validArcLabel(TrgmStateKey *key, TrgmColor co);
495 : : static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key);
496 : : static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2);
497 : : static bool selectColorTrigrams(TrgmNFA *trgmNFA);
498 : : static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
499 : : static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
500 : : static void mergeStates(TrgmState *state1, TrgmState *state2);
501 : : static int colorTrgmInfoCmp(const void *p1, const void *p2);
502 : : static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
503 : : static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
504 : : static int packArcInfoCmp(const void *a1, const void *a2);
505 : :
506 : : #ifdef TRGM_REGEXP_DEBUG
507 : : static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors);
508 : : static void printTrgmNFA(TrgmNFA *trgmNFA);
509 : : static void printTrgmColor(StringInfo buf, TrgmColor co);
510 : : static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
511 : : #endif
512 : :
513 : :
514 : : /*
515 : : * Main entry point to process a regular expression.
516 : : *
517 : : * Returns an array of trigrams required by the regular expression, or NULL if
518 : : * the regular expression was too complex to analyze. In addition, a packed
519 : : * graph representation of the regex is returned into *graph. The results
520 : : * must be allocated in rcontext (which might or might not be the current
521 : : * context).
522 : : */
523 : : TRGM *
4722 tgl@sss.pgh.pa.us 524 :CBC 65 : createTrgmNFA(text *text_re, Oid collation,
525 : : TrgmPackedGraph **graph, MemoryContext rcontext)
526 : : {
527 : : TRGM *trg;
528 : : regex_t regex;
529 : : MemoryContext tmpcontext;
530 : : MemoryContext oldcontext;
531 : :
532 : : /*
533 : : * This processing generates a great deal of cruft, which we'd like to
534 : : * clean up before returning (since this function may be called in a
535 : : * query-lifespan memory context). Make a temp context we can work in so
536 : : * that cleanup is easy.
537 : : */
4723 538 : 65 : tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
539 : : "createTrgmNFA temporary context",
540 : : ALLOCSET_DEFAULT_SIZES);
541 : 65 : oldcontext = MemoryContextSwitchTo(tmpcontext);
542 : :
543 : : /*
544 : : * Stage 1: Compile the regexp into a NFA, using the regexp library.
545 : : */
546 : : #ifdef IGNORECASE
1679 547 : 65 : RE_compile(®ex, text_re,
548 : : REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
549 : : #else
550 : : RE_compile(®ex, text_re,
551 : : REG_ADVANCED | REG_NOSUB, collation);
552 : : #endif
553 : :
1072 tmunro@postgresql.or 554 : 65 : trg = createTrgmNFAInternal(®ex, graph, rcontext);
555 : :
556 : : /* Clean up all the cruft we created (including regex) */
4723 tgl@sss.pgh.pa.us 557 : 65 : MemoryContextSwitchTo(oldcontext);
558 : 65 : MemoryContextDelete(tmpcontext);
559 : :
560 : 65 : return trg;
561 : : }
562 : :
563 : : /*
564 : : * Body of createTrgmNFA, exclusive of regex compilation/freeing.
565 : : */
566 : : static TRGM *
567 : 65 : createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
568 : : MemoryContext rcontext)
569 : : {
570 : : TRGM *trg;
571 : : TrgmNFA trgmNFA;
572 : :
573 : 65 : trgmNFA.regex = regex;
574 : :
575 : : /* Collect color information from the regex */
576 : 65 : getColorInfo(regex, &trgmNFA);
577 : :
578 : : #ifdef TRGM_REGEXP_DEBUG
579 : : printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
580 : : #endif
581 : :
582 : : /*
583 : : * Stage 2: Create an expanded graph from the source NFA.
584 : : */
585 : 65 : transformGraph(&trgmNFA);
586 : :
587 : : #ifdef TRGM_REGEXP_DEBUG
588 : : printTrgmNFA(&trgmNFA);
589 : : #endif
590 : :
591 : : /*
592 : : * Fail if we were unable to make a nontrivial graph, ie it is possible to
593 : : * get from the initial state to the final state without reading any
594 : : * predictable trigram.
595 : : */
3308 596 [ + + ]: 65 : if (trgmNFA.initState->flags & TSTATE_FIN)
4723 597 : 9 : return NULL;
598 : :
599 : : /*
600 : : * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
601 : : */
602 [ + + ]: 56 : if (!selectColorTrigrams(&trgmNFA))
603 : 3 : return NULL;
604 : :
605 : : /*
606 : : * Stage 4: Expand color trigrams and pack graph into final
607 : : * representation.
608 : : */
609 : 53 : trg = expandColorTrigrams(&trgmNFA, rcontext);
610 : :
611 : 53 : *graph = packGraph(&trgmNFA, rcontext);
612 : :
613 : : #ifdef TRGM_REGEXP_DEBUG
614 : : printTrgmPackedGraph(*graph, trg);
615 : : #endif
616 : :
617 : 53 : return trg;
618 : : }
619 : :
620 : : /*
621 : : * Main entry point for evaluating a graph during index scanning.
622 : : *
623 : : * The check[] array is indexed by trigram number (in the array of simple
624 : : * trigrams returned by createTrgmNFA), and holds true for those trigrams
625 : : * that are present in the index entry being checked.
626 : : */
627 : : bool
628 : 3557 : trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
629 : : {
630 : : int i,
631 : : j,
632 : : k,
633 : : queueIn,
634 : : queueOut;
635 : :
636 : : /*
637 : : * Reset temporary working areas.
638 : : */
639 : 3557 : memset(graph->colorTrigramsActive, 0,
640 : 3557 : sizeof(bool) * graph->colorTrigramsCount);
641 : 3557 : memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
642 : :
643 : : /*
644 : : * Check which color trigrams were matched. A match for any simple
645 : : * trigram associated with a color trigram counts as a match of the color
646 : : * trigram.
647 : : */
648 : 3557 : j = 0;
649 [ + + ]: 11039 : for (i = 0; i < graph->colorTrigramsCount; i++)
650 : : {
651 : 7482 : int cnt = graph->colorTrigramGroups[i];
652 : :
653 [ + + ]: 166735 : for (k = j; k < j + cnt; k++)
654 : : {
655 [ + + ]: 163105 : if (check[k])
656 : : {
657 : : /*
658 : : * Found one matched trigram in the group. Can skip the rest
659 : : * of them and go to the next group.
660 : : */
661 : 3852 : graph->colorTrigramsActive[i] = true;
662 : 3852 : break;
663 : : }
664 : : }
665 : 7482 : j = j + cnt;
666 : : }
667 : :
668 : : /*
669 : : * Initialize the statesQueue to hold just the initial state. Note:
670 : : * statesQueue has room for statesCount entries, which is certainly enough
671 : : * since no state will be put in the queue more than once. The
672 : : * statesActive array marks which states have been queued.
673 : : */
674 : 3557 : graph->statesActive[0] = true;
675 : 3557 : graph->statesQueue[0] = 0;
676 : 3557 : queueIn = 0;
677 : 3557 : queueOut = 1;
678 : :
679 : : /* Process queued states as long as there are any. */
680 [ + + ]: 7657 : while (queueIn < queueOut)
681 : : {
682 : 7521 : int stateno = graph->statesQueue[queueIn++];
683 : 7521 : TrgmPackedState *state = &graph->states[stateno];
684 : 7521 : int cnt = state->arcsCount;
685 : :
686 : : /* Loop over state's out-arcs */
687 [ + + ]: 15156 : for (i = 0; i < cnt; i++)
688 : : {
689 : 11056 : TrgmPackedArc *arc = &state->arcs[i];
690 : :
691 : : /*
692 : : * If corresponding color trigram is present then activate the
693 : : * corresponding state. We're done if that's the final state,
694 : : * otherwise queue the state if it's not been queued already.
695 : : */
696 [ + + ]: 11056 : if (graph->colorTrigramsActive[arc->colorTrgm])
697 : : {
698 : 7724 : int nextstate = arc->targetState;
699 : :
700 [ + + ]: 7724 : if (nextstate == 1)
701 : 3421 : return true; /* success: final state is reachable */
702 : :
703 [ + + ]: 4303 : if (!graph->statesActive[nextstate])
704 : : {
705 : 4246 : graph->statesActive[nextstate] = true;
706 : 4246 : graph->statesQueue[queueOut++] = nextstate;
707 : : }
708 : : }
709 : : }
710 : : }
711 : :
712 : : /* Queue is empty, so match fails. */
713 : 136 : return false;
714 : : }
715 : :
716 : : /*
717 : : * Compile regex string into struct at *regex.
718 : : * NB: pg_regfree must be applied to regex if this completes successfully.
719 : : */
720 : : static void
721 : 65 : RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
722 : : {
723 [ - + - - : 65 : int text_re_len = VARSIZE_ANY_EXHDR(text_re);
- - - - -
+ ]
724 [ - + ]: 65 : char *text_re_val = VARDATA_ANY(text_re);
725 : : pg_wchar *pattern;
726 : : int pattern_len;
727 : : int regcomp_result;
728 : : char errMsg[100];
729 : :
730 : : /* Convert pattern string to wide characters */
731 : 65 : pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
732 : 65 : pattern_len = pg_mb2wchar_with_len(text_re_val,
733 : : pattern,
734 : : text_re_len);
735 : :
736 : : /* Compile regex */
737 : 65 : regcomp_result = pg_regcomp(regex,
738 : : pattern,
739 : : pattern_len,
740 : : cflags,
741 : : collation);
742 : :
743 : 65 : pfree(pattern);
744 : :
745 [ - + ]: 65 : if (regcomp_result != REG_OKAY)
746 : : {
747 : : /* re didn't compile (no need for pg_regfree, if so) */
4723 tgl@sss.pgh.pa.us 748 :UBC 0 : pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
749 [ # # ]: 0 : ereport(ERROR,
750 : : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
751 : : errmsg("invalid regular expression: %s", errMsg)));
752 : : }
4723 tgl@sss.pgh.pa.us 753 :CBC 65 : }
754 : :
755 : :
756 : : /*---------------------
757 : : * Subroutines for pre-processing the color map (stage 1).
758 : : *---------------------
759 : : */
760 : :
761 : : /*
762 : : * Fill TrgmColorInfo structure for each color using regex export functions.
763 : : */
764 : : static void
765 : 65 : getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
766 : : {
767 : 65 : int colorsCount = pg_reg_getnumcolors(regex);
768 : : int i;
769 : :
770 : 65 : trgmNFA->ncolors = colorsCount;
771 : 65 : trgmNFA->colorInfo = (TrgmColorInfo *)
772 : 65 : palloc0(colorsCount * sizeof(TrgmColorInfo));
773 : :
774 : : /*
775 : : * Loop over colors, filling TrgmColorInfo about each. Note we include
776 : : * WHITE (0) even though we know it'll be reported as non-expandable.
777 : : */
778 [ + + ]: 596 : for (i = 0; i < colorsCount; i++)
779 : : {
780 : 531 : TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
781 : 531 : int charsCount = pg_reg_getnumcharacters(regex, i);
782 : : pg_wchar *chars;
783 : : int j;
784 : :
785 [ + + - + ]: 531 : if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT)
786 : : {
787 : : /* Non expandable, or too large to work with */
788 : 325 : colorInfo->expandable = false;
789 : 325 : continue;
790 : : }
791 : :
792 : 206 : colorInfo->expandable = true;
793 : 206 : colorInfo->containsNonWord = false;
100 michael@paquier.xyz 794 :GNC 206 : colorInfo->wordChars = palloc_array(trgm_mb_char, charsCount);
4723 tgl@sss.pgh.pa.us 795 :CBC 206 : colorInfo->wordCharsCount = 0;
796 : :
797 : : /* Extract all the chars in this color */
100 michael@paquier.xyz 798 :GNC 206 : chars = palloc_array(pg_wchar, charsCount);
4723 tgl@sss.pgh.pa.us 799 :CBC 206 : pg_reg_getcharacters(regex, i, chars, charsCount);
800 : :
801 : : /*
802 : : * Convert characters back to multibyte form, and save only those that
803 : : * are word characters. Set "containsNonWord" if any non-word
804 : : * character. (Note: it'd probably be nicer to keep the chars in
805 : : * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
806 : : */
807 [ + + ]: 991 : for (j = 0; j < charsCount; j++)
808 : : {
809 : : trgm_mb_char c;
67 tmunro@postgresql.or 810 : 785 : int clen = convertPgWchar(chars[j], &c);
811 : :
812 [ + + ]: 785 : if (!clen)
4723 tgl@sss.pgh.pa.us 813 : 365 : continue; /* ok to ignore it altogether */
67 tmunro@postgresql.or 814 [ + + ]: 420 : if (ISWORDCHR(c.bytes, clen))
4723 tgl@sss.pgh.pa.us 815 : 395 : colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
816 : : else
817 : 25 : colorInfo->containsNonWord = true;
818 : : }
819 : :
820 : 206 : pfree(chars);
821 : : }
822 : 65 : }
823 : :
824 : : /*
825 : : * Convert pg_wchar to multibyte format.
826 : : * Returns 0 if the character should be ignored completely, else returns its
827 : : * byte length.
828 : : */
829 : : static int
830 : 785 : convertPgWchar(pg_wchar c, trgm_mb_char *result)
831 : : {
832 : : /* "s" has enough space for a multibyte character and a trailing NUL */
833 : : char s[MAX_MULTIBYTE_CHAR_LEN + 1];
834 : : int clen;
835 : :
836 : : /*
837 : : * We can ignore the NUL character, since it can never appear in a PG text
838 : : * string. This avoids the need for various special cases when
839 : : * reconstructing trigrams.
840 : : */
841 [ - + ]: 785 : if (c == 0)
67 tmunro@postgresql.or 842 :UBC 0 : return 0;
843 : :
844 : : /* Do the conversion, making sure the result is NUL-terminated */
4723 tgl@sss.pgh.pa.us 845 :CBC 785 : memset(s, 0, sizeof(s));
67 tmunro@postgresql.or 846 : 785 : clen = pg_wchar2mb_with_len(&c, s, 1);
847 : :
848 : : /*
849 : : * In IGNORECASE mode, we can ignore uppercase characters. We assume that
850 : : * the regex engine generated both uppercase and lowercase equivalents
851 : : * within each color, since we used the REG_ICASE option; so there's no
852 : : * need to process the uppercase version.
853 : : *
854 : : * XXX this code is dependent on the assumption that str_tolower() works
855 : : * the same as the regex engine's internal case folding machinery. Might
856 : : * be wiser to expose pg_wc_tolower and test whether c ==
857 : : * pg_wc_tolower(c). On the other hand, the trigrams in the index were
858 : : * created using str_tolower(), so we're probably screwed if there's any
859 : : * incompatibility anyway.
860 : : */
861 : : #ifdef IGNORECASE
862 : : {
863 : 785 : char *lowerCased = str_tolower(s, clen, DEFAULT_COLLATION_OID);
864 : :
4723 tgl@sss.pgh.pa.us 865 [ + + ]: 785 : if (strcmp(lowerCased, s) != 0)
866 : : {
867 : 365 : pfree(lowerCased);
67 tmunro@postgresql.or 868 : 365 : return 0;
869 : : }
4723 tgl@sss.pgh.pa.us 870 : 420 : pfree(lowerCased);
871 : : }
872 : : #endif
873 : :
874 : : /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
4068 875 : 420 : memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
67 tmunro@postgresql.or 876 : 420 : return clen;
877 : : }
878 : :
879 : :
880 : : /*---------------------
881 : : * Subroutines for expanding original NFA graph into a trigram graph (stage 2).
882 : : *---------------------
883 : : */
884 : :
885 : : /*
886 : : * Transform the graph, given a regex and extracted color information.
887 : : *
888 : : * We create and process a queue of expanded-graph states until all the states
889 : : * are processed.
890 : : *
891 : : * This algorithm may be stopped due to resource limitation. In this case we
892 : : * force every unprocessed branch to immediately finish with matching (this
893 : : * can give us false positives but no false negatives) by marking all
894 : : * unprocessed states as final.
895 : : */
896 : : static void
4723 tgl@sss.pgh.pa.us 897 : 65 : transformGraph(TrgmNFA *trgmNFA)
898 : : {
899 : : HASHCTL hashCtl;
900 : : TrgmStateKey initkey;
901 : : TrgmState *initstate;
902 : : ListCell *lc;
903 : :
904 : : /* Initialize this stage's workspace in trgmNFA struct */
905 : 65 : trgmNFA->queue = NIL;
906 : 65 : trgmNFA->keysQueue = NIL;
907 : 65 : trgmNFA->arcsCount = 0;
908 : 65 : trgmNFA->overflowed = false;
909 : :
910 : : /* Create hashtable for states */
911 : 65 : hashCtl.keysize = sizeof(TrgmStateKey);
912 : 65 : hashCtl.entrysize = sizeof(TrgmState);
913 : 65 : hashCtl.hcxt = CurrentMemoryContext;
914 : 65 : trgmNFA->states = hash_create("Trigram NFA",
915 : : 1024,
916 : : &hashCtl,
917 : : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
3257 918 : 65 : trgmNFA->nstates = 0;
919 : :
920 : : /* Create initial state: ambiguous prefix, NFA's initial state */
4723 921 [ - + - - : 65 : MemSet(&initkey, 0, sizeof(initkey));
- - - - -
- ]
922 : 65 : initkey.prefix.colors[0] = COLOR_UNKNOWN;
923 : 65 : initkey.prefix.colors[1] = COLOR_UNKNOWN;
924 : 65 : initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
925 : :
926 : 65 : initstate = getState(trgmNFA, &initkey);
3308 927 : 65 : initstate->flags |= TSTATE_INIT;
4723 928 : 65 : trgmNFA->initState = initstate;
929 : :
930 : : /*
931 : : * Recursively build the expanded graph by processing queue of states
932 : : * (breadth-first search). getState already put initstate in the queue.
933 : : * Note that getState will append new states to the queue within the loop,
934 : : * too; this works as long as we don't do repeat fetches using the "lc"
935 : : * pointer.
936 : : */
1595 937 [ + - + + : 729 : foreach(lc, trgmNFA->queue)
+ + ]
938 : : {
939 : 664 : TrgmState *state = (TrgmState *) lfirst(lc);
940 : :
941 : : /*
942 : : * If we overflowed then just mark state as final. Otherwise do
943 : : * actual processing.
944 : : */
4723 945 [ + + ]: 664 : if (trgmNFA->overflowed)
3308 946 : 9 : state->flags |= TSTATE_FIN;
947 : : else
4723 948 : 655 : processState(trgmNFA, state);
949 : :
950 : : /* Did we overflow? */
951 [ + - + + ]: 1328 : if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
952 : 664 : hash_get_num_entries(trgmNFA->states) > MAX_EXPANDED_STATES)
953 : 12 : trgmNFA->overflowed = true;
954 : : }
955 : 65 : }
956 : :
957 : : /*
958 : : * Process one state: add enter keys and then add outgoing arcs.
959 : : */
960 : : static void
961 : 655 : processState(TrgmNFA *trgmNFA, TrgmState *state)
962 : : {
963 : : ListCell *lc;
964 : :
965 : : /* keysQueue should be NIL already, but make sure */
966 : 655 : trgmNFA->keysQueue = NIL;
967 : :
968 : : /*
969 : : * Add state's own key, and then process all keys added to keysQueue until
970 : : * queue is finished. But we can quit if the state gets marked final.
971 : : */
972 : 655 : addKey(trgmNFA, state, &state->stateKey);
1595 973 [ + + + + : 1266 : foreach(lc, trgmNFA->keysQueue)
+ + ]
974 : : {
975 : 692 : TrgmStateKey *key = (TrgmStateKey *) lfirst(lc);
976 : :
977 [ + + ]: 692 : if (state->flags & TSTATE_FIN)
978 : 81 : break;
4723 979 : 611 : addKey(trgmNFA, state, key);
980 : : }
981 : :
982 : : /* Release keysQueue to clean up for next cycle */
1595 983 : 655 : list_free(trgmNFA->keysQueue);
984 : 655 : trgmNFA->keysQueue = NIL;
985 : :
986 : : /*
987 : : * Add outgoing arcs only if state isn't final (we have no interest in
988 : : * outgoing arcs if we already match)
989 : : */
3308 990 [ + + ]: 655 : if (!(state->flags & TSTATE_FIN))
4723 991 : 571 : addArcs(trgmNFA, state);
992 : 655 : }
993 : :
994 : : /*
995 : : * Add the given enter key into the state's enterKeys list, and determine
996 : : * whether this should result in any further enter keys being added.
997 : : * If so, add those keys to keysQueue so that processState will handle them.
998 : : *
999 : : * If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
1000 : : * This situation means that we can reach the final state from this expanded
1001 : : * state without reading any predictable trigram, so we must consider this
1002 : : * state as an accepting one.
1003 : : *
1004 : : * The given key could be a duplicate of one already in enterKeys, or be
1005 : : * redundant with some enterKeys. So we check that before doing anything.
1006 : : *
1007 : : * Note that we don't generate any actual arcs here. addArcs will do that
1008 : : * later, after we have identified all the enter keys for this state.
1009 : : */
1010 : : static void
1011 : 1266 : addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
1012 : : {
1013 : : regex_arc_t *arcs;
1014 : : TrgmStateKey destKey;
1015 : : ListCell *cell;
1016 : : int i,
1017 : : arcsCount;
1018 : :
1019 : : /*
1020 : : * Ensure any pad bytes in destKey are zero, since it may get used as a
1021 : : * hashtable key by getState.
1022 : : */
1023 [ - + - - : 1266 : MemSet(&destKey, 0, sizeof(destKey));
- - - - -
- ]
1024 : :
1025 : : /*
1026 : : * Compare key to each existing enter key of the state to check for
1027 : : * redundancy. We can drop either old key(s) or the new key if we find
1028 : : * redundancy.
1029 : : */
2435 1030 [ + + + + : 1979 : foreach(cell, state->enterKeys)
+ + ]
1031 : : {
4723 1032 : 1016 : TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1033 : :
1034 [ + + ]: 1016 : if (existingKey->nstate == key->nstate)
1035 : : {
1036 [ + + ]: 312 : if (prefixContains(&existingKey->prefix, &key->prefix))
1037 : : {
1038 : : /* This old key already covers the new key. Nothing to do */
1039 : 303 : return;
1040 : : }
1041 [ + + ]: 9 : if (prefixContains(&key->prefix, &existingKey->prefix))
1042 : : {
1043 : : /*
1044 : : * The new key covers this old key. Remove the old key, it's
1045 : : * no longer needed once we add this key to the list.
1046 : : */
2435 1047 : 6 : state->enterKeys = foreach_delete_current(state->enterKeys,
1048 : : cell);
1049 : : }
1050 : : }
1051 : : }
1052 : :
1053 : : /* No redundancy, so add this key to the state's list */
4723 1054 : 963 : state->enterKeys = lappend(state->enterKeys, key);
1055 : :
1056 : : /* If state is now known final, mark it and we're done */
1057 [ + + ]: 963 : if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
1058 : : {
3308 1059 : 84 : state->flags |= TSTATE_FIN;
4723 1060 : 84 : return;
1061 : : }
1062 : :
1063 : : /*
1064 : : * Loop through all outgoing arcs of the corresponding state in the
1065 : : * original NFA.
1066 : : */
1067 : 879 : arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
100 michael@paquier.xyz 1068 :GNC 879 : arcs = palloc_array(regex_arc_t, arcsCount);
4723 tgl@sss.pgh.pa.us 1069 :CBC 879 : pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1070 : :
1071 [ + + ]: 2384 : for (i = 0; i < arcsCount; i++)
1072 : : {
1073 : 1505 : regex_arc_t *arc = &arcs[i];
1074 : :
1075 [ + + ]: 1505 : if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
1076 : : {
1077 : : /*
1078 : : * Start of line/string (^). Trigram extraction treats start of
1079 : : * line same as start of word: double space prefix is added.
1080 : : * Hence, make an enter key showing we can reach the arc
1081 : : * destination with all-blank prefix.
1082 : : */
1083 : 246 : destKey.prefix.colors[0] = COLOR_BLANK;
1084 : 246 : destKey.prefix.colors[1] = COLOR_BLANK;
1085 : 246 : destKey.nstate = arc->to;
1086 : :
1087 : : /* Add enter key to this state */
1088 : 246 : addKeyToQueue(trgmNFA, &destKey);
1089 : : }
1090 [ + + ]: 1259 : else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
1091 : : {
1092 : : /*
1093 : : * End of line/string ($). We must consider this arc as a
1094 : : * transition that doesn't read anything. The reason for adding
1095 : : * this enter key to the state is that if the arc leads to the
1096 : : * NFA's final state, we must mark this expanded state as final.
1097 : : */
1098 : 162 : destKey.prefix.colors[0] = COLOR_UNKNOWN;
1099 : 162 : destKey.prefix.colors[1] = COLOR_UNKNOWN;
1100 : 162 : destKey.nstate = arc->to;
1101 : :
1102 : : /* Add enter key to this state */
1103 : 162 : addKeyToQueue(trgmNFA, &destKey);
1104 : : }
1849 1105 [ + + ]: 1097 : else if (arc->co >= 0)
1106 : : {
1107 : : /* Regular color (including WHITE) */
4723 1108 : 893 : TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
1109 : :
1110 [ + - ]: 893 : if (colorInfo->expandable)
1111 : : {
1112 [ + + ]: 893 : if (colorInfo->containsNonWord &&
1113 [ + + ]: 55 : !validArcLabel(key, COLOR_BLANK))
1114 : : {
1115 : : /*
1116 : : * We can reach the arc destination after reading a
1117 : : * non-word character, but the prefix is not something
1118 : : * that addArc will accept with COLOR_BLANK, so no trigram
1119 : : * arc can get made for this transition. We must make an
1120 : : * enter key to show that the arc destination is
1121 : : * reachable. Set it up with an all-blank prefix, since
1122 : : * that corresponds to what the trigram extraction code
1123 : : * will do at a word starting boundary.
1124 : : */
1125 : 27 : destKey.prefix.colors[0] = COLOR_BLANK;
1126 : 27 : destKey.prefix.colors[1] = COLOR_BLANK;
1127 : 27 : destKey.nstate = arc->to;
1128 : 27 : addKeyToQueue(trgmNFA, &destKey);
1129 : : }
1130 : :
1131 [ + + ]: 893 : if (colorInfo->wordCharsCount > 0 &&
1132 [ + + ]: 838 : !validArcLabel(key, arc->co))
1133 : : {
1134 : : /*
1135 : : * We can reach the arc destination after reading a word
1136 : : * character, but the prefix is not something that addArc
1137 : : * will accept, so no trigram arc can get made for this
1138 : : * transition. We must make an enter key to show that the
1139 : : * arc destination is reachable. The prefix for the enter
1140 : : * key should reflect the info we have for this arc.
1141 : : */
1142 : 131 : destKey.prefix.colors[0] = key->prefix.colors[1];
1143 : 131 : destKey.prefix.colors[1] = arc->co;
1144 : 131 : destKey.nstate = arc->to;
1145 : 131 : addKeyToQueue(trgmNFA, &destKey);
1146 : : }
1147 : : }
1148 : : else
1149 : : {
1150 : : /*
1151 : : * Unexpandable color. Add enter key with ambiguous prefix,
1152 : : * showing we can reach the destination from this state, but
1153 : : * the preceding colors will be uncertain. (We do not set the
1154 : : * first prefix color to key->prefix.colors[1], because a
1155 : : * prefix of known followed by unknown is invalid.)
1156 : : */
4723 tgl@sss.pgh.pa.us 1157 :UBC 0 : destKey.prefix.colors[0] = COLOR_UNKNOWN;
1158 : 0 : destKey.prefix.colors[1] = COLOR_UNKNOWN;
1159 : 0 : destKey.nstate = arc->to;
1160 : 0 : addKeyToQueue(trgmNFA, &destKey);
1161 : : }
1162 : : }
1163 : : else
1164 : : {
1165 : : /* RAINBOW: treat as unexpandable color */
1849 tgl@sss.pgh.pa.us 1166 :CBC 204 : destKey.prefix.colors[0] = COLOR_UNKNOWN;
1167 : 204 : destKey.prefix.colors[1] = COLOR_UNKNOWN;
1168 : 204 : destKey.nstate = arc->to;
1169 : 204 : addKeyToQueue(trgmNFA, &destKey);
1170 : : }
1171 : : }
1172 : :
4723 1173 : 879 : pfree(arcs);
1174 : : }
1175 : :
1176 : : /*
1177 : : * Add copy of given key to keysQueue for later processing.
1178 : : */
1179 : : static void
1180 : 770 : addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
1181 : : {
100 michael@paquier.xyz 1182 :GNC 770 : TrgmStateKey *keyCopy = palloc_object(TrgmStateKey);
1183 : :
4723 tgl@sss.pgh.pa.us 1184 :CBC 770 : memcpy(keyCopy, key, sizeof(TrgmStateKey));
1185 : 770 : trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
1186 : 770 : }
1187 : :
1188 : : /*
1189 : : * Add outgoing arcs from given state, whose enter keys are all now known.
1190 : : */
1191 : : static void
1192 : 571 : addArcs(TrgmNFA *trgmNFA, TrgmState *state)
1193 : : {
1194 : : TrgmStateKey destKey;
1195 : : ListCell *cell;
1196 : : regex_arc_t *arcs;
1197 : : int arcsCount,
1198 : : i;
1199 : :
1200 : : /*
1201 : : * Ensure any pad bytes in destKey are zero, since it may get used as a
1202 : : * hashtable key by getState.
1203 : : */
1204 [ - + - - : 571 : MemSet(&destKey, 0, sizeof(destKey));
- - - - -
- ]
1205 : :
1206 : : /*
1207 : : * Iterate over enter keys associated with this expanded-graph state. This
1208 : : * includes both the state's own stateKey, and any enter keys we added to
1209 : : * it during addKey (which represent expanded-graph states that are not
1210 : : * distinguishable from this one by means of trigrams). For each such
1211 : : * enter key, examine all the out-arcs of the key's underlying NFA state,
1212 : : * and try to make a trigram arc leading to where the out-arc leads.
1213 : : * (addArc will deal with whether the arc is valid or not.)
1214 : : */
1215 [ + - + + : 1336 : foreach(cell, state->enterKeys)
+ + ]
1216 : : {
1217 : 765 : TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
1218 : :
1219 : 765 : arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
100 michael@paquier.xyz 1220 :GNC 765 : arcs = palloc_array(regex_arc_t, arcsCount);
4723 tgl@sss.pgh.pa.us 1221 :CBC 765 : pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1222 : :
1223 [ + + ]: 1946 : for (i = 0; i < arcsCount; i++)
1224 : : {
1225 : 1181 : regex_arc_t *arc = &arcs[i];
1226 : : TrgmColorInfo *colorInfo;
1227 : :
1228 : : /*
1229 : : * Ignore non-expandable colors; addKey already handled the case.
1230 : : *
1231 : : * We need no special check for WHITE or begin/end pseudocolors
1232 : : * here. We don't need to do any processing for them, and they
1233 : : * will be marked non-expandable since the regex engine will have
1234 : : * reported them that way. We do have to watch out for RAINBOW,
1235 : : * which has a negative color number.
1236 : : */
1848 1237 [ + + ]: 1181 : if (arc->co < 0)
1238 : 102 : continue;
1239 [ - + ]: 1079 : Assert(arc->co < trgmNFA->ncolors);
1240 : :
1241 : 1079 : colorInfo = &trgmNFA->colorInfo[arc->co];
4723 1242 [ + + ]: 1079 : if (!colorInfo->expandable)
1243 : 210 : continue;
1244 : :
1245 [ + + ]: 869 : if (colorInfo->containsNonWord)
1246 : : {
1247 : : /*
1248 : : * Color includes non-word character(s).
1249 : : *
1250 : : * Generate an arc, treating this transition as occurring on
1251 : : * BLANK. This allows word-ending trigrams to be manufactured
1252 : : * if possible.
1253 : : */
1254 : 55 : destKey.prefix.colors[0] = key->prefix.colors[1];
1255 : 55 : destKey.prefix.colors[1] = COLOR_BLANK;
1256 : 55 : destKey.nstate = arc->to;
1257 : :
1258 : 55 : addArc(trgmNFA, state, key, COLOR_BLANK, &destKey);
1259 : : }
1260 : :
1261 [ + + ]: 869 : if (colorInfo->wordCharsCount > 0)
1262 : : {
1263 : : /*
1264 : : * Color includes word character(s).
1265 : : *
1266 : : * Generate an arc. Color is pushed into prefix of target
1267 : : * state.
1268 : : */
1269 : 814 : destKey.prefix.colors[0] = key->prefix.colors[1];
1270 : 814 : destKey.prefix.colors[1] = arc->co;
1271 : 814 : destKey.nstate = arc->to;
1272 : :
1273 : 814 : addArc(trgmNFA, state, key, arc->co, &destKey);
1274 : : }
1275 : : }
1276 : :
1277 : 765 : pfree(arcs);
1278 : : }
1279 : 571 : }
1280 : :
1281 : : /*
1282 : : * Generate an out-arc of the expanded graph, if it's valid and not redundant.
1283 : : *
1284 : : * state: expanded-graph state we want to add an out-arc to
1285 : : * key: provides prefix colors (key->nstate is not used)
1286 : : * co: transition color
1287 : : * destKey: identifier for destination state of expanded graph
1288 : : */
1289 : : static void
1290 : 869 : addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
1291 : : TrgmColor co, TrgmStateKey *destKey)
1292 : : {
1293 : : TrgmArc *arc;
1294 : : ListCell *cell;
1295 : :
1296 : : /* Do nothing if this wouldn't be a valid arc label trigram */
1297 [ + + ]: 869 : if (!validArcLabel(key, co))
1298 : 137 : return;
1299 : :
1300 : : /*
1301 : : * Check if we are going to reach key which is covered by a key which is
1302 : : * already listed in this state. If so arc is useless: the NFA can bypass
1303 : : * it through a path that doesn't require any predictable trigram, so
1304 : : * whether the arc's trigram is present or not doesn't really matter.
1305 : : */
1306 [ + - + + : 1767 : foreach(cell, state->enterKeys)
+ + ]
1307 : : {
1308 : 1041 : TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1309 : :
1310 [ + + + + ]: 1066 : if (existingKey->nstate == destKey->nstate &&
1311 : 25 : prefixContains(&existingKey->prefix, &destKey->prefix))
1312 : 6 : return;
1313 : : }
1314 : :
1315 : : /* Checks were successful, add new arc */
100 michael@paquier.xyz 1316 :GNC 726 : arc = palloc_object(TrgmArc);
4723 tgl@sss.pgh.pa.us 1317 :CBC 726 : arc->target = getState(trgmNFA, destKey);
1318 : 726 : arc->ctrgm.colors[0] = key->prefix.colors[0];
1319 : 726 : arc->ctrgm.colors[1] = key->prefix.colors[1];
1320 : 726 : arc->ctrgm.colors[2] = co;
1321 : :
1322 : 726 : state->arcs = lappend(state->arcs, arc);
1323 : 726 : trgmNFA->arcsCount++;
1324 : : }
1325 : :
1326 : : /*
1327 : : * Can we make a valid trigram arc label from the given prefix and arc color?
1328 : : *
1329 : : * This is split out so that tests in addKey and addArc will stay in sync.
1330 : : */
1331 : : static bool
1332 : 1762 : validArcLabel(TrgmStateKey *key, TrgmColor co)
1333 : : {
1334 : : /*
1335 : : * We have to know full trigram in order to add outgoing arc. So we can't
1336 : : * do it if prefix is ambiguous.
1337 : : */
1338 [ + + ]: 1762 : if (key->prefix.colors[0] == COLOR_UNKNOWN)
1339 : 233 : return false;
1340 : :
1341 : : /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
1342 [ - + ]: 1529 : Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
1343 : : /* And we should not be called with an unknown arc color anytime */
1344 [ - + ]: 1529 : Assert(co != COLOR_UNKNOWN);
1345 : :
1346 : : /*
1347 : : * We don't bother with making arcs representing three non-word
1348 : : * characters, since that's useless for trigram extraction.
1349 : : */
1350 [ + + ]: 1529 : if (key->prefix.colors[0] == COLOR_BLANK &&
1351 [ + + + + ]: 164 : key->prefix.colors[1] == COLOR_BLANK &&
1352 : : co == COLOR_BLANK)
1353 : 12 : return false;
1354 : :
1355 : : /*
1356 : : * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
1357 : : * case doesn't correspond to any trigram the trigram extraction code
1358 : : * would make. The nonblank-blank-blank case is also not possible with
1359 : : * RPADDING = 1. (Note that in many cases we'd fail to generate such a
1360 : : * trigram even if it were valid, for example processing "foo bar" will
1361 : : * not result in considering the trigram "o ". So if you want to support
1362 : : * RPADDING = 2, there's more to do than just twiddle this test.)
1363 : : */
1364 [ + + ]: 1517 : if (key->prefix.colors[0] != COLOR_BLANK &&
1365 [ + + ]: 1365 : key->prefix.colors[1] == COLOR_BLANK)
1366 : 50 : return false;
1367 : :
1368 : : /*
1369 : : * Other combinations involving blank are valid, in particular we assume
1370 : : * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
1371 : : *
1372 : : * Note: Using again the example "foo bar", we will not consider the
1373 : : * trigram " b", though this trigram would be found by the trigram
1374 : : * extraction code. Since we will find " ba", it doesn't seem worth
1375 : : * trying to hack the algorithm to generate the additional trigram.
1376 : : */
1377 : :
1378 : : /* arc label is valid */
1379 : 1467 : return true;
1380 : : }
1381 : :
1382 : : /*
1383 : : * Get state of expanded graph for given state key,
1384 : : * and queue the state for processing if it didn't already exist.
1385 : : */
1386 : : static TrgmState *
1387 : 791 : getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
1388 : : {
1389 : : TrgmState *state;
1390 : : bool found;
1391 : :
1392 : 791 : state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER,
1393 : : &found);
1394 [ + + ]: 791 : if (!found)
1395 : : {
1396 : : /* New state: initialize and queue it */
1397 : 664 : state->arcs = NIL;
1398 : 664 : state->enterKeys = NIL;
3308 1399 : 664 : state->flags = 0;
1400 : : /* states are initially given negative numbers */
3257 1401 : 664 : state->snumber = -(++trgmNFA->nstates);
4723 1402 : 664 : state->parent = NULL;
3257 1403 : 664 : state->tentFlags = 0;
3308 1404 : 664 : state->tentParent = NULL;
1405 : :
4723 1406 : 664 : trgmNFA->queue = lappend(trgmNFA->queue, state);
1407 : : }
1408 : 791 : return state;
1409 : : }
1410 : :
1411 : : /*
1412 : : * Check if prefix1 "contains" prefix2.
1413 : : *
1414 : : * "contains" means that any exact prefix (with no ambiguity) that satisfies
1415 : : * prefix2 also satisfies prefix1.
1416 : : */
1417 : : static bool
1418 : 346 : prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
1419 : : {
1420 [ + + ]: 346 : if (prefix1->colors[1] == COLOR_UNKNOWN)
1421 : : {
1422 : : /* Fully ambiguous prefix contains everything */
1423 : 306 : return true;
1424 : : }
1425 [ + + ]: 40 : else if (prefix1->colors[0] == COLOR_UNKNOWN)
1426 : : {
1427 : : /*
1428 : : * Prefix with only first unknown color contains every prefix with
1429 : : * same second color.
1430 : : */
1431 [ + + ]: 12 : if (prefix1->colors[1] == prefix2->colors[1])
1432 : 3 : return true;
1433 : : else
1434 : 9 : return false;
1435 : : }
1436 : : else
1437 : : {
1438 : : /* Exact prefix contains only the exact same prefix */
1439 [ + + ]: 28 : if (prefix1->colors[0] == prefix2->colors[0] &&
1440 [ + + ]: 13 : prefix1->colors[1] == prefix2->colors[1])
1441 : 6 : return true;
1442 : : else
1443 : 22 : return false;
1444 : : }
1445 : : }
1446 : :
1447 : :
1448 : : /*---------------------
1449 : : * Subroutines for expanding color trigrams into regular trigrams (stage 3).
1450 : : *---------------------
1451 : : */
1452 : :
1453 : : /*
1454 : : * Get vector of all color trigrams in graph and select which of them
1455 : : * to expand into simple trigrams.
1456 : : *
1457 : : * Returns true if OK, false if exhausted resource limits.
1458 : : */
1459 : : static bool
1460 : 56 : selectColorTrigrams(TrgmNFA *trgmNFA)
1461 : : {
1462 : : HASH_SEQ_STATUS scan_status;
1463 : 56 : int arcsCount = trgmNFA->arcsCount,
1464 : : i;
1465 : : TrgmState *state;
1466 : : ColorTrgmInfo *colorTrgms;
1467 : : int64 totalTrgmCount;
1468 : : float4 totalTrgmPenalty;
1469 : : int cnumber;
1470 : :
1471 : : /* Collect color trigrams from all arcs */
100 michael@paquier.xyz 1472 :GNC 56 : colorTrgms = palloc0_array(ColorTrgmInfo, arcsCount);
4723 tgl@sss.pgh.pa.us 1473 :CBC 56 : trgmNFA->colorTrgms = colorTrgms;
1474 : :
1475 : 56 : i = 0;
1476 : 56 : hash_seq_init(&scan_status, trgmNFA->states);
1477 [ + + ]: 711 : while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1478 : : {
1479 : : ListCell *cell;
1480 : :
1481 [ + + + + : 1381 : foreach(cell, state->arcs)
+ + ]
1482 : : {
1483 : 726 : TrgmArc *arc = (TrgmArc *) lfirst(cell);
100 michael@paquier.xyz 1484 :GNC 726 : TrgmArcInfo *arcInfo = palloc_object(TrgmArcInfo);
3257 tgl@sss.pgh.pa.us 1485 :CBC 726 : ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1486 : :
4723 1487 : 726 : arcInfo->source = state;
1488 : 726 : arcInfo->target = arc->target;
3257 1489 : 726 : trgmInfo->ctrgm = arc->ctrgm;
1490 : 726 : trgmInfo->cnumber = -1;
1491 : : /* count and penalty will be set below */
1492 : 726 : trgmInfo->expanded = true;
1493 : 726 : trgmInfo->arcs = list_make1(arcInfo);
4723 1494 : 726 : i++;
1495 : : }
1496 : : }
1497 [ - + ]: 56 : Assert(i == arcsCount);
1498 : :
1499 : : /* Remove duplicates, merging their arcs lists */
1500 [ + + ]: 56 : if (arcsCount >= 2)
1501 : : {
1502 : : ColorTrgmInfo *p1,
1503 : : *p2;
1504 : :
1505 : : /* Sort trigrams to ease duplicate detection */
1506 : 34 : qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
1507 : :
1508 : : /* p1 is probe point, p2 is last known non-duplicate. */
1509 : 34 : p2 = colorTrgms;
1510 [ + + ]: 706 : for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
1511 : : {
1512 [ + + ]: 672 : if (colorTrgmInfoCmp(p1, p2) > 0)
1513 : : {
1514 : 224 : p2++;
1515 : 224 : *p2 = *p1;
1516 : : }
1517 : : else
1518 : : {
1519 : 448 : p2->arcs = list_concat(p2->arcs, p1->arcs);
1520 : : }
1521 : : }
1522 : 34 : trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
1523 : : }
1524 : : else
1525 : : {
1526 : 22 : trgmNFA->colorTrgmsCount = arcsCount;
1527 : : }
1528 : :
1529 : : /*
1530 : : * Count number of simple trigrams generated by each color trigram, and
1531 : : * also compute a penalty value, which is the number of simple trigrams
1532 : : * times a multiplier that depends on its whitespace content.
1533 : : *
1534 : : * Note: per-color-trigram counts cannot overflow an int so long as
1535 : : * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
1536 : : * 1290. However, the grand total totalTrgmCount might conceivably
1537 : : * overflow an int, so we use int64 for that within this routine. Also,
1538 : : * penalties are calculated in float4 arithmetic to avoid any overflow
1539 : : * worries.
1540 : : */
1541 : 56 : totalTrgmCount = 0;
4362 1542 : 56 : totalTrgmPenalty = 0.0f;
4723 1543 [ + + ]: 334 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1544 : : {
1545 : 278 : ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1546 : : int j,
4362 1547 : 278 : count = 1,
1548 : 278 : typeIndex = 0;
1549 : :
4723 1550 [ + + ]: 1112 : for (j = 0; j < 3; j++)
1551 : : {
1552 : 834 : TrgmColor c = trgmInfo->ctrgm.colors[j];
1553 : :
4362 1554 : 834 : typeIndex *= 2;
1555 [ + + ]: 834 : if (c == COLOR_BLANK)
1556 : 113 : typeIndex++;
1557 : : else
4723 1558 : 721 : count *= trgmNFA->colorInfo[c].wordCharsCount;
1559 : : }
1560 : 278 : trgmInfo->count = count;
1561 : 278 : totalTrgmCount += count;
4362 1562 : 278 : trgmInfo->penalty = penalties[typeIndex] * (float4) count;
1563 : 278 : totalTrgmPenalty += trgmInfo->penalty;
1564 : : }
1565 : :
1566 : : /* Sort color trigrams in descending order of their penalties */
4723 1567 : 56 : qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1568 : : colorTrgmInfoPenaltyCmp);
1569 : :
1570 : : /*
1571 : : * Remove color trigrams from the graph so long as total penalty of color
1572 : : * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
1573 : : * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
1574 : : * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
1575 : : * penalty, since those are the most promising for reducing the total
1576 : : * penalty. When removing a color trigram we have to merge states
1577 : : * connected by arcs labeled with that trigram. It's necessary to not
1578 : : * merge initial and final states, because our graph becomes useless if
1579 : : * that happens; so we cannot always remove the trigram we'd prefer to.
1580 : : */
4362 1581 [ + + ]: 204 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1582 : : {
4723 1583 : 179 : ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1584 : 179 : bool canRemove = true;
1585 : : ListCell *cell;
1586 : :
1587 : : /* Done if we've reached the target */
4362 1588 [ + + ]: 179 : if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
1589 : 31 : break;
1590 : :
1591 : : #ifdef TRGM_REGEXP_DEBUG
1592 : : fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
1593 : : trgmInfo->ctrgm.colors[0],
1594 : : trgmInfo->ctrgm.colors[1],
1595 : : trgmInfo->ctrgm.colors[2],
1596 : : trgmInfo->penalty,
1597 : : list_length(trgmInfo->arcs));
1598 : : #endif
1599 : :
1600 : : /*
1601 : : * Does any arc of this color trigram connect initial and final
1602 : : * states? If so we can't remove it.
1603 : : */
4723 1604 [ + - + + : 306 : foreach(cell, trgmInfo->arcs)
+ + ]
1605 : : {
1606 : 191 : TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1607 : 191 : TrgmState *source = arcInfo->source,
1608 : 191 : *target = arcInfo->target;
1609 : : int source_flags,
1610 : : target_flags;
1611 : :
1612 : : #ifdef TRGM_REGEXP_DEBUG
1613 : : fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
1614 : : -target->snumber, target->flags,
1615 : : -source->snumber, source->flags);
1616 : : #endif
1617 : :
1618 : : /* examine parent states, if any merging has already happened */
1619 [ + + ]: 341 : while (source->parent)
1620 : 150 : source = source->parent;
1621 [ + + ]: 407 : while (target->parent)
1622 : 216 : target = target->parent;
1623 : :
1624 : : #ifdef TRGM_REGEXP_DEBUG
1625 : : fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
1626 : : -target->snumber, target->flags,
1627 : : -source->snumber, source->flags);
1628 : : #endif
1629 : :
1630 : : /* we must also consider merges we are planning right now */
3257 1631 : 191 : source_flags = source->flags | source->tentFlags;
3308 1632 [ + + ]: 195 : while (source->tentParent)
1633 : : {
1634 : 4 : source = source->tentParent;
3257 1635 : 4 : source_flags |= source->flags | source->tentFlags;
1636 : : }
1637 : 191 : target_flags = target->flags | target->tentFlags;
3308 1638 [ + + ]: 206 : while (target->tentParent)
1639 : : {
1640 : 15 : target = target->tentParent;
3257 1641 : 15 : target_flags |= target->flags | target->tentFlags;
1642 : : }
1643 : :
1644 : : #ifdef TRGM_REGEXP_DEBUG
1645 : : fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
1646 : : -target->snumber, target_flags,
1647 : : -source->snumber, source_flags);
1648 : : #endif
1649 : :
1650 : : /* would fully-merged state have both INIT and FIN set? */
3308 1651 [ + + ]: 191 : if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
1652 : : (TSTATE_INIT | TSTATE_FIN))
1653 : : {
4723 1654 : 33 : canRemove = false;
1655 : 33 : break;
1656 : : }
1657 : :
1658 : : /* ok so far, so remember planned merge */
3308 1659 [ + + ]: 158 : if (source != target)
1660 : : {
1661 : : #ifdef TRGM_REGEXP_DEBUG
1662 : : fprintf(stderr, " ... tentatively merging s%d into s%d\n",
1663 : : -target->snumber, -source->snumber);
1664 : : #endif
1665 : 113 : target->tentParent = source;
3257 1666 : 113 : source->tentFlags |= target_flags;
1667 : : }
1668 : : }
1669 : :
1670 : : /*
1671 : : * We must reset all the tentFlags/tentParent fields before
1672 : : * continuing. tentFlags could only have become set in states that
1673 : : * are the source or parent or tentative parent of one of the current
1674 : : * arcs; likewise tentParent could only have become set in states that
1675 : : * are the target or parent or tentative parent of one of the current
1676 : : * arcs. There might be some overlap between those sets, but if we
1677 : : * clear tentFlags in target states as well as source states, we
1678 : : * should be okay even if we visit a state as target before visiting
1679 : : * it as a source.
1680 : : */
3308 1681 [ + - + + : 348 : foreach(cell, trgmInfo->arcs)
+ + ]
1682 : : {
1683 : 200 : TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
3257 1684 : 200 : TrgmState *source = arcInfo->source,
1685 : 200 : *target = arcInfo->target;
1686 : : TrgmState *ttarget;
1687 : :
1688 : : /* no need to touch previously-merged states */
1689 [ + + ]: 350 : while (source->parent)
1690 : 150 : source = source->parent;
3308 1691 [ + + ]: 440 : while (target->parent)
1692 : 240 : target = target->parent;
1693 : :
3257 1694 [ + + ]: 406 : while (source)
1695 : : {
1696 : 206 : source->tentFlags = 0;
1697 : 206 : source = source->tentParent;
1698 : : }
1699 : :
3308 1700 [ + + ]: 313 : while ((ttarget = target->tentParent) != NULL)
1701 : : {
1702 : 113 : target->tentParent = NULL;
3257 1703 : 113 : target->tentFlags = 0; /* in case it was also a source */
3308 1704 : 113 : target = ttarget;
1705 : : }
1706 : : }
1707 : :
1708 : : /* Now, move on if we can't drop this trigram */
4723 1709 [ + + ]: 148 : if (!canRemove)
1710 : : {
1711 : : #ifdef TRGM_REGEXP_DEBUG
1712 : : fprintf(stderr, " ... not ok to merge\n");
1713 : : #endif
1714 : 33 : continue;
1715 : : }
1716 : :
1717 : : /* OK, merge states linked by each arc labeled by the trigram */
1718 [ + - + + : 263 : foreach(cell, trgmInfo->arcs)
+ + ]
1719 : : {
1720 : 148 : TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1721 : 148 : TrgmState *source = arcInfo->source,
1722 : 148 : *target = arcInfo->target;
1723 : :
1724 [ + + ]: 292 : while (source->parent)
1725 : 144 : source = source->parent;
1726 [ + + ]: 346 : while (target->parent)
1727 : 198 : target = target->parent;
1728 [ + + ]: 148 : if (source != target)
1729 : : {
1730 : : #ifdef TRGM_REGEXP_DEBUG
1731 : : fprintf(stderr, "merging s%d into s%d\n",
1732 : : -target->snumber, -source->snumber);
1733 : : #endif
1734 : 103 : mergeStates(source, target);
1735 : : /* Assert we didn't merge initial and final states */
3308 1736 [ - + ]: 103 : Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1737 : : (TSTATE_INIT | TSTATE_FIN));
1738 : : }
1739 : : }
1740 : :
1741 : : /* Mark trigram unexpanded, and update totals */
4723 1742 : 115 : trgmInfo->expanded = false;
1743 : 115 : totalTrgmCount -= trgmInfo->count;
4362 1744 : 115 : totalTrgmPenalty -= trgmInfo->penalty;
1745 : : }
1746 : :
1747 : : /* Did we succeed in fitting into MAX_TRGM_COUNT? */
4723 1748 [ + + ]: 56 : if (totalTrgmCount > MAX_TRGM_COUNT)
1749 : 3 : return false;
1750 : :
1751 : 53 : trgmNFA->totalTrgmCount = (int) totalTrgmCount;
1752 : :
1753 : : /*
1754 : : * Sort color trigrams by colors (will be useful for bsearch in packGraph)
1755 : : * and enumerate the color trigrams that are expanded.
1756 : : */
3257 1757 : 53 : cnumber = 0;
4723 1758 : 53 : qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1759 : : colorTrgmInfoCmp);
1760 [ + + ]: 328 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1761 : : {
1762 [ + + ]: 275 : if (colorTrgms[i].expanded)
1763 : : {
3257 1764 : 160 : colorTrgms[i].cnumber = cnumber;
1765 : 160 : cnumber++;
1766 : : }
1767 : : }
1768 : :
4723 1769 : 53 : return true;
1770 : : }
1771 : :
1772 : : /*
1773 : : * Expand selected color trigrams into regular trigrams.
1774 : : *
1775 : : * Returns the TRGM array to be passed to the index machinery.
1776 : : * The array must be allocated in rcontext.
1777 : : */
1778 : : static TRGM *
1779 : 53 : expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
1780 : : {
1781 : : TRGM *trg;
1782 : : trgm *p;
1783 : : int i;
1784 : : TrgmColorInfo blankColor;
1785 : : trgm_mb_char blankChar;
1786 : :
1787 : : /* Set up "blank" color structure containing a single zero character */
1788 : 53 : memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
1789 : 53 : blankColor.wordCharsCount = 1;
1790 : 53 : blankColor.wordChars = &blankChar;
1791 : :
1792 : : /* Construct the trgm array */
1793 : : trg = (TRGM *)
1794 : 53 : MemoryContextAllocZero(rcontext,
1795 : : TRGMHDRSIZE +
1796 : 53 : trgmNFA->totalTrgmCount * sizeof(trgm));
1797 : 53 : trg->flag = ARRKEY;
1798 : 53 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount));
1799 : 53 : p = GETARR(trg);
1800 [ + + ]: 328 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1801 : : {
1802 : 275 : ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
1803 : : TrgmColorInfo *c[3];
1804 : : trgm_mb_char s[3];
1805 : : int j,
1806 : : i1,
1807 : : i2,
1808 : : i3;
1809 : :
1810 : : /* Ignore any unexpanded trigrams ... */
1811 [ + + ]: 275 : if (!colorTrgm->expanded)
1812 : 115 : continue;
1813 : :
1814 : : /* Get colors, substituting the dummy struct for COLOR_BLANK */
1815 [ + + ]: 640 : for (j = 0; j < 3; j++)
1816 : : {
1817 [ + + ]: 480 : if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
1818 : 413 : c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
1819 : : else
1820 : 67 : c[j] = &blankColor;
1821 : : }
1822 : :
1823 : : /* Iterate over all possible combinations of colors' characters */
1824 [ + + ]: 377 : for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
1825 : : {
1826 : 217 : s[0] = c[0]->wordChars[i1];
1827 [ + + ]: 728 : for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
1828 : : {
1829 : 511 : s[1] = c[1]->wordChars[i2];
1830 [ + + ]: 1970 : for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
1831 : : {
1832 : 1459 : s[2] = c[2]->wordChars[i3];
1833 : 1459 : fillTrgm(p, s);
1834 : 1459 : p++;
1835 : : }
1836 : : }
1837 : : }
1838 : : }
1839 : :
1840 : 53 : return trg;
1841 : : }
1842 : :
1843 : : /*
1844 : : * Convert trigram into trgm datatype.
1845 : : */
1846 : : static void
1847 : 1459 : fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
1848 : : {
1849 : : char str[3 * MAX_MULTIBYTE_CHAR_LEN],
1850 : : *p;
1851 : : int i,
1852 : : j;
1853 : :
1854 : : /* Write multibyte string into "str" (we don't need null termination) */
1855 : 1459 : p = str;
1856 : :
1857 [ + + ]: 5836 : for (i = 0; i < 3; i++)
1858 : : {
1859 [ + + ]: 4377 : if (s[i].bytes[0] != 0)
1860 : : {
1861 [ + - + + ]: 8232 : for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
1862 : 4116 : *p++ = s[i].bytes[j];
1863 : : }
1864 : : else
1865 : : {
1866 : : /* Emit a space in place of COLOR_BLANK */
1867 : 261 : *p++ = ' ';
1868 : : }
1869 : : }
1870 : :
1871 : : /* Convert "str" to a standard trigram (possibly hashing it) */
1872 : 1459 : compact_trigram(ptrgm, str, p - str);
1873 : 1459 : }
1874 : :
1875 : : /*
1876 : : * Merge two states of graph.
1877 : : */
1878 : : static void
1879 : 103 : mergeStates(TrgmState *state1, TrgmState *state2)
1880 : : {
1881 [ - + ]: 103 : Assert(state1 != state2);
1882 [ - + ]: 103 : Assert(!state1->parent);
1883 [ - + ]: 103 : Assert(!state2->parent);
1884 : :
1885 : : /* state1 absorbs state2's flags */
3308 1886 : 103 : state1->flags |= state2->flags;
1887 : :
1888 : : /* state2, and indirectly all its children, become children of state1 */
4723 1889 : 103 : state2->parent = state1;
1890 : 103 : }
1891 : :
1892 : : /*
1893 : : * Compare function for sorting of color trigrams by their colors.
1894 : : */
1895 : : static int
1896 : 5089 : colorTrgmInfoCmp(const void *p1, const void *p2)
1897 : : {
1898 : 5089 : const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
1899 : 5089 : const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
1900 : :
1901 : 5089 : return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
1902 : : }
1903 : :
1904 : : /*
1905 : : * Compare function for sorting color trigrams in descending order of
1906 : : * their penalty fields.
1907 : : */
1908 : : static int
4362 1909 : 423 : colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
1910 : : {
1911 : 423 : float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
1912 : 423 : float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
1913 : :
1914 [ + + ]: 423 : if (penalty1 < penalty2)
4723 1915 : 127 : return 1;
4362 1916 [ + + ]: 296 : else if (penalty1 == penalty2)
4723 1917 : 162 : return 0;
1918 : : else
1919 : 134 : return -1;
1920 : : }
1921 : :
1922 : :
1923 : : /*---------------------
1924 : : * Subroutines for packing the graph into final representation (stage 4).
1925 : : *---------------------
1926 : : */
1927 : :
1928 : : /*
1929 : : * Pack expanded graph into final representation.
1930 : : *
1931 : : * The result data must be allocated in rcontext.
1932 : : */
1933 : : static TrgmPackedGraph *
1934 : 53 : packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
1935 : : {
3257 1936 : 53 : int snumber = 2,
1937 : : arcIndex,
1938 : : arcsCount;
1939 : : HASH_SEQ_STATUS scan_status;
1940 : : TrgmState *state;
1941 : : TrgmPackArcInfo *arcs;
1942 : : TrgmPackedArc *packedArcs;
1943 : : TrgmPackedGraph *result;
1944 : : int i,
1945 : : j;
1946 : :
1947 : : /* Enumerate surviving states, giving init and fin reserved numbers */
4723 1948 : 53 : hash_seq_init(&scan_status, trgmNFA->states);
1949 [ + + ]: 755 : while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1950 : : {
1951 [ + + ]: 971 : while (state->parent)
1952 : 322 : state = state->parent;
1953 : :
3257 1954 [ + + ]: 649 : if (state->snumber < 0)
1955 : : {
3308 1956 [ + + ]: 546 : if (state->flags & TSTATE_INIT)
3257 1957 : 53 : state->snumber = 0;
3308 1958 [ + + ]: 493 : else if (state->flags & TSTATE_FIN)
3257 1959 : 57 : state->snumber = 1;
1960 : : else
1961 : : {
1962 : 436 : state->snumber = snumber;
1963 : 436 : snumber++;
1964 : : }
1965 : : }
1966 : : }
1967 : :
1968 : : /* Collect array of all arcs */
100 michael@paquier.xyz 1969 :GNC 53 : arcs = palloc_array(TrgmPackArcInfo, trgmNFA->arcsCount);
4723 tgl@sss.pgh.pa.us 1970 :CBC 53 : arcIndex = 0;
1971 : 53 : hash_seq_init(&scan_status, trgmNFA->states);
1972 [ + + ]: 702 : while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1973 : : {
1974 : 649 : TrgmState *source = state;
1975 : : ListCell *cell;
1976 : :
1977 [ + + ]: 971 : while (source->parent)
1978 : 322 : source = source->parent;
1979 : :
1980 [ + + + + : 1372 : foreach(cell, state->arcs)
+ + ]
1981 : : {
1982 : 723 : TrgmArc *arc = (TrgmArc *) lfirst(cell);
1983 : 723 : TrgmState *target = arc->target;
1984 : :
1985 [ + + ]: 1352 : while (target->parent)
1986 : 629 : target = target->parent;
1987 : :
3257 1988 [ + + ]: 723 : if (source->snumber != target->snumber)
1989 : : {
1990 : : ColorTrgmInfo *ctrgm;
1991 : :
4723 1992 : 566 : ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
1993 : 566 : trgmNFA->colorTrgms,
1994 : 566 : trgmNFA->colorTrgmsCount,
1995 : : sizeof(ColorTrgmInfo),
1996 : : colorTrgmInfoCmp);
1997 [ - + ]: 566 : Assert(ctrgm != NULL);
1998 [ - + ]: 566 : Assert(ctrgm->expanded);
1999 : :
3257 2000 : 566 : arcs[arcIndex].sourceState = source->snumber;
2001 : 566 : arcs[arcIndex].targetState = target->snumber;
2002 : 566 : arcs[arcIndex].colorTrgm = ctrgm->cnumber;
4723 2003 : 566 : arcIndex++;
2004 : : }
2005 : : }
2006 : : }
2007 : :
2008 : : /* Sort arcs to ease duplicate detection */
2009 : 53 : qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp);
2010 : :
2011 : : /* We could have duplicates because states were merged. Remove them. */
1100 2012 [ + + ]: 53 : if (arcIndex > 1)
2013 : : {
2014 : : /* p1 is probe point, p2 is last known non-duplicate. */
2015 : : TrgmPackArcInfo *p1,
2016 : : *p2;
2017 : :
2018 : 31 : p2 = arcs;
2019 [ + + ]: 546 : for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
2020 : : {
2021 [ + + ]: 515 : if (packArcInfoCmp(p1, p2) > 0)
2022 : : {
2023 : 509 : p2++;
2024 : 509 : *p2 = *p1;
2025 : : }
2026 : : }
2027 : 31 : arcsCount = (p2 - arcs) + 1;
2028 : : }
2029 : : else
2030 : 22 : arcsCount = arcIndex;
2031 : :
2032 : : /* Create packed representation */
2033 : : result = (TrgmPackedGraph *)
4723 2034 : 53 : MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph));
2035 : :
2036 : : /* Pack color trigrams information */
2037 : 53 : result->colorTrigramsCount = 0;
2038 [ + + ]: 328 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2039 : : {
2040 [ + + ]: 275 : if (trgmNFA->colorTrgms[i].expanded)
2041 : 160 : result->colorTrigramsCount++;
2042 : : }
2043 : 53 : result->colorTrigramGroups = (int *)
2044 : 53 : MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
2045 : 53 : j = 0;
2046 [ + + ]: 328 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2047 : : {
2048 [ + + ]: 275 : if (trgmNFA->colorTrgms[i].expanded)
2049 : : {
2050 : 160 : result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
2051 : 160 : j++;
2052 : : }
2053 : : }
2054 : :
2055 : : /* Pack states and arcs information */
3257 2056 : 53 : result->statesCount = snumber;
4723 2057 : 53 : result->states = (TrgmPackedState *)
3257 2058 : 53 : MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
2059 : : packedArcs = (TrgmPackedArc *)
4723 2060 : 53 : MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
2061 : 53 : j = 0;
3257 2062 [ + + ]: 595 : for (i = 0; i < snumber; i++)
2063 : : {
4723 2064 : 542 : int cnt = 0;
2065 : :
2066 : 542 : result->states[i].arcs = &packedArcs[j];
2067 [ + + + + ]: 1102 : while (j < arcsCount && arcs[j].sourceState == i)
2068 : : {
2069 : 560 : packedArcs[j].targetState = arcs[j].targetState;
2070 : 560 : packedArcs[j].colorTrgm = arcs[j].colorTrgm;
2071 : 560 : cnt++;
2072 : 560 : j++;
2073 : : }
2074 : 542 : result->states[i].arcsCount = cnt;
2075 : : }
2076 : :
2077 : : /* Allocate working memory for trigramsMatchGraph() */
2078 : 53 : result->colorTrigramsActive = (bool *)
2079 : 53 : MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
2080 : 53 : result->statesActive = (bool *)
2081 : 53 : MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
2082 : 53 : result->statesQueue = (int *)
2083 : 53 : MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
2084 : :
2085 : 53 : return result;
2086 : : }
2087 : :
2088 : : /*
2089 : : * Comparison function for sorting TrgmPackArcInfos.
2090 : : *
2091 : : * Compares arcs in following order: sourceState, colorTrgm, targetState.
2092 : : */
2093 : : static int
2094 : 4546 : packArcInfoCmp(const void *a1, const void *a2)
2095 : : {
2096 : 4546 : const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
2097 : 4546 : const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
2098 : :
2099 [ + + ]: 4546 : if (p1->sourceState < p2->sourceState)
2100 : 2179 : return -1;
2101 [ + + ]: 2367 : if (p1->sourceState > p2->sourceState)
2102 : 2050 : return 1;
2103 [ + + ]: 317 : if (p1->colorTrgm < p2->colorTrgm)
2104 : 198 : return -1;
2105 [ + + ]: 119 : if (p1->colorTrgm > p2->colorTrgm)
2106 : 107 : return 1;
2107 [ - + ]: 12 : if (p1->targetState < p2->targetState)
4723 tgl@sss.pgh.pa.us 2108 :UBC 0 : return -1;
4723 tgl@sss.pgh.pa.us 2109 [ - + ]:CBC 12 : if (p1->targetState > p2->targetState)
4723 tgl@sss.pgh.pa.us 2110 :UBC 0 : return 1;
4723 tgl@sss.pgh.pa.us 2111 :CBC 12 : return 0;
2112 : : }
2113 : :
2114 : :
2115 : : /*---------------------
2116 : : * Debugging functions
2117 : : *
2118 : : * These are designed to emit GraphViz files.
2119 : : *---------------------
2120 : : */
2121 : :
2122 : : #ifdef TRGM_REGEXP_DEBUG
2123 : :
2124 : : /*
2125 : : * Print initial NFA, in regexp library's representation
2126 : : */
2127 : : static void
2128 : : printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors)
2129 : : {
2130 : : StringInfoData buf;
2131 : : int nstates = pg_reg_getnumstates(regex);
2132 : :
2133 : : initStringInfo(&buf);
2134 : :
2135 : : appendStringInfoString(&buf, "\ndigraph sourceNFA {\n");
2136 : :
2137 : : for (int state = 0; state < nstates; state++)
2138 : : {
2139 : : regex_arc_t *arcs;
2140 : : int arcsCount;
2141 : :
2142 : : appendStringInfo(&buf, "s%d", state);
2143 : : if (pg_reg_getfinalstate(regex) == state)
2144 : : appendStringInfoString(&buf, " [shape = doublecircle]");
2145 : : appendStringInfoString(&buf, ";\n");
2146 : :
2147 : : arcsCount = pg_reg_getnumoutarcs(regex, state);
2148 : : arcs = palloc_array(regex_arc_t, arcsCount);
2149 : : pg_reg_getoutarcs(regex, state, arcs, arcsCount);
2150 : :
2151 : : for (int i = 0; i < arcsCount; i++)
2152 : : {
2153 : : appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n",
2154 : : state, arcs[i].to, arcs[i].co);
2155 : : }
2156 : :
2157 : : pfree(arcs);
2158 : : }
2159 : :
2160 : : appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2161 : : appendStringInfo(&buf, " initial -> s%d;\n",
2162 : : pg_reg_getinitialstate(regex));
2163 : :
2164 : : /* Print colors */
2165 : : appendStringInfoString(&buf, " { rank = sink;\n");
2166 : : appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n");
2167 : :
2168 : : for (int i = 0; i < ncolors; i++)
2169 : : {
2170 : : TrgmColorInfo *color = &colors[i];
2171 : :
2172 : : appendStringInfo(&buf, "<br/>Color %d: ", i);
2173 : : if (color->expandable)
2174 : : {
2175 : : for (int j = 0; j < color->wordCharsCount; j++)
2176 : : {
2177 : : char s[MAX_MULTIBYTE_CHAR_LEN + 1];
2178 : :
2179 : : memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN);
2180 : : s[MAX_MULTIBYTE_CHAR_LEN] = '\0';
2181 : : appendStringInfoString(&buf, s);
2182 : : }
2183 : : }
2184 : : else
2185 : : appendStringInfoString(&buf, "not expandable");
2186 : : appendStringInfoChar(&buf, '\n');
2187 : : }
2188 : :
2189 : : appendStringInfoString(&buf, " >];\n");
2190 : : appendStringInfoString(&buf, " }\n");
2191 : : appendStringInfoString(&buf, "}\n");
2192 : :
2193 : : {
2194 : : /* dot -Tpng -o /tmp/source.png < /tmp/source.gv */
2195 : : FILE *fp = fopen("/tmp/source.gv", "w");
2196 : :
2197 : : fprintf(fp, "%s", buf.data);
2198 : : fclose(fp);
2199 : : }
2200 : :
2201 : : pfree(buf.data);
2202 : : }
2203 : :
2204 : : /*
2205 : : * Print expanded graph.
2206 : : */
2207 : : static void
2208 : : printTrgmNFA(TrgmNFA *trgmNFA)
2209 : : {
2210 : : StringInfoData buf;
2211 : : HASH_SEQ_STATUS scan_status;
2212 : : TrgmState *state;
2213 : : TrgmState *initstate = NULL;
2214 : :
2215 : : initStringInfo(&buf);
2216 : :
2217 : : appendStringInfoString(&buf, "\ndigraph transformedNFA {\n");
2218 : :
2219 : : hash_seq_init(&scan_status, trgmNFA->states);
2220 : : while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
2221 : : {
2222 : : ListCell *cell;
2223 : :
2224 : : appendStringInfo(&buf, "s%d", -state->snumber);
2225 : : if (state->flags & TSTATE_FIN)
2226 : : appendStringInfoString(&buf, " [shape = doublecircle]");
2227 : : if (state->flags & TSTATE_INIT)
2228 : : initstate = state;
2229 : : appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
2230 : : appendStringInfoString(&buf, ";\n");
2231 : :
2232 : : foreach(cell, state->arcs)
2233 : : {
2234 : : TrgmArc *arc = (TrgmArc *) lfirst(cell);
2235 : :
2236 : : appendStringInfo(&buf, " s%d -> s%d [label = \"",
2237 : : -state->snumber, -arc->target->snumber);
2238 : : printTrgmColor(&buf, arc->ctrgm.colors[0]);
2239 : : appendStringInfoChar(&buf, ' ');
2240 : : printTrgmColor(&buf, arc->ctrgm.colors[1]);
2241 : : appendStringInfoChar(&buf, ' ');
2242 : : printTrgmColor(&buf, arc->ctrgm.colors[2]);
2243 : : appendStringInfoString(&buf, "\"];\n");
2244 : : }
2245 : : }
2246 : :
2247 : : if (initstate)
2248 : : {
2249 : : appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2250 : : appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
2251 : : }
2252 : :
2253 : : appendStringInfoString(&buf, "}\n");
2254 : :
2255 : : {
2256 : : /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */
2257 : : FILE *fp = fopen("/tmp/transformed.gv", "w");
2258 : :
2259 : : fprintf(fp, "%s", buf.data);
2260 : : fclose(fp);
2261 : : }
2262 : :
2263 : : pfree(buf.data);
2264 : : }
2265 : :
2266 : : /*
2267 : : * Print a TrgmColor readably.
2268 : : */
2269 : : static void
2270 : : printTrgmColor(StringInfo buf, TrgmColor co)
2271 : : {
2272 : : if (co == COLOR_UNKNOWN)
2273 : : appendStringInfoChar(buf, 'u');
2274 : : else if (co == COLOR_BLANK)
2275 : : appendStringInfoChar(buf, 'b');
2276 : : else
2277 : : appendStringInfo(buf, "%d", (int) co);
2278 : : }
2279 : :
2280 : : /*
2281 : : * Print final packed representation of trigram-based expanded graph.
2282 : : */
2283 : : static void
2284 : : printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams)
2285 : : {
2286 : : StringInfoData buf;
2287 : : trgm *p;
2288 : : int i;
2289 : :
2290 : : initStringInfo(&buf);
2291 : :
2292 : : appendStringInfoString(&buf, "\ndigraph packedGraph {\n");
2293 : :
2294 : : for (i = 0; i < packedGraph->statesCount; i++)
2295 : : {
2296 : : TrgmPackedState *state = &packedGraph->states[i];
2297 : : int j;
2298 : :
2299 : : appendStringInfo(&buf, " s%d", i);
2300 : : if (i == 1)
2301 : : appendStringInfoString(&buf, " [shape = doublecircle]");
2302 : :
2303 : : appendStringInfo(&buf, " [label = <s%d>];\n", i);
2304 : :
2305 : : for (j = 0; j < state->arcsCount; j++)
2306 : : {
2307 : : TrgmPackedArc *arc = &state->arcs[j];
2308 : :
2309 : : appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n",
2310 : : i, arc->targetState, arc->colorTrgm);
2311 : : }
2312 : : }
2313 : :
2314 : : appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2315 : : appendStringInfo(&buf, " initial -> s%d;\n", 0);
2316 : :
2317 : : /* Print trigrams */
2318 : : appendStringInfoString(&buf, " { rank = sink;\n");
2319 : : appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n");
2320 : :
2321 : : p = GETARR(trigrams);
2322 : : for (i = 0; i < packedGraph->colorTrigramsCount; i++)
2323 : : {
2324 : : int count = packedGraph->colorTrigramGroups[i];
2325 : : int j;
2326 : :
2327 : : appendStringInfo(&buf, "<br/>Trigram %d: ", i);
2328 : :
2329 : : for (j = 0; j < count; j++)
2330 : : {
2331 : : if (j > 0)
2332 : : appendStringInfoString(&buf, ", ");
2333 : :
2334 : : /*
2335 : : * XXX This representation is nice only for all-ASCII trigrams.
2336 : : */
2337 : : appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]);
2338 : : p++;
2339 : : }
2340 : : }
2341 : :
2342 : : appendStringInfoString(&buf, " >];\n");
2343 : : appendStringInfoString(&buf, " }\n");
2344 : : appendStringInfoString(&buf, "}\n");
2345 : :
2346 : : {
2347 : : /* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */
2348 : : FILE *fp = fopen("/tmp/packed.gv", "w");
2349 : :
2350 : : fprintf(fp, "%s", buf.data);
2351 : : fclose(fp);
2352 : : }
2353 : :
2354 : : pfree(buf.data);
2355 : : }
2356 : :
2357 : : #endif /* TRGM_REGEXP_DEBUG */
|