LCOV - 380a8b2ea024c33a35e7abc8628e7c4f52f9f9f9 vs db5ed03217b9c238703df8b4b286115d6e940488

LCOV - differential code coverage report

Current view:	top level - src/common - unicode_norm.c (source / functions)		Coverage	Total	Hit	UNC	UBC	GBC	GNC	CBC	DUB	DCB
Current:	380a8b2ea024c33a35e7abc8628e7c4f52f9f9f9 vs db5ed03217b9c238703df8b4b286115d6e940488	Lines:	77.5 %	204	158	1	45		20	138	1	20
Current Date:	2026-05-29 21:51:00 -0400	Functions:	100.0 %	11	11				9	2		8
Baseline:	lcov-20260530-034037-baseline	Branches:	72.8 %	151	110	1	40	1	3	106	1	3
Baseline Date:	2026-05-29 14:39:03 -0700	Line coverage date bins:
Legend:	Lines: hit not hit Branches: + taken - not taken # not executed	(7,30] days:	33.3 %	3	1		2		1
		(30,360] days:	95.0 %	20	19	1			19
		(360..) days:	76.2 %	181	138		43			138
		Function coverage date bins:
		(30,360] days:	100.0 %	8	8				8
		(360..) days:	100.0 %	3	3				1	2
		Branch coverage date bins:
		(7,30] days:	50.0 %	2	1	1			1
		(30,360] days:	100.0 %	2	2				2
		(360..) days:	72.8 %	147	107		40	1		106

 Age         Owner                    Branch data    TLA  Line data    Source code

                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  * unicode_norm.c
                                  3                 :                :  *      Normalize a Unicode string
                                  4                 :                :  *
                                  5                 :                :  * This implements Unicode normalization, per the documentation at
                                  6                 :                :  * https://www.unicode.org/reports/tr15/.
                                  7                 :                :  *
                                  8                 :                :  * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
                                  9                 :                :  *
                                 10                 :                :  * IDENTIFICATION
                                 11                 :                :  *    src/common/unicode_norm.c
                                 12                 :                :  *
                                 13                 :                :  *-------------------------------------------------------------------------
                                 14                 :                :  */
                                 15                 :                : #ifndef FRONTEND
                                 16                 :                : #include "postgres.h"
                                 17                 :                : #else
                                 18                 :                : #include "postgres_fe.h"
                                 19                 :                : #endif
                                 20                 :                : 
                                 21                 :                : #include "common/unicode_norm.h"
                                 22                 :                : #ifndef FRONTEND
                                 23                 :                : #include "common/unicode_norm_hashfunc.h"
                                 24                 :                : #include "common/unicode_normprops_table.h"
                                 25                 :                : #include "port/pg_bswap.h"
                                 26                 :                : #include "utils/memutils.h"
                                 27                 :                : #else
                                 28                 :                : #include "common/unicode_norm_table.h"
                                 29                 :                : #endif
                                 30                 :                : 
                                 31                 :                : #ifndef FRONTEND
                                 32                 :                : #define ALLOC(size) palloc(size)
                                 33                 :                : #define FREE(size) pfree(size)
                                 34                 :                : #else
                                 35                 :                : #define ALLOC(size) malloc(size)
                                 36                 :                : #define FREE(size) free(size)
                                 37                 :                : #endif
                                 38                 :                : 
                                 39                 :                : /* Constants for calculations with Hangul characters */
                                 40                 :                : #define SBASE       0xAC00      /* U+AC00 */
                                 41                 :                : #define LBASE       0x1100      /* U+1100 */
                                 42                 :                : #define VBASE       0x1161      /* U+1161 */
                                 43                 :                : #define TBASE       0x11A7      /* U+11A7 */
                                 44                 :                : #define LCOUNT      19
                                 45                 :                : #define VCOUNT      21
                                 46                 :                : #define TCOUNT      28
                                 47                 :                : #define NCOUNT      VCOUNT * TCOUNT
                                 48                 :                : #define SCOUNT      LCOUNT * NCOUNT
                                 49                 :                : 
                                 50                 :                : #ifdef FRONTEND
                                 51                 :                : /* comparison routine for bsearch() of decomposition lookup table. */
                                 52                 :                : static int
 2044 michael@paquier.xyz        53                 :CBC       15401 : conv_compare(const void *p1, const void *p2)
                                 54                 :                : {
                                 55                 :                :     uint32      v1,
                                 56                 :                :                 v2;
                                 57                 :                : 
                                 58                 :          15401 :     v1 = *(const uint32 *) p1;
                                 59                 :          15401 :     v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
                                 60   [ +  +  +  + ]:          15401 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
                                 61                 :                : }
                                 62                 :                : 
                                 63                 :                : #endif
                                 64                 :                : 
                                 65                 :                : /*
                                 66                 :                :  * get_code_entry
                                 67                 :                :  *
                                 68                 :                :  * Get the entry corresponding to code in the decomposition lookup table.
                                 69                 :                :  * The backend version of this code uses a perfect hash function for the
                                 70                 :                :  * lookup, while the frontend version uses a binary search.
                                 71                 :                :  */
                                 72                 :                : static const pg_unicode_decomposition *
  213 jdavis@postgresql.or       73                 :GNC       14371 : get_code_entry(char32_t code)
                                 74                 :                : {
                                 75                 :                : #ifndef FRONTEND
                                 76                 :                :     int         h;
                                 77                 :                :     uint32      hashkey;
 2045 michael@paquier.xyz        78                 :CBC       13186 :     pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
                                 79                 :                : 
                                 80                 :                :     /*
                                 81                 :                :      * Compute the hash function. The hash key is the codepoint with the bytes
                                 82                 :                :      * in network order.
                                 83                 :                :      */
                                 84                 :          13186 :     hashkey = pg_hton32(code);
                                 85                 :          13186 :     h = decompinfo.hash(&hashkey);
                                 86                 :                : 
                                 87                 :                :     /* An out-of-range result implies no match */
                                 88   [ +  -  +  + ]:          13186 :     if (h < 0 || h >= decompinfo.num_decomps)
                                 89                 :           3457 :         return NULL;
                                 90                 :                : 
                                 91                 :                :     /*
                                 92                 :                :      * Since it's a perfect hash, we need only match to the specific codepoint
                                 93                 :                :      * it identifies.
                                 94                 :                :      */
                                 95         [ +  + ]:           9729 :     if (code != decompinfo.decomps[h].codepoint)
                                 96                 :           9023 :         return NULL;
                                 97                 :                : 
                                 98                 :                :     /* Success! */
                                 99                 :            706 :     return &decompinfo.decomps[h];
                                100                 :                : #else
 3340 heikki.linnakangas@i      101                 :           1185 :     return bsearch(&(code),
                                102                 :                :                    UnicodeDecompMain,
                                103                 :                :                    lengthof(UnicodeDecompMain),
                                104                 :                :                    sizeof(pg_unicode_decomposition),
                                105                 :                :                    conv_compare);
                                106                 :                : #endif
                                107                 :                : }
                                108                 :                : 
                                109                 :                : /*
                                110                 :                :  * Get the combining class of the given codepoint.
                                111                 :                :  */
                                112                 :                : static uint8
  213 jdavis@postgresql.or      113                 :GNC        8089 : get_canonical_class(char32_t code)
                                114                 :                : {
 1998 michael@paquier.xyz       115                 :CBC        8089 :     const pg_unicode_decomposition *entry = get_code_entry(code);
                                116                 :                : 
                                117                 :                :     /*
                                118                 :                :      * If no entries are found, the character used is either a Hangul
                                119                 :                :      * character or a character with a class of 0 and no decompositions.
                                120                 :                :      */
                                121         [ +  + ]:           8089 :     if (!entry)
                                122                 :           7749 :         return 0;
                                123                 :                :     else
                                124                 :            340 :         return entry->comb_class;
                                125                 :                : }
                                126                 :                : 
                                127                 :                : /*
                                128                 :                :  * Given a decomposition entry looked up earlier, get the decomposed
                                129                 :                :  * characters.
                                130                 :                :  *
                                131                 :                :  * Note: the returned pointer can point to statically allocated buffer, and
                                132                 :                :  * is only valid until next call to this function!
                                133                 :                :  */
                                134                 :                : static const char32_t *
 2045                           135                 :            138 : get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
                                136                 :                : {
                                137                 :                :     static char32_t x;
                                138                 :                : 
 3340 heikki.linnakangas@i      139         [ +  + ]:            138 :     if (DECOMPOSITION_IS_INLINE(entry))
                                140                 :                :     {
                                141         [ -  + ]:             42 :         Assert(DECOMPOSITION_SIZE(entry) == 1);
  213 jdavis@postgresql.or      142                 :GNC          42 :         x = (char32_t) entry->dec_index;
 3340 heikki.linnakangas@i      143                 :CBC          42 :         *dec_size = 1;
                                144                 :             42 :         return &x;
                                145                 :                :     }
                                146                 :                :     else
                                147                 :                :     {
                                148                 :             96 :         *dec_size = DECOMPOSITION_SIZE(entry);
                                149                 :             96 :         return &UnicodeDecomp_codepoints[entry->dec_index];
                                150                 :                :     }
                                151                 :                : }
                                152                 :                : 
                                153                 :                : /*
                                154                 :                :  * Calculate how many characters a given character will decompose to.
                                155                 :                :  *
                                156                 :                :  * This needs to recurse, if the character decomposes into characters that
                                157                 :                :  * are, in turn, decomposable.
                                158                 :                :  */
                                159                 :                : static int
  213 jdavis@postgresql.or      160                 :GNC        3141 : get_decomposed_size(char32_t code, bool compat)
                                161                 :                : {
                                162                 :                :     const pg_unicode_decomposition *entry;
 3340 heikki.linnakangas@i      163                 :CBC        3141 :     int         size = 0;
                                164                 :                :     int         i;
                                165                 :                :     const uint32 *decomp;
                                166                 :                :     int         dec_size;
                                167                 :                : 
                                168                 :                :     /*
                                169                 :                :      * Fast path for Hangul characters not stored in tables to save memory as
                                170                 :                :      * decomposition is algorithmic. See
                                171                 :                :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
                                172                 :                :      * on the matter.
                                173                 :                :      */
                                174   [ -  +  -  - ]:           3141 :     if (code >= SBASE && code < SBASE + SCOUNT)
                                175                 :                :     {
                                176                 :                :         uint32      tindex,
                                177                 :                :                     sindex;
                                178                 :                : 
 3340 heikki.linnakangas@i      179                 :UBC           0 :         sindex = code - SBASE;
                                180                 :              0 :         tindex = sindex % TCOUNT;
                                181                 :                : 
                                182         [ #  # ]:              0 :         if (tindex != 0)
                                183                 :              0 :             return 3;
                                184                 :              0 :         return 2;
                                185                 :                :     }
                                186                 :                : 
 3340 heikki.linnakangas@i      187                 :CBC        3141 :     entry = get_code_entry(code);
                                188                 :                : 
                                189                 :                :     /*
                                190                 :                :      * Just count current code if no other decompositions.  A NULL entry is
                                191                 :                :      * equivalent to a character with class 0 and no decompositions.
                                192                 :                :      */
 2258 peter@eisentraut.org      193   [ +  +  +  + ]:           3141 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
                                194   [ +  +  +  + ]:             96 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
 3340 heikki.linnakangas@i      195                 :           3072 :         return 1;
                                196                 :                : 
                                197                 :                :     /*
                                198                 :                :      * If this entry has other decomposition codes look at them as well. First
                                199                 :                :      * get its decomposition in the list of tables available.
                                200                 :                :      */
                                201                 :             69 :     decomp = get_code_decomposition(entry, &dec_size);
                                202         [ +  + ]:            186 :     for (i = 0; i < dec_size; i++)
                                203                 :                :     {
                                204                 :            117 :         uint32      lcode = decomp[i];
                                205                 :                : 
 2258 peter@eisentraut.org      206                 :            117 :         size += get_decomposed_size(lcode, compat);
                                207                 :                :     }
                                208                 :                : 
 3340 heikki.linnakangas@i      209                 :             69 :     return size;
                                210                 :                : }
                                211                 :                : 
                                212                 :                : /*
                                213                 :                :  * Recompose a set of characters. For hangul characters, the calculation
                                214                 :                :  * is algorithmic. For others, an inverse lookup at the decomposition
                                215                 :                :  * table is necessary. Returns true if a recomposition can be done, and
                                216                 :                :  * false otherwise.
                                217                 :                :  */
                                218                 :                : static bool
                                219                 :           2525 : recompose_code(uint32 start, uint32 code, uint32 *result)
                                220                 :                : {
                                221                 :                :     /*
                                222                 :                :      * Handle Hangul characters algorithmically, per the Unicode spec.
                                223                 :                :      *
                                224                 :                :      * Check if two current characters are L and V.
                                225                 :                :      */
                                226   [ +  +  -  +  :           2525 :     if (start >= LBASE && start < LBASE + LCOUNT &&
                                              -  - ]
 3340 heikki.linnakangas@i      227         [ #  # ]:UBC           0 :         code >= VBASE && code < VBASE + VCOUNT)
                                228                 :                :     {
                                229                 :                :         /* make syllable of form LV */
                                230                 :              0 :         uint32      lindex = start - LBASE;
                                231                 :              0 :         uint32      vindex = code - VBASE;
                                232                 :                : 
                                233                 :              0 :         *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
                                234                 :              0 :         return true;
                                235                 :                :     }
                                236                 :                :     /* Check if two current characters are LV and T */
 3340 heikki.linnakangas@i      237   [ -  +  -  - ]:CBC        2525 :     else if (start >= SBASE && start < (SBASE + SCOUNT) &&
 3340 heikki.linnakangas@i      238   [ #  #  #  # ]:UBC           0 :              ((start - SBASE) % TCOUNT) == 0 &&
                                239         [ #  # ]:              0 :              code >= TBASE && code < (TBASE + TCOUNT))
                                240                 :                :     {
                                241                 :                :         /* make syllable of form LVT */
                                242                 :              0 :         uint32      tindex = code - TBASE;
                                243                 :                : 
                                244                 :              0 :         *result = start + tindex;
                                245                 :              0 :         return true;
                                246                 :                :     }
                                247                 :                :     else
                                248                 :                :     {
                                249                 :                :         const pg_unicode_decomposition *entry;
                                250                 :                : 
                                251                 :                :         /*
                                252                 :                :          * Do an inverse lookup of the decomposition tables to see if anything
                                253                 :                :          * matches. The comparison just needs to be a perfect match on the
                                254                 :                :          * sub-table of size two, because the start character has already been
                                255                 :                :          * recomposed partially.  This lookup uses a perfect hash function for
                                256                 :                :          * the backend code.
                                257                 :                :          */
                                258                 :                : #ifndef FRONTEND
                                259                 :                : 
                                260                 :                :         int         h,
                                261                 :                :                     inv_lookup_index;
                                262                 :                :         uint64      hashkey;
 2045 michael@paquier.xyz       263                 :CBC        2316 :         pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
                                264                 :                : 
                                265                 :                :         /*
                                266                 :                :          * Compute the hash function. The hash key is formed by concatenating
                                267                 :                :          * bytes of the two codepoints in network order. See also
                                268                 :                :          * src/common/unicode/generate-unicode_norm_table.pl.
                                269                 :                :          */
                                270                 :           2316 :         hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
                                271                 :           2316 :         h = recompinfo.hash(&hashkey);
                                272                 :                : 
                                273                 :                :         /* An out-of-range result implies no match */
                                274   [ +  +  +  + ]:           2316 :         if (h < 0 || h >= recompinfo.num_recomps)
                                275                 :           1924 :             return false;
                                276                 :                : 
                                277                 :            424 :         inv_lookup_index = recompinfo.inverse_lookup[h];
                                278                 :            424 :         entry = &UnicodeDecompMain[inv_lookup_index];
                                279                 :                : 
                                280         [ +  + ]:            424 :         if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
                                281         [ +  + ]:             36 :             code == UnicodeDecomp_codepoints[entry->dec_index + 1])
                                282                 :                :         {
                                283                 :             32 :             *result = entry->codepoint;
                                284                 :             32 :             return true;
                                285                 :                :         }
                                286                 :                : 
                                287                 :                : #else
                                288                 :                : 
                                289                 :                :         int         i;
                                290                 :                : 
 3340 heikki.linnakangas@i      291         [ +  + ]:        1437711 :         for (i = 0; i < lengthof(UnicodeDecompMain); i++)
                                292                 :                :         {
 2045 michael@paquier.xyz       293                 :        1437502 :             entry = &UnicodeDecompMain[i];
                                294                 :                : 
 3340 heikki.linnakangas@i      295         [ +  + ]:        1437502 :             if (DECOMPOSITION_SIZE(entry) != 2)
                                296                 :        1083456 :                 continue;
                                297                 :                : 
                                298         [ +  + ]:         354046 :             if (DECOMPOSITION_NO_COMPOSE(entry))
                                299                 :         153197 :                 continue;
                                300                 :                : 
                                301         [ +  + ]:         200849 :             if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
                                302         [ -  + ]:           1766 :                 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
                                303                 :                :             {
 3340 heikki.linnakangas@i      304                 :UBC           0 :                 *result = entry->codepoint;
                                305                 :              0 :                 return true;
                                306                 :                :             }
                                307                 :                :         }
                                308                 :                : #endif                          /* !FRONTEND */
                                309                 :                :     }
                                310                 :                : 
 3340 heikki.linnakangas@i      311                 :CBC         601 :     return false;
                                312                 :                : }
                                313                 :                : 
                                314                 :                : /*
                                315                 :                :  * Decompose the given code into the array given by caller. The
                                316                 :                :  * decomposition begins at the position given by caller, saving one
                                317                 :                :  * lookup on the decomposition table. The current position needs to be
                                318                 :                :  * updated here to let the caller know from where to continue filling
                                319                 :                :  * in the array result.
                                320                 :                :  */
                                321                 :                : static void
  213 jdavis@postgresql.or      322                 :GNC        3141 : decompose_code(char32_t code, bool compat, char32_t **result, int *current)
                                323                 :                : {
                                324                 :                :     const pg_unicode_decomposition *entry;
                                325                 :                :     int         i;
                                326                 :                :     const uint32 *decomp;
                                327                 :                :     int         dec_size;
                                328                 :                : 
                                329                 :                :     /*
                                330                 :                :      * Fast path for Hangul characters not stored in tables to save memory as
                                331                 :                :      * decomposition is algorithmic. See
                                332                 :                :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
                                333                 :                :      * on the matter.
                                334                 :                :      */
 3340 heikki.linnakangas@i      335   [ -  +  -  - ]:CBC        3141 :     if (code >= SBASE && code < SBASE + SCOUNT)
                                336                 :                :     {
                                337                 :                :         uint32      l,
                                338                 :                :                     v,
                                339                 :                :                     tindex,
                                340                 :                :                     sindex;
  213 jdavis@postgresql.or      341                 :UNC           0 :         char32_t   *res = *result;
                                342                 :                : 
 3340 heikki.linnakangas@i      343                 :UBC           0 :         sindex = code - SBASE;
                                344                 :              0 :         l = LBASE + sindex / (VCOUNT * TCOUNT);
                                345                 :              0 :         v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
                                346                 :              0 :         tindex = sindex % TCOUNT;
                                347                 :                : 
                                348                 :              0 :         res[*current] = l;
                                349                 :              0 :         (*current)++;
                                350                 :              0 :         res[*current] = v;
                                351                 :              0 :         (*current)++;
                                352                 :                : 
                                353         [ #  # ]:              0 :         if (tindex != 0)
                                354                 :                :         {
                                355                 :              0 :             res[*current] = TBASE + tindex;
                                356                 :              0 :             (*current)++;
                                357                 :                :         }
                                358                 :                : 
 3340 heikki.linnakangas@i      359                 :CBC        3072 :         return;
                                360                 :                :     }
                                361                 :                : 
                                362                 :           3141 :     entry = get_code_entry(code);
                                363                 :                : 
                                364                 :                :     /*
                                365                 :                :      * Just fill in with the current decomposition if there are no
                                366                 :                :      * decomposition codes to recurse to.  A NULL entry is equivalent to a
                                367                 :                :      * character with class 0 and no decompositions, so just leave also in
                                368                 :                :      * this case.
                                369                 :                :      */
 2258 peter@eisentraut.org      370   [ +  +  +  + ]:           3141 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
                                371   [ +  +  +  + ]:             96 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
                                372                 :                :     {
  213 jdavis@postgresql.or      373                 :GNC        3072 :         char32_t   *res = *result;
                                374                 :                : 
 3340 heikki.linnakangas@i      375                 :CBC        3072 :         res[*current] = code;
                                376                 :           3072 :         (*current)++;
                                377                 :           3072 :         return;
                                378                 :                :     }
                                379                 :                : 
                                380                 :                :     /*
                                381                 :                :      * If this entry has other decomposition codes look at them as well.
                                382                 :                :      */
                                383                 :             69 :     decomp = get_code_decomposition(entry, &dec_size);
                                384         [ +  + ]:            186 :     for (i = 0; i < dec_size; i++)
                                385                 :                :     {
  213 jdavis@postgresql.or      386                 :GNC         117 :         char32_t    lcode = (char32_t) decomp[i];
                                387                 :                : 
                                388                 :                :         /* Leave if no more decompositions */
 2258 peter@eisentraut.org      389                 :CBC         117 :         decompose_code(lcode, compat, result, current);
                                390                 :                :     }
                                391                 :                : }
                                392                 :                : 
                                393                 :                : /*
                                394                 :                :  * unicode_normalize - Normalize a Unicode string to the specified form.
                                395                 :                :  *
                                396                 :                :  * The input is a 0-terminated array of codepoints.
                                397                 :                :  *
                                398                 :                :  * In frontend, returns a 0-terminated array of codepoints, allocated with
                                399                 :                :  * malloc. Or NULL if we run out of memory. In backend, the returned
                                400                 :                :  * string is palloc'd instead, and OOM is reported with ereport().
                                401                 :                :  */
                                402                 :                : char32_t *
  213 jdavis@postgresql.or      403                 :GNC         370 : unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
                                404                 :                : {
 2258 peter@eisentraut.org      405   [ +  +  +  + ]:CBC         370 :     bool        compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
                                406   [ +  +  +  + ]:            370 :     bool        recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
                                407                 :                :     char32_t   *decomp_chars;
                                408                 :                :     char32_t   *recomp_chars;
                                409                 :                :     int         decomp_size,
                                410                 :                :                 current_size;
                                411                 :                :     int         count;
                                412                 :                :     const char32_t *p;
                                413                 :                : 
                                414                 :                :     /* variables for recomposition */
                                415                 :                :     int         last_class;
                                416                 :                :     int         starter_pos;
                                417                 :                :     int         target_pos;
                                418                 :                :     uint32      starter_ch;
                                419                 :                : 
                                420                 :                :     /* First, do character decomposition */
                                421                 :                : 
                                422                 :                :     /*
                                423                 :                :      * Calculate how many characters long the decomposed version will be.
                                424                 :                :      *
                                425                 :                :      * Some characters decompose to quite a few code points, so that the
                                426                 :                :      * decomposed version's size could overrun MaxAllocSize, and even 32-bit
                                427                 :                :      * size_t, even though the input string presumably fits in that.  In
                                428                 :                :      * frontend we want to just return NULL in that case, so monitor the sum
                                429                 :                :      * and exit early once we'd need more than MaxAllocSize bytes.
                                430                 :                :      */
 3340 heikki.linnakangas@i      431                 :            370 :     decomp_size = 0;
                                432         [ +  + ]:           3394 :     for (p = input; *p; p++)
                                433                 :                :     {
 2258 peter@eisentraut.org      434                 :           3024 :         decomp_size += get_decomposed_size(*p, compat);
   19 tgl@sss.pgh.pa.us         435         [ -  + ]:GNC        3024 :         if (unlikely(decomp_size > MaxAllocSize / sizeof(char32_t)))
                                436                 :                :         {
                                437                 :                : #ifndef FRONTEND
                                438                 :                :             /* Exit loop and let palloc() throw error below */
   19 tgl@sss.pgh.pa.us         439                 :UBC           0 :             break;
                                440                 :                : #else
                                441                 :                :             /* Just return NULL with no explicit error */
                                442                 :              0 :             return NULL;
                                443                 :                : #endif
                                444                 :                :         }
                                445                 :                :     }
                                446                 :                : 
  213 jdavis@postgresql.or      447                 :GNC         370 :     decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
 3340 heikki.linnakangas@i      448         [ -  + ]:CBC         370 :     if (decomp_chars == NULL)
 3340 heikki.linnakangas@i      449                 :UBC           0 :         return NULL;
                                450                 :                : 
                                451                 :                :     /*
                                452                 :                :      * Now fill in each entry recursively. This needs a second pass on the
                                453                 :                :      * decomposition table.
                                454                 :                :      */
 3340 heikki.linnakangas@i      455                 :CBC         370 :     current_size = 0;
                                456         [ +  + ]:           3394 :     for (p = input; *p; p++)
 2258 peter@eisentraut.org      457                 :           3024 :         decompose_code(*p, compat, &decomp_chars, &current_size);
 3340 heikki.linnakangas@i      458                 :            370 :     decomp_chars[decomp_size] = '\0';
                                459         [ -  + ]:            370 :     Assert(decomp_size == current_size);
                                460                 :                : 
                                461                 :                :     /* Leave if there is nothing to decompose */
 1661 michael@paquier.xyz       462         [ +  + ]:            370 :     if (decomp_size == 0)
                                463                 :             13 :         return decomp_chars;
                                464                 :                : 
                                465                 :                :     /*
                                466                 :                :      * Now apply canonical ordering.
                                467                 :                :      */
 3340 heikki.linnakangas@i      468         [ +  + ]:           3072 :     for (count = 1; count < decomp_size; count++)
                                469                 :                :     {
  213 jdavis@postgresql.or      470                 :GNC        2715 :         char32_t    prev = decomp_chars[count - 1];
                                471                 :           2715 :         char32_t    next = decomp_chars[count];
                                472                 :                :         char32_t    tmp;
 1998 michael@paquier.xyz       473                 :CBC        2715 :         const uint8 prevClass = get_canonical_class(prev);
                                474                 :           2715 :         const uint8 nextClass = get_canonical_class(next);
                                475                 :                : 
                                476                 :                :         /*
                                477                 :                :          * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
                                478                 :                :          * annex 4, a sequence of two adjacent characters in a string is an
                                479                 :                :          * exchangeable pair if the combining class (from the Unicode
                                480                 :                :          * Character Database) for the first character is greater than the
                                481                 :                :          * combining class for the second, and the second is not a starter.  A
                                482                 :                :          * character is a starter if its combining class is 0.
                                483                 :                :          */
                                484   [ +  +  +  - ]:           2715 :         if (prevClass == 0 || nextClass == 0)
 3340 heikki.linnakangas@i      485                 :           2715 :             continue;
                                486                 :                : 
 1998 michael@paquier.xyz       487         [ #  # ]:UBC           0 :         if (prevClass <= nextClass)
 3340 heikki.linnakangas@i      488                 :              0 :             continue;
                                489                 :                : 
                                490                 :                :         /* exchange can happen */
                                491                 :              0 :         tmp = decomp_chars[count - 1];
                                492                 :              0 :         decomp_chars[count - 1] = decomp_chars[count];
                                493                 :              0 :         decomp_chars[count] = tmp;
                                494                 :                : 
                                495                 :                :         /* backtrack to check again */
                                496         [ #  # ]:              0 :         if (count > 1)
                                497                 :              0 :             count -= 2;
                                498                 :                :     }
                                499                 :                : 
 2258 peter@eisentraut.org      500         [ +  + ]:CBC         357 :     if (!recompose)
                                501                 :             58 :         return decomp_chars;
                                502                 :                : 
                                503                 :                :     /*
                                504                 :                :      * The last phase of NFC and NFKC is the recomposition of the reordered
                                505                 :                :      * Unicode string using combining classes. The recomposed string cannot be
                                506                 :                :      * longer than the decomposed one, so make the allocation of the output
                                507                 :                :      * string based on that assumption.
                                508                 :                :      */
  213 jdavis@postgresql.or      509                 :GNC         299 :     recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
 3340 heikki.linnakangas@i      510         [ -  + ]:CBC         299 :     if (!recomp_chars)
                                511                 :                :     {
 3340 heikki.linnakangas@i      512                 :UBC           0 :         FREE(decomp_chars);
                                513                 :              0 :         return NULL;
                                514                 :                :     }
                                515                 :                : 
 3340 heikki.linnakangas@i      516                 :CBC         299 :     last_class = -1;            /* this eliminates a special check */
                                517                 :            299 :     starter_pos = 0;
                                518                 :            299 :     target_pos = 1;
                                519                 :            299 :     starter_ch = recomp_chars[0] = decomp_chars[0];
                                520                 :                : 
                                521         [ +  + ]:           2824 :     for (count = 1; count < decomp_size; count++)
                                522                 :                :     {
  213 jdavis@postgresql.or      523                 :GNC        2525 :         char32_t    ch = decomp_chars[count];
 1998 michael@paquier.xyz       524                 :CBC        2525 :         int         ch_class = get_canonical_class(ch);
                                525                 :                :         char32_t    composite;
                                526                 :                : 
 3340 heikki.linnakangas@i      527   [ +  -  +  + ]:           5050 :         if (last_class < ch_class &&
                                528                 :           2525 :             recompose_code(starter_ch, ch, &composite))
                                529                 :                :         {
                                530                 :             32 :             recomp_chars[starter_pos] = composite;
                                531                 :             32 :             starter_ch = composite;
                                532                 :                :         }
                                533         [ +  - ]:           2493 :         else if (ch_class == 0)
                                534                 :                :         {
                                535                 :           2493 :             starter_pos = target_pos;
                                536                 :           2493 :             starter_ch = ch;
                                537                 :           2493 :             last_class = -1;
                                538                 :           2493 :             recomp_chars[target_pos++] = ch;
                                539                 :                :         }
                                540                 :                :         else
                                541                 :                :         {
 3340 heikki.linnakangas@i      542                 :UBC           0 :             last_class = ch_class;
                                543                 :              0 :             recomp_chars[target_pos++] = ch;
                                544                 :                :         }
                                545                 :                :     }
  213 jdavis@postgresql.or      546                 :GNC         299 :     recomp_chars[target_pos] = (char32_t) '\0';
                                547                 :                : 
 3340 heikki.linnakangas@i      548                 :CBC         299 :     FREE(decomp_chars);
                                549                 :                : 
                                550                 :            299 :     return recomp_chars;
                                551                 :                : }
                                552                 :                : 
                                553                 :                : /*
                                554                 :                :  * Normalization "quick check" algorithm; see
                                555                 :                :  * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
                                556                 :                :  */
                                557                 :                : 
                                558                 :                : /* We only need this in the backend. */
                                559                 :                : #ifndef FRONTEND
                                560                 :                : 
                                561                 :                : static const pg_unicode_normprops *
  213 jdavis@postgresql.or      562                 :GNC         134 : qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
                                563                 :                : {
                                564                 :                :     int         h;
                                565                 :                :     uint32      hashkey;
                                566                 :                : 
                                567                 :                :     /*
                                568                 :                :      * Compute the hash function. The hash key is the codepoint with the bytes
                                569                 :                :      * in network order.
                                570                 :                :      */
 2056 michael@paquier.xyz       571                 :CBC         134 :     hashkey = pg_hton32(ch);
 2057                           572                 :            134 :     h = norminfo->hash(&hashkey);
                                573                 :                : 
                                574                 :                :     /* An out-of-range result implies no match */
                                575   [ +  -  +  + ]:            134 :     if (h < 0 || h >= norminfo->num_normprops)
                                576                 :             92 :         return NULL;
                                577                 :                : 
                                578                 :                :     /*
                                579                 :                :      * Since it's a perfect hash, we need only match to the specific codepoint
                                580                 :                :      * it identifies.
                                581                 :                :      */
                                582         [ +  + ]:             42 :     if (ch != norminfo->normprops[h].codepoint)
                                583                 :             18 :         return NULL;
                                584                 :                : 
                                585                 :                :     /* Success! */
                                586                 :             24 :     return &norminfo->normprops[h];
                                587                 :                : }
                                588                 :                : 
                                589                 :                : /*
                                590                 :                :  * Look up the normalization quick check character property
                                591                 :                :  */
                                592                 :                : static UnicodeNormalizationQC
  213 jdavis@postgresql.or      593                 :GNC         134 : qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
                                594                 :                : {
 2057 michael@paquier.xyz       595                 :CBC         134 :     const pg_unicode_normprops *found = NULL;
                                596                 :                : 
 2256 peter@eisentraut.org      597      [ +  +  - ]:            134 :     switch (form)
                                598                 :                :     {
                                599                 :             86 :         case UNICODE_NFC:
 2057 michael@paquier.xyz       600                 :             86 :             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
 2256 peter@eisentraut.org      601                 :             86 :             break;
                                602                 :             48 :         case UNICODE_NFKC:
 2057 michael@paquier.xyz       603                 :             48 :             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
 2256 peter@eisentraut.org      604                 :             48 :             break;
 2256 peter@eisentraut.org      605                 :UBC           0 :         default:
                                606                 :              0 :             Assert(false);
                                607                 :                :             break;
                                608                 :                :     }
                                609                 :                : 
 2256 peter@eisentraut.org      610         [ +  + ]:CBC         134 :     if (found)
                                611                 :             24 :         return found->quickcheck;
                                612                 :                :     else
                                613                 :            110 :         return UNICODE_NORM_QC_YES;
                                614                 :                : }
                                615                 :                : 
                                616                 :                : UnicodeNormalizationQC
  213 jdavis@postgresql.or      617                 :GNC          90 : unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
                                618                 :                : {
 2256 peter@eisentraut.org      619                 :CBC          90 :     uint8       lastCanonicalClass = 0;
                                620                 :             90 :     UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
                                621                 :                : 
                                622                 :                :     /*
                                623                 :                :      * For the "D" forms, we don't run the quickcheck.  We don't include the
                                624                 :                :      * lookup tables for those because they are huge, checking for these
                                625                 :                :      * particular forms is less common, and running the slow path is faster
                                626                 :                :      * for the "D" forms than the "C" forms because you don't need to
                                627                 :                :      * recompose, which is slow.
                                628                 :                :      */
                                629   [ +  +  +  + ]:             90 :     if (form == UNICODE_NFD || form == UNICODE_NFKD)
                                630                 :             40 :         return UNICODE_NORM_QC_MAYBE;
                                631                 :                : 
  213 jdavis@postgresql.or      632         [ +  + ]:GNC         176 :     for (const char32_t *p = input; *p; p++)
                                633                 :                :     {
                                634                 :            134 :         char32_t    ch = *p;
                                635                 :                :         uint8       canonicalClass;
                                636                 :                :         UnicodeNormalizationQC check;
                                637                 :                : 
 2256 peter@eisentraut.org      638                 :CBC         134 :         canonicalClass = get_canonical_class(ch);
                                639   [ +  +  -  + ]:            134 :         if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
 2256 peter@eisentraut.org      640                 :UBC           0 :             return UNICODE_NORM_QC_NO;
                                641                 :                : 
 2256 peter@eisentraut.org      642                 :CBC         134 :         check = qc_is_allowed(form, ch);
                                643         [ +  + ]:            134 :         if (check == UNICODE_NORM_QC_NO)
                                644                 :              8 :             return UNICODE_NORM_QC_NO;
                                645         [ +  + ]:            126 :         else if (check == UNICODE_NORM_QC_MAYBE)
                                646                 :             16 :             result = UNICODE_NORM_QC_MAYBE;
                                647                 :                : 
                                648                 :            126 :         lastCanonicalClass = canonicalClass;
                                649                 :                :     }
                                650                 :             42 :     return result;
                                651                 :                : }
                                652                 :                : 
                                653                 :                : #endif                          /* !FRONTEND */

Generated by: LCOV version 2.5.0-beta