Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * multirangetypes_selfuncs.c
4 : : * Functions for selectivity estimation of multirange operators
5 : : *
6 : : * Estimates are based on histograms of lower and upper bounds, and the
7 : : * fraction of empty multiranges.
8 : : *
9 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10 : : * Portions Copyright (c) 1994, Regents of the University of California
11 : : *
12 : : *
13 : : * IDENTIFICATION
14 : : * src/backend/utils/adt/multirangetypes_selfuncs.c
15 : : *
16 : : *-------------------------------------------------------------------------
17 : : */
18 : : #include "postgres.h"
19 : :
20 : : #include <math.h>
21 : :
22 : : #include "access/htup_details.h"
23 : : #include "catalog/pg_operator.h"
24 : : #include "catalog/pg_statistic.h"
25 : : #include "utils/float.h"
26 : : #include "utils/fmgrprotos.h"
27 : : #include "utils/lsyscache.h"
28 : : #include "utils/multirangetypes.h"
29 : : #include "utils/rangetypes.h"
30 : : #include "utils/selfuncs.h"
31 : : #include "utils/typcache.h"
32 : :
33 : : static double calc_multirangesel(TypeCacheEntry *typcache,
34 : : VariableStatData *vardata,
35 : : const MultirangeType *constval, Oid operator);
36 : : static double default_multirange_selectivity(Oid operator);
37 : : static double calc_hist_selectivity(TypeCacheEntry *typcache,
38 : : VariableStatData *vardata,
39 : : const MultirangeType *constval,
40 : : Oid operator);
41 : : static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
42 : : const RangeBound *constbound,
43 : : const RangeBound *hist,
44 : : int hist_nvalues, bool equal);
45 : : static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value,
46 : : const RangeBound *hist, int hist_length, bool equal);
47 : : static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value,
48 : : const RangeBound *hist1, const RangeBound *hist2);
49 : : static float8 get_len_position(double value, double hist1, double hist2);
50 : : static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1,
51 : : const RangeBound *bound2);
52 : : static int length_hist_bsearch(Datum *length_hist_values,
53 : : int length_hist_nvalues, double value,
54 : : bool equal);
55 : : static double calc_length_hist_frac(Datum *length_hist_values,
56 : : int length_hist_nvalues, double length1,
57 : : double length2, bool equal);
58 : : static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
59 : : const RangeBound *lower,
60 : : RangeBound *upper,
61 : : const RangeBound *hist_lower,
62 : : int hist_nvalues,
63 : : Datum *length_hist_values,
64 : : int length_hist_nvalues);
65 : : static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
66 : : const RangeBound *lower,
67 : : const RangeBound *upper,
68 : : const RangeBound *hist_lower,
69 : : int hist_nvalues,
70 : : Datum *length_hist_values,
71 : : int length_hist_nvalues);
72 : :
73 : : /*
74 : : * Returns a default selectivity estimate for given operator, when we don't
75 : : * have statistics or cannot use them for some reason.
76 : : */
77 : : static double
1721 akorotkov@postgresql 78 :CBC 473 : default_multirange_selectivity(Oid operator)
79 : : {
80 [ + + + + : 473 : switch (operator)
- ]
81 : : {
82 : 210 : case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
83 : : case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
84 : : case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
85 : 210 : return 0.01;
86 : :
1712 87 : 170 : case OID_RANGE_CONTAINS_MULTIRANGE_OP:
88 : : case OID_RANGE_MULTIRANGE_CONTAINED_OP:
89 : : case OID_MULTIRANGE_CONTAINS_RANGE_OP:
90 : : case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
91 : : case OID_MULTIRANGE_RANGE_CONTAINED_OP:
92 : : case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
1721 93 : 170 : return 0.005;
94 : :
95 : 33 : case OID_MULTIRANGE_CONTAINS_ELEM_OP:
96 : : case OID_MULTIRANGE_ELEM_CONTAINED_OP:
97 : :
98 : : /*
99 : : * "multirange @> elem" is more or less identical to a scalar
100 : : * inequality "A >= b AND A <= c".
101 : : */
102 : 33 : return DEFAULT_MULTIRANGE_INEQ_SEL;
103 : :
104 : 60 : case OID_MULTIRANGE_LESS_OP:
105 : : case OID_MULTIRANGE_LESS_EQUAL_OP:
106 : : case OID_MULTIRANGE_GREATER_OP:
107 : : case OID_MULTIRANGE_GREATER_EQUAL_OP:
108 : : case OID_MULTIRANGE_LEFT_RANGE_OP:
109 : : case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
110 : : case OID_RANGE_LEFT_MULTIRANGE_OP:
111 : : case OID_MULTIRANGE_RIGHT_RANGE_OP:
112 : : case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
113 : : case OID_RANGE_RIGHT_MULTIRANGE_OP:
114 : : case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
115 : : case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
116 : : case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
117 : : case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
118 : : case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
119 : : case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
120 : : /* these are similar to regular scalar inequalities */
121 : 60 : return DEFAULT_INEQ_SEL;
122 : :
1721 akorotkov@postgresql 123 :UBC 0 : default:
124 : :
125 : : /*
126 : : * all multirange operators should be handled above, but just in
127 : : * case
128 : : */
129 : 0 : return 0.01;
130 : : }
131 : : }
132 : :
133 : : /*
134 : : * multirangesel -- restriction selectivity for multirange operators
135 : : */
136 : : Datum
1721 akorotkov@postgresql 137 :CBC 710 : multirangesel(PG_FUNCTION_ARGS)
138 : : {
139 : 710 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
140 : 710 : Oid operator = PG_GETARG_OID(1);
141 : 710 : List *args = (List *) PG_GETARG_POINTER(2);
142 : 710 : int varRelid = PG_GETARG_INT32(3);
143 : : VariableStatData vardata;
144 : : Node *other;
145 : : bool varonleft;
146 : : Selectivity selec;
147 : 710 : TypeCacheEntry *typcache = NULL;
148 : 710 : MultirangeType *constmultirange = NULL;
149 : 710 : RangeType *constrange = NULL;
150 : :
151 : : /*
152 : : * If expression is not (variable op something) or (something op
153 : : * variable), then punt and return a default estimate.
154 : : */
155 [ - + ]: 710 : if (!get_restriction_variable(root, args, varRelid,
156 : : &vardata, &other, &varonleft))
1721 akorotkov@postgresql 157 :UBC 0 : PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
158 : :
159 : : /*
160 : : * Can't do anything useful if the something is not a constant, either.
161 : : */
1721 akorotkov@postgresql 162 [ + + ]:CBC 710 : if (!IsA(other, Const))
163 : : {
164 [ - + ]: 51 : ReleaseVariableStats(vardata);
165 : 51 : PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
166 : : }
167 : :
168 : : /*
169 : : * All the multirange operators are strict, so we can cope with a NULL
170 : : * constant right away.
171 : : */
172 [ - + ]: 659 : if (((Const *) other)->constisnull)
173 : : {
1721 akorotkov@postgresql 174 [ # # ]:UBC 0 : ReleaseVariableStats(vardata);
175 : 0 : PG_RETURN_FLOAT8(0.0);
176 : : }
177 : :
178 : : /*
179 : : * If var is on the right, commute the operator, so that we can assume the
180 : : * var is on the left in what follows.
181 : : */
1721 akorotkov@postgresql 182 [ + + ]:CBC 659 : if (!varonleft)
183 : : {
184 : : /* we have other Op var, commute to make var Op other */
185 : 248 : operator = get_commutator(operator);
186 [ - + ]: 248 : if (!operator)
187 : : {
188 : : /* Use default selectivity (should we raise an error instead?) */
1721 akorotkov@postgresql 189 [ # # ]:UBC 0 : ReleaseVariableStats(vardata);
190 : 0 : PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
191 : : }
192 : : }
193 : :
194 : : /*
195 : : * OK, there's a Var and a Const we're dealing with here. We need the
196 : : * Const to be of same multirange type as the column, else we can't do
197 : : * anything useful. (Such cases will likely fail at runtime, but here we'd
198 : : * rather just return a default estimate.)
199 : : *
200 : : * If the operator is "multirange @> element", the constant should be of
201 : : * the element type of the multirange column. Convert it to a multirange
202 : : * that includes only that single point, so that we don't need special
203 : : * handling for that in what follows.
204 : : */
1721 akorotkov@postgresql 205 [ + + ]:CBC 659 : if (operator == OID_MULTIRANGE_CONTAINS_ELEM_OP)
206 : : {
207 : 48 : typcache = multirange_get_typcache(fcinfo, vardata.vartype);
208 : :
209 [ + - ]: 48 : if (((Const *) other)->consttype == typcache->rngtype->rngelemtype->type_id)
210 : : {
211 : : RangeBound lower,
212 : : upper;
213 : :
214 : 48 : lower.inclusive = true;
215 : 48 : lower.val = ((Const *) other)->constvalue;
216 : 48 : lower.infinite = false;
217 : 48 : lower.lower = true;
218 : 48 : upper.inclusive = true;
219 : 48 : upper.val = ((Const *) other)->constvalue;
220 : 48 : upper.infinite = false;
221 : 48 : upper.lower = false;
996 tgl@sss.pgh.pa.us 222 : 48 : constrange = range_serialize(typcache->rngtype, &lower, &upper,
223 : : false, NULL);
1721 akorotkov@postgresql 224 : 48 : constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
225 : : 1, &constrange);
226 : : }
227 : : }
1712 228 [ + + + + ]: 611 : else if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
229 [ + + ]: 516 : operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
1721 230 [ + + ]: 498 : operator == OID_MULTIRANGE_OVERLAPS_RANGE_OP ||
231 [ + + ]: 486 : operator == OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP ||
232 [ + + ]: 474 : operator == OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP ||
233 [ + + ]: 462 : operator == OID_MULTIRANGE_LEFT_RANGE_OP ||
234 : : operator == OID_MULTIRANGE_RIGHT_RANGE_OP)
235 : : {
236 : : /*
237 : : * Promote a range in "multirange OP range" just like we do an element
238 : : * in "multirange OP element".
239 : : */
240 : 161 : typcache = multirange_get_typcache(fcinfo, vardata.vartype);
241 [ + - ]: 161 : if (((Const *) other)->consttype == typcache->rngtype->type_id)
242 : : {
243 : 161 : constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
244 : 161 : constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
245 : : 1, &constrange);
246 : : }
247 : : }
248 [ + + + + ]: 450 : else if (operator == OID_RANGE_OVERLAPS_MULTIRANGE_OP ||
249 [ + + ]: 432 : operator == OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP ||
250 [ + + ]: 423 : operator == OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP ||
251 [ + + ]: 414 : operator == OID_RANGE_LEFT_MULTIRANGE_OP ||
252 [ + + ]: 405 : operator == OID_RANGE_RIGHT_MULTIRANGE_OP ||
1712 253 [ + - ]: 387 : operator == OID_RANGE_CONTAINS_MULTIRANGE_OP ||
1721 254 [ + + ]: 387 : operator == OID_MULTIRANGE_ELEM_CONTAINED_OP ||
255 : : operator == OID_MULTIRANGE_RANGE_CONTAINED_OP)
256 : : {
257 : : /*
258 : : * Here, the Var is the elem/range, not the multirange. For now we
259 : : * just punt and return the default estimate. In future we could
260 : : * disassemble the multirange constant to do something more
261 : : * intelligent.
262 : : */
263 : : }
264 [ + - ]: 378 : else if (((Const *) other)->consttype == vardata.vartype)
265 : : {
266 : : /* Both sides are the same multirange type */
267 : 378 : typcache = multirange_get_typcache(fcinfo, vardata.vartype);
268 : :
269 : 378 : constmultirange = DatumGetMultirangeTypeP(((Const *) other)->constvalue);
270 : : }
271 : :
272 : : /*
273 : : * If we got a valid constant on one side of the operator, proceed to
274 : : * estimate using statistics. Otherwise punt and return a default constant
275 : : * estimate. Note that calc_multirangesel need not handle
276 : : * OID_MULTIRANGE_*_CONTAINED_OP.
277 : : */
278 [ + + ]: 659 : if (constmultirange)
279 : 587 : selec = calc_multirangesel(typcache, &vardata, constmultirange, operator);
280 : : else
281 : 72 : selec = default_multirange_selectivity(operator);
282 : :
283 [ + + ]: 659 : ReleaseVariableStats(vardata);
284 : :
285 [ - + - + ]: 659 : CLAMP_PROBABILITY(selec);
286 : :
287 : 659 : PG_RETURN_FLOAT8((float8) selec);
288 : : }
289 : :
290 : : static double
291 : 587 : calc_multirangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
292 : : const MultirangeType *constval, Oid operator)
293 : : {
294 : : double hist_selec;
295 : : double selec;
296 : : float4 empty_frac,
297 : : null_frac;
298 : :
299 : : /*
300 : : * First look up the fraction of NULLs and empty multiranges from
301 : : * pg_statistic.
302 : : */
303 [ + + ]: 587 : if (HeapTupleIsValid(vardata->statsTuple))
304 : : {
305 : : Form_pg_statistic stats;
306 : : AttStatsSlot sslot;
307 : :
308 : 225 : stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
309 : 225 : null_frac = stats->stanullfrac;
310 : :
311 : : /* Try to get fraction of empty multiranges */
312 [ + - ]: 225 : if (get_attstatsslot(&sslot, vardata->statsTuple,
313 : : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
314 : : InvalidOid,
315 : : ATTSTATSSLOT_NUMBERS))
316 : : {
317 [ - + ]: 225 : if (sslot.nnumbers != 1)
1721 akorotkov@postgresql 318 [ # # ]:UBC 0 : elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
1721 akorotkov@postgresql 319 :CBC 225 : empty_frac = sslot.numbers[0];
320 : 225 : free_attstatsslot(&sslot);
321 : : }
322 : : else
323 : : {
324 : : /* No empty fraction statistic. Assume no empty ranges. */
1721 akorotkov@postgresql 325 :UBC 0 : empty_frac = 0.0;
326 : : }
327 : : }
328 : : else
329 : : {
330 : : /*
331 : : * No stats are available. Follow through the calculations below
332 : : * anyway, assuming no NULLs and no empty multiranges. This still
333 : : * allows us to give a better-than-nothing estimate based on whether
334 : : * the constant is an empty multirange or not.
335 : : */
1721 akorotkov@postgresql 336 :CBC 362 : null_frac = 0.0;
337 : 362 : empty_frac = 0.0;
338 : : }
339 : :
340 [ + + ]: 587 : if (MultirangeIsEmpty(constval))
341 : : {
342 : : /*
343 : : * An empty multirange matches all multiranges, all empty multiranges,
344 : : * or nothing, depending on the operator
345 : : */
346 [ + + + + : 111 : switch (operator)
- ]
347 : : {
348 : : /* these return false if either argument is empty */
349 : 63 : case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
350 : : case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
351 : : case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
352 : : case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
353 : : case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
354 : : case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
355 : : case OID_MULTIRANGE_LEFT_RANGE_OP:
356 : : case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
357 : : case OID_MULTIRANGE_RIGHT_RANGE_OP:
358 : : case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
359 : : /* nothing is less than an empty multirange */
360 : : case OID_MULTIRANGE_LESS_OP:
361 : 63 : selec = 0.0;
362 : 63 : break;
363 : :
364 : : /*
365 : : * only empty multiranges can be contained by an empty
366 : : * multirange
367 : : */
1530 368 : 15 : case OID_RANGE_MULTIRANGE_CONTAINED_OP:
369 : : case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
370 : : /* only empty ranges are <= an empty multirange */
371 : : case OID_MULTIRANGE_LESS_EQUAL_OP:
1721 372 : 15 : selec = empty_frac;
373 : 15 : break;
374 : :
375 : : /* everything contains an empty multirange */
376 : 30 : case OID_MULTIRANGE_CONTAINS_RANGE_OP:
377 : : case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
378 : : /* everything is >= an empty multirange */
379 : : case OID_MULTIRANGE_GREATER_EQUAL_OP:
380 : 30 : selec = 1.0;
381 : 30 : break;
382 : :
383 : : /* all non-empty multiranges are > an empty multirange */
384 : 3 : case OID_MULTIRANGE_GREATER_OP:
385 : 3 : selec = 1.0 - empty_frac;
386 : 3 : break;
387 : :
388 : : /* an element cannot be empty */
1721 akorotkov@postgresql 389 :UBC 0 : case OID_MULTIRANGE_CONTAINS_ELEM_OP:
390 : :
391 : : /* filtered out by multirangesel() */
392 : : case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
393 : : case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
394 : : case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
395 : : case OID_RANGE_LEFT_MULTIRANGE_OP:
396 : : case OID_RANGE_RIGHT_MULTIRANGE_OP:
397 : : case OID_RANGE_CONTAINS_MULTIRANGE_OP:
398 : : case OID_MULTIRANGE_ELEM_CONTAINED_OP:
399 : : case OID_MULTIRANGE_RANGE_CONTAINED_OP:
400 : :
401 : : default:
402 [ # # ]: 0 : elog(ERROR, "unexpected operator %u", operator);
403 : : selec = 0.0; /* keep compiler quiet */
404 : : break;
405 : : }
406 : : }
407 : : else
408 : : {
409 : : /*
410 : : * Calculate selectivity using bound histograms. If that fails for
411 : : * some reason, e.g no histogram in pg_statistic, use the default
412 : : * constant estimate for the fraction of non-empty values. This is
413 : : * still somewhat better than just returning the default estimate,
414 : : * because this still takes into account the fraction of empty and
415 : : * NULL tuples, if we had statistics for them.
416 : : */
1721 akorotkov@postgresql 417 :CBC 476 : hist_selec = calc_hist_selectivity(typcache, vardata, constval,
418 : : operator);
419 [ + + ]: 476 : if (hist_selec < 0.0)
420 : 350 : hist_selec = default_multirange_selectivity(operator);
421 : :
422 : : /*
423 : : * Now merge the results for the empty multiranges and histogram
424 : : * calculations, realizing that the histogram covers only the
425 : : * non-null, non-empty values.
426 : : */
1530 427 [ + + + + ]: 476 : if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
428 : : operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
429 : : {
430 : : /* empty is contained by anything non-empty */
1721 431 : 12 : selec = (1.0 - empty_frac) * hist_selec + empty_frac;
432 : : }
433 : : else
434 : : {
435 : : /* with any other operator, empty Op non-empty matches nothing */
436 : 464 : selec = (1.0 - empty_frac) * hist_selec;
437 : : }
438 : : }
439 : :
440 : : /* all multirange operators are strict */
441 : 587 : selec *= (1.0 - null_frac);
442 : :
443 : : /* result should be in range, but make sure... */
444 [ - + - + ]: 587 : CLAMP_PROBABILITY(selec);
445 : :
446 : 587 : return selec;
447 : : }
448 : :
449 : : /*
450 : : * Calculate multirange operator selectivity using histograms of multirange bounds.
451 : : *
452 : : * This estimate is for the portion of values that are not empty and not
453 : : * NULL.
454 : : */
455 : : static double
456 : 476 : calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
457 : : const MultirangeType *constval, Oid operator)
458 : : {
459 : 476 : TypeCacheEntry *rng_typcache = typcache->rngtype;
460 : : AttStatsSlot hslot;
461 : : AttStatsSlot lslot;
462 : : int nhist;
463 : : RangeBound *hist_lower;
464 : : RangeBound *hist_upper;
465 : : int i;
466 : : RangeBound const_lower;
467 : : RangeBound const_upper;
468 : : RangeBound tmp;
469 : : double hist_selec;
470 : :
471 : : /* Can't use the histogram with insecure multirange support functions */
472 [ - + ]: 476 : if (!statistic_proc_security_check(vardata,
473 : : rng_typcache->rng_cmp_proc_finfo.fn_oid))
1721 akorotkov@postgresql 474 :UBC 0 : return -1;
1721 akorotkov@postgresql 475 [ + - ]:CBC 476 : if (OidIsValid(rng_typcache->rng_subdiff_finfo.fn_oid) &&
476 [ + + ]: 476 : !statistic_proc_security_check(vardata,
477 : : rng_typcache->rng_subdiff_finfo.fn_oid))
478 : 119 : return -1;
479 : :
480 : : /* Try to get histogram of ranges */
481 [ + + - + ]: 483 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
482 : 126 : get_attstatsslot(&hslot, vardata->statsTuple,
483 : : STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
484 : : ATTSTATSSLOT_VALUES)))
485 : 231 : return -1.0;
486 : :
487 : : /* check that it's a histogram, not just a dummy entry */
488 [ - + ]: 126 : if (hslot.nvalues < 2)
489 : : {
1721 akorotkov@postgresql 490 :UBC 0 : free_attstatsslot(&hslot);
491 : 0 : return -1.0;
492 : : }
493 : :
494 : : /*
495 : : * Convert histogram of ranges into histograms of its lower and upper
496 : : * bounds.
497 : : */
1721 akorotkov@postgresql 498 :CBC 126 : nhist = hslot.nvalues;
499 : 126 : hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
500 : 126 : hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
501 [ + + ]: 9468 : for (i = 0; i < nhist; i++)
502 : : {
503 : : bool empty;
504 : :
505 : 9342 : range_deserialize(rng_typcache, DatumGetRangeTypeP(hslot.values[i]),
506 : 9342 : &hist_lower[i], &hist_upper[i], &empty);
507 : : /* The histogram should not contain any empty ranges */
508 [ - + ]: 9342 : if (empty)
1721 akorotkov@postgresql 509 [ # # ]:UBC 0 : elog(ERROR, "bounds histogram contains an empty range");
510 : : }
511 : :
512 : : /* @> and @< also need a histogram of range lengths */
1721 akorotkov@postgresql 513 [ + + + + ]:CBC 126 : if (operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
514 [ + - ]: 102 : operator == OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP ||
515 [ + + ]: 102 : operator == OID_MULTIRANGE_RANGE_CONTAINED_OP ||
516 : : operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
517 : : {
518 [ + - - + ]: 60 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
519 : 30 : get_attstatsslot(&lslot, vardata->statsTuple,
520 : : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
521 : : InvalidOid,
522 : : ATTSTATSSLOT_VALUES)))
523 : : {
1721 akorotkov@postgresql 524 :UBC 0 : free_attstatsslot(&hslot);
525 : 0 : return -1.0;
526 : : }
527 : :
528 : : /* check that it's a histogram, not just a dummy entry */
1721 akorotkov@postgresql 529 [ - + ]:CBC 30 : if (lslot.nvalues < 2)
530 : : {
1721 akorotkov@postgresql 531 :UBC 0 : free_attstatsslot(&lslot);
532 : 0 : free_attstatsslot(&hslot);
533 : 0 : return -1.0;
534 : : }
535 : : }
536 : : else
1721 akorotkov@postgresql 537 :CBC 96 : memset(&lslot, 0, sizeof(lslot));
538 : :
539 : : /* Extract the bounds of the constant value. */
540 [ - + ]: 126 : Assert(constval->rangeCount > 0);
541 : 126 : multirange_get_bounds(rng_typcache, constval, 0,
542 : : &const_lower, &tmp);
543 : 126 : multirange_get_bounds(rng_typcache, constval, constval->rangeCount - 1,
544 : : &tmp, &const_upper);
545 : :
546 : : /*
547 : : * Calculate selectivity comparing the lower or upper bound of the
548 : : * constant with the histogram of lower or upper bounds.
549 : : */
550 [ - - - - : 126 : switch (operator)
+ + + + +
+ + - ]
551 : : {
1721 akorotkov@postgresql 552 :UBC 0 : case OID_MULTIRANGE_LESS_OP:
553 : :
554 : : /*
555 : : * The regular b-tree comparison operators (<, <=, >, >=) compare
556 : : * the lower bounds first, and the upper bounds for values with
557 : : * equal lower bounds. Estimate that by comparing the lower bounds
558 : : * only. This gives a fairly accurate estimate assuming there
559 : : * aren't many rows with a lower bound equal to the constant's
560 : : * lower bound.
561 : : */
562 : : hist_selec =
563 : 0 : calc_hist_selectivity_scalar(rng_typcache, &const_lower,
564 : : hist_lower, nhist, false);
565 : 0 : break;
566 : :
567 : 0 : case OID_MULTIRANGE_LESS_EQUAL_OP:
568 : : hist_selec =
569 : 0 : calc_hist_selectivity_scalar(rng_typcache, &const_lower,
570 : : hist_lower, nhist, true);
571 : 0 : break;
572 : :
573 : 0 : case OID_MULTIRANGE_GREATER_OP:
574 : 0 : hist_selec =
575 : 0 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
576 : : hist_lower, nhist, false);
577 : 0 : break;
578 : :
579 : 0 : case OID_MULTIRANGE_GREATER_EQUAL_OP:
580 : 0 : hist_selec =
581 : 0 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
582 : : hist_lower, nhist, true);
583 : 0 : break;
584 : :
1721 akorotkov@postgresql 585 :CBC 12 : case OID_MULTIRANGE_LEFT_RANGE_OP:
586 : : case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
587 : : /* var << const when upper(var) < lower(const) */
588 : : hist_selec =
589 : 12 : calc_hist_selectivity_scalar(rng_typcache, &const_lower,
590 : : hist_upper, nhist, false);
591 : 12 : break;
592 : :
593 : 12 : case OID_MULTIRANGE_RIGHT_RANGE_OP:
594 : : case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
595 : : /* var >> const when lower(var) > upper(const) */
596 : 12 : hist_selec =
597 : 12 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_upper,
598 : : hist_lower, nhist, true);
599 : 12 : break;
600 : :
601 : 12 : case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
602 : : case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
603 : : /* compare lower bounds */
604 : 12 : hist_selec =
605 : 12 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
606 : : hist_lower, nhist, false);
607 : 12 : break;
608 : :
609 : 12 : case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
610 : : case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
611 : : /* compare upper bounds */
612 : : hist_selec =
613 : 12 : calc_hist_selectivity_scalar(rng_typcache, &const_upper,
614 : : hist_upper, nhist, true);
615 : 12 : break;
616 : :
617 : 42 : case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
618 : : case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
619 : : case OID_MULTIRANGE_CONTAINS_ELEM_OP:
620 : :
621 : : /*
622 : : * A && B <=> NOT (A << B OR A >> B).
623 : : *
624 : : * Since A << B and A >> B are mutually exclusive events we can
625 : : * sum their probabilities to find probability of (A << B OR A >>
626 : : * B).
627 : : *
628 : : * "multirange @> elem" is equivalent to "multirange &&
629 : : * {[elem,elem]}". The caller already constructed the singular
630 : : * range from the element constant, so just treat it the same as
631 : : * &&.
632 : : */
633 : : hist_selec =
634 : 42 : calc_hist_selectivity_scalar(rng_typcache,
635 : : &const_lower, hist_upper,
636 : : nhist, false);
637 : 42 : hist_selec +=
638 : 42 : (1.0 - calc_hist_selectivity_scalar(rng_typcache,
639 : : &const_upper, hist_lower,
640 : : nhist, true));
641 : 42 : hist_selec = 1.0 - hist_selec;
642 : 42 : break;
643 : :
644 : 24 : case OID_MULTIRANGE_CONTAINS_RANGE_OP:
645 : : case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
646 : : hist_selec =
647 : 24 : calc_hist_selectivity_contains(rng_typcache, &const_lower,
648 : : &const_upper, hist_lower, nhist,
649 : : lslot.values, lslot.nvalues);
650 : 24 : break;
651 : :
652 : 12 : case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
653 : : case OID_RANGE_MULTIRANGE_CONTAINED_OP:
654 [ - + ]: 12 : if (const_lower.infinite)
655 : : {
656 : : /*
657 : : * Lower bound no longer matters. Just estimate the fraction
658 : : * with an upper bound <= const upper bound
659 : : */
660 : : hist_selec =
1721 akorotkov@postgresql 661 :UBC 0 : calc_hist_selectivity_scalar(rng_typcache, &const_upper,
662 : : hist_upper, nhist, true);
663 : : }
1721 akorotkov@postgresql 664 [ - + ]:CBC 12 : else if (const_upper.infinite)
665 : : {
1721 akorotkov@postgresql 666 :UBC 0 : hist_selec =
667 : 0 : 1.0 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
668 : : hist_lower, nhist, false);
669 : : }
670 : : else
671 : : {
672 : : hist_selec =
1721 akorotkov@postgresql 673 :CBC 12 : calc_hist_selectivity_contained(rng_typcache, &const_lower,
674 : : &const_upper, hist_lower, nhist,
675 : : lslot.values, lslot.nvalues);
676 : : }
677 : 12 : break;
678 : :
679 : : /* filtered out by multirangesel() */
1530 akorotkov@postgresql 680 :UBC 0 : case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
681 : : case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
682 : : case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
683 : : case OID_RANGE_LEFT_MULTIRANGE_OP:
684 : : case OID_RANGE_RIGHT_MULTIRANGE_OP:
685 : : case OID_RANGE_CONTAINS_MULTIRANGE_OP:
686 : : case OID_MULTIRANGE_ELEM_CONTAINED_OP:
687 : : case OID_MULTIRANGE_RANGE_CONTAINED_OP:
688 : :
689 : : default:
1721 690 [ # # ]: 0 : elog(ERROR, "unknown multirange operator %u", operator);
691 : : hist_selec = -1.0; /* keep compiler quiet */
692 : : break;
693 : : }
694 : :
1721 akorotkov@postgresql 695 :CBC 126 : free_attstatsslot(&lslot);
696 : 126 : free_attstatsslot(&hslot);
697 : :
698 : 126 : return hist_selec;
699 : : }
700 : :
701 : :
702 : : /*
703 : : * Look up the fraction of values less than (or equal, if 'equal' argument
704 : : * is true) a given const in a histogram of range bounds.
705 : : */
706 : : static double
707 : 132 : calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound,
708 : : const RangeBound *hist, int hist_nvalues, bool equal)
709 : : {
710 : : Selectivity selec;
711 : : int index;
712 : :
713 : : /*
714 : : * Find the histogram bin the given constant falls into. Estimate
715 : : * selectivity as the number of preceding whole bins.
716 : : */
717 : 132 : index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
718 [ + + ]: 132 : selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
719 : :
720 : : /* Adjust using linear interpolation within the bin */
721 [ + + + + ]: 132 : if (index >= 0 && index < hist_nvalues - 1)
722 : 180 : selec += get_position(typcache, constbound, &hist[index],
723 : 90 : &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
724 : :
725 : 132 : return selec;
726 : : }
727 : :
728 : : /*
729 : : * Binary search on an array of range bounds. Returns greatest index of range
730 : : * bound in array which is less(less or equal) than given range bound. If all
731 : : * range bounds in array are greater or equal(greater) than given range bound,
732 : : * return -1. When "equal" flag is set conditions in brackets are used.
733 : : *
734 : : * This function is used in scalar operator selectivity estimation. Another
735 : : * goal of this function is to find a histogram bin where to stop
736 : : * interpolation of portion of bounds which are less than or equal to given bound.
737 : : */
738 : : static int
739 : 168 : rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist,
740 : : int hist_length, bool equal)
741 : : {
742 : 168 : int lower = -1,
743 : 168 : upper = hist_length - 1,
744 : : cmp,
745 : : middle;
746 : :
747 [ + + ]: 1068 : while (lower < upper)
748 : : {
749 : 900 : middle = (lower + upper + 1) / 2;
750 : 900 : cmp = range_cmp_bounds(typcache, &hist[middle], value);
751 : :
752 [ + + + + : 900 : if (cmp < 0 || (equal && cmp == 0))
- + ]
753 : 372 : lower = middle;
754 : : else
755 : 528 : upper = middle - 1;
756 : : }
757 : 168 : return lower;
758 : : }
759 : :
760 : :
761 : : /*
762 : : * Binary search on length histogram. Returns greatest index of range length in
763 : : * histogram which is less than (less than or equal) the given length value. If
764 : : * all lengths in the histogram are greater than (greater than or equal) the
765 : : * given length, returns -1.
766 : : */
767 : : static int
768 : 180 : length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
769 : : double value, bool equal)
770 : : {
771 : 180 : int lower = -1,
772 : 180 : upper = length_hist_nvalues - 1,
773 : : middle;
774 : :
775 [ + + ]: 936 : while (lower < upper)
776 : : {
777 : : double middleval;
778 : :
779 : 756 : middle = (lower + upper + 1) / 2;
780 : :
781 : 756 : middleval = DatumGetFloat8(length_hist_values[middle]);
782 [ + + + + : 756 : if (middleval < value || (equal && middleval <= value))
- + ]
783 : 372 : lower = middle;
784 : : else
785 : 384 : upper = middle - 1;
786 : : }
787 : 180 : return lower;
788 : : }
789 : :
790 : : /*
791 : : * Get relative position of value in histogram bin in [0,1] range.
792 : : */
793 : : static float8
794 : 138 : get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1,
795 : : const RangeBound *hist2)
796 : : {
797 : 138 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
798 : : float8 position;
799 : :
800 [ + + + + ]: 138 : if (!hist1->infinite && !hist2->infinite)
801 : : {
802 : : float8 bin_width;
803 : :
804 : : /*
805 : : * Both bounds are finite. Assuming the subtype's comparison function
806 : : * works sanely, the value must be finite, too, because it lies
807 : : * somewhere between the bounds. If it doesn't, arbitrarily return
808 : : * 0.5.
809 : : */
810 [ - + ]: 102 : if (value->infinite)
1721 akorotkov@postgresql 811 :UBC 0 : return 0.5;
812 : :
813 : : /* Can't interpolate without subdiff function */
1721 akorotkov@postgresql 814 [ - + ]:CBC 102 : if (!has_subdiff)
1721 akorotkov@postgresql 815 :UBC 0 : return 0.5;
816 : :
817 : : /* Calculate relative position using subdiff function. */
1721 akorotkov@postgresql 818 :CBC 102 : bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
819 : : typcache->rng_collation,
820 : 102 : hist2->val,
821 : 102 : hist1->val));
822 [ + - - + ]: 102 : if (isnan(bin_width) || bin_width <= 0.0)
1721 akorotkov@postgresql 823 :UBC 0 : return 0.5; /* punt for NaN or zero-width bin */
824 : :
1721 akorotkov@postgresql 825 :CBC 102 : position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
826 : : typcache->rng_collation,
827 : 102 : value->val,
828 : 102 : hist1->val))
829 : : / bin_width;
830 : :
831 [ - + ]: 102 : if (isnan(position))
1721 akorotkov@postgresql 832 :UBC 0 : return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */
833 : :
834 : : /* Relative position must be in [0,1] range */
1721 akorotkov@postgresql 835 [ + - ]:CBC 102 : position = Max(position, 0.0);
836 [ + + ]: 102 : position = Min(position, 1.0);
837 : 102 : return position;
838 : : }
839 [ + + + - ]: 36 : else if (hist1->infinite && !hist2->infinite)
840 : : {
841 : : /*
842 : : * Lower bin boundary is -infinite, upper is finite. If the value is
843 : : * -infinite, return 0.0 to indicate it's equal to the lower bound.
844 : : * Otherwise return 1.0 to indicate it's infinitely far from the lower
845 : : * bound.
846 : : */
847 [ - + - - ]: 30 : return ((value->infinite && value->lower) ? 0.0 : 1.0);
848 : : }
849 [ + - + - ]: 6 : else if (!hist1->infinite && hist2->infinite)
850 : : {
851 : : /* same as above, but in reverse */
852 [ - + - - ]: 6 : return ((value->infinite && !value->lower) ? 1.0 : 0.0);
853 : : }
854 : : else
855 : : {
856 : : /*
857 : : * If both bin boundaries are infinite, they should be equal to each
858 : : * other, and the value should also be infinite and equal to both
859 : : * bounds. (But don't Assert that, to avoid crashing if a user creates
860 : : * a datatype with a broken comparison function).
861 : : *
862 : : * Assume the value to lie in the middle of the infinite bounds.
863 : : */
1721 akorotkov@postgresql 864 :UBC 0 : return 0.5;
865 : : }
866 : : }
867 : :
868 : :
869 : : /*
870 : : * Get relative position of value in a length histogram bin in [0,1] range.
871 : : */
872 : : static double
1721 akorotkov@postgresql 873 :CBC 180 : get_len_position(double value, double hist1, double hist2)
874 : : {
875 [ + - + + ]: 180 : if (!isinf(hist1) && !isinf(hist2))
876 : : {
877 : : /*
878 : : * Both bounds are finite. The value should be finite too, because it
879 : : * lies somewhere between the bounds. If it doesn't, just return
880 : : * something.
881 : : */
882 [ - + ]: 30 : if (isinf(value))
1721 akorotkov@postgresql 883 :UBC 0 : return 0.5;
884 : :
1721 akorotkov@postgresql 885 :CBC 30 : return 1.0 - (hist2 - value) / (hist2 - hist1);
886 : : }
887 [ - + - - ]: 150 : else if (isinf(hist1) && !isinf(hist2))
888 : : {
889 : : /*
890 : : * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
891 : : * indicate the value is infinitely far from the lower bound.
892 : : */
1721 akorotkov@postgresql 893 :UBC 0 : return 1.0;
894 : : }
1721 akorotkov@postgresql 895 [ - + - - ]:CBC 150 : else if (isinf(hist1) && isinf(hist2))
896 : : {
897 : : /* same as above, but in reverse */
1721 akorotkov@postgresql 898 :UBC 0 : return 0.0;
899 : : }
900 : : else
901 : : {
902 : : /*
903 : : * If both bin boundaries are infinite, they should be equal to each
904 : : * other, and the value should also be infinite and equal to both
905 : : * bounds. (But don't Assert that, to avoid crashing unnecessarily if
906 : : * the caller messes up)
907 : : *
908 : : * Assume the value to lie in the middle of the infinite bounds.
909 : : */
1721 akorotkov@postgresql 910 :CBC 150 : return 0.5;
911 : : }
912 : : }
913 : :
914 : : /*
915 : : * Measure distance between two range bounds.
916 : : */
917 : : static float8
918 : 204 : get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2)
919 : : {
920 : 204 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
921 : :
922 [ + + + - ]: 204 : if (!bound1->infinite && !bound2->infinite)
923 : : {
924 : : /*
925 : : * Neither bound is infinite, use subdiff function or return default
926 : : * value of 1.0 if no subdiff is available.
927 : : */
928 [ + - ]: 120 : if (has_subdiff)
929 : : {
930 : : float8 res;
931 : :
932 : 120 : res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
933 : : typcache->rng_collation,
934 : 120 : bound2->val,
935 : 120 : bound1->val));
936 : : /* Reject possible NaN result, also negative result */
937 [ + - - + ]: 120 : if (isnan(res) || res < 0.0)
1721 akorotkov@postgresql 938 :UBC 0 : return 1.0;
939 : : else
1721 akorotkov@postgresql 940 :CBC 120 : return res;
941 : : }
942 : : else
1721 akorotkov@postgresql 943 :UBC 0 : return 1.0;
944 : : }
1721 akorotkov@postgresql 945 [ + - - + ]:CBC 84 : else if (bound1->infinite && bound2->infinite)
946 : : {
947 : : /* Both bounds are infinite */
1721 akorotkov@postgresql 948 [ # # ]:UBC 0 : if (bound1->lower == bound2->lower)
949 : 0 : return 0.0;
950 : : else
951 : 0 : return get_float8_infinity();
952 : : }
953 : : else
954 : : {
955 : : /* One bound is infinite, the other is not */
1721 akorotkov@postgresql 956 :CBC 84 : return get_float8_infinity();
957 : : }
958 : : }
959 : :
960 : : /*
961 : : * Calculate the average of function P(x), in the interval [length1, length2],
962 : : * where P(x) is the fraction of tuples with length < x (or length <= x if
963 : : * 'equal' is true).
964 : : */
965 : : static double
966 : 180 : calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
967 : : double length1, double length2, bool equal)
968 : : {
969 : : double frac;
970 : : double A,
971 : : B,
972 : : PA,
973 : : PB;
974 : : double pos;
975 : : int i;
976 : : double area;
977 : :
978 [ - + ]: 180 : Assert(length2 >= length1);
979 : :
980 [ - + ]: 180 : if (length2 < 0.0)
1721 akorotkov@postgresql 981 :UBC 0 : return 0.0; /* shouldn't happen, but doesn't hurt to check */
982 : :
983 : : /* All lengths in the table are <= infinite. */
1721 akorotkov@postgresql 984 [ + + - + ]:CBC 180 : if (isinf(length2) && equal)
1721 akorotkov@postgresql 985 :UBC 0 : return 1.0;
986 : :
987 : : /*----------
988 : : * The average of a function between A and B can be calculated by the
989 : : * formula:
990 : : *
991 : : * B
992 : : * 1 /
993 : : * ------- | P(x)dx
994 : : * B - A /
995 : : * A
996 : : *
997 : : * The geometrical interpretation of the integral is the area under the
998 : : * graph of P(x). P(x) is defined by the length histogram. We calculate
999 : : * the area in a piecewise fashion, iterating through the length histogram
1000 : : * bins. Each bin is a trapezoid:
1001 : : *
1002 : : * P(x2)
1003 : : * /|
1004 : : * / |
1005 : : * P(x1)/ |
1006 : : * | |
1007 : : * | |
1008 : : * ---+---+--
1009 : : * x1 x2
1010 : : *
1011 : : * where x1 and x2 are the boundaries of the current histogram, and P(x1)
1012 : : * and P(x1) are the cumulative fraction of tuples at the boundaries.
1013 : : *
1014 : : * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
1015 : : *
1016 : : * The first bin contains the lower bound passed by the caller, so we
1017 : : * use linear interpolation between the previous and next histogram bin
1018 : : * boundary to calculate P(x1). Likewise for the last bin: we use linear
1019 : : * interpolation to calculate P(x2). For the bins in between, x1 and x2
1020 : : * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
1021 : : * P(x1) = (bin index) / (number of bins)
1022 : : * P(x2) = (bin index + 1 / (number of bins)
1023 : : */
1024 : :
1025 : : /* First bin, the one that contains lower bound */
1721 akorotkov@postgresql 1026 :CBC 180 : i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
1027 [ + + ]: 180 : if (i >= length_hist_nvalues - 1)
1028 : 24 : return 1.0;
1029 : :
1030 [ + + ]: 156 : if (i < 0)
1031 : : {
1032 : 42 : i = 0;
1033 : 42 : pos = 0.0;
1034 : : }
1035 : : else
1036 : : {
1037 : : /* interpolate length1's position in the bin */
1038 : 114 : pos = get_len_position(length1,
1039 : 114 : DatumGetFloat8(length_hist_values[i]),
1040 : 114 : DatumGetFloat8(length_hist_values[i + 1]));
1041 : : }
1042 : 156 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
1043 : 156 : B = length1;
1044 : :
1045 : : /*
1046 : : * In the degenerate case that length1 == length2, simply return
1047 : : * P(length1). This is not merely an optimization: if length1 == length2,
1048 : : * we'd divide by zero later on.
1049 : : */
1050 [ + + ]: 156 : if (length2 == length1)
1051 : 72 : return PB;
1052 : :
1053 : : /*
1054 : : * Loop through all the bins, until we hit the last bin, the one that
1055 : : * contains the upper bound. (if lower and upper bounds are in the same
1056 : : * bin, this falls out immediately)
1057 : : */
1058 : 84 : area = 0.0;
1059 [ + - ]: 1584 : for (; i < length_hist_nvalues - 1; i++)
1060 : : {
1061 : 1584 : double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
1062 : :
1063 : : /* check if we've reached the last bin */
1064 [ + + + + : 1584 : if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
- + ]
1065 : : break;
1066 : :
1067 : : /* the upper bound of previous bin is the lower bound of this bin */
1068 : 1500 : A = B;
1069 : 1500 : PA = PB;
1070 : :
1071 : 1500 : B = bin_upper;
1072 : 1500 : PB = (double) i / (double) (length_hist_nvalues - 1);
1073 : :
1074 : : /*
1075 : : * Add the area of this trapezoid to the total. The point of the
1076 : : * if-check is to avoid NaN, in the corner case that PA == PB == 0,
1077 : : * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
1078 : : * 0) is zero, regardless of the width (B - A).
1079 : : */
1080 [ + + + + ]: 1500 : if (PA > 0 || PB > 0)
1081 : 1476 : area += 0.5 * (PB + PA) * (B - A);
1082 : : }
1083 : :
1084 : : /* Last bin */
1085 : 84 : A = B;
1086 : 84 : PA = PB;
1087 : :
1088 : 84 : B = length2; /* last bin ends at the query upper bound */
1089 [ - + ]: 84 : if (i >= length_hist_nvalues - 1)
1721 akorotkov@postgresql 1090 :UBC 0 : pos = 0.0;
1091 : : else
1092 : : {
1721 akorotkov@postgresql 1093 [ + + ]:CBC 84 : if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
1094 : 18 : pos = 0.0;
1095 : : else
1096 : 66 : pos = get_len_position(length2,
1097 : 66 : DatumGetFloat8(length_hist_values[i]),
1098 : 66 : DatumGetFloat8(length_hist_values[i + 1]));
1099 : : }
1100 : 84 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
1101 : :
1102 [ + + - + ]: 84 : if (PA > 0 || PB > 0)
1103 : 66 : area += 0.5 * (PB + PA) * (B - A);
1104 : :
1105 : : /*
1106 : : * Ok, we have calculated the area, ie. the integral. Divide by width to
1107 : : * get the requested average.
1108 : : *
1109 : : * Avoid NaN arising from infinite / infinite. This happens at least if
1110 : : * length2 is infinite. It's not clear what the correct value would be in
1111 : : * that case, so 0.5 seems as good as any value.
1112 : : */
1113 [ + + + - ]: 84 : if (isinf(area) && isinf(length2))
1114 : 24 : frac = 0.5;
1115 : : else
1116 : 60 : frac = area / (length2 - length1);
1117 : :
1118 : 84 : return frac;
1119 : : }
1120 : :
1121 : : /*
1122 : : * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
1123 : : * of multiranges that fall within the constant lower and upper bounds. This uses
1124 : : * the histograms of range lower bounds and range lengths, on the assumption
1125 : : * that the range lengths are independent of the lower bounds.
1126 : : *
1127 : : * The caller has already checked that constant lower and upper bounds are
1128 : : * finite.
1129 : : */
1130 : : static double
1131 : 12 : calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1132 : : const RangeBound *lower, RangeBound *upper,
1133 : : const RangeBound *hist_lower, int hist_nvalues,
1134 : : Datum *length_hist_values, int length_hist_nvalues)
1135 : : {
1136 : : int i,
1137 : : upper_index;
1138 : : float8 prev_dist;
1139 : : double bin_width;
1140 : : double upper_bin_width;
1141 : : double sum_frac;
1142 : :
1143 : : /*
1144 : : * Begin by finding the bin containing the upper bound, in the lower bound
1145 : : * histogram. Any range with a lower bound > constant upper bound can't
1146 : : * match, ie. there are no matches in bins greater than upper_index.
1147 : : */
1148 : 12 : upper->inclusive = !upper->inclusive;
1149 : 12 : upper->lower = true;
1150 : 12 : upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1151 : : false);
1152 : :
1153 : : /*
1154 : : * If the upper bound value is below the histogram's lower limit, there
1155 : : * are no matches.
1156 : : */
1157 [ - + ]: 12 : if (upper_index < 0)
1721 akorotkov@postgresql 1158 :UBC 0 : return 0.0;
1159 : :
1160 : : /*
1161 : : * If the upper bound value is at or beyond the histogram's upper limit,
1162 : : * start our loop at the last actual bin, as though the upper bound were
1163 : : * within that bin; get_position will clamp its result to 1.0 anyway.
1164 : : * (This corresponds to assuming that the data population above the
1165 : : * histogram's upper limit is empty, exactly like what we just assumed for
1166 : : * the lower limit.)
1167 : : */
1721 akorotkov@postgresql 1168 :CBC 12 : upper_index = Min(upper_index, hist_nvalues - 2);
1169 : :
1170 : : /*
1171 : : * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1172 : : * upper_index + 1) bin which is greater than upper bound of query range
1173 : : * using linear interpolation of subdiff function.
1174 : : */
1175 : 12 : upper_bin_width = get_position(typcache, upper,
1176 : 12 : &hist_lower[upper_index],
1177 : 12 : &hist_lower[upper_index + 1]);
1178 : :
1179 : : /*
1180 : : * In the loop, dist and prev_dist are the distance of the "current" bin's
1181 : : * lower and upper bounds from the constant upper bound.
1182 : : *
1183 : : * bin_width represents the width of the current bin. Normally it is 1.0,
1184 : : * meaning a full width bin, but can be less in the corner cases: start
1185 : : * and end of the loop. We start with bin_width = upper_bin_width, because
1186 : : * we begin at the bin containing the upper bound.
1187 : : */
1188 : 12 : prev_dist = 0.0;
1189 : 12 : bin_width = upper_bin_width;
1190 : :
1191 : 12 : sum_frac = 0.0;
1192 [ + - ]: 60 : for (i = upper_index; i >= 0; i--)
1193 : : {
1194 : : double dist;
1195 : : double length_hist_frac;
1196 : 60 : bool final_bin = false;
1197 : :
1198 : : /*
1199 : : * dist -- distance from upper bound of query range to lower bound of
1200 : : * the current bin in the lower bound histogram. Or to the lower bound
1201 : : * of the constant range, if this is the final bin, containing the
1202 : : * constant lower bound.
1203 : : */
1204 [ + + ]: 60 : if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1205 : : {
1206 : 12 : dist = get_distance(typcache, lower, upper);
1207 : :
1208 : : /*
1209 : : * Subtract from bin_width the portion of this bin that we want to
1210 : : * ignore.
1211 : : */
1212 : 24 : bin_width -= get_position(typcache, lower, &hist_lower[i],
1213 : 12 : &hist_lower[i + 1]);
1214 [ - + ]: 12 : if (bin_width < 0.0)
1721 akorotkov@postgresql 1215 :UBC 0 : bin_width = 0.0;
1721 akorotkov@postgresql 1216 :CBC 12 : final_bin = true;
1217 : : }
1218 : : else
1219 : 48 : dist = get_distance(typcache, &hist_lower[i], upper);
1220 : :
1221 : : /*
1222 : : * Estimate the fraction of tuples in this bin that are narrow enough
1223 : : * to not exceed the distance to the upper bound of the query range.
1224 : : */
1225 : 60 : length_hist_frac = calc_length_hist_frac(length_hist_values,
1226 : : length_hist_nvalues,
1227 : : prev_dist, dist, true);
1228 : :
1229 : : /*
1230 : : * Add the fraction of tuples in this bin, with a suitable length, to
1231 : : * the total.
1232 : : */
1233 : 60 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1234 : :
1235 [ + + ]: 60 : if (final_bin)
1236 : 12 : break;
1237 : :
1238 : 48 : bin_width = 1.0;
1239 : 48 : prev_dist = dist;
1240 : : }
1241 : :
1242 : 12 : return sum_frac;
1243 : : }
1244 : :
1245 : : /*
1246 : : * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1247 : : * of multiranges that contain the constant lower and upper bounds. This uses
1248 : : * the histograms of range lower bounds and range lengths, on the assumption
1249 : : * that the range lengths are independent of the lower bounds.
1250 : : */
1251 : : static double
1252 : 24 : calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1253 : : const RangeBound *lower, const RangeBound *upper,
1254 : : const RangeBound *hist_lower, int hist_nvalues,
1255 : : Datum *length_hist_values, int length_hist_nvalues)
1256 : : {
1257 : : int i,
1258 : : lower_index;
1259 : : double bin_width,
1260 : : lower_bin_width;
1261 : : double sum_frac;
1262 : : float8 prev_dist;
1263 : :
1264 : : /* Find the bin containing the lower bound of query range. */
1265 : 24 : lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1266 : : true);
1267 : :
1268 : : /*
1269 : : * If the lower bound value is below the histogram's lower limit, there
1270 : : * are no matches.
1271 : : */
1272 [ - + ]: 24 : if (lower_index < 0)
1721 akorotkov@postgresql 1273 :UBC 0 : return 0.0;
1274 : :
1275 : : /*
1276 : : * If the lower bound value is at or beyond the histogram's upper limit,
1277 : : * start our loop at the last actual bin, as though the upper bound were
1278 : : * within that bin; get_position will clamp its result to 1.0 anyway.
1279 : : * (This corresponds to assuming that the data population above the
1280 : : * histogram's upper limit is empty, exactly like what we just assumed for
1281 : : * the lower limit.)
1282 : : */
1721 akorotkov@postgresql 1283 :CBC 24 : lower_index = Min(lower_index, hist_nvalues - 2);
1284 : :
1285 : : /*
1286 : : * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1287 : : * lower_index + 1) bin which is greater than lower bound of query range
1288 : : * using linear interpolation of subdiff function.
1289 : : */
1290 : 24 : lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1291 : 24 : &hist_lower[lower_index + 1]);
1292 : :
1293 : : /*
1294 : : * Loop through all the lower bound bins, smaller than the query lower
1295 : : * bound. In the loop, dist and prev_dist are the distance of the
1296 : : * "current" bin's lower and upper bounds from the constant upper bound.
1297 : : * We begin from query lower bound, and walk backwards, so the first bin's
1298 : : * upper bound is the query lower bound, and its distance to the query
1299 : : * upper bound is the length of the query range.
1300 : : *
1301 : : * bin_width represents the width of the current bin. Normally it is 1.0,
1302 : : * meaning a full width bin, except for the first bin, which is only
1303 : : * counted up to the constant lower bound.
1304 : : */
1305 : 24 : prev_dist = get_distance(typcache, lower, upper);
1306 : 24 : sum_frac = 0.0;
1307 : 24 : bin_width = lower_bin_width;
1308 [ + + ]: 144 : for (i = lower_index; i >= 0; i--)
1309 : : {
1310 : : float8 dist;
1311 : : double length_hist_frac;
1312 : :
1313 : : /*
1314 : : * dist -- distance from upper bound of query range to current value
1315 : : * of lower bound histogram or lower bound of query range (if we've
1316 : : * reach it).
1317 : : */
1318 : 120 : dist = get_distance(typcache, &hist_lower[i], upper);
1319 : :
1320 : : /*
1321 : : * Get average fraction of length histogram which covers intervals
1322 : : * longer than (or equal to) distance to upper bound of query range.
1323 : : */
1324 : 120 : length_hist_frac =
1325 : 120 : 1.0 - calc_length_hist_frac(length_hist_values,
1326 : : length_hist_nvalues,
1327 : : prev_dist, dist, false);
1328 : :
1329 : 120 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1330 : :
1331 : 120 : bin_width = 1.0;
1332 : 120 : prev_dist = dist;
1333 : : }
1334 : :
1335 : 24 : return sum_frac;
1336 : : }
|