Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * rangetypes_selfuncs.c
4 : : * Functions for selectivity estimation of range operators
5 : : *
6 : : * Estimates are based on histograms of lower and upper bounds, and the
7 : : * fraction of empty ranges.
8 : : *
9 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10 : : * Portions Copyright (c) 1994, Regents of the University of California
11 : : *
12 : : *
13 : : * IDENTIFICATION
14 : : * src/backend/utils/adt/rangetypes_selfuncs.c
15 : : *
16 : : *-------------------------------------------------------------------------
17 : : */
18 : : #include "postgres.h"
19 : :
20 : : #include <math.h>
21 : :
22 : : #include "access/htup_details.h"
23 : : #include "catalog/pg_operator.h"
24 : : #include "catalog/pg_statistic.h"
25 : : #include "utils/float.h"
26 : : #include "utils/fmgrprotos.h"
27 : : #include "utils/lsyscache.h"
28 : : #include "utils/rangetypes.h"
29 : : #include "utils/selfuncs.h"
30 : : #include "utils/typcache.h"
31 : :
32 : : static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
33 : : const RangeType *constval, Oid operator);
34 : : static double default_range_selectivity(Oid operator);
35 : : static double calc_hist_selectivity(TypeCacheEntry *typcache,
36 : : VariableStatData *vardata, const RangeType *constval,
37 : : Oid operator);
38 : : static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
39 : : const RangeBound *constbound,
40 : : const RangeBound *hist, int hist_nvalues,
41 : : bool equal);
42 : : static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value,
43 : : const RangeBound *hist, int hist_length, bool equal);
44 : : static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value,
45 : : const RangeBound *hist1, const RangeBound *hist2);
46 : : static float8 get_len_position(double value, double hist1, double hist2);
47 : : static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1,
48 : : const RangeBound *bound2);
49 : : static int length_hist_bsearch(Datum *length_hist_values,
50 : : int length_hist_nvalues, double value, bool equal);
51 : : static double calc_length_hist_frac(Datum *length_hist_values,
52 : : int length_hist_nvalues, double length1, double length2, bool equal);
53 : : static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
54 : : const RangeBound *lower, RangeBound *upper,
55 : : const RangeBound *hist_lower, int hist_nvalues,
56 : : Datum *length_hist_values, int length_hist_nvalues);
57 : : static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
58 : : const RangeBound *lower, const RangeBound *upper,
59 : : const RangeBound *hist_lower, int hist_nvalues,
60 : : Datum *length_hist_values, int length_hist_nvalues);
61 : :
62 : : /*
63 : : * Returns a default selectivity estimate for given operator, when we don't
64 : : * have statistics or cannot use them for some reason.
65 : : */
66 : : static double
4810 heikki.linnakangas@i 67 :CBC 564 : default_range_selectivity(Oid operator)
68 : : {
69 [ + + + + : 564 : switch (operator)
- ]
70 : : {
71 : 297 : case OID_RANGE_OVERLAP_OP:
72 : 297 : return 0.01;
73 : :
74 : 54 : case OID_RANGE_CONTAINS_OP:
75 : : case OID_RANGE_CONTAINED_OP:
76 : 54 : return 0.005;
77 : :
78 : 63 : case OID_RANGE_CONTAINS_ELEM_OP:
79 : : case OID_RANGE_ELEM_CONTAINED_OP:
80 : :
81 : : /*
82 : : * "range @> elem" is more or less identical to a scalar
83 : : * inequality "A >= b AND A <= c".
84 : : */
85 : 63 : return DEFAULT_RANGE_INEQ_SEL;
86 : :
87 : 150 : case OID_RANGE_LESS_OP:
88 : : case OID_RANGE_LESS_EQUAL_OP:
89 : : case OID_RANGE_GREATER_OP:
90 : : case OID_RANGE_GREATER_EQUAL_OP:
91 : : case OID_RANGE_LEFT_OP:
92 : : case OID_RANGE_RIGHT_OP:
93 : : case OID_RANGE_OVERLAPS_LEFT_OP:
94 : : case OID_RANGE_OVERLAPS_RIGHT_OP:
95 : : /* these are similar to regular scalar inequalities */
96 : 150 : return DEFAULT_INEQ_SEL;
97 : :
4810 heikki.linnakangas@i 98 :UBC 0 : default:
99 : : /* all range operators should be handled above, but just in case */
100 : 0 : return 0.01;
101 : : }
102 : : }
103 : :
104 : : /*
105 : : * rangesel -- restriction selectivity for range operators
106 : : */
107 : : Datum
4810 heikki.linnakangas@i 108 :CBC 729 : rangesel(PG_FUNCTION_ARGS)
109 : : {
110 : 729 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
111 : 729 : Oid operator = PG_GETARG_OID(1);
112 : 729 : List *args = (List *) PG_GETARG_POINTER(2);
113 : 729 : int varRelid = PG_GETARG_INT32(3);
114 : : VariableStatData vardata;
115 : : Node *other;
116 : : bool varonleft;
117 : : Selectivity selec;
3924 tgl@sss.pgh.pa.us 118 : 729 : TypeCacheEntry *typcache = NULL;
4810 heikki.linnakangas@i 119 : 729 : RangeType *constrange = NULL;
120 : :
121 : : /*
122 : : * If expression is not (variable op something) or (something op
123 : : * variable), then punt and return a default estimate.
124 : : */
125 [ - + ]: 729 : if (!get_restriction_variable(root, args, varRelid,
126 : : &vardata, &other, &varonleft))
4810 heikki.linnakangas@i 127 :UBC 0 : PG_RETURN_FLOAT8(default_range_selectivity(operator));
128 : :
129 : : /*
130 : : * Can't do anything useful if the something is not a constant, either.
131 : : */
4810 heikki.linnakangas@i 132 [ + + ]:CBC 729 : if (!IsA(other, Const))
133 : : {
134 [ - + ]: 27 : ReleaseVariableStats(vardata);
135 : 27 : PG_RETURN_FLOAT8(default_range_selectivity(operator));
136 : : }
137 : :
138 : : /*
139 : : * All the range operators are strict, so we can cope with a NULL constant
140 : : * right away.
141 : : */
142 [ - + ]: 702 : if (((Const *) other)->constisnull)
143 : : {
4810 heikki.linnakangas@i 144 [ # # ]:UBC 0 : ReleaseVariableStats(vardata);
145 : 0 : PG_RETURN_FLOAT8(0.0);
146 : : }
147 : :
148 : : /*
149 : : * If var is on the right, commute the operator, so that we can assume the
150 : : * var is on the left in what follows.
151 : : */
4810 heikki.linnakangas@i 152 [ + + ]:CBC 702 : if (!varonleft)
153 : : {
154 : : /* we have other Op var, commute to make var Op other */
155 : 175 : operator = get_commutator(operator);
156 [ - + ]: 175 : if (!operator)
157 : : {
158 : : /* Use default selectivity (should we raise an error instead?) */
4810 heikki.linnakangas@i 159 [ # # ]:UBC 0 : ReleaseVariableStats(vardata);
160 : 0 : PG_RETURN_FLOAT8(default_range_selectivity(operator));
161 : : }
162 : : }
163 : :
164 : : /*
165 : : * OK, there's a Var and a Const we're dealing with here. We need the
166 : : * Const to be of same range type as the column, else we can't do anything
167 : : * useful. (Such cases will likely fail at runtime, but here we'd rather
168 : : * just return a default estimate.)
169 : : *
170 : : * If the operator is "range @> element", the constant should be of the
171 : : * element type of the range column. Convert it to a range that includes
172 : : * only that single point, so that we don't need special handling for that
173 : : * in what follows.
174 : : */
4810 heikki.linnakangas@i 175 [ + + ]:CBC 702 : if (operator == OID_RANGE_CONTAINS_ELEM_OP)
176 : : {
4604 177 : 72 : typcache = range_get_typcache(fcinfo, vardata.vartype);
178 : :
4810 179 [ + - ]: 72 : if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
180 : : {
181 : : RangeBound lower,
182 : : upper;
183 : :
184 : 72 : lower.inclusive = true;
185 : 72 : lower.val = ((Const *) other)->constvalue;
186 : 72 : lower.infinite = false;
187 : 72 : lower.lower = true;
188 : 72 : upper.inclusive = true;
189 : 72 : upper.val = ((Const *) other)->constvalue;
190 : 72 : upper.infinite = false;
191 : 72 : upper.lower = false;
1048 tgl@sss.pgh.pa.us 192 : 72 : constrange = range_serialize(typcache, &lower, &upper, false, NULL);
193 : : }
194 : : }
3924 195 [ + - ]: 630 : else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
196 : : {
197 : : /*
198 : : * Here, the Var is the elem, not the range. In typical cases
199 : : * elem_contained_by_range_support will have simplified this case, so
200 : : * that we won't get here. If we do get here we'll fall back on a
201 : : * default estimate.
202 : : */
203 : : }
204 [ + - ]: 630 : else if (((Const *) other)->consttype == vardata.vartype)
205 : : {
206 : : /* Both sides are the same range type */
207 : 630 : typcache = range_get_typcache(fcinfo, vardata.vartype);
208 : :
2962 209 : 630 : constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
210 : : }
211 : :
212 : : /*
213 : : * If we got a valid constant on one side of the operator, proceed to
214 : : * estimate using statistics. Otherwise punt and return a default constant
215 : : * estimate. Note that calc_rangesel need not handle
216 : : * OID_RANGE_ELEM_CONTAINED_OP.
217 : : */
4810 heikki.linnakangas@i 218 [ + - ]: 702 : if (constrange)
219 : 702 : selec = calc_rangesel(typcache, &vardata, constrange, operator);
220 : : else
4810 heikki.linnakangas@i 221 :UBC 0 : selec = default_range_selectivity(operator);
222 : :
4810 heikki.linnakangas@i 223 [ + + ]:CBC 702 : ReleaseVariableStats(vardata);
224 : :
225 [ - + - + ]: 702 : CLAMP_PROBABILITY(selec);
226 : :
227 : 702 : PG_RETURN_FLOAT8((float8) selec);
228 : : }
229 : :
230 : : static double
231 : 702 : calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
232 : : const RangeType *constval, Oid operator)
233 : : {
234 : : double hist_selec;
235 : : double selec;
236 : : float4 empty_frac,
237 : : null_frac;
238 : :
239 : : /*
240 : : * First look up the fraction of NULLs and empty ranges from pg_statistic.
241 : : */
242 [ + + ]: 702 : if (HeapTupleIsValid(vardata->statsTuple))
243 : : {
244 : : Form_pg_statistic stats;
245 : : AttStatsSlot sslot;
246 : :
247 : 81 : stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
248 : 81 : null_frac = stats->stanullfrac;
249 : :
250 : : /* Try to get fraction of empty ranges */
3090 tgl@sss.pgh.pa.us 251 [ + - ]: 81 : if (get_attstatsslot(&sslot, vardata->statsTuple,
252 : : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
253 : : InvalidOid,
254 : : ATTSTATSSLOT_NUMBERS))
255 : : {
256 [ - + ]: 81 : if (sslot.nnumbers != 1)
3051 tgl@sss.pgh.pa.us 257 [ # # ]:UBC 0 : elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
3090 tgl@sss.pgh.pa.us 258 :CBC 81 : empty_frac = sslot.numbers[0];
259 : 81 : free_attstatsslot(&sslot);
260 : : }
261 : : else
262 : : {
263 : : /* No empty fraction statistic. Assume no empty ranges. */
4810 heikki.linnakangas@i 264 :UBC 0 : empty_frac = 0.0;
265 : : }
266 : : }
267 : : else
268 : : {
269 : : /*
270 : : * No stats are available. Follow through the calculations below
271 : : * anyway, assuming no NULLs and no empty ranges. This still allows us
272 : : * to give a better-than-nothing estimate based on whether the
273 : : * constant is an empty range or not.
274 : : */
4810 heikki.linnakangas@i 275 :CBC 621 : null_frac = 0.0;
276 : 621 : empty_frac = 0.0;
277 : : }
278 : :
279 [ + + ]: 702 : if (RangeIsEmpty(constval))
280 : : {
281 : : /*
282 : : * An empty range matches all ranges, all empty ranges, or nothing,
283 : : * depending on the operator
284 : : */
285 [ + + + + : 93 : switch (operator)
- ]
286 : : {
287 : : /* these return false if either argument is empty */
288 : 6 : case OID_RANGE_OVERLAP_OP:
289 : : case OID_RANGE_OVERLAPS_LEFT_OP:
290 : : case OID_RANGE_OVERLAPS_RIGHT_OP:
291 : : case OID_RANGE_LEFT_OP:
292 : : case OID_RANGE_RIGHT_OP:
293 : : /* nothing is less than an empty range */
294 : : case OID_RANGE_LESS_OP:
295 : 6 : selec = 0.0;
296 : 6 : break;
297 : :
298 : : /* only empty ranges can be contained by an empty range */
299 : 27 : case OID_RANGE_CONTAINED_OP:
300 : : /* only empty ranges are <= an empty range */
301 : : case OID_RANGE_LESS_EQUAL_OP:
302 : 27 : selec = empty_frac;
303 : 27 : break;
304 : :
305 : : /* everything contains an empty range */
3924 tgl@sss.pgh.pa.us 306 : 45 : case OID_RANGE_CONTAINS_OP:
307 : : /* everything is >= an empty range */
308 : : case OID_RANGE_GREATER_EQUAL_OP:
4810 heikki.linnakangas@i 309 : 45 : selec = 1.0;
310 : 45 : break;
311 : :
312 : : /* all non-empty ranges are > an empty range */
3924 tgl@sss.pgh.pa.us 313 : 15 : case OID_RANGE_GREATER_OP:
314 : 15 : selec = 1.0 - empty_frac;
315 : 15 : break;
316 : :
317 : : /* an element cannot be empty */
4810 heikki.linnakangas@i 318 :UBC 0 : case OID_RANGE_CONTAINS_ELEM_OP:
319 : : default:
320 [ # # ]: 0 : elog(ERROR, "unexpected operator %u", operator);
321 : : selec = 0.0; /* keep compiler quiet */
322 : : break;
323 : : }
324 : : }
325 : : else
326 : : {
327 : : /*
328 : : * Calculate selectivity using bound histograms. If that fails for
329 : : * some reason, e.g no histogram in pg_statistic, use the default
330 : : * constant estimate for the fraction of non-empty values. This is
331 : : * still somewhat better than just returning the default estimate,
332 : : * because this still takes into account the fraction of empty and
333 : : * NULL tuples, if we had statistics for them.
334 : : */
4810 heikki.linnakangas@i 335 :CBC 609 : hist_selec = calc_hist_selectivity(typcache, vardata, constval,
336 : : operator);
337 [ + + ]: 609 : if (hist_selec < 0.0)
338 : 537 : hist_selec = default_range_selectivity(operator);
339 : :
340 : : /*
341 : : * Now merge the results for the empty ranges and histogram
342 : : * calculations, realizing that the histogram covers only the
343 : : * non-null, non-empty values.
344 : : */
345 [ + + ]: 609 : if (operator == OID_RANGE_CONTAINED_OP)
346 : : {
347 : : /* empty is contained by anything non-empty */
348 : 36 : selec = (1.0 - empty_frac) * hist_selec + empty_frac;
349 : : }
350 : : else
351 : : {
352 : : /* with any other operator, empty Op non-empty matches nothing */
353 : 573 : selec = (1.0 - empty_frac) * hist_selec;
354 : : }
355 : : }
356 : :
357 : : /* all range operators are strict */
358 : 702 : selec *= (1.0 - null_frac);
359 : :
360 : : /* result should be in range, but make sure... */
361 [ - + - + ]: 702 : CLAMP_PROBABILITY(selec);
362 : :
363 : 702 : return selec;
364 : : }
365 : :
366 : : /*
367 : : * Calculate range operator selectivity using histograms of range bounds.
368 : : *
369 : : * This estimate is for the portion of values that are not empty and not
370 : : * NULL.
371 : : */
372 : : static double
373 : 609 : calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
374 : : const RangeType *constval, Oid operator)
375 : : {
376 : : AttStatsSlot hslot;
377 : : AttStatsSlot lslot;
378 : : int nhist;
379 : : RangeBound *hist_lower;
380 : : RangeBound *hist_upper;
381 : : int i;
382 : : RangeBound const_lower;
383 : : RangeBound const_upper;
384 : : bool empty;
385 : : double hist_selec;
386 : :
387 : : /* Can't use the histogram with insecure range support functions */
3098 peter_e@gmx.net 388 [ - + ]: 609 : if (!statistic_proc_security_check(vardata,
389 : : typcache->rng_cmp_proc_finfo.fn_oid))
3098 peter_e@gmx.net 390 :UBC 0 : return -1;
3098 peter_e@gmx.net 391 [ + + ]:CBC 609 : if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
392 [ - + ]: 603 : !statistic_proc_security_check(vardata,
393 : : typcache->rng_subdiff_finfo.fn_oid))
3098 peter_e@gmx.net 394 :UBC 0 : return -1;
395 : :
396 : : /* Try to get histogram of ranges */
4810 heikki.linnakangas@i 397 [ + + - + ]:CBC 681 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
3090 tgl@sss.pgh.pa.us 398 : 72 : get_attstatsslot(&hslot, vardata->statsTuple,
399 : : STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
400 : : ATTSTATSSLOT_VALUES)))
4810 heikki.linnakangas@i 401 : 537 : return -1.0;
402 : :
403 : : /* check that it's a histogram, not just a dummy entry */
2116 tgl@sss.pgh.pa.us 404 [ - + ]: 72 : if (hslot.nvalues < 2)
405 : : {
2116 tgl@sss.pgh.pa.us 406 :UBC 0 : free_attstatsslot(&hslot);
407 : 0 : return -1.0;
408 : : }
409 : :
410 : : /*
411 : : * Convert histogram of ranges into histograms of its lower and upper
412 : : * bounds.
413 : : */
3090 tgl@sss.pgh.pa.us 414 :CBC 72 : nhist = hslot.nvalues;
4810 heikki.linnakangas@i 415 : 72 : hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
416 : 72 : hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
417 [ + + ]: 7344 : for (i = 0; i < nhist; i++)
418 : : {
2962 tgl@sss.pgh.pa.us 419 : 7272 : range_deserialize(typcache, DatumGetRangeTypeP(hslot.values[i]),
4810 heikki.linnakangas@i 420 : 7272 : &hist_lower[i], &hist_upper[i], &empty);
421 : : /* The histogram should not contain any empty ranges */
422 [ - + ]: 7272 : if (empty)
4810 heikki.linnakangas@i 423 [ # # ]:UBC 0 : elog(ERROR, "bounds histogram contains an empty range");
424 : : }
425 : :
426 : : /* @> and @< also need a histogram of range lengths */
4611 heikki.linnakangas@i 427 [ + + + + ]:CBC 72 : if (operator == OID_RANGE_CONTAINS_OP ||
428 : : operator == OID_RANGE_CONTAINED_OP)
429 : : {
430 [ + - - + ]: 36 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
3090 tgl@sss.pgh.pa.us 431 : 18 : get_attstatsslot(&lslot, vardata->statsTuple,
432 : : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
433 : : InvalidOid,
434 : : ATTSTATSSLOT_VALUES)))
435 : : {
3090 tgl@sss.pgh.pa.us 436 :UBC 0 : free_attstatsslot(&hslot);
4611 heikki.linnakangas@i 437 : 0 : return -1.0;
438 : : }
439 : :
440 : : /* check that it's a histogram, not just a dummy entry */
3090 tgl@sss.pgh.pa.us 441 [ - + ]:CBC 18 : if (lslot.nvalues < 2)
442 : : {
3090 tgl@sss.pgh.pa.us 443 :UBC 0 : free_attstatsslot(&lslot);
444 : 0 : free_attstatsslot(&hslot);
4611 heikki.linnakangas@i 445 : 0 : return -1.0;
446 : : }
447 : : }
448 : : else
3090 tgl@sss.pgh.pa.us 449 :CBC 54 : memset(&lslot, 0, sizeof(lslot));
450 : :
451 : : /* Extract the bounds of the constant value. */
4810 heikki.linnakangas@i 452 : 72 : range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
4535 bruce@momjian.us 453 [ - + ]: 72 : Assert(!empty);
454 : :
455 : : /*
456 : : * Calculate selectivity comparing the lower or upper bound of the
457 : : * constant with the histogram of lower or upper bounds.
458 : : */
4810 heikki.linnakangas@i 459 [ - - - - : 72 : switch (operator)
+ + + + +
+ + - ]
460 : : {
4810 heikki.linnakangas@i 461 :UBC 0 : case OID_RANGE_LESS_OP:
462 : :
463 : : /*
464 : : * The regular b-tree comparison operators (<, <=, >, >=) compare
465 : : * the lower bounds first, and the upper bounds for values with
466 : : * equal lower bounds. Estimate that by comparing the lower bounds
467 : : * only. This gives a fairly accurate estimate assuming there
468 : : * aren't many rows with a lower bound equal to the constant's
469 : : * lower bound.
470 : : */
471 : : hist_selec =
472 : 0 : calc_hist_selectivity_scalar(typcache, &const_lower,
473 : : hist_lower, nhist, false);
474 : 0 : break;
475 : :
476 : 0 : case OID_RANGE_LESS_EQUAL_OP:
477 : : hist_selec =
478 : 0 : calc_hist_selectivity_scalar(typcache, &const_lower,
479 : : hist_lower, nhist, true);
480 : 0 : break;
481 : :
482 : 0 : case OID_RANGE_GREATER_OP:
483 : 0 : hist_selec =
484 : 0 : 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
485 : : hist_lower, nhist, false);
486 : 0 : break;
487 : :
488 : 0 : case OID_RANGE_GREATER_EQUAL_OP:
489 : 0 : hist_selec =
490 : 0 : 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
491 : : hist_lower, nhist, true);
492 : 0 : break;
493 : :
4810 heikki.linnakangas@i 494 :CBC 9 : case OID_RANGE_LEFT_OP:
495 : : /* var << const when upper(var) < lower(const) */
496 : : hist_selec =
497 : 9 : calc_hist_selectivity_scalar(typcache, &const_lower,
498 : : hist_upper, nhist, false);
499 : 9 : break;
500 : :
501 : 9 : case OID_RANGE_RIGHT_OP:
502 : : /* var >> const when lower(var) > upper(const) */
503 : 9 : hist_selec =
504 : 9 : 1 - calc_hist_selectivity_scalar(typcache, &const_upper,
505 : : hist_lower, nhist, true);
506 : 9 : break;
507 : :
508 : 9 : case OID_RANGE_OVERLAPS_RIGHT_OP:
509 : : /* compare lower bounds */
510 : 9 : hist_selec =
511 : 9 : 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
512 : : hist_lower, nhist, false);
513 : 9 : break;
514 : :
515 : 9 : case OID_RANGE_OVERLAPS_LEFT_OP:
516 : : /* compare upper bounds */
517 : : hist_selec =
518 : 9 : calc_hist_selectivity_scalar(typcache, &const_upper,
519 : : hist_upper, nhist, true);
520 : 9 : break;
521 : :
522 : 18 : case OID_RANGE_OVERLAP_OP:
523 : : case OID_RANGE_CONTAINS_ELEM_OP:
524 : :
525 : : /*
526 : : * A && B <=> NOT (A << B OR A >> B).
527 : : *
528 : : * Since A << B and A >> B are mutually exclusive events we can
529 : : * sum their probabilities to find probability of (A << B OR A >>
530 : : * B).
531 : : *
532 : : * "range @> elem" is equivalent to "range && [elem,elem]". The
533 : : * caller already constructed the singular range from the element
534 : : * constant, so just treat it the same as &&.
535 : : */
536 : : hist_selec =
537 : 18 : calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
538 : : nhist, false);
539 : 18 : hist_selec +=
540 : 18 : (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
541 : : nhist, true));
542 : 18 : hist_selec = 1.0 - hist_selec;
543 : 18 : break;
544 : :
545 : 9 : case OID_RANGE_CONTAINS_OP:
546 : : hist_selec =
4611 547 : 9 : calc_hist_selectivity_contains(typcache, &const_lower,
548 : : &const_upper, hist_lower, nhist,
549 : : lslot.values, lslot.nvalues);
550 : 9 : break;
551 : :
4810 552 : 9 : case OID_RANGE_CONTAINED_OP:
4611 553 [ - + ]: 9 : if (const_lower.infinite)
554 : : {
555 : : /*
556 : : * Lower bound no longer matters. Just estimate the fraction
557 : : * with an upper bound <= const upper bound
558 : : */
559 : : hist_selec =
4611 heikki.linnakangas@i 560 :UBC 0 : calc_hist_selectivity_scalar(typcache, &const_upper,
561 : : hist_upper, nhist, true);
562 : : }
4611 heikki.linnakangas@i 563 [ - + ]:CBC 9 : else if (const_upper.infinite)
564 : : {
4611 heikki.linnakangas@i 565 :UBC 0 : hist_selec =
566 : 0 : 1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
567 : : hist_lower, nhist, false);
568 : : }
569 : : else
570 : : {
571 : : hist_selec =
4611 heikki.linnakangas@i 572 :CBC 9 : calc_hist_selectivity_contained(typcache, &const_lower,
573 : : &const_upper, hist_lower, nhist,
574 : : lslot.values, lslot.nvalues);
575 : : }
4810 576 : 9 : break;
577 : :
4810 heikki.linnakangas@i 578 :UBC 0 : default:
579 [ # # ]: 0 : elog(ERROR, "unknown range operator %u", operator);
580 : : hist_selec = -1.0; /* keep compiler quiet */
581 : : break;
582 : : }
583 : :
3090 tgl@sss.pgh.pa.us 584 :CBC 72 : free_attstatsslot(&lslot);
585 : 72 : free_attstatsslot(&hslot);
586 : :
4810 heikki.linnakangas@i 587 : 72 : return hist_selec;
588 : : }
589 : :
590 : :
591 : : /*
592 : : * Look up the fraction of values less than (or equal, if 'equal' argument
593 : : * is true) a given const in a histogram of range bounds.
594 : : */
595 : : static double
2189 peter@eisentraut.org 596 : 72 : calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound,
597 : : const RangeBound *hist, int hist_nvalues, bool equal)
598 : : {
599 : : Selectivity selec;
600 : : int index;
601 : :
602 : : /*
603 : : * Find the histogram bin the given constant falls into. Estimate
604 : : * selectivity as the number of preceding whole bins.
605 : : */
4810 heikki.linnakangas@i 606 : 72 : index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
607 [ + + ]: 72 : selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
608 : :
609 : : /* Adjust using linear interpolation within the bin */
610 [ + + + - ]: 72 : if (index >= 0 && index < hist_nvalues - 1)
611 : 108 : selec += get_position(typcache, constbound, &hist[index],
3051 tgl@sss.pgh.pa.us 612 : 54 : &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
613 : :
4810 heikki.linnakangas@i 614 : 72 : return selec;
615 : : }
616 : :
617 : : /*
618 : : * Binary search on an array of range bounds. Returns greatest index of range
619 : : * bound in array which is less(less or equal) than given range bound. If all
620 : : * range bounds in array are greater or equal(greater) than given range bound,
621 : : * return -1. When "equal" flag is set conditions in brackets are used.
622 : : *
623 : : * This function is used in scalar operator selectivity estimation. Another
624 : : * goal of this function is to find a histogram bin where to stop
625 : : * interpolation of portion of bounds which are less than or equal to given bound.
626 : : */
627 : : static int
2189 peter@eisentraut.org 628 : 90 : rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist,
629 : : int hist_length, bool equal)
630 : : {
4810 heikki.linnakangas@i 631 : 90 : int lower = -1,
632 : 90 : upper = hist_length - 1,
633 : : cmp,
634 : : middle;
635 : :
636 [ + + ]: 684 : while (lower < upper)
637 : : {
638 : 594 : middle = (lower + upper + 1) / 2;
639 : 594 : cmp = range_cmp_bounds(typcache, &hist[middle], value);
640 : :
641 [ + + + + : 594 : if (cmp < 0 || (equal && cmp == 0))
- + ]
642 : 171 : lower = middle;
643 : : else
644 : 423 : upper = middle - 1;
645 : : }
646 : 90 : return lower;
647 : : }
648 : :
649 : :
650 : : /*
651 : : * Binary search on length histogram. Returns greatest index of range length in
652 : : * histogram which is less than (less than or equal) the given length value. If
653 : : * all lengths in the histogram are greater than (greater than or equal) the
654 : : * given length, returns -1.
655 : : */
656 : : static int
4611 657 : 54 : length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
658 : : double value, bool equal)
659 : : {
660 : 54 : int lower = -1,
661 : 54 : upper = length_hist_nvalues - 1,
662 : : middle;
663 : :
664 [ + + ]: 414 : while (lower < upper)
665 : : {
666 : : double middleval;
667 : :
668 : 360 : middle = (lower + upper + 1) / 2;
669 : :
670 : 360 : middleval = DatumGetFloat8(length_hist_values[middle]);
671 [ + + + + : 360 : if (middleval < value || (equal && middleval <= value))
- + ]
672 : 135 : lower = middle;
673 : : else
674 : 225 : upper = middle - 1;
675 : : }
676 : 54 : return lower;
677 : : }
678 : :
679 : : /*
680 : : * Get relative position of value in histogram bin in [0,1] range.
681 : : */
682 : : static float8
2189 peter@eisentraut.org 683 : 81 : get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1,
684 : : const RangeBound *hist2)
685 : : {
4810 heikki.linnakangas@i 686 : 81 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
687 : : float8 position;
688 : :
689 [ + - + - ]: 81 : if (!hist1->infinite && !hist2->infinite)
690 : : {
691 : : float8 bin_width;
692 : :
693 : : /*
694 : : * Both bounds are finite. Assuming the subtype's comparison function
695 : : * works sanely, the value must be finite, too, because it lies
696 : : * somewhere between the bounds. If it doesn't, arbitrarily return
697 : : * 0.5.
698 : : */
699 [ - + ]: 81 : if (value->infinite)
4810 heikki.linnakangas@i 700 :UBC 0 : return 0.5;
701 : :
702 : : /* Can't interpolate without subdiff function */
4810 heikki.linnakangas@i 703 [ - + ]:CBC 81 : if (!has_subdiff)
4810 heikki.linnakangas@i 704 :UBC 0 : return 0.5;
705 : :
706 : : /* Calculate relative position using subdiff function. */
2116 tgl@sss.pgh.pa.us 707 :CBC 81 : bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
708 : : typcache->rng_collation,
4810 heikki.linnakangas@i 709 : 81 : hist2->val,
710 : 81 : hist1->val));
2116 tgl@sss.pgh.pa.us 711 [ + - - + ]: 81 : if (isnan(bin_width) || bin_width <= 0.0)
2116 tgl@sss.pgh.pa.us 712 :UBC 0 : return 0.5; /* punt for NaN or zero-width bin */
713 : :
2116 tgl@sss.pgh.pa.us 714 :CBC 81 : position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
715 : : typcache->rng_collation,
4810 heikki.linnakangas@i 716 : 81 : value->val,
717 : 81 : hist1->val))
718 : : / bin_width;
719 : :
2116 tgl@sss.pgh.pa.us 720 [ - + ]: 81 : if (isnan(position))
2116 tgl@sss.pgh.pa.us 721 :UBC 0 : return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */
722 : :
723 : : /* Relative position must be in [0,1] range */
4810 heikki.linnakangas@i 724 [ + - ]:CBC 81 : position = Max(position, 0.0);
725 [ + - ]: 81 : position = Min(position, 1.0);
726 : 81 : return position;
727 : : }
4810 heikki.linnakangas@i 728 [ # # # # ]:UBC 0 : else if (hist1->infinite && !hist2->infinite)
729 : : {
730 : : /*
731 : : * Lower bin boundary is -infinite, upper is finite. If the value is
732 : : * -infinite, return 0.0 to indicate it's equal to the lower bound.
733 : : * Otherwise return 1.0 to indicate it's infinitely far from the lower
734 : : * bound.
735 : : */
736 [ # # # # ]: 0 : return ((value->infinite && value->lower) ? 0.0 : 1.0);
737 : : }
738 [ # # # # ]: 0 : else if (!hist1->infinite && hist2->infinite)
739 : : {
740 : : /* same as above, but in reverse */
741 [ # # # # ]: 0 : return ((value->infinite && !value->lower) ? 1.0 : 0.0);
742 : : }
743 : : else
744 : : {
745 : : /*
746 : : * If both bin boundaries are infinite, they should be equal to each
747 : : * other, and the value should also be infinite and equal to both
748 : : * bounds. (But don't Assert that, to avoid crashing if a user creates
749 : : * a datatype with a broken comparison function).
750 : : *
751 : : * Assume the value to lie in the middle of the infinite bounds.
752 : : */
753 : 0 : return 0.5;
754 : : }
755 : : }
756 : :
757 : :
758 : : /*
759 : : * Get relative position of value in a length histogram bin in [0,1] range.
760 : : */
761 : : static double
4611 heikki.linnakangas@i 762 :CBC 81 : get_len_position(double value, double hist1, double hist2)
763 : : {
2579 tgl@sss.pgh.pa.us 764 [ + - + + ]: 81 : if (!isinf(hist1) && !isinf(hist2))
765 : : {
766 : : /*
767 : : * Both bounds are finite. The value should be finite too, because it
768 : : * lies somewhere between the bounds. If it doesn't, just return
769 : : * something.
770 : : */
771 [ - + ]: 63 : if (isinf(value))
4611 heikki.linnakangas@i 772 :UBC 0 : return 0.5;
773 : :
4611 heikki.linnakangas@i 774 :CBC 63 : return 1.0 - (hist2 - value) / (hist2 - hist1);
775 : : }
2579 tgl@sss.pgh.pa.us 776 [ - + - - ]: 18 : else if (isinf(hist1) && !isinf(hist2))
777 : : {
778 : : /*
779 : : * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
780 : : * indicate the value is infinitely far from the lower bound.
781 : : */
4611 heikki.linnakangas@i 782 :UBC 0 : return 1.0;
783 : : }
2579 tgl@sss.pgh.pa.us 784 [ - + - - ]:CBC 18 : else if (isinf(hist1) && isinf(hist2))
785 : : {
786 : : /* same as above, but in reverse */
4611 heikki.linnakangas@i 787 :UBC 0 : return 0.0;
788 : : }
789 : : else
790 : : {
791 : : /*
792 : : * If both bin boundaries are infinite, they should be equal to each
793 : : * other, and the value should also be infinite and equal to both
794 : : * bounds. (But don't Assert that, to avoid crashing unnecessarily if
795 : : * the caller messes up)
796 : : *
797 : : * Assume the value to lie in the middle of the infinite bounds.
798 : : */
4611 heikki.linnakangas@i 799 :CBC 18 : return 0.5;
800 : : }
801 : : }
802 : :
803 : : /*
804 : : * Measure distance between two range bounds.
805 : : */
806 : : static float8
2189 peter@eisentraut.org 807 : 63 : get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2)
808 : : {
4535 bruce@momjian.us 809 : 63 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
810 : :
4611 heikki.linnakangas@i 811 [ + + + - ]: 63 : if (!bound1->infinite && !bound2->infinite)
812 : : {
813 : : /*
814 : : * Neither bound is infinite, use subdiff function or return default
815 : : * value of 1.0 if no subdiff is available.
816 : : */
817 [ + - ]: 45 : if (has_subdiff)
818 : : {
819 : : float8 res;
820 : :
2116 tgl@sss.pgh.pa.us 821 : 45 : res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
822 : : typcache->rng_collation,
823 : 45 : bound2->val,
824 : 45 : bound1->val));
825 : : /* Reject possible NaN result, also negative result */
826 [ + - - + ]: 45 : if (isnan(res) || res < 0.0)
2116 tgl@sss.pgh.pa.us 827 :UBC 0 : return 1.0;
828 : : else
2116 tgl@sss.pgh.pa.us 829 :CBC 45 : return res;
830 : : }
831 : : else
4611 heikki.linnakangas@i 832 :UBC 0 : return 1.0;
833 : : }
4611 heikki.linnakangas@i 834 [ + - - + ]:CBC 18 : else if (bound1->infinite && bound2->infinite)
835 : : {
836 : : /* Both bounds are infinite */
4611 heikki.linnakangas@i 837 [ # # ]:UBC 0 : if (bound1->lower == bound2->lower)
838 : 0 : return 0.0;
839 : : else
840 : 0 : return get_float8_infinity();
841 : : }
842 : : else
843 : : {
844 : : /* One bound is infinite, the other is not */
4611 heikki.linnakangas@i 845 :CBC 18 : return get_float8_infinity();
846 : : }
847 : : }
848 : :
849 : : /*
850 : : * Calculate the average of function P(x), in the interval [length1, length2],
851 : : * where P(x) is the fraction of tuples with length < x (or length <= x if
852 : : * 'equal' is true).
853 : : */
854 : : static double
855 : 54 : calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
856 : : double length1, double length2, bool equal)
857 : : {
858 : : double frac;
859 : : double A,
860 : : B,
861 : : PA,
862 : : PB;
863 : : double pos;
864 : : int i;
865 : : double area;
866 : :
867 [ - + ]: 54 : Assert(length2 >= length1);
868 : :
869 [ - + ]: 54 : if (length2 < 0.0)
4535 bruce@momjian.us 870 :UBC 0 : return 0.0; /* shouldn't happen, but doesn't hurt to check */
871 : :
872 : : /* All lengths in the table are <= infinite. */
2579 tgl@sss.pgh.pa.us 873 [ + + - + ]:CBC 54 : if (isinf(length2) && equal)
4611 heikki.linnakangas@i 874 :UBC 0 : return 1.0;
875 : :
876 : : /*----------
877 : : * The average of a function between A and B can be calculated by the
878 : : * formula:
879 : : *
880 : : * B
881 : : * 1 /
882 : : * ------- | P(x)dx
883 : : * B - A /
884 : : * A
885 : : *
886 : : * The geometrical interpretation of the integral is the area under the
887 : : * graph of P(x). P(x) is defined by the length histogram. We calculate
888 : : * the area in a piecewise fashion, iterating through the length histogram
889 : : * bins. Each bin is a trapezoid:
890 : : *
891 : : * P(x2)
892 : : * /|
893 : : * / |
894 : : * P(x1)/ |
895 : : * | |
896 : : * | |
897 : : * ---+---+--
898 : : * x1 x2
899 : : *
900 : : * where x1 and x2 are the boundaries of the current histogram, and P(x1)
901 : : * and P(x1) are the cumulative fraction of tuples at the boundaries.
902 : : *
903 : : * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
904 : : *
905 : : * The first bin contains the lower bound passed by the caller, so we
906 : : * use linear interpolation between the previous and next histogram bin
907 : : * boundary to calculate P(x1). Likewise for the last bin: we use linear
908 : : * interpolation to calculate P(x2). For the bins in between, x1 and x2
909 : : * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
910 : : * P(x1) = (bin index) / (number of bins)
911 : : * P(x2) = (bin index + 1 / (number of bins)
912 : : */
913 : :
914 : : /* First bin, the one that contains lower bound */
4611 heikki.linnakangas@i 915 :CBC 54 : i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
916 [ - + ]: 54 : if (i >= length_hist_nvalues - 1)
4611 heikki.linnakangas@i 917 :UBC 0 : return 1.0;
918 : :
4611 heikki.linnakangas@i 919 [ + + ]:CBC 54 : if (i < 0)
920 : : {
921 : 18 : i = 0;
922 : 18 : pos = 0.0;
923 : : }
924 : : else
925 : : {
926 : : /* interpolate length1's position in the bin */
927 : 36 : pos = get_len_position(length1,
928 : 36 : DatumGetFloat8(length_hist_values[i]),
929 : 36 : DatumGetFloat8(length_hist_values[i + 1]));
930 : : }
931 : 54 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
932 : 54 : B = length1;
933 : :
934 : : /*
935 : : * In the degenerate case that length1 == length2, simply return
936 : : * P(length1). This is not merely an optimization: if length1 == length2,
937 : : * we'd divide by zero later on.
938 : : */
939 [ + + ]: 54 : if (length2 == length1)
940 : 9 : return PB;
941 : :
942 : : /*
943 : : * Loop through all the bins, until we hit the last bin, the one that
944 : : * contains the upper bound. (if lower and upper bounds are in the same
945 : : * bin, this falls out immediately)
946 : : */
947 : 45 : area = 0.0;
948 [ + - ]: 1593 : for (; i < length_hist_nvalues - 1; i++)
949 : : {
4535 bruce@momjian.us 950 : 1593 : double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
951 : :
952 : : /* check if we've reached the last bin */
4611 heikki.linnakangas@i 953 [ + + + + : 1593 : if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
- + ]
954 : : break;
955 : :
956 : : /* the upper bound of previous bin is the lower bound of this bin */
4535 bruce@momjian.us 957 : 1548 : A = B;
958 : 1548 : PA = PB;
959 : :
4611 heikki.linnakangas@i 960 : 1548 : B = bin_upper;
961 : 1548 : PB = (double) i / (double) (length_hist_nvalues - 1);
962 : :
963 : : /*
964 : : * Add the area of this trapezoid to the total. The point of the
965 : : * if-check is to avoid NaN, in the corner case that PA == PB == 0,
966 : : * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
967 : : * 0) is zero, regardless of the width (B - A).
968 : : */
969 [ + + + + ]: 1548 : if (PA > 0 || PB > 0)
970 : 1530 : area += 0.5 * (PB + PA) * (B - A);
971 : : }
972 : :
973 : : /* Last bin */
4535 bruce@momjian.us 974 : 45 : A = B;
975 : 45 : PA = PB;
976 : :
977 : 45 : B = length2; /* last bin ends at the query upper bound */
4611 heikki.linnakangas@i 978 [ - + ]: 45 : if (i >= length_hist_nvalues - 1)
4611 heikki.linnakangas@i 979 :UBC 0 : pos = 0.0;
980 : : else
981 : : {
4611 heikki.linnakangas@i 982 [ - + ]:CBC 45 : if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
4611 heikki.linnakangas@i 983 :UBC 0 : pos = 0.0;
984 : : else
4611 heikki.linnakangas@i 985 :CBC 45 : pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
986 : : }
987 : 45 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
988 : :
989 [ - + - - ]: 45 : if (PA > 0 || PB > 0)
990 : 45 : area += 0.5 * (PB + PA) * (B - A);
991 : :
992 : : /*
993 : : * Ok, we have calculated the area, ie. the integral. Divide by width to
994 : : * get the requested average.
995 : : *
996 : : * Avoid NaN arising from infinite / infinite. This happens at least if
997 : : * length2 is infinite. It's not clear what the correct value would be in
998 : : * that case, so 0.5 seems as good as any value.
999 : : */
2579 tgl@sss.pgh.pa.us 1000 [ + + + - ]: 45 : if (isinf(area) && isinf(length2))
4611 heikki.linnakangas@i 1001 : 9 : frac = 0.5;
1002 : : else
1003 : 36 : frac = area / (length2 - length1);
1004 : :
1005 : 45 : return frac;
1006 : : }
1007 : :
1008 : : /*
1009 : : * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
1010 : : * of ranges that fall within the constant lower and upper bounds. This uses
1011 : : * the histograms of range lower bounds and range lengths, on the assumption
1012 : : * that the range lengths are independent of the lower bounds.
1013 : : *
1014 : : * The caller has already checked that constant lower and upper bounds are
1015 : : * finite.
1016 : : */
1017 : : static double
1018 : 9 : calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1019 : : const RangeBound *lower, RangeBound *upper,
1020 : : const RangeBound *hist_lower, int hist_nvalues,
1021 : : Datum *length_hist_values, int length_hist_nvalues)
1022 : : {
1023 : : int i,
1024 : : upper_index;
1025 : : float8 prev_dist;
1026 : : double bin_width;
1027 : : double upper_bin_width;
1028 : : double sum_frac;
1029 : :
1030 : : /*
1031 : : * Begin by finding the bin containing the upper bound, in the lower bound
1032 : : * histogram. Any range with a lower bound > constant upper bound can't
1033 : : * match, ie. there are no matches in bins greater than upper_index.
1034 : : */
1035 : 9 : upper->inclusive = !upper->inclusive;
1036 : 9 : upper->lower = true;
1037 : 9 : upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1038 : : false);
1039 : :
1040 : : /*
1041 : : * If the upper bound value is below the histogram's lower limit, there
1042 : : * are no matches.
1043 : : */
2116 tgl@sss.pgh.pa.us 1044 [ - + ]: 9 : if (upper_index < 0)
2116 tgl@sss.pgh.pa.us 1045 :UBC 0 : return 0.0;
1046 : :
1047 : : /*
1048 : : * If the upper bound value is at or beyond the histogram's upper limit,
1049 : : * start our loop at the last actual bin, as though the upper bound were
1050 : : * within that bin; get_position will clamp its result to 1.0 anyway.
1051 : : * (This corresponds to assuming that the data population above the
1052 : : * histogram's upper limit is empty, exactly like what we just assumed for
1053 : : * the lower limit.)
1054 : : */
2116 tgl@sss.pgh.pa.us 1055 :CBC 9 : upper_index = Min(upper_index, hist_nvalues - 2);
1056 : :
1057 : : /*
1058 : : * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1059 : : * upper_index + 1) bin which is greater than upper bound of query range
1060 : : * using linear interpolation of subdiff function.
1061 : : */
1062 : 9 : upper_bin_width = get_position(typcache, upper,
1063 : 9 : &hist_lower[upper_index],
1064 : 9 : &hist_lower[upper_index + 1]);
1065 : :
1066 : : /*
1067 : : * In the loop, dist and prev_dist are the distance of the "current" bin's
1068 : : * lower and upper bounds from the constant upper bound.
1069 : : *
1070 : : * bin_width represents the width of the current bin. Normally it is 1.0,
1071 : : * meaning a full width bin, but can be less in the corner cases: start
1072 : : * and end of the loop. We start with bin_width = upper_bin_width, because
1073 : : * we begin at the bin containing the upper bound.
1074 : : */
4611 heikki.linnakangas@i 1075 : 9 : prev_dist = 0.0;
1076 : 9 : bin_width = upper_bin_width;
1077 : :
1078 : 9 : sum_frac = 0.0;
1079 [ + - ]: 27 : for (i = upper_index; i >= 0; i--)
1080 : : {
1081 : : double dist;
1082 : : double length_hist_frac;
1083 : 27 : bool final_bin = false;
1084 : :
1085 : : /*
1086 : : * dist -- distance from upper bound of query range to lower bound of
1087 : : * the current bin in the lower bound histogram. Or to the lower bound
1088 : : * of the constant range, if this is the final bin, containing the
1089 : : * constant lower bound.
1090 : : */
1091 [ + + ]: 27 : if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1092 : : {
1093 : 9 : dist = get_distance(typcache, lower, upper);
1094 : :
1095 : : /*
1096 : : * Subtract from bin_width the portion of this bin that we want to
1097 : : * ignore.
1098 : : */
1099 : 18 : bin_width -= get_position(typcache, lower, &hist_lower[i],
1100 : 9 : &hist_lower[i + 1]);
1101 [ - + ]: 9 : if (bin_width < 0.0)
4611 heikki.linnakangas@i 1102 :UBC 0 : bin_width = 0.0;
4611 heikki.linnakangas@i 1103 :CBC 9 : final_bin = true;
1104 : : }
1105 : : else
1106 : 18 : dist = get_distance(typcache, &hist_lower[i], upper);
1107 : :
1108 : : /*
1109 : : * Estimate the fraction of tuples in this bin that are narrow enough
1110 : : * to not exceed the distance to the upper bound of the query range.
1111 : : */
1112 : 27 : length_hist_frac = calc_length_hist_frac(length_hist_values,
1113 : : length_hist_nvalues,
1114 : : prev_dist, dist, true);
1115 : :
1116 : : /*
1117 : : * Add the fraction of tuples in this bin, with a suitable length, to
1118 : : * the total.
1119 : : */
1120 : 27 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1121 : :
1122 [ + + ]: 27 : if (final_bin)
1123 : 9 : break;
1124 : :
1125 : 18 : bin_width = 1.0;
1126 : 18 : prev_dist = dist;
1127 : : }
1128 : :
1129 : 9 : return sum_frac;
1130 : : }
1131 : :
1132 : : /*
1133 : : * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1134 : : * of ranges that contain the constant lower and upper bounds. This uses
1135 : : * the histograms of range lower bounds and range lengths, on the assumption
1136 : : * that the range lengths are independent of the lower bounds.
1137 : : */
1138 : : static double
1139 : 9 : calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1140 : : const RangeBound *lower, const RangeBound *upper,
1141 : : const RangeBound *hist_lower, int hist_nvalues,
1142 : : Datum *length_hist_values, int length_hist_nvalues)
1143 : : {
1144 : : int i,
1145 : : lower_index;
1146 : : double bin_width,
1147 : : lower_bin_width;
1148 : : double sum_frac;
1149 : : float8 prev_dist;
1150 : :
1151 : : /* Find the bin containing the lower bound of query range. */
1152 : 9 : lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1153 : : true);
1154 : :
1155 : : /*
1156 : : * If the lower bound value is below the histogram's lower limit, there
1157 : : * are no matches.
1158 : : */
2116 tgl@sss.pgh.pa.us 1159 [ - + ]: 9 : if (lower_index < 0)
2116 tgl@sss.pgh.pa.us 1160 :UBC 0 : return 0.0;
1161 : :
1162 : : /*
1163 : : * If the lower bound value is at or beyond the histogram's upper limit,
1164 : : * start our loop at the last actual bin, as though the upper bound were
1165 : : * within that bin; get_position will clamp its result to 1.0 anyway.
1166 : : * (This corresponds to assuming that the data population above the
1167 : : * histogram's upper limit is empty, exactly like what we just assumed for
1168 : : * the lower limit.)
1169 : : */
2116 tgl@sss.pgh.pa.us 1170 :CBC 9 : lower_index = Min(lower_index, hist_nvalues - 2);
1171 : :
1172 : : /*
1173 : : * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1174 : : * lower_index + 1) bin which is greater than lower bound of query range
1175 : : * using linear interpolation of subdiff function.
1176 : : */
1177 : 9 : lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1178 : 9 : &hist_lower[lower_index + 1]);
1179 : :
1180 : : /*
1181 : : * Loop through all the lower bound bins, smaller than the query lower
1182 : : * bound. In the loop, dist and prev_dist are the distance of the
1183 : : * "current" bin's lower and upper bounds from the constant upper bound.
1184 : : * We begin from query lower bound, and walk backwards, so the first bin's
1185 : : * upper bound is the query lower bound, and its distance to the query
1186 : : * upper bound is the length of the query range.
1187 : : *
1188 : : * bin_width represents the width of the current bin. Normally it is 1.0,
1189 : : * meaning a full width bin, except for the first bin, which is only
1190 : : * counted up to the constant lower bound.
1191 : : */
4611 heikki.linnakangas@i 1192 : 9 : prev_dist = get_distance(typcache, lower, upper);
1193 : 9 : sum_frac = 0.0;
1194 : 9 : bin_width = lower_bin_width;
1195 [ + + ]: 36 : for (i = lower_index; i >= 0; i--)
1196 : : {
1197 : : float8 dist;
1198 : : double length_hist_frac;
1199 : :
1200 : : /*
1201 : : * dist -- distance from upper bound of query range to current value
1202 : : * of lower bound histogram or lower bound of query range (if we've
1203 : : * reach it).
1204 : : */
1205 : 27 : dist = get_distance(typcache, &hist_lower[i], upper);
1206 : :
1207 : : /*
1208 : : * Get average fraction of length histogram which covers intervals
1209 : : * longer than (or equal to) distance to upper bound of query range.
1210 : : */
1211 : 27 : length_hist_frac =
1212 : 27 : 1.0 - calc_length_hist_frac(length_hist_values,
1213 : : length_hist_nvalues,
1214 : : prev_dist, dist, false);
1215 : :
1216 : 27 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1217 : :
1218 : 27 : bin_width = 1.0;
1219 : 27 : prev_dist = dist;
1220 : : }
1221 : :
1222 : 9 : return sum_frac;
1223 : : }
|