Numeric histogram: Simplify bin calculation

This commit is contained in:
Raul Marin 2019-03-04 15:52:27 +01:00 committed by Raul Marin
parent 6241b23d4f
commit 730076469e

View File

@ -4,41 +4,37 @@ const BaseHistogram = require('./base-histogram');
const debug = require('debug')('windshaft:dataview:numeric-histogram');
const utils = require('../../../utils/query-utils');
/** Query to get min and max values from the query */
/** Query to get min, max, count and (if necessary) bin number of the query */
const irqQueryTpl = ctx => `
__cdb_filtered_source AS (
SELECT *
FROM (${ctx.query}) __cdb_filtered_source_query
WHERE ${utils.handleFloatColumn(ctx)} IS NOT NULL
),
__cdb_basics AS (
SELECT
*,
CASE
WHEN __cdb_total_rows = 0 OR __cdb_iqr = 0 THEN 1
ELSE GREATEST(
LEAST(
${ctx.minBins},
__cdb_total_rows::int),
LEAST(
${ctx.maxBins},
((__cdb_max_val - __cdb_min_val) / (2 * __cdb_iqr * power(__cdb_total_rows, 1/3)))::int)
)
END AS __cdb_bins_number
FROM
(
SELECT
max(${ctx.column}) AS __cdb_max_val,
min(${ctx.column}) AS __cdb_min_val,
count(1) AS __cdb_total_rows,
${ctx.irq ? ctx.irq : `0`} AS __cdb_iqr
FROM __cdb_filtered_source
)
`;
/* Query to calculate the number of bins (needs irqQueryTpl before it.
* It uses the FreedmanDiaconis rule to calculate the witdh of the bins */
const binsQueryTpl = ctx => `
__cdb_bins AS (
SELECT
CASE WHEN __cdb_total_rows = 0 OR __cdb_iqr = 0
THEN 1
ELSE GREATEST(
LEAST(${ctx.minBins}, CAST(__cdb_total_rows AS INT)),
LEAST(
CAST(((__cdb_max_val - __cdb_min_val) / (2 * __cdb_iqr * power(__cdb_total_rows, 1/3))) AS INT),
${ctx.maxBins}
)
)
END AS __cdb_bins_number
FROM __cdb_basics, __cdb_filtered_source
LIMIT 1
)
FROM
(
SELECT *
FROM (${ctx.query}) __cdb_filtered_source_query
WHERE ${utils.handleFloatColumn(ctx)} IS NOT NULL
) __cdb_filtered_source
) __cdb_basics_2
)
`;
const BIN_MIN_NUMBER = 6;
@ -108,11 +104,11 @@ module.exports = class NumericHistogram extends BaseHistogram {
}
if (ctx.bins <= 0) {
ctx.bins = `__cdb_bins.__cdb_bins_number`;
ctx.irq = `percentile_disc(0.75) within group (order by ${ctx.column}) - percentile_disc(0.25) within group (order by ${ctx.column})`;
extra_groupby += `, __cdb_bins.__cdb_bins_number`;
extra_tables += `, __cdb_bins`;
extra_queries = `WITH ${irqQueryTpl(ctx)}, ${binsQueryTpl(ctx)}`;
ctx.bins = `__cdb_basics.__cdb_bins_number`;
ctx.irq = `percentile_disc(0.75) within group (order by ${ctx.column})
- percentile_disc(0.25) within group (order by ${ctx.column})`;
extra_groupby += `, __cdb_basics.__cdb_bins_number`;
extra_queries = `WITH ${irqQueryTpl(ctx)}`;
}
return `