2018-10-23 23:45:42 +08:00
|
|
|
'use strict';
|
|
|
|
|
2017-09-11 19:54:46 +08:00
|
|
|
const BaseHistogram = require('./base-histogram');
|
2017-09-12 00:44:14 +08:00
|
|
|
const debug = require('debug')('windshaft:dataview:numeric-histogram');
|
2017-11-29 20:07:59 +08:00
|
|
|
const utils = require('../../../utils/query-utils');
|
2017-11-29 00:51:28 +08:00
|
|
|
|
2019-03-04 22:52:27 +08:00
|
|
|
/** Query to get min, max, count and (if necessary) bin number of the query */
|
2017-11-29 00:51:28 +08:00
|
|
|
const irqQueryTpl = ctx => `
|
2019-03-04 22:52:27 +08:00
|
|
|
__cdb_basics AS (
|
|
|
|
SELECT
|
|
|
|
*,
|
|
|
|
CASE
|
|
|
|
WHEN __cdb_total_rows = 0 OR __cdb_iqr = 0 THEN 1
|
|
|
|
ELSE GREATEST(
|
|
|
|
LEAST(
|
|
|
|
${ctx.minBins},
|
|
|
|
__cdb_total_rows::int),
|
|
|
|
LEAST(
|
|
|
|
${ctx.maxBins},
|
|
|
|
((__cdb_max_val - __cdb_min_val) / (2 * __cdb_iqr * power(__cdb_total_rows, 1/3)))::int)
|
|
|
|
)
|
|
|
|
END AS __cdb_bins_number
|
|
|
|
FROM
|
|
|
|
(
|
2017-09-08 16:29:54 +08:00
|
|
|
SELECT
|
2017-11-29 00:51:28 +08:00
|
|
|
max(${ctx.column}) AS __cdb_max_val,
|
|
|
|
min(${ctx.column}) AS __cdb_min_val,
|
2019-03-04 21:09:30 +08:00
|
|
|
count(1) AS __cdb_total_rows,
|
|
|
|
${ctx.irq ? ctx.irq : `0`} AS __cdb_iqr
|
2019-03-04 22:52:27 +08:00
|
|
|
FROM
|
|
|
|
(
|
|
|
|
SELECT *
|
|
|
|
FROM (${ctx.query}) __cdb_filtered_source_query
|
|
|
|
WHERE ${utils.handleFloatColumn(ctx)} IS NOT NULL
|
|
|
|
) __cdb_filtered_source
|
|
|
|
) __cdb_basics_2
|
|
|
|
)
|
2017-09-08 16:29:54 +08:00
|
|
|
`;
|
|
|
|
|
|
|
|
const BIN_MIN_NUMBER = 6;
|
|
|
|
const BIN_MAX_NUMBER = 48;
|
|
|
|
|
|
|
|
/**
|
|
|
|
Numeric histogram:
|
|
|
|
{
|
|
|
|
type: 'histogram',
|
|
|
|
options: {
|
|
|
|
column: 'name', // column data type: numeric
|
|
|
|
bins: 10 // OPTIONAL
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*/
|
2017-09-11 19:54:46 +08:00
|
|
|
module.exports = class NumericHistogram extends BaseHistogram {
|
2017-09-08 16:29:54 +08:00
|
|
|
constructor (query, options, queries) {
|
2017-09-11 17:30:09 +08:00
|
|
|
super(query, options, queries);
|
2017-09-08 16:29:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
_buildQuery (psql, override, callback) {
|
2017-09-08 18:15:21 +08:00
|
|
|
const histogramSql = this._buildQueryTpl({
|
2017-11-29 20:07:59 +08:00
|
|
|
column: this._columnType === 'date' ? utils.columnCastTpl({ column: this.column }) : this.column,
|
2017-09-12 19:05:46 +08:00
|
|
|
isFloatColumn: this._columnType === 'float',
|
|
|
|
query: this.query,
|
|
|
|
start: this._getBinStart(override),
|
|
|
|
end: this._getBinEnd(override),
|
2017-11-29 00:51:28 +08:00
|
|
|
bins: this._getBinsCount(override),
|
2017-09-12 19:05:46 +08:00
|
|
|
minBins: BIN_MIN_NUMBER,
|
2017-11-29 00:51:28 +08:00
|
|
|
maxBins: BIN_MAX_NUMBER
|
2017-09-08 16:29:54 +08:00
|
|
|
});
|
|
|
|
|
2017-09-12 00:44:14 +08:00
|
|
|
debug(histogramSql);
|
|
|
|
|
2017-09-08 16:29:54 +08:00
|
|
|
return callback(null, histogramSql);
|
|
|
|
}
|
|
|
|
|
2017-11-29 00:51:28 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* ctx: Object with the following values
|
|
|
|
* ctx.column -- Column for the histogram
|
|
|
|
* ctx.isFloatColumn - Whether the column is float or not
|
|
|
|
* ctx.query -- Subquery to extract data
|
|
|
|
* ctx.start -- Start value for the bins. [>= end to force calculation]
|
|
|
|
* ctx.end -- End value for the bins.
|
|
|
|
* ctx.bins -- Numbers of bins to generate [<0 to force calculation]
|
|
|
|
* ctx.minBins - If !full min bins to calculate [Optional]
|
|
|
|
* ctx.maxBins - If !full max bins to calculate [Optional]
|
|
|
|
*/
|
2017-09-08 18:15:21 +08:00
|
|
|
_buildQueryTpl (ctx) {
|
2017-11-29 00:51:28 +08:00
|
|
|
var extra_tables = ``;
|
|
|
|
var extra_queries = ``;
|
|
|
|
var extra_groupby = ``;
|
2018-07-05 18:39:26 +08:00
|
|
|
var extra_filter = ``;
|
2017-11-29 00:51:28 +08:00
|
|
|
|
2018-07-05 23:21:35 +08:00
|
|
|
if (ctx.start < ctx.end) {
|
|
|
|
extra_filter = `
|
|
|
|
WHERE __ctx_query.${ctx.column} >= ${ctx.start}
|
|
|
|
AND __ctx_query.${ctx.column} <= ${ctx.end}
|
|
|
|
`;
|
|
|
|
} else {
|
2017-11-29 00:51:28 +08:00
|
|
|
ctx.end = `__cdb_basics.__cdb_max_val`;
|
|
|
|
ctx.start = `__cdb_basics.__cdb_min_val`;
|
|
|
|
extra_groupby = `, __cdb_basics.__cdb_max_val, __cdb_basics.__cdb_min_val`;
|
|
|
|
extra_tables = `, __cdb_basics`;
|
|
|
|
extra_queries = `WITH ${irqQueryTpl(ctx)}`;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ctx.bins <= 0) {
|
2019-03-04 22:52:27 +08:00
|
|
|
ctx.bins = `__cdb_basics.__cdb_bins_number`;
|
|
|
|
ctx.irq = `percentile_disc(0.75) within group (order by ${ctx.column})
|
|
|
|
- percentile_disc(0.25) within group (order by ${ctx.column})`;
|
|
|
|
extra_groupby += `, __cdb_basics.__cdb_bins_number`;
|
2019-03-04 23:53:55 +08:00
|
|
|
extra_tables = `, __cdb_basics`;
|
2019-03-04 22:52:27 +08:00
|
|
|
extra_queries = `WITH ${irqQueryTpl(ctx)}`;
|
2017-11-29 00:51:28 +08:00
|
|
|
}
|
|
|
|
|
2017-09-08 16:29:54 +08:00
|
|
|
return `
|
2017-11-29 00:51:28 +08:00
|
|
|
${extra_queries}
|
|
|
|
SELECT
|
|
|
|
(${ctx.end} - ${ctx.start}) / ${ctx.bins}::float AS bin_width,
|
|
|
|
${ctx.bins} as bins_number,
|
2017-11-29 20:07:59 +08:00
|
|
|
${utils.countNULLs(ctx)} AS nulls_count,
|
|
|
|
${utils.countInfinites(ctx)} AS infinities_count,
|
|
|
|
${utils.countNaNs(ctx)} AS nans_count,
|
|
|
|
min(${utils.handleFloatColumn(ctx)}) AS min,
|
|
|
|
max(${utils.handleFloatColumn(ctx)}) AS max,
|
|
|
|
avg(${utils.handleFloatColumn(ctx)}) AS avg,
|
|
|
|
sum(CASE WHEN (${utils.handleFloatColumn(ctx)} is not NULL) THEN 1 ELSE 0 END) as freq,
|
2017-11-29 00:51:28 +08:00
|
|
|
CASE WHEN ${ctx.start} = ${ctx.end}
|
|
|
|
THEN 0
|
2017-11-29 20:07:59 +08:00
|
|
|
ELSE GREATEST(1, LEAST(
|
|
|
|
${ctx.bins},
|
|
|
|
WIDTH_BUCKET(${utils.handleFloatColumn(ctx)}, ${ctx.start}, ${ctx.end}, ${ctx.bins}))) - 1
|
2017-11-29 00:51:28 +08:00
|
|
|
END AS bin
|
|
|
|
FROM
|
|
|
|
(
|
2018-07-05 18:39:26 +08:00
|
|
|
SELECT * FROM (${ctx.query}) __ctx_query${extra_tables} ${extra_filter}
|
2017-11-29 00:51:28 +08:00
|
|
|
) __cdb_filtered_source_query${extra_tables}
|
2017-12-23 00:12:57 +08:00
|
|
|
GROUP BY 10${extra_groupby}
|
|
|
|
ORDER BY 10;`;
|
2017-09-08 16:29:54 +08:00
|
|
|
}
|
|
|
|
|
2017-09-12 16:14:55 +08:00
|
|
|
_hasOverridenBins (override) {
|
2017-09-08 16:29:54 +08:00
|
|
|
return override && override.hasOwnProperty('bins');
|
|
|
|
}
|
|
|
|
|
2017-09-11 23:19:02 +08:00
|
|
|
_getSummary (result, override) {
|
|
|
|
const firstRow = result.rows[0] || {};
|
2017-09-08 16:29:54 +08:00
|
|
|
|
2017-11-29 00:51:28 +08:00
|
|
|
var total_nulls = 0;
|
|
|
|
var total_infinities = 0;
|
|
|
|
var total_nans = 0;
|
|
|
|
var total_avg = 0;
|
|
|
|
var total_count = 0;
|
|
|
|
|
|
|
|
result.rows.forEach(function(row) {
|
|
|
|
total_nulls += row.nulls_count;
|
|
|
|
total_infinities += row.infinities_count;
|
|
|
|
total_nans += row.nans_count;
|
|
|
|
total_avg += row.avg * row.freq;
|
|
|
|
total_count += row.freq;
|
|
|
|
});
|
|
|
|
if (total_count !== 0) {
|
|
|
|
total_avg /= total_count;
|
|
|
|
}
|
|
|
|
|
2017-09-08 16:29:54 +08:00
|
|
|
return {
|
2017-09-11 23:19:02 +08:00
|
|
|
bin_width: firstRow.bin_width,
|
|
|
|
bins_count: firstRow.bins_number,
|
|
|
|
bins_start: this._populateBinStart(firstRow, override),
|
2017-11-29 00:51:28 +08:00
|
|
|
nulls: total_nulls,
|
|
|
|
infinities: total_infinities,
|
|
|
|
nans: total_nans,
|
|
|
|
avg: total_avg
|
2017-09-08 16:29:54 +08:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2017-09-11 23:19:02 +08:00
|
|
|
_getBuckets (result) {
|
2017-09-12 01:24:01 +08:00
|
|
|
return result.rows.map(({ bin, min, max, avg, freq }) => ({ bin, min, max, avg, freq }));
|
2017-09-08 16:29:54 +08:00
|
|
|
}
|
|
|
|
|
2017-09-11 23:19:02 +08:00
|
|
|
_populateBinStart (firstRow, override = {}) {
|
2017-09-08 23:43:10 +08:00
|
|
|
let binStart;
|
2017-09-08 16:29:54 +08:00
|
|
|
|
|
|
|
if (override.hasOwnProperty('start')) {
|
2017-09-08 18:21:22 +08:00
|
|
|
binStart = this._getBinStart(override);
|
2017-09-08 16:29:54 +08:00
|
|
|
} else {
|
|
|
|
binStart = firstRow.min;
|
|
|
|
}
|
|
|
|
|
|
|
|
return binStart;
|
|
|
|
}
|
|
|
|
|
|
|
|
};
|