2018-10-23 23:45:42 +08:00
|
|
|
'use strict';
|
|
|
|
|
2016-05-24 00:11:10 +08:00
|
|
|
var _ = require('underscore');
|
2016-05-31 21:30:38 +08:00
|
|
|
var BaseOverviewsDataview = require('./base');
|
2016-05-23 23:06:52 +08:00
|
|
|
var BaseDataview = require('../histogram');
|
2017-06-16 18:57:46 +08:00
|
|
|
var debug = require('debug')('windshaft:dataview:histogram:overview');
|
2016-05-23 23:06:52 +08:00
|
|
|
|
2016-05-24 00:11:10 +08:00
|
|
|
var dot = require('dot');
|
|
|
|
dot.templateSettings.strip = false;
|
|
|
|
|
|
|
|
var BIN_MIN_NUMBER = 6;
|
|
|
|
var BIN_MAX_NUMBER = 48;
|
|
|
|
|
2017-06-15 01:00:37 +08:00
|
|
|
var filteredQueryTpl = dot.template([
|
|
|
|
'filtered_source AS (',
|
|
|
|
' SELECT *',
|
|
|
|
' FROM ({{=it._query}}) _cdb_filtered_source',
|
|
|
|
' WHERE',
|
|
|
|
' {{=it._column}} IS NOT NULL',
|
2017-06-16 18:57:46 +08:00
|
|
|
' {{?it._isFloatColumn}}AND',
|
2017-06-15 01:00:37 +08:00
|
|
|
' {{=it._column}} != \'infinity\'::float',
|
|
|
|
' AND',
|
|
|
|
' {{=it._column}} != \'-infinity\'::float',
|
|
|
|
' AND',
|
2017-06-16 18:57:46 +08:00
|
|
|
' {{=it._column}} != \'NaN\'::float{{?}}',
|
2017-06-15 01:00:37 +08:00
|
|
|
')'
|
|
|
|
].join(' \n'));
|
|
|
|
|
2016-05-24 00:11:10 +08:00
|
|
|
var basicsQueryTpl = dot.template([
|
|
|
|
'basics AS (',
|
|
|
|
' SELECT',
|
|
|
|
' max({{=it._column}}) AS max_val, min({{=it._column}}) AS min_val,',
|
|
|
|
' sum({{=it._column}}*_feature_count)/sum(_feature_count) AS avg_val, sum(_feature_count) AS total_rows',
|
2017-06-15 01:00:37 +08:00
|
|
|
' FROM filtered_source',
|
2016-05-24 00:11:10 +08:00
|
|
|
')'
|
|
|
|
].join(' \n'));
|
|
|
|
|
|
|
|
var overrideBasicsQueryTpl = dot.template([
|
|
|
|
'basics AS (',
|
|
|
|
' SELECT',
|
|
|
|
' max({{=it._end}}) AS max_val, min({{=it._start}}) AS min_val,',
|
|
|
|
' sum({{=it._column}}*_feature_count)/sum(_feature_count) AS avg_val, sum(_feature_count) AS total_rows',
|
2017-06-15 01:00:37 +08:00
|
|
|
' FROM filtered_source',
|
2016-05-24 00:11:10 +08:00
|
|
|
')'
|
|
|
|
].join('\n'));
|
|
|
|
|
|
|
|
var iqrQueryTpl = dot.template([
|
|
|
|
'iqrange AS (',
|
|
|
|
' SELECT max(quartile_max) - min(quartile_max) AS iqr',
|
|
|
|
' FROM (',
|
|
|
|
' SELECT quartile, max(_cdb_iqr_column) AS quartile_max from (',
|
|
|
|
' SELECT {{=it._column}} AS _cdb_iqr_column, ntile(4) over (order by {{=it._column}}',
|
|
|
|
' ) AS quartile',
|
2017-06-15 01:00:37 +08:00
|
|
|
' FROM filtered_source) _cdb_quartiles',
|
2016-05-24 00:11:10 +08:00
|
|
|
' WHERE quartile = 1 or quartile = 3',
|
|
|
|
' GROUP BY quartile',
|
|
|
|
' ) _cdb_iqr',
|
|
|
|
')'
|
|
|
|
].join('\n'));
|
|
|
|
|
|
|
|
var binsQueryTpl = dot.template([
|
|
|
|
'bins AS (',
|
|
|
|
' SELECT CASE WHEN total_rows = 0 OR iqr = 0',
|
|
|
|
' THEN 1',
|
|
|
|
' ELSE GREATEST(',
|
|
|
|
' LEAST({{=it._minBins}}, CAST(total_rows AS INT)),',
|
|
|
|
' LEAST(',
|
|
|
|
' CAST(((max_val - min_val) / (2 * iqr * power(total_rows, 1/3))) AS INT),',
|
|
|
|
' {{=it._maxBins}}',
|
|
|
|
' )',
|
|
|
|
' )',
|
|
|
|
' END AS bins_number',
|
2017-06-15 01:00:37 +08:00
|
|
|
' FROM basics, iqrange, filtered_source',
|
2016-05-24 00:11:10 +08:00
|
|
|
' LIMIT 1',
|
|
|
|
')'
|
|
|
|
].join('\n'));
|
|
|
|
|
|
|
|
var overrideBinsQueryTpl = dot.template([
|
|
|
|
'bins AS (',
|
|
|
|
' SELECT {{=it._bins}} AS bins_number',
|
|
|
|
')'
|
|
|
|
].join('\n'));
|
|
|
|
|
|
|
|
var nullsQueryTpl = dot.template([
|
|
|
|
'nulls AS (',
|
|
|
|
' SELECT',
|
|
|
|
' count(*) AS nulls_count',
|
|
|
|
' FROM ({{=it._query}}) _cdb_histogram_nulls',
|
|
|
|
' WHERE {{=it._column}} IS NULL',
|
|
|
|
')'
|
|
|
|
].join('\n'));
|
|
|
|
|
2017-06-15 01:00:37 +08:00
|
|
|
var infinitiesQueryTpl = dot.template([
|
|
|
|
'infinities AS (',
|
|
|
|
' SELECT',
|
|
|
|
' count(*) AS infinities_count',
|
|
|
|
' FROM ({{=it._query}}) _cdb_histogram_infinities',
|
|
|
|
' WHERE',
|
|
|
|
' {{=it._column}} = \'infinity\'::float',
|
|
|
|
' OR',
|
|
|
|
' {{=it._column}} = \'-infinity\'::float',
|
|
|
|
')'
|
|
|
|
].join('\n'));
|
|
|
|
|
|
|
|
var nansQueryTpl = dot.template([
|
|
|
|
'nans AS (',
|
|
|
|
' SELECT',
|
|
|
|
' count(*) AS nans_count',
|
|
|
|
' FROM ({{=it._query}}) _cdb_histogram_infinities',
|
|
|
|
' WHERE {{=it._column}} = \'NaN\'::float',
|
|
|
|
')'
|
|
|
|
].join('\n'));
|
|
|
|
|
2016-05-24 00:11:10 +08:00
|
|
|
var histogramQueryTpl = dot.template([
|
|
|
|
'SELECT',
|
|
|
|
' (max_val - min_val) / cast(bins_number as float) AS bin_width,',
|
|
|
|
' bins_number,',
|
|
|
|
' nulls_count,',
|
2017-06-16 18:57:46 +08:00
|
|
|
' {{?it._isFloatColumn}}infinities_count,',
|
|
|
|
' nans_count,{{?}}',
|
2016-05-24 00:11:10 +08:00
|
|
|
' avg_val,',
|
|
|
|
' CASE WHEN min_val = max_val',
|
|
|
|
' THEN 0',
|
|
|
|
' ELSE GREATEST(1, LEAST(WIDTH_BUCKET({{=it._column}}, min_val, max_val, bins_number), bins_number)) - 1',
|
|
|
|
' END AS bin,',
|
|
|
|
' min({{=it._column}})::numeric AS min,',
|
|
|
|
' max({{=it._column}})::numeric AS max,',
|
|
|
|
' sum({{=it._column}}*_feature_count)/sum(_feature_count)::numeric AS avg,',
|
|
|
|
' sum(_feature_count) AS freq',
|
2017-06-16 18:57:46 +08:00
|
|
|
'FROM filtered_source, basics, nulls, bins{{?it._isFloatColumn}},infinities, nans{{?}}',
|
|
|
|
'GROUP BY bin, bins_number, bin_width, nulls_count, avg_val',
|
|
|
|
' {{?it._isFloatColumn}}, infinities_count, nans_count{{?}}',
|
2016-05-24 00:11:10 +08:00
|
|
|
'ORDER BY bin'
|
|
|
|
].join('\n'));
|
|
|
|
|
2019-10-22 01:07:24 +08:00
|
|
|
function Histogram (query, options, queryRewriter, queryRewriteData, params, queries) {
|
2017-06-23 22:53:16 +08:00
|
|
|
BaseOverviewsDataview.call(this, query, options, BaseDataview, queryRewriter, queryRewriteData, params, queries);
|
2016-05-24 00:11:10 +08:00
|
|
|
|
|
|
|
this.query = query;
|
2017-03-17 02:15:34 +08:00
|
|
|
this.queries = queries;
|
2016-05-24 00:11:10 +08:00
|
|
|
this.column = options.column;
|
|
|
|
this.bins = options.bins;
|
|
|
|
|
|
|
|
this._columnType = null;
|
2016-05-23 23:06:52 +08:00
|
|
|
}
|
|
|
|
|
2016-05-31 21:30:38 +08:00
|
|
|
Histogram.prototype = Object.create(BaseOverviewsDataview.prototype);
|
2016-05-23 23:06:52 +08:00
|
|
|
Histogram.prototype.constructor = Histogram;
|
|
|
|
|
|
|
|
module.exports = Histogram;
|
2016-05-24 00:11:10 +08:00
|
|
|
|
2019-10-22 01:07:24 +08:00
|
|
|
Histogram.prototype.sql = function (psql, override, callback) {
|
2017-06-16 18:57:46 +08:00
|
|
|
var self = this;
|
|
|
|
|
2016-05-24 00:11:10 +08:00
|
|
|
if (!callback) {
|
|
|
|
callback = override;
|
|
|
|
override = {};
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this._columnType === null) {
|
2017-06-16 18:57:46 +08:00
|
|
|
this.getColumnType(psql, this.column, this.queries.no_filters, function (err, type) {
|
2016-05-24 00:11:10 +08:00
|
|
|
// assume numeric, will fail later
|
|
|
|
self._columnType = 'numeric';
|
2017-06-16 18:57:46 +08:00
|
|
|
if (!err && !!type) {
|
|
|
|
self._columnType = Object.keys(type).find(function (key) {
|
|
|
|
return type[key];
|
|
|
|
});
|
2016-05-24 00:11:10 +08:00
|
|
|
}
|
|
|
|
self.sql(psql, override, callback);
|
|
|
|
}, true); // use read-only transaction
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (this._columnType === 'date') {
|
2016-12-17 00:51:36 +08:00
|
|
|
// overviews currently aggregate dates to NULL
|
|
|
|
// to avoid problem we don't use overviews for histograms of date columns
|
|
|
|
return this.defaultSql(psql, override, callback);
|
2016-05-24 00:11:10 +08:00
|
|
|
}
|
|
|
|
|
2017-06-16 18:57:46 +08:00
|
|
|
var histogramSql = this._buildQuery(override);
|
2016-05-24 00:11:10 +08:00
|
|
|
|
2018-08-29 19:50:21 +08:00
|
|
|
return callback(null, histogramSql, { usesOverviews: true });
|
2017-06-16 18:57:46 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
Histogram.prototype._buildQuery = function (override) {
|
2017-06-15 01:00:37 +08:00
|
|
|
var filteredQuery, basicsQuery, binsQuery;
|
2017-06-16 18:57:46 +08:00
|
|
|
var _column = this.column;
|
|
|
|
var _query = this.rewrittenQuery(this.query);
|
2017-06-15 01:00:37 +08:00
|
|
|
|
|
|
|
filteredQuery = filteredQueryTpl({
|
2017-06-16 18:57:46 +08:00
|
|
|
_isFloatColumn: this._columnType === 'float',
|
2017-06-15 01:00:37 +08:00
|
|
|
_query: _query,
|
|
|
|
_column: _column
|
|
|
|
});
|
2016-05-24 00:11:10 +08:00
|
|
|
|
2017-06-16 18:57:46 +08:00
|
|
|
if (this._shouldOverride(override)) {
|
|
|
|
debug('overriding with %j', override);
|
2016-05-24 00:11:10 +08:00
|
|
|
basicsQuery = overrideBasicsQueryTpl({
|
|
|
|
_query: _query,
|
|
|
|
_column: _column,
|
|
|
|
_start: override.start,
|
|
|
|
_end: override.end
|
|
|
|
});
|
|
|
|
|
|
|
|
binsQuery = [
|
|
|
|
overrideBinsQueryTpl({
|
|
|
|
_bins: override.bins
|
|
|
|
})
|
|
|
|
].join(',\n');
|
|
|
|
} else {
|
|
|
|
basicsQuery = basicsQueryTpl({
|
|
|
|
_query: _query,
|
|
|
|
_column: _column
|
|
|
|
});
|
|
|
|
|
2017-06-16 18:57:46 +08:00
|
|
|
if (this._shouldOverrideBins(override)) {
|
2016-05-24 00:11:10 +08:00
|
|
|
binsQuery = [
|
|
|
|
overrideBinsQueryTpl({
|
|
|
|
_bins: override.bins
|
|
|
|
})
|
|
|
|
].join(',\n');
|
|
|
|
} else {
|
|
|
|
binsQuery = [
|
|
|
|
iqrQueryTpl({
|
|
|
|
_query: _query,
|
|
|
|
_column: _column
|
|
|
|
}),
|
|
|
|
binsQueryTpl({
|
|
|
|
_query: _query,
|
|
|
|
_minBins: BIN_MIN_NUMBER,
|
|
|
|
_maxBins: BIN_MAX_NUMBER
|
|
|
|
})
|
|
|
|
].join(',\n');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-16 18:57:46 +08:00
|
|
|
var cteSql = [
|
|
|
|
filteredQuery,
|
|
|
|
basicsQuery,
|
|
|
|
binsQuery,
|
|
|
|
nullsQueryTpl({
|
|
|
|
_query: _query,
|
|
|
|
_column: _column
|
|
|
|
})
|
|
|
|
];
|
2016-05-24 00:11:10 +08:00
|
|
|
|
2017-06-16 18:57:46 +08:00
|
|
|
if (this._columnType === 'float') {
|
|
|
|
cteSql.push(
|
2017-06-15 01:00:37 +08:00
|
|
|
infinitiesQueryTpl({
|
|
|
|
_query: _query,
|
|
|
|
_column: _column
|
|
|
|
}),
|
|
|
|
nansQueryTpl({
|
|
|
|
_query: _query,
|
|
|
|
_column: _column
|
2016-05-24 00:11:10 +08:00
|
|
|
})
|
2017-06-16 18:57:46 +08:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
var histogramSql = [
|
2019-10-22 01:07:24 +08:00
|
|
|
'WITH',
|
2017-06-16 18:57:46 +08:00
|
|
|
cteSql.join(',\n'),
|
2016-05-24 00:11:10 +08:00
|
|
|
histogramQueryTpl({
|
2017-06-16 18:57:46 +08:00
|
|
|
_isFloatColumn: this._columnType === 'float',
|
2016-05-24 00:11:10 +08:00
|
|
|
_query: _query,
|
|
|
|
_column: _column
|
|
|
|
})
|
|
|
|
].join('\n');
|
|
|
|
|
2017-06-16 18:57:46 +08:00
|
|
|
debug(histogramSql);
|
|
|
|
|
|
|
|
return histogramSql;
|
|
|
|
};
|
|
|
|
|
|
|
|
Histogram.prototype._shouldOverride = function (override) {
|
|
|
|
return override && _.has(override, 'start') && _.has(override, 'end') && _.has(override, 'bins');
|
2016-05-24 00:11:10 +08:00
|
|
|
};
|
2017-06-16 18:57:46 +08:00
|
|
|
|
|
|
|
Histogram.prototype._shouldOverrideBins = function (override) {
|
|
|
|
return override && _.has(override, 'bins');
|
|
|
|
};
|