diff --git a/lib/cartodb/models/dataview/date-histogram.js b/lib/cartodb/models/dataview/date-histogram.js new file mode 100644 index 00000000..2e55ea2d --- /dev/null +++ b/lib/cartodb/models/dataview/date-histogram.js @@ -0,0 +1,441 @@ +const BaseDataview = require('./base'); +const debug = require('debug')('windshaft:dataview:histogram'); + +const dateIntervalQueryTpl = ctx => ` + WITH + __cdb_dates AS ( + SELECT + MAX(${ctx.column}::timestamp) AS __cdb_end, + MIN(${ctx.column}::timestamp) AS __cdb_start + FROM (${ctx.query}) __cdb_source + ), + __cdb_interval_in_days AS ( + SELECT + DATE_PART('day', __cdb_end - __cdb_start) AS __cdb_days + FROM __cdb_dates + ), + __cdb_interval_in_hours AS ( + SELECT + __cdb_days * 24 + DATE_PART('hour', __cdb_end - __cdb_start) AS __cdb_hours + FROM __cdb_interval_in_days, __cdb_dates + ), + __cdb_interval_in_minutes AS ( + SELECT + __cdb_hours * 60 + DATE_PART('minute', __cdb_end - __cdb_start) AS __cdb_minutes + FROM __cdb_interval_in_hours, __cdb_dates + ), + __cdb_interval_in_seconds AS ( + SELECT + __cdb_minutes * 60 + DATE_PART('second', __cdb_end - __cdb_start) AS __cdb_seconds + FROM __cdb_interval_in_minutes, __cdb_dates + ) + SELECT + ROUND(__cdb_days / 365) AS year, + ROUND(__cdb_days / 90) AS quarter, + ROUND(__cdb_days / 30) AS month, + ROUND(__cdb_days / 7) AS week, + __cdb_days AS day, + __cdb_hours AS hour, + __cdb_minutes AS minute, + __cdb_seconds AS second + FROM __cdb_interval_in_days, __cdb_interval_in_hours, __cdb_interval_in_minutes, __cdb_interval_in_seconds +`; + +const nullsQueryTpl = ctx => ` + __cdb_nulls AS ( + SELECT + count(*) AS __cdb_nulls_count + FROM (${ctx._query}) __cdb_histogram_nulls + WHERE ${ctx._column} IS NULL + ) +`; + +const dateBasicsQueryTpl = ctx => ` + __cdb_basics AS ( + SELECT + max(date_part('epoch', ${ctx._column})) AS __cdb_max_val, + min(date_part('epoch', ${ctx._column})) AS __cdb_min_val, + avg(date_part('epoch', ${ctx._column})) AS __cdb_avg_val, + min( + date_trunc( + '${ctx._aggregation}', ${ctx._column}::timestamp AT TIME ZONE '${ctx._offset}' + ) + ) AS __cdb_start_date, + max(${ctx._column}::timestamp AT TIME ZONE '${ctx._offset}') AS __cdb_end_date, + count(1) AS __cdb_total_rows + FROM (${ctx._query}) __cdb_basics_query + ) +`; + +const dateOverrideBasicsQueryTpl = ctx => ` + __cdb_basics AS ( + SELECT + max(${ctx._end}) AS __cdb_max_val, + min(${ctx._start}) AS __cdb_min_val, + avg(date_part('epoch', ${ctx._column})) AS __cdb_avg_val, + min( + date_trunc( + '${ctx._aggregation}', + TO_TIMESTAMP(${ctx._start})::timestamp AT TIME ZONE '${ctx._offset}' + ) + ) AS __cdb_start_date, + max( + TO_TIMESTAMP(${ctx._end})::timestamp AT TIME ZONE '${ctx._offset}' + ) AS __cdb_end_date, + count(1) AS __cdb_total_rows + FROM (${ctx._query}) __cdb_basics_query + ) +`; + +const dateBinsQueryTpl = ctx => ` + __cdb_bins AS ( + SELECT + __cdb_bins_array, + ARRAY_LENGTH(__cdb_bins_array, 1) AS __cdb_bins_number + FROM ( + SELECT + ARRAY( + SELECT GENERATE_SERIES( + __cdb_start_date::timestamptz, + __cdb_end_date::timestamptz, + ${ctx._aggregation === 'quarter' ? `'3 month'::interval` : `'1 ${ctx._aggregation}'::interval`} + ) + ) AS __cdb_bins_array + FROM __cdb_basics + ) __cdb_bins_array_query + ) +`; + +const dateHistogramQueryTpl = ctx => ` + SELECT + (__cdb_max_val - __cdb_min_val) / cast(__cdb_bins_number as float) AS bin_width, + __cdb_bins_number AS bins_number, + __cdb_nulls_count AS nulls_count, + CASE WHEN __cdb_min_val = __cdb_max_val + THEN 0 + ELSE GREATEST( + 1, + LEAST( + WIDTH_BUCKET( + ${ctx._column}::timestamp AT TIME ZONE '${ctx._offset}', + __cdb_bins_array + ), + __cdb_bins_number + ) + ) - 1 + END AS bin, + min( + date_part( + 'epoch', + date_trunc( + '${ctx._aggregation}', ${ctx._column}::timestamp AT TIME ZONE '${ctx._offset}' + ) AT TIME ZONE '${ctx._offset}' + ) + )::numeric AS timestamp, + date_part('epoch', __cdb_start_date)::numeric AS timestamp_start, + min(date_part('epoch', ${ctx._column}))::numeric AS min, + max(date_part('epoch', ${ctx._column}))::numeric AS max, + avg(date_part('epoch', ${ctx._column}))::numeric AS avg, + count(*) AS freq + FROM (${ctx._query}) __cdb_histogram, __cdb_basics, __cdb_bins, __cdb_nulls + WHERE date_part('epoch', ${ctx._column}) IS NOT NULL + GROUP BY bin, bins_number, bin_width, nulls_count, timestamp_start + ORDER BY bin +`; + +const MAX_INTERVAL_VALUE = 366; + +const TYPE = 'histogram'; + +const DATE_AGGREGATIONS = { + 'auto': true, + 'minute': true, + 'hour': true, + 'day': true, + 'week': true, + 'month': true, + 'quarter': true, + 'year': true +}; + +/** + date_histogram: { + type: 'histogram', + options: { + column: 'date', // column data type: date + aggregation: 'day' // auto by default + offset: -7200 // OPTIONAL (UTC offset in seconds) + } + } +*/ +module.exports = class Histogram extends BaseDataview { + constructor (query, options, queries) { + super(); + + if (typeof options.column !== 'string') { + throw new Error('Histogram expects `column` in widget options'); + } + + this.query = query; + this.queries = queries; + this.column = options.column; + this.bins = options.bins; + this.aggregation = options.aggregation; + this.offset = options.offset; + + this._columnType = null; + } + + sql (psql, override, callback) { + var self = this; + + if (!callback) { + callback = override; + override = {}; + } + + if (this._columnType === null) { + this.getColumnType(psql, this.column, this.queries.no_filters, function (err, type) { + // assume numeric, will fail later + self._columnType = 'numeric'; + if (!err && !!type) { + self._columnType = Object.keys(type).find(function (key) { + return type[key]; + }); + } + self.sql(psql, override, callback); + }, true); // use read-only transaction + return null; + } + + if (this.isDateHistogram(override)) { + return this._buildQuery(psql, override, callback); + } + + } + + isDateHistogram (override) { + return this._columnType === 'date' && (this.aggregation !== undefined || override.aggregation !== undefined); + } + + buildDateHistogramQueryTpl (ctx) { + return ` + WITH + ${ctx._override && ctx._override.hasOwnProperty('start') && ctx._override.hasOwnProperty('end') ? + dateOverrideBasicsQueryTpl(ctx) : + dateBasicsQueryTpl(ctx)}, + ${dateBinsQueryTpl(ctx)}, + ${nullsQueryTpl(ctx)} + ${dateHistogramQueryTpl(ctx)} + `; + } + + _buildQuery (psql, override, callback) { + var _aggregation = override && override.aggregation ? override.aggregation : this.aggregation; + var _offset = override && Number.isFinite(override.offset) ? override.offset : this.offset; + + if (!DATE_AGGREGATIONS.hasOwnProperty(_aggregation)) { + return callback(new Error('Invalid aggregation value. Valid ones: ' + + Object.keys(DATE_AGGREGATIONS).join(', ') + )); + } + + if (_aggregation === 'auto') { + this.getAutomaticAggregation(psql, function (err, aggregation) { + if (err || aggregation === 'none') { + this.aggregation = 'day'; + } else { + this.aggregation = aggregation; + } + override.aggregation = this.aggregation; + this._buildDateHistogramQuery(psql, override, callback); + }.bind(this)); + return null; + } + + const histogramSql = this.buildDateHistogramQueryTpl({ + _override: override, + _query: this.query, + _column: this.column, + _aggregation: _aggregation, + _start: this.getBinStart(override), + _end: this.getBinEnd(override), + _offset: this.parseOffset(_offset, _aggregation) + }); + + debug(histogramSql); + + return callback(null, histogramSql); + } + + getAutomaticAggregation (psql, callback) { + var dateIntervalQuery = dateIntervalQueryTpl({ + query: this.query, + column: this.column + }); + + debug(dateIntervalQuery); + + psql.query(dateIntervalQuery, function (err, result) { + if (err) { + return callback(err); + } + + var aggegations = result.rows[0]; + var aggregation = Object.keys(aggegations) + .map(function (key) { + return { + name: key, + value: aggegations[key] + }; + }) + .reduce(function (closer, current) { + if (current.value > MAX_INTERVAL_VALUE) { + return closer; + } + + var closerDiff = MAX_INTERVAL_VALUE - closer.value; + var currentDiff = MAX_INTERVAL_VALUE - current.value; + + if (Number.isFinite(current.value) && closerDiff > currentDiff) { + return current; + } + + return closer; + }, { name: 'none', value: -1 }); + + callback(null, aggregation.name); + }); + } + + format (result, override) { + override = override || {}; + var buckets = []; + + var binsCount = this.getBinsCount(override); + var width = this.getWidth(override); + var binsStart = this.getBinStart(override); + var nulls = 0; + var infinities = 0; + var nans = 0; + var avg; + var timestampStart; + var aggregation; + var offset; + + if (result.rows.length) { + var firstRow = result.rows[0]; + binsCount = firstRow.bins_number; + width = firstRow.bin_width || width; + avg = firstRow.avg_val; + nulls = firstRow.nulls_count; + timestampStart = firstRow.timestamp_start; + infinities = firstRow.infinities_count; + nans = firstRow.nans_count; + binsStart = this.populateBinStart(override, firstRow); + + if (Number.isFinite(timestampStart)) { + aggregation = this.getAggregation(override); + offset = this.getOffset(override); + } + + buckets = result.rows.map(row => ({ bin, min, max, avg, freq, timestamp } = row)); + } + + return { + aggregation: aggregation, + offset: offset, + timestamp_start: timestampStart, + bin_width: width, + bins_count: binsCount, + bins_start: binsStart, + nulls: nulls, + infinities: infinities, + nans: nans, + avg: avg, + bins: buckets + }; + } + + getType () { + return TYPE; + } + + toString () { + return JSON.stringify({ + _type: TYPE, + _column: this.column, + _query: this.query + }); + } + + getAggregation (override) { + return override && override.aggregation ? override.aggregation : this.aggregation; + } + + getOffset (override) { + if (override && override.offset) { + return override.offset; + } + if (this.offset) { + return this.offset; + } + + return 0; + } + + getBinStart (override) { + if (override.hasOwnProperty('start') && override.hasOwnProperty('end')) { + return Math.min(override.start, override.end); + } + return override.start || 0; + } + + populateBinStart (override, firstRow) { + var binStart; + + if (firstRow.hasOwnProperty('timestamp')) { + binStart = firstRow.timestamp; + } else if (override.hasOwnProperty('start')) { + binStart = this.getBinStart(override); + } else { + binStart = firstRow.min; + } + + return binStart; + } + + getBinEnd (override) { + if (override.hasOwnProperty('start') && override.hasOwnProperty('end')) { + return Math.max(override.start, override.end); + } + return override.end || 0; + } + + getBinsCount (override) { + return override.bins || 0; + } + + getWidth (override) { + var width = 0; + var binsCount = override.bins; + + if (binsCount && Number.isFinite(override.start) && Number.isFinite(override.end)) { + width = (override.end - override.start) / binsCount; + } + + return width; + } + + parseOffset (offset, aggregation) { + if (!offset) { + return '0'; + } + if (aggregation === 'hour' || aggregation === 'minute') { + return '0'; + } + + var offsetInHours = Math.ceil(offset / 3600); + return '' + offsetInHours; + } +}; diff --git a/lib/cartodb/models/dataview/numeric-histogram.js b/lib/cartodb/models/dataview/numeric-histogram.js new file mode 100644 index 00000000..e5149793 --- /dev/null +++ b/lib/cartodb/models/dataview/numeric-histogram.js @@ -0,0 +1,369 @@ +const BaseDataview = require('./base'); +const debug = require('debug')('windshaft:dataview:histogram'); + +const columnCastTpl = ctx => `date_part('epoch', ${ctx.column})`; + +const filterOutSpecialNumericValues = ctx => ` + ${ctx._column} != 'infinity'::float + AND + ${ctx._column} != '-infinity'::float + AND + ${ctx._column} != 'NaN'::float +`; + +const filteredQueryTpl = ctx => ` + __cdb_filtered_source AS ( + SELECT * + FROM (${ctx._query}) __cdb_filtered_source_query + WHERE ${ctx._column} IS NOT NULL + ${ctx._isFloatColumn ? `AND ${filterOutSpecialNumericValues(ctx)}` : ''} + ) +`; + +const basicsQueryTpl = ctx => ` + __cdb_basics AS ( + SELECT + max(${ctx._column}) AS __cdb_max_val, min(${ctx._column}) AS __cdb_min_val, + avg(${ctx._column}) AS __cdb_avg_val, count(1) AS __cdb_total_rows + FROM __cdb_filtered_source + ) +`; + +const overrideBasicsQueryTpl = ctx => ` + __cdb_basics AS ( + SELECT + max(${ctx._end}) AS __cdb_max_val, min(${ctx._start}) AS __cdb_min_val, + avg(${ctx._column}) AS __cdb_avg_val, count(1) AS __cdb_total_rows + FROM __cdb_filtered_source + ) +`; + +const iqrQueryTpl = ctx => ` + __cdb_iqrange AS ( + SELECT max(quartile_max) - min(quartile_max) AS __cdb_iqr + FROM ( + SELECT quartile, max(_cdb_iqr_column) AS quartile_max from ( + SELECT ${ctx._column} AS _cdb_iqr_column, ntile(4) over (order by ${ctx._column} + ) AS quartile + FROM __cdb_filtered_source) _cdb_quartiles + WHERE quartile = 1 or quartile = 3 + GROUP BY quartile + ) __cdb_iqr + ) +`; + +const binsQueryTpl = ctx => ` + __cdb_bins AS ( + SELECT + CASE WHEN __cdb_total_rows = 0 OR __cdb_iqr = 0 + THEN 1 + ELSE GREATEST( + LEAST(${ctx._minBins}, CAST(__cdb_total_rows AS INT)), + LEAST( + CAST(((__cdb_max_val - __cdb_min_val) / (2 * __cdb_iqr * power(__cdb_total_rows, 1/3))) AS INT), + ${ctx._maxBins} + ) + ) + END AS __cdb_bins_number + FROM __cdb_basics, __cdb_iqrange, __cdb_filtered_source + LIMIT 1 + ) +`; + +const overrideBinsQueryTpl = ctx => ` + __cdb_bins AS ( + SELECT ${ctx._override.bins} AS __cdb_bins_number + ) +`; + +const nullsQueryTpl = ctx => ` + __cdb_nulls AS ( + SELECT + count(*) AS __cdb_nulls_count + FROM (${ctx._query}) __cdb_histogram_nulls + WHERE ${ctx._column} IS NULL + ) +`; + +const infinitiesQueryTpl = ctx => ` + __cdb_infinities AS ( + SELECT + count(*) AS __cdb_infinities_count + FROM (${ctx._query}) __cdb_infinities_query + WHERE + ${ctx._column} = 'infinity'::float + OR + ${ctx._column} = '-infinity'::float + ) +`; + +const nansQueryTpl = ctx => ` + __cdb_nans AS ( + SELECT + count(*) AS __cdb_nans_count + FROM (${ctx._query}) __cdb_nans_query + WHERE ${ctx._column} = 'NaN'::float + ) +`; + +const specialNumericValuesColumnDefinitionTpl = () => ` + __cdb_infinities_count AS infinities_count, + __cdb_nans_count AS nans_count +`; + +const specialNumericValuesCTETpl = () => ` + __cdb_infinities, __cdb_nans +`; + +const specialNumericValuesColumnTpl = () => ` + infinities_count, nans_count +`; + +const histogramQueryTpl = ctx => ` + SELECT + (__cdb_max_val - __cdb_min_val) / cast(__cdb_bins_number as float) AS bin_width, + __cdb_bins_number AS bins_number, + __cdb_nulls_count AS nulls_count, + ${ctx._isFloatColumn ? `${specialNumericValuesColumnDefinitionTpl()},` : ''} + __cdb_avg_val AS avg_val, + CASE WHEN __cdb_min_val = __cdb_max_val + THEN 0 + ELSE GREATEST( + 1, + LEAST( + WIDTH_BUCKET(${ctx._column}, __cdb_min_val, __cdb_max_val, __cdb_bins_number), + __cdb_bins_number + ) + ) - 1 + END AS bin, + min(${ctx._column})::numeric AS min, + max(${ctx._column})::numeric AS max, + avg(${ctx._column})::numeric AS avg, + count(*) AS freq + FROM __cdb_filtered_source, __cdb_basics, __cdb_nulls, __cdb_bins + ${ctx._isFloatColumn ? `, ${specialNumericValuesCTETpl()}` : ''} + GROUP BY bin, bins_number, bin_width, nulls_count, avg_val + ${ctx._isFloatColumn ? `, ${specialNumericValuesColumnTpl()}` : ''} + ORDER BY bin +`; + +const BIN_MIN_NUMBER = 6; +const BIN_MAX_NUMBER = 48; + +const TYPE = 'histogram'; + +/** +Numeric histogram: +{ + type: 'histogram', + options: { + column: 'name', // column data type: numeric + bins: 10 // OPTIONAL + } +} +*/ +module.exports = class NumericHistogram extends BaseDataview { + constructor (query, options, queries) { + super(); + + if (typeof options.column !== 'string') { + throw new Error('Histogram expects `column` in widget options'); + } + + this.query = query; + this.queries = queries; + this.column = options.column; + this.bins = options.bins; + this.aggregation = options.aggregation; + this.offset = options.offset; + + this._columnType = null; + } + + sql (psql, override, callback) { + var self = this; + + if (!callback) { + callback = override; + override = {}; + } + + if (this._columnType === null) { + this.getColumnType(psql, this.column, this.queries.no_filters, function (err, type) { + // assume numeric, will fail later + self._columnType = 'numeric'; + if (!err && !!type) { + self._columnType = Object.keys(type).find(function (key) { + return type[key]; + }); + } + self.sql(psql, override, callback); + }, true); // use read-only transaction + return null; + } + + this._buildQuery(psql, override, callback); + } + + + _buildQuery (psql, override, callback) { + const histogramSql = this.buildNumericHistogramQueryTpl({ + _override: override, + _column: this._columnType === 'date' ? columnCastTpl({ column: this.column }) : this.column, + _isFloatColumn: this._columnType === 'float', + _query: this.query, + _start: this.getBinStart(override), + _end: this.getBinEnd(override), + _minBins: BIN_MIN_NUMBER, + _maxBins: BIN_MAX_NUMBER, + }); + + debug(histogramSql); + + return callback(null, histogramSql); + } + + buildNumericHistogramQueryTpl (ctx) { + return ` + WITH + ${filteredQueryTpl(ctx)}, + ${this._shouldOverride(ctx._override) ? overrideBasicsQueryTpl(ctx) : basicsQueryTpl(ctx)}, + ${this._shouldOverrideRange(ctx._override) || this._shouldOverrideBins(ctx._override) ? + overrideBinsQueryTpl(ctx) : + `${iqrQueryTpl(ctx)}, ${binsQueryTpl(ctx)}` + }, + ${nullsQueryTpl(ctx)} + ${ctx._isFloatColumn ? `,${infinitiesQueryTpl(ctx)}, ${nansQueryTpl(ctx)}` : ''} + ${histogramQueryTpl(ctx)} + `; + } + + _shouldOverride (override) { + return this._shouldOverrideRange(override) && this._shouldOverrideBins(override); + } + + _shouldOverrideRange (override) { + return override && override.hasOwnProperty('start') && override.hasOwnProperty('end'); + } + + _shouldOverrideBins (override) { + return override && override.hasOwnProperty('bins'); + } + + format (result, override) { + override = override || {}; + var buckets = []; + + var binsCount = this.getBinsCount(override); + var width = this.getWidth(override); + var binsStart = this.getBinStart(override); + var nulls = 0; + var infinities = 0; + var nans = 0; + var avg; + var timestampStart; + var aggregation; + var offset; + + if (result.rows.length) { + var firstRow = result.rows[0]; + binsCount = firstRow.bins_number; + width = firstRow.bin_width || width; + avg = firstRow.avg_val; + nulls = firstRow.nulls_count; + timestampStart = firstRow.timestamp_start; + infinities = firstRow.infinities_count; + nans = firstRow.nans_count; + binsStart = this.populateBinStart(override, firstRow); + + if (Number.isFinite(timestampStart)) { + aggregation = this.getAggregation(override); + offset = this.getOffset(override); + } + + buckets = result.rows.map(row => ({ bin, min, max, avg, freq } = row)); + } + + return { + aggregation: aggregation, + offset: offset, + timestamp_start: timestampStart, + bin_width: width, + bins_count: binsCount, + bins_start: binsStart, + nulls: nulls, + infinities: infinities, + nans: nans, + avg: avg, + bins: buckets + }; + } + + getType () { + return TYPE; + } + + toString () { + return JSON.stringify({ + _type: TYPE, + _column: this.column, + _query: this.query + }); + } + + getAggregation (override) { + return override && override.aggregation ? override.aggregation : this.aggregation; + } + + getOffset (override) { + if (override && override.offset) { + return override.offset; + } + if (this.offset) { + return this.offset; + } + + return 0; + } + + getBinStart (override) { + if (override.hasOwnProperty('start') && override.hasOwnProperty('end')) { + return Math.min(override.start, override.end); + } + return override.start || 0; + } + + populateBinStart (override, firstRow) { + var binStart; + + if (override.hasOwnProperty('start')) { + binStart = this.getBinStart(override); + } else { + binStart = firstRow.min; + } + + return binStart; + } + + getBinEnd (override) { + if (override.hasOwnProperty('start') && override.hasOwnProperty('end')) { + return Math.max(override.start, override.end); + } + return override.end || 0; + } + + getBinsCount (override) { + return override.bins || 0; + } + + getWidth (override) { + var width = 0; + var binsCount = override.bins; + + if (binsCount && Number.isFinite(override.start) && Number.isFinite(override.end)) { + width = (override.end - override.start) / binsCount; + } + + return width; + } +};