Refactor time dimensions

This commit is contained in:
Javier Goizueta 2018-10-03 21:02:22 +02:00
parent aff55351ad
commit c588d4139e
2 changed files with 193 additions and 168 deletions

View File

@ -122,10 +122,11 @@ const timeDimensionParameters = definition => {
// definition.column should correspond to a wrapped date column
return {
time: `to_timestamp("${definition.column}")`,
timeZone: definition.timezone || 'utc',
groupBy: definition.group_by,
groupByCount: definition.group_by_count || 1,
starting: definition.starting
timezone: definition.timezone || 'utc',
grouping: definition.group_by,
count: definition.group_by_count || 1,
starting: definition.starting,
format: definition.format
};
};

View File

@ -1,3 +1,48 @@
// timezones can be defined either by an numeric offset in seconds or by
// a valid (case-insensitive) tz/PG name;
// they include abbreviations defined by PG (which have precedence and
// are fixed offsets, not handling DST) or general names that can handle DST.
function timezone(tz) {
if (isFinite(tz)) {
return `INTERVAL '${tz} seconds'`;
}
return `'${tz}'`;
}
// We assume t is a TIMESTAMP WITH TIME ZONE.
// If this was to be used with a t which is a TIMESTAMP or TIME (no time zone)
// it should be converted with `timezone('utc',t)` to a type with time zone.
// Note that by default CARTO uses timestamp with time zone columns for dates
// and VectorMapConfigAdapter converts them to epoch numbers.
// So, for using this with aggregations, relying on dates & times
// converted to UTC UNIX epoch numbers, apply `to_timestamp` to the
// (converted) column.
function timeExpression(t, tz) {
if (tz !== undefined) {
return `timezone(${timezone(tz)}, ${t})`;
}
return t;
}
// Epoch should be an ISO timestamp literal without time zone
// (it is interpreted as in the defined timzezone for the input time)
// It can be partial, e.g. 'YYYY', 'YYYY-MM', 'YYYY-MM-DDTHH', etc.
// Defaults are applied: YYYY=0001, MM=01, DD=01, HH=00, MM=00, S=00
// It returns a timestamp without time zone
function epochExpression(epoch) {
/* jshint maxcomplexity:8 */ // goddammit linter, I like this as is!!
const format = /^(\d\d\d\d)(?:\-?(\d\d)(?:\-?(\d\d)(?:[T\s]?(\d\d)(?:(\d\d)(?:\:(\d\d))?)?)?)?)?$/;
const match = epoch.match(format) || [];
const year = match[1] || '0001';
const month = match[2] || '01';
const day = match[3] || '01';
const hour = match[4] || '00';
const minute = match[5] || '00';
const second = match[6] || '00';
epoch = `${year}-${month}-${day}T${hour}:${minute}:${second}`;
return `TIMESTAMP '${epoch}'`;
}
const YEARSPAN = "(date_part('year', $t)-date_part('year', $epoch))";
// Note that SECONDSPAN is not a UTC epoch, but an epoch in the specified TZ,
// so we can use it to compute any multiple of seconds with it without using date_part or date_trunc
@ -32,25 +77,47 @@ const serialParts = {
sql: `1 + date_part('quarter', $t) - date_part('quarter', $epoch) + 4*${YEARSPAN}`,
zeroBased: false
},
semester: {
sql: `1 + FLOOR((date_part('month', $t) - date_part('month', $epoch))/6) + 2*${YEARSPAN}`,
zeroBased: false
},
trimester: {
sql: `1 + FLOOR((date_part('month', $t) - date_part('month', $epoch))/4) + 3*${YEARSPAN}`,
zeroBased: false
},
year: {
// TODO: isn't more meaningful to ignore the epoch here and return date_part('year', $t)
// for the default epoch this coincides with date_part('year', $t)
sql: `1 + ${YEARSPAN}`,
zeroBased: false
},
decade: {
// for the default epoch this coincides with date_part('decade', $t)
sql: `FLOOR((${YEARSPAN} + 1)/10)`,
zeroBased: true
},
century: {
// for the default epoch this coincides with date_part('century', $t)
sql: `1 + FLOOR(${YEARSPAN}/100)`,
zeroBased: false
},
millennium: {
// for the default epoch this coincides with date_part('millennium', $t)
sql: `1 + FLOOR(${YEARSPAN}/1000)`,
zeroBased: false
}
};
function serialSqlExpr(time, timeZone, groupBy, count = 1, starting = undefined) {
[groupBy, count] = serialNormalize(groupBy, count);
let { sql, zeroBased } = serialParts[groupBy];
const column = timeExpression(time, timeZone);
const epoch = epochExpression(starting);
function serialSqlExpr(params) {
const { sql, zeroBased } = serialParts[params.grouping];
const column = timeExpression(params.time, params.timezone);
const epoch = epochExpression(params.starting);
const serial = sql.replace(/\$t/g, column).replace(/\$epoch/g, epoch);
let expr = serial;
if (count !== 1) {
if (params.count !== 1) {
if (zeroBased) {
expr = `FLOOR((${expr})/(${count}::double precision))::int`;
expr = `FLOOR((${expr})/(${params.count}::double precision))::int`;
} else {
expr = `CEIL((${expr})/(${count}::double precision))::int`;
expr = `CEIL((${expr})/(${params.count}::double precision))::int`;
}
} else {
expr = `(${expr})::int`;
@ -71,184 +138,141 @@ const isoParts = {
trimester: `to_char($t, 'YYYY"t"') || to_char(CEIL(date_part('month', $t)/4), '9')`,
decade: `to_char(date_part('decade', $t), '"D"999')`,
century: `to_char($t, '"C"CC')`,
millennium: `to_char(date_part('millenium', $t), '"M"999')`
millennium: `to_char(date_part('millennium', $t), '"M"999')`
};
function isoSqlExpr(time, timeZone, groupBy, count = 1) {
const column = timeExpression(time, timeZone);
if (count > 1) {
// TODO: it would be sensible to return the ISO of the firt unit in the period
function isoSqlExpr(params) {
const column = timeExpression(params.time, params.timezone);
if (params.count > 1) {
// TODO: it would be sensible to return the ISO of the first unit in the period
throw new Error('Multiple time units not supported for ISO format');
}
return isoParts[groupBy].replace(/\$t/g, column);
return isoParts[params.grouping].replace(/\$t/g, column);
}
function serialNormalize(groupBy, count) {
if (groupBy === 'semester') {
groupBy = 'month';
count *= 6;
} else if (groupBy === 'trimester') {
groupBy = 'month';
count *= 4;
} else if (groupBy === 'decade') {
groupBy = 'year';
count *= 10;
} else if (groupBy === 'century') {
groupBy = 'year';
count *= 100;
} else if (groupBy === 'millenium') {
groupBy = 'year';
count *= 1000;
const cyclicParts = {
dayOfWeek: `date_part('isodow', $t)`, // 1 = monday to 7 = sunday;
dayOfMonth: `date_part('day', $t)`, // 1 to 31
dayOfYear: `date_part('doy', $t)`, // 1 to 366
hourOfDay: `date_part('hour', $t)`, // 0 to 23
monthOfYear: `date_part('month', $t)`, // 1 to 12
quarterOfYear: `date_part('quarter', $t)`, // 1 to 4
semesterOfYear: `FLOOR((date_part('month', $t)-1)/6.0) + 1`, // 1 to 2
trimesterOfYear: `FLOOR((date_part('month', $t)-1)/4.0) + 1`, // 1 to 3
weekOfYear: `date_part('week', $t)`, // 1 to 53
minuteOfHour: `date_part('minute', $t)` // 0 to 59
};
function cyclicSqlExpr(params) {
const column = timeExpression(params.time, params.timezone);
return isoParts[params.grouping].replace(/\$t/g, column);
}
const ACCEPTED_PARAMETERS = ['time', 'grouping', 'timezone', 'count', 'starting', 'format'];
const REQUIRED_PARAMETERS = ['time', 'grouping'];
function validateParameters(params, checker) {
const errors = [];
const presentParams = Object.keys(params);
const invalidParams = presentParams.filter(param => !ACCEPTED_PARAMETERS.includes(param));
if (invalidParams.length) {
errors.push(`Invalid parameters: ${invalidParams.join(', ')}`);
}
return [groupBy, count];
}
function cyclicNormalize(groupBy, count) {
if (groupBy === 'monthOfYear' && count === 3) {
groupBy = 'quarterOfYear';
count = 1;
} else if (groupBy === 'monthOfYear' && count === 6) {
groupBy = 'semesterOfYear';
count = 1;
} else if (groupBy === 'monthOfYear' && count === 4) {
groupBy = 'trimesterOfYear';
count = 1;
const missingParams = REQUIRED_PARAMETERS.filter(param => !presentParams.includes(param));
if (missingParams.length) {
errors.push(`Missing parameters: ${missingParams.join(', ')}`);
}
if (count !== 1) {
throw new Error(`invalid multiplicity ${count} for cyclic ${groupBy}`);
errors.push(...checker(params));
if (errors.length) {
throw new Error(`Invalid time dimension:\n${errors.join("\n")}`);
}
return [groupBy, count];
}
// timezones can be defined either by an numeric offset in seconds or by
// a valid (case-insensitive) tz/PG name;
// they include abbreviations defined by PG (which have precedence and
// are fixed offsets, not handling DST) or general names that can handle DST.
function timezone(tz) {
if (isFinite(tz)) {
return `INTERVAL '${tz} seconds'`;
}
return `'${tz}'`;
}
const VALID_CYCLIC_GROUPINGS = Object.keys(cyclicParts);
const VALID_SERIAL_GROUPINGS = Object.keys(serialParts);
const VALID_ISO_GROUPINGS = Object.keys(isoParts);
// We assume t is a TIMESTAMP WITH TIME ZONE.
// If this was to be used with a t which is a TIMESTAMP or TIME (no time zone)
// it should be converted with `timezone('utc',t)` to a type with time zone.
// Note that by default CARTO uses timestamp with time zone columns for dates
// and VectorMapConfigAdapter converts them to epoch numbers.
// So, for using this with aggregations, relying on dates & times
// converted to UTC UNIX epoch numbers, apply `to_timestamp` to the
// (converted) column.
function timeExpression(t, tz) {
if (tz !== undefined) {
return `timezone(${timezone(tz)}, ${t})`;
}
return t;
}
const MONTH_GROUPING = {
3: 'quarterOfYear',
6: 'semesterOfYear',
4: 'trimesterOfYear'
};
// Epoch should be an ISO timestamp literal without time zone
// (it is interpreted as in the defined timzezone for the input time)
// It can be partial, e.g. 'YYYY', 'YYYY-MM', 'YYYY-MM-DDTHH', etc.
// Defaults are applied: YYYY=0001, MM=01, DD=01, HH=00, MM=00, S=00
// It returns a timestamp without time zone
function epochExpression(epoch) {
const format = /^(\d\d\d\d)(?:\-?(\d\d)(?:\-?(\d\d)(?:[T\s]?(\d\d)(?:(\d\d)(?:\:(\d\d))?)?)?)?)?$/;
const match = epoch.match(format) || [];
const year = match[1] || '0001';
const month = match[2] || '01';
const day = match[3] || '01';
const hour = match[4] || '00';
const minute = match[5] || '00';
const second = match[6]t || '00';
epoch = `${year}-${month}-${day}T${hour}:${minute}:${second}`;
return `TIMESTAMP '${epoch}'`;
}
function cyclicSqlExpr(time, timeZone, groupBy, count = 1) {
[groupBy, count] = cyclicNormalize(groupBy, count);
const column = timeExpression(time, timeZone);
if (count === 1) {
switch (groupBy) {
case 'dayOfWeek':
// 1 = monday; 7 = sunday;
return `date_part('isodow', ${column})`;
case 'dayOfMonth':
// result: 1-31
return `date_part('day', ${column})`;
case 'dayOfYear':
// result: 1-366
return `date_part('doy', ${column})`;
case 'hourOfDay':
// result: 0-23
return `date_part('hour', ${column})`;
case 'monthOfYear':
// result 1-12
return `date_part('month', ${column})`;
case 'quarterOfYear':
// result 1-4
return `date_part('quarter', ${column})`;
case 'semesterOfYear':
// result 1-2
return `FLOOR((date_part('month', ${column})-1)/6.0) + 1`;
case 'trimesterOfYear':
// result 1-3
return `FLOOR((date_part('month', ${column})-1)/4.0) + 1`;
case 'weekOfYear':
// result 1-53
return `date_part('week', ${column})`;
case 'minuteOfHour':
// result 0-59
return `date_part('minute', ${column})`;
function cyclicCheckParams(params) {
const errors = [];
if (!VALID_CYCLIC_GROUPINGS.includes(params.grouping)) {
errors.push(`Invalid grouping "${params.grouping}"`);
} else {
if (params.count && params.count > 1) {
let fixed = false;
if (params.grouping === 'monthOfYear') {
const grouping = MONTH_GROUPING[params.count];
if (grouping) {
params.grouping = grouping;
params.count = 1;
fixed = true;
}
}
if (!fixed) {
errors.push(`Invalid count ${params.count} for cyclic ${params.grouping}`);
}
}
}
throw new Error(`Invalid cyclic time grouping ${groupBy} with count ${count}`)
return errors;
}
function validateParameters(_params) {
return true;
function serialCheckParams(params) {
const errors = [];
if (!VALID_SERIAL_GROUPINGS.includes(params.grouping)) {
errors.push(`Invalid grouping "${params.grouping}"`);
}
return errors;
}
function isoCheckParams(params) {
const errors = [];
if (!VALID_ISO_GROUPINGS.includes(params.grouping)) {
errors.push(`Invalid grouping "${params.grouping}"`);
}
if (params.starting) {
errors.push("Parameter 'starting' not supported for ISO format");
}
return errors;
}
const CLASSIFIERS = {
cyclic: {
sqlExpr: cyclicSqlExpr,
checkParams: cyclicCheckParams
},
iso: {
sqlExpr: isoSqlExpr,
checkParams: isoCheckParams
},
serial: {
sqlExpr: serialSqlExpr,
checkParams: serialCheckParams
}
};
function isCyclic(groupBy) {
return groupBy.match(/.+By.+/);
}
function classificationSql(params) {
validateParameters(params);
if (isCyclic(params.group_by)) {
// TODO: validate group_by_count === 1, No epoch
return cyclicSqlExpr(
params.time,
params.timeZone,
params.groupBy,
params.groupByCount
);
function classifierFor(params) {
let classifier = 'serial';
if (params.grouping && isCyclic(params.grouping)) {
classifier = 'cyclic';
} else if (params.format === 'iso') {
// TODO: validate group_by_count === 1, No epoch
return isoSqlExpr(
params.time,
params.timeZone,
params.groupBy,
params.groupByCount
);
} else {
return serialSqlExpr(
params.time,
params.timeZone,
params.groupBy,
params.groupByCount,
params.starting
);
classifier = 'iso';
}
return CLASSIFIERS[classifier];
}
module.exports = classificationSql;
function classificationSql(params) {
const classifier = classifierFor(params);
validateParameters(params, classifier.checkParams);
return classifier.sqlExpr(params);
}
module.exports = classificationSql;