134 lines
4.5 KiB
Ruby
134 lines
4.5 KiB
Ruby
|
module Carto
|
||
|
class UserTableIndexService
|
||
|
|
||
|
include ::LoggerHelper
|
||
|
|
||
|
AUTO_INDEX_TTL_MS = 600000
|
||
|
AUTO_INDEX_PREFIX = '_auto_idx_'.freeze
|
||
|
INDEXABLE_WIDGET_TYPES = %w(histogram category time-series).freeze
|
||
|
|
||
|
# Not worth it to create indices on small tables, where a full scan is cheap.
|
||
|
# Maybe better to use number of pages instead of rows instead.
|
||
|
MINIMUM_ROW_COUNT_TO_INDEX = 10000
|
||
|
# Number of categories (most common) shown by default in widgets, used to determine the number
|
||
|
# of categories likely to be used for filtering.
|
||
|
CATEGORIES_SHOWN_IN_WIDGET = 5
|
||
|
# Number of different values in a column to consider indexing it, to avoid indexing boolean columns.
|
||
|
# Note: Also used to index columns with less values but an unbalanced probability distribution.
|
||
|
MINIMUM_COLUMN_VALUES_TO_INDEX = 5
|
||
|
# Regardless of anything, columns with high clustering factor (sorted in physical order) are also indexed.
|
||
|
MINIMUM_CORRELATION_TO_INDEX = 0.9
|
||
|
|
||
|
def initialize(user_table)
|
||
|
@user_table = user_table
|
||
|
@table = user_table.service
|
||
|
end
|
||
|
|
||
|
def update_auto_indices
|
||
|
bolt = Carto::Bolt.new("user_table:#{@user_table.id}:auto_index", ttl_ms: AUTO_INDEX_TTL_MS)
|
||
|
|
||
|
bolt.run_locked { generate_indices }
|
||
|
end
|
||
|
|
||
|
private
|
||
|
|
||
|
def generate_indices
|
||
|
auto_indices(valid: false).each do |idx|
|
||
|
@table.drop_index(idx[:column], AUTO_INDEX_PREFIX, concurrent: true)
|
||
|
end
|
||
|
|
||
|
widget_columns = (@table.estimated_row_count > MINIMUM_ROW_COUNT_TO_INDEX) ? columns_with_widgets : []
|
||
|
columns_to_index = widget_columns.select { |c| indexable_column?(c) }
|
||
|
|
||
|
indexed_columns = valid_indices.map { |i| i[:column] }
|
||
|
create_index_on = columns_to_index - indexed_columns
|
||
|
create_index_on.each do |col|
|
||
|
@table.create_index(col, AUTO_INDEX_PREFIX, concurrent: true)
|
||
|
end
|
||
|
|
||
|
auto_indexed_columns = auto_indices.map { |i| i[:column] }
|
||
|
drop_index_on = auto_indexed_columns - columns_to_index
|
||
|
drop_index_on.each do |col|
|
||
|
@table.drop_index(col, AUTO_INDEX_PREFIX, concurrent: true)
|
||
|
end
|
||
|
rescue StandardError => e
|
||
|
log_error(exception: e, message: 'Error auto-indexing table', table: @user_table)
|
||
|
end
|
||
|
|
||
|
def indexable_column?(column)
|
||
|
stats = pg_stats_by_column[column]
|
||
|
return false unless stats
|
||
|
|
||
|
# Accept columns with several different values
|
||
|
# Checks common values and accepts cases with several value
|
||
|
# or with few but unbalanced values (e.g: boolean with 80/20 distribution)
|
||
|
common_freqs = stats[:most_common_freqs] || stats[:most_common_elem_freqs]
|
||
|
if common_freqs.present?
|
||
|
check_common_freq = common_freqs[CATEGORIES_SHOWN_IN_WIDGET - 1] || common_freqs.last
|
||
|
if check_common_freq < (1.0 / MINIMUM_COLUMN_VALUES_TO_INDEX)
|
||
|
return true
|
||
|
end
|
||
|
else
|
||
|
# No histogram, rely on distinct values. Note: Values < 0 represent a proportion over the number of rows.
|
||
|
# They are generated when the analyzer gives up counting or identifies the value as a continuous magnitude
|
||
|
distinct = stats[:n_distinct]
|
||
|
return true if distinct < 0 || distinct > MINIMUM_COLUMN_VALUES_TO_INDEX
|
||
|
end
|
||
|
|
||
|
# Accept columns with high correlation (values related to physical row order)
|
||
|
if stats[:correlation].abs > MINIMUM_CORRELATION_TO_INDEX
|
||
|
return true
|
||
|
end
|
||
|
|
||
|
# Default
|
||
|
false
|
||
|
end
|
||
|
|
||
|
def auto_indices(valid: true)
|
||
|
indices.select { |i| i[:name].starts_with?(AUTO_INDEX_PREFIX) && i[:valid] == valid }
|
||
|
end
|
||
|
|
||
|
def valid_indices
|
||
|
indices.select { |i| i[:valid] }
|
||
|
end
|
||
|
|
||
|
def indices
|
||
|
@indices ||= @table.pg_indexes
|
||
|
end
|
||
|
|
||
|
def columns_with_widgets
|
||
|
columns = Set.new
|
||
|
table_widgets.select { |w| INDEXABLE_WIDGET_TYPES.include?(w.type) }.each do |w|
|
||
|
columns.add(w.column)
|
||
|
end
|
||
|
columns
|
||
|
end
|
||
|
|
||
|
def table_widgets
|
||
|
widgets.select do |w|
|
||
|
node = w.analysis_node
|
||
|
node && node.table_source?(@user_table.name)
|
||
|
end
|
||
|
end
|
||
|
|
||
|
def widgets
|
||
|
@user_table.layers.map(&:widgets).flatten
|
||
|
end
|
||
|
|
||
|
def pg_stats_by_column
|
||
|
@stats ||= get_pg_stats_by_column
|
||
|
end
|
||
|
|
||
|
def get_pg_stats_by_column
|
||
|
@table.update_table_pg_stats
|
||
|
stats = @table.pg_stats
|
||
|
if stats && !stats.empty?
|
||
|
stats.map { |s| { s[:attname] => s } }.reduce(:merge)
|
||
|
else
|
||
|
log_warning(message: 'Error retrieving stats for table', table: @user_table.attributes.slice(:id))
|
||
|
{}
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
end
|