cartodb-4.42/app/services/carto/user_table_index_service.rb

134 lines
4.5 KiB
Ruby
Raw Permalink Normal View History

2024-04-06 13:25:13 +08:00
module Carto
class UserTableIndexService
include ::LoggerHelper
AUTO_INDEX_TTL_MS = 600000
AUTO_INDEX_PREFIX = '_auto_idx_'.freeze
INDEXABLE_WIDGET_TYPES = %w(histogram category time-series).freeze
# Not worth it to create indices on small tables, where a full scan is cheap.
# Maybe better to use number of pages instead of rows instead.
MINIMUM_ROW_COUNT_TO_INDEX = 10000
# Number of categories (most common) shown by default in widgets, used to determine the number
# of categories likely to be used for filtering.
CATEGORIES_SHOWN_IN_WIDGET = 5
# Number of different values in a column to consider indexing it, to avoid indexing boolean columns.
# Note: Also used to index columns with less values but an unbalanced probability distribution.
MINIMUM_COLUMN_VALUES_TO_INDEX = 5
# Regardless of anything, columns with high clustering factor (sorted in physical order) are also indexed.
MINIMUM_CORRELATION_TO_INDEX = 0.9
def initialize(user_table)
@user_table = user_table
@table = user_table.service
end
def update_auto_indices
bolt = Carto::Bolt.new("user_table:#{@user_table.id}:auto_index", ttl_ms: AUTO_INDEX_TTL_MS)
bolt.run_locked { generate_indices }
end
private
def generate_indices
auto_indices(valid: false).each do |idx|
@table.drop_index(idx[:column], AUTO_INDEX_PREFIX, concurrent: true)
end
widget_columns = (@table.estimated_row_count > MINIMUM_ROW_COUNT_TO_INDEX) ? columns_with_widgets : []
columns_to_index = widget_columns.select { |c| indexable_column?(c) }
indexed_columns = valid_indices.map { |i| i[:column] }
create_index_on = columns_to_index - indexed_columns
create_index_on.each do |col|
@table.create_index(col, AUTO_INDEX_PREFIX, concurrent: true)
end
auto_indexed_columns = auto_indices.map { |i| i[:column] }
drop_index_on = auto_indexed_columns - columns_to_index
drop_index_on.each do |col|
@table.drop_index(col, AUTO_INDEX_PREFIX, concurrent: true)
end
rescue StandardError => e
log_error(exception: e, message: 'Error auto-indexing table', table: @user_table)
end
def indexable_column?(column)
stats = pg_stats_by_column[column]
return false unless stats
# Accept columns with several different values
# Checks common values and accepts cases with several value
# or with few but unbalanced values (e.g: boolean with 80/20 distribution)
common_freqs = stats[:most_common_freqs] || stats[:most_common_elem_freqs]
if common_freqs.present?
check_common_freq = common_freqs[CATEGORIES_SHOWN_IN_WIDGET - 1] || common_freqs.last
if check_common_freq < (1.0 / MINIMUM_COLUMN_VALUES_TO_INDEX)
return true
end
else
# No histogram, rely on distinct values. Note: Values < 0 represent a proportion over the number of rows.
# They are generated when the analyzer gives up counting or identifies the value as a continuous magnitude
distinct = stats[:n_distinct]
return true if distinct < 0 || distinct > MINIMUM_COLUMN_VALUES_TO_INDEX
end
# Accept columns with high correlation (values related to physical row order)
if stats[:correlation].abs > MINIMUM_CORRELATION_TO_INDEX
return true
end
# Default
false
end
def auto_indices(valid: true)
indices.select { |i| i[:name].starts_with?(AUTO_INDEX_PREFIX) && i[:valid] == valid }
end
def valid_indices
indices.select { |i| i[:valid] }
end
def indices
@indices ||= @table.pg_indexes
end
def columns_with_widgets
columns = Set.new
table_widgets.select { |w| INDEXABLE_WIDGET_TYPES.include?(w.type) }.each do |w|
columns.add(w.column)
end
columns
end
def table_widgets
widgets.select do |w|
node = w.analysis_node
node && node.table_source?(@user_table.name)
end
end
def widgets
@user_table.layers.map(&:widgets).flatten
end
def pg_stats_by_column
@stats ||= get_pg_stats_by_column
end
def get_pg_stats_by_column
@table.update_table_pg_stats
stats = @table.pg_stats
if stats && !stats.empty?
stats.map { |s| { s[:attname] => s } }.reduce(:merge)
else
log_warning(message: 'Error retrieving stats for table', table: @user_table.attributes.slice(:id))
{}
end
end
end
end