1167 Move the sampling code to a TableSampler class

pull/1186/head
Rafa de la Torre 10 years ago
parent ed3c4d89c6
commit c0f15a144d

@ -1,5 +1,7 @@
# encoding: utf-8
require_relative 'table_sampler'
module CartoDB
module Importer2
class ContentGuesser
@ -74,6 +76,10 @@ module CartoDB
matches.to_f / sample.count
end
def sample
@sample ||= TableSampler.new(@db, qualified_table_name, IDS_COLUMN, sample_size).sample
end
def threshold
@options[:guessing][:threshold]
end
@ -82,67 +88,6 @@ module CartoDB
['character varying', 'varchar', 'text'].include? column[:data_type]
end
def sample
@sample ||= @db[%Q(
SELECT * FROM #{qualified_table_name}
#{sample_where_clause}
)].all
end
def sample_where_clause
if ids_count <= sample_size
""
else
"WHERE #{IDS_COLUMN} IN (#{sample_indices.to_a.join(',')})"
end
end
def ids_count
@ids_count ||= max_id - min_id + 1
end
def min_id
@min_id ||= index_limits[:min_id]
end
def max_id
@max_id ||= index_limits[:max_id]
end
#TODO move to a collaborator
def sample_indices
if ids_count / 2 > sample_size
sample_indices_add_method
else
sample_indices_delete_method
end
end
def sample_indices_add_method
sample_indices = Set.new
while sample_indices.size < sample_size
random_index = rand(@min_id..@max_id)
sample_indices.add(random_index)
end
sample_indices
end
def sample_indices_delete_method
sample_indices = Set.new(@min_id..@max_id)
while sample_indices.size > sample_size
random_index = rand(@min_id..@max_id)
sample_indices.delete random_index
end
sample_indices
end
def index_limits
@index_limits ||= @db[%Q(
SELECT min(#{IDS_COLUMN}) AS min_id, max(#{IDS_COLUMN}) AS max_id
FROM #{qualified_table_name}
)].first
end
def sample_size
@options[:guessing][:sample_size]
end

@ -0,0 +1,78 @@
# encoding: utf-8
module CartoDB
module Importer2
class TableSampler
attr_reader :db, :qualified_table_name, :ids_column, :sample_size
def initialize db, qualified_table_name, ids_column, sample_size
@db = db
@qualified_table_name = qualified_table_name
@ids_column = ids_column
@sample_size = sample_size
end
def sample
db[%Q(
SELECT * FROM #{qualified_table_name}
#{sample_where_clause}
)].all
end
def sample_where_clause
if ids_count <= sample_size
""
else
"WHERE #{ids_column} IN (#{sample_indices.to_a.join(',')})"
end
end
def sample_indices
if ids_count / 2 > sample_size
sample_indices_add_method
else
sample_indices_delete_method
end
end
def sample_indices_add_method
sample_indices = Set.new
while sample_indices.size < sample_size
random_index = rand(min_id..max_id)
sample_indices.add(random_index)
end
sample_indices
end
def sample_indices_delete_method
sample_indices = Set.new(min_id..max_id)
while sample_indices.size > sample_size
random_index = rand(min_id..max_id)
sample_indices.delete random_index
end
sample_indices
end
def ids_count
@ids_count ||= max_id - min_id + 1
end
def min_id
@min_id ||= id_min_max[:min_id]
end
def max_id
@max_id ||= id_min_max[:max_id]
end
def id_min_max
@id_min_max ||= db[%Q(
SELECT min(#{ids_column}) AS min_id, max(#{ids_column}) AS max_id
FROM #{qualified_table_name}
)].first
end
end
end
end
Loading…
Cancel
Save