285 lines
10 KiB
Ruby
285 lines
10 KiB
Ruby
require_relative '../../lib/importer/content_guesser'
|
|
require_relative '../../../../spec/rspec_configuration.rb'
|
|
|
|
describe CartoDB::Importer2::ContentGuesser do
|
|
|
|
before(:each) do
|
|
CartoDB::Stats::Aggregator.stubs(:read_config).returns({})
|
|
end
|
|
|
|
describe '#enabled?' do
|
|
it 'returns a true value if set so in options' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
|
|
guesser.enabled?.should eq true
|
|
end
|
|
|
|
it 'returns a false value if set so in options' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: false}}
|
|
guesser.enabled?.should eq false
|
|
end
|
|
|
|
it 'returns a false-like value if not set in options' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {}
|
|
guesser.enabled?.should eq false
|
|
end
|
|
|
|
end
|
|
|
|
describe '#country_column' do
|
|
it 'returns nil if guessing is not enabled' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: false}}
|
|
guesser.country_column.should eq nil
|
|
end
|
|
|
|
it 'returns the first column name which contents are countries, if present' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
|
|
columns = [
|
|
{column_name: 'any_column' },
|
|
{column_name: 'country_column'},
|
|
{column_name: 'any_other_column'}
|
|
]
|
|
guesser.stubs(:columns).returns(columns)
|
|
guesser.stubs(:is_country_column?).with({column_name: 'any_column'}).returns(false)
|
|
guesser.stubs(:is_country_column?).with({column_name: 'country_column'}).returns(true)
|
|
guesser.stubs(:is_country_column?).with({column_name: 'any_other_column'}).returns(false)
|
|
|
|
guesser.country_column.should eq 'country_column'
|
|
end
|
|
|
|
it "returns nil if there's no column containing countries" do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
|
|
columns = [
|
|
{column_name: 'any_column' },
|
|
{column_name: 'any_other_column'}
|
|
]
|
|
guesser.stubs(:columns).returns(columns)
|
|
guesser.stubs(:is_country_column?).returns(false)
|
|
|
|
guesser.country_column.should be_nil
|
|
end
|
|
end
|
|
|
|
describe '#columns' do
|
|
it 'queries the db to get a list of columns with their corresponding data types' do
|
|
db = mock
|
|
db.expects(:[]).returns(:any_iterable_list_of_columns)
|
|
table_name = 'any_table_name'
|
|
schema = 'any_schema'
|
|
guesser = CartoDB::Importer2::ContentGuesser.new db, table_name, schema, nil
|
|
guesser.columns.should == :any_iterable_list_of_columns
|
|
end
|
|
end
|
|
|
|
describe '#is_country_column?' do
|
|
it 'returns true if a sample proportion is above a given threshold' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
|
|
column = {column_name: 'candidate_column_name', data_type: 'text'}
|
|
guesser.stubs(:sample).returns [
|
|
{candidate_column_name: 'USA'},
|
|
{candidate_column_name: 'Spain'},
|
|
{candidate_column_name: 'not a country'}
|
|
]
|
|
guesser.stubs(:countries).returns Set.new ['usa', 'spain', 'france', 'canada']
|
|
guesser.stubs(:threshold).returns 0.5
|
|
importer_stats_mock = mock
|
|
proportion = 2.0/3.0
|
|
importer_stats_mock.expects(:gauge).once().with('country_proportion', proportion)
|
|
guesser.set_importer_stats(importer_stats_mock)
|
|
|
|
guesser.is_country_column?(column).should eq true
|
|
end
|
|
|
|
it 'returns false if sample.count == 0' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
|
|
column = {column_name: 'candidate_column_name', data_type: 'text'}
|
|
guesser.stubs(:sample).returns []
|
|
guesser.stubs(:countries).returns Set.new ['usa', 'spain', 'france', 'canada']
|
|
guesser.stubs(:threshold).returns 0.5
|
|
|
|
guesser.is_country_column?(column).should eq false
|
|
end
|
|
|
|
it 'returns false if countries.count == 0' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
|
|
column = {column_name: 'candidate_column_name', data_type: 'text'}
|
|
guesser.stubs(:sample).returns [
|
|
{candidate_column_name: 'USA'},
|
|
{candidate_column_name: 'Spain'},
|
|
{candidate_column_name: 'not a country'}
|
|
]
|
|
guesser.stubs(:countries).returns Set.new []
|
|
guesser.stubs(:threshold).returns 0.5
|
|
|
|
guesser.is_country_column?(column).should eq false
|
|
end
|
|
|
|
it 'returns false if sample.count == 0 and countries.count == 0' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
|
|
column = {column_name: 'candidate_column_name', data_type: 'text'}
|
|
guesser.stubs(:sample).returns []
|
|
guesser.stubs(:countries).returns Set.new []
|
|
guesser.stubs(:threshold).returns 0.5
|
|
|
|
guesser.is_country_column?(column).should eq false
|
|
end
|
|
|
|
end
|
|
|
|
describe '#is_text_type?' do
|
|
it 'returns false if the column type is not compatible' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
|
|
column = {data_type: 'integer'}
|
|
guesser.is_text_type?(column).should eq false
|
|
end
|
|
|
|
it 'returns true if the column type is of a compatible type' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
|
|
column = {data_type: 'text'}
|
|
guesser.is_text_type?(column).should eq true
|
|
end
|
|
end
|
|
|
|
describe '#countries' do
|
|
it 'queries the sql api to get a Set of countries' do
|
|
countries_column = CartoDB::Importer2::ContentGuesser::COUNTRIES_COLUMN
|
|
api_mock = mock
|
|
api_mock
|
|
.expects(:fetch)
|
|
.with(CartoDB::Importer2::ContentGuesser::COUNTRIES_QUERY)
|
|
.returns([
|
|
{countries_column => 'usa'},
|
|
{countries_column => 'united states'},
|
|
{countries_column => 'spain'},
|
|
{countries_column => 'es'},
|
|
{countries_column => 'france'},
|
|
{countries_column => 'fr'}
|
|
])
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
|
|
guesser.geocoder_sql_api = api_mock
|
|
guesser.countries.should eq Set.new ['usa', 'united states', 'spain', 'es', 'france', 'fr']
|
|
end
|
|
|
|
it 'caches the response so no need to call the sql api on successive calls' do
|
|
api_mock = mock
|
|
api_mock
|
|
.expects(:fetch)
|
|
.once
|
|
.with(CartoDB::Importer2::ContentGuesser::COUNTRIES_QUERY)
|
|
.returns([])
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
|
|
guesser.geocoder_sql_api = api_mock
|
|
|
|
guesser.countries.should eq Set.new []
|
|
guesser.countries.should eq Set.new []
|
|
end
|
|
|
|
it 'shall not add countries from DB if length < 2' do
|
|
countries_column = CartoDB::Importer2::ContentGuesser::COUNTRIES_COLUMN
|
|
api_mock = mock
|
|
api_mock
|
|
.expects(:fetch)
|
|
.with(CartoDB::Importer2::ContentGuesser::COUNTRIES_QUERY)
|
|
.returns([
|
|
{countries_column => 'usa'},
|
|
{countries_column => 'united states'},
|
|
{countries_column => 'fr'},
|
|
{countries_column => 's'},
|
|
{countries_column => ''},
|
|
])
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
|
|
guesser.geocoder_sql_api = api_mock
|
|
guesser.countries.should eq Set.new ['usa', 'united states', 'fr']
|
|
end
|
|
|
|
end
|
|
|
|
describe '#id_column' do
|
|
it 'should return a column name known to be sequential and with index' do
|
|
db = mock
|
|
list_of_columns = [
|
|
{ column_name: "data", data_type: "string" },
|
|
{ column_name: "ogc_fid", data_type: "integer" },
|
|
{ column_name: "more_data", data_type: "string" }
|
|
]
|
|
db.expects(:[]).once.returns(list_of_columns)
|
|
guesser = CartoDB::Importer2::ContentGuesser.new db, nil, nil, nil
|
|
guesser.id_column.should eq 'ogc_fid'
|
|
end
|
|
|
|
it "should use objectid in case the file is a gdb one" do
|
|
db = mock
|
|
list_of_columns = [
|
|
{ column_name: "data", data_type: "string" },
|
|
{ column_name: "objectid", data_type: "integer" },
|
|
{ column_name: "more_data", data_type: "string" }
|
|
]
|
|
db.expects(:[]).once.returns(list_of_columns)
|
|
guesser = CartoDB::Importer2::ContentGuesser.new db, nil, nil, nil
|
|
guesser.id_column.should eq 'objectid'
|
|
end
|
|
|
|
it "should raise an exception if there's no suitable id column" do
|
|
db = mock
|
|
list_of_columns = [
|
|
{ column_name: "data", data_type: "string" },
|
|
{ column_name: "more_data", data_type: "string" }
|
|
]
|
|
db.expects(:[]).once.returns(list_of_columns)
|
|
guesser = CartoDB::Importer2::ContentGuesser.new db, nil, nil, nil
|
|
expect {guesser.id_column}.to raise_error(CartoDB::Importer2::ContentGuesserException)
|
|
end
|
|
|
|
end
|
|
|
|
describe '#metric_entropy' do
|
|
it 'should be low for repeated elements after normalization' do
|
|
column = { column_name: 'candidate_column_name' }
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
|
|
guesser.stubs(:sample).returns [
|
|
{candidate_column_name: '1400US600'},
|
|
{candidate_column_name: '1400US601'},
|
|
{candidate_column_name: '1400US602'}
|
|
]
|
|
guesser.metric_entropy(column).should > 0.99
|
|
guesser.metric_entropy(column, guesser.country_name_normalizer).should < 0.5
|
|
end
|
|
end
|
|
|
|
describe '#country_name_normalizer' do
|
|
it 'should handle gracefully nil values' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
|
|
guesser.country_name_normalizer.call(nil).should == ''
|
|
end
|
|
end
|
|
|
|
describe '#is_ip_column?' do
|
|
|
|
it "returns true if column contains IP's" do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
|
|
column = {column_name: 'candidate_column_name', data_type: 'text'}
|
|
guesser.stubs(:sample).returns [
|
|
{candidate_column_name: '192.168.1.1'},
|
|
{candidate_column_name: '162.243.83.87'},
|
|
{candidate_column_name: '173.194.66.104'}
|
|
]
|
|
guesser.stubs(:threshold).returns 0.9
|
|
|
|
guesser.is_ip_column?(column).should eq true
|
|
end
|
|
|
|
it 'returns false if sample contains a bunch of integers #1803' do
|
|
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
|
|
column = {column_name: 'candidate_column_name', data_type: 'text'}
|
|
guesser.stubs(:sample).returns [
|
|
{candidate_column_name: '12345'},
|
|
{candidate_column_name: '67891'},
|
|
{candidate_column_name: '1024'}
|
|
]
|
|
guesser.stubs(:threshold).returns 0.9
|
|
|
|
guesser.is_ip_column?(column).should eq false
|
|
end
|
|
end
|
|
|
|
end
|