cartodb-4.42/services/importer/spec/unit/content_guesser_spec.rb

285 lines
10 KiB
Ruby
Raw Normal View History

2024-04-06 13:25:13 +08:00
require_relative '../../lib/importer/content_guesser'
require_relative '../../../../spec/rspec_configuration.rb'
describe CartoDB::Importer2::ContentGuesser do
before(:each) do
CartoDB::Stats::Aggregator.stubs(:read_config).returns({})
end
describe '#enabled?' do
it 'returns a true value if set so in options' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
guesser.enabled?.should eq true
end
it 'returns a false value if set so in options' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: false}}
guesser.enabled?.should eq false
end
it 'returns a false-like value if not set in options' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {}
guesser.enabled?.should eq false
end
end
describe '#country_column' do
it 'returns nil if guessing is not enabled' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: false}}
guesser.country_column.should eq nil
end
it 'returns the first column name which contents are countries, if present' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
columns = [
{column_name: 'any_column' },
{column_name: 'country_column'},
{column_name: 'any_other_column'}
]
guesser.stubs(:columns).returns(columns)
guesser.stubs(:is_country_column?).with({column_name: 'any_column'}).returns(false)
guesser.stubs(:is_country_column?).with({column_name: 'country_column'}).returns(true)
guesser.stubs(:is_country_column?).with({column_name: 'any_other_column'}).returns(false)
guesser.country_column.should eq 'country_column'
end
it "returns nil if there's no column containing countries" do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
columns = [
{column_name: 'any_column' },
{column_name: 'any_other_column'}
]
guesser.stubs(:columns).returns(columns)
guesser.stubs(:is_country_column?).returns(false)
guesser.country_column.should be_nil
end
end
describe '#columns' do
it 'queries the db to get a list of columns with their corresponding data types' do
db = mock
db.expects(:[]).returns(:any_iterable_list_of_columns)
table_name = 'any_table_name'
schema = 'any_schema'
guesser = CartoDB::Importer2::ContentGuesser.new db, table_name, schema, nil
guesser.columns.should == :any_iterable_list_of_columns
end
end
describe '#is_country_column?' do
it 'returns true if a sample proportion is above a given threshold' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
column = {column_name: 'candidate_column_name', data_type: 'text'}
guesser.stubs(:sample).returns [
{candidate_column_name: 'USA'},
{candidate_column_name: 'Spain'},
{candidate_column_name: 'not a country'}
]
guesser.stubs(:countries).returns Set.new ['usa', 'spain', 'france', 'canada']
guesser.stubs(:threshold).returns 0.5
importer_stats_mock = mock
proportion = 2.0/3.0
importer_stats_mock.expects(:gauge).once().with('country_proportion', proportion)
guesser.set_importer_stats(importer_stats_mock)
guesser.is_country_column?(column).should eq true
end
it 'returns false if sample.count == 0' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
column = {column_name: 'candidate_column_name', data_type: 'text'}
guesser.stubs(:sample).returns []
guesser.stubs(:countries).returns Set.new ['usa', 'spain', 'france', 'canada']
guesser.stubs(:threshold).returns 0.5
guesser.is_country_column?(column).should eq false
end
it 'returns false if countries.count == 0' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
column = {column_name: 'candidate_column_name', data_type: 'text'}
guesser.stubs(:sample).returns [
{candidate_column_name: 'USA'},
{candidate_column_name: 'Spain'},
{candidate_column_name: 'not a country'}
]
guesser.stubs(:countries).returns Set.new []
guesser.stubs(:threshold).returns 0.5
guesser.is_country_column?(column).should eq false
end
it 'returns false if sample.count == 0 and countries.count == 0' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
column = {column_name: 'candidate_column_name', data_type: 'text'}
guesser.stubs(:sample).returns []
guesser.stubs(:countries).returns Set.new []
guesser.stubs(:threshold).returns 0.5
guesser.is_country_column?(column).should eq false
end
end
describe '#is_text_type?' do
it 'returns false if the column type is not compatible' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
column = {data_type: 'integer'}
guesser.is_text_type?(column).should eq false
end
it 'returns true if the column type is of a compatible type' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
column = {data_type: 'text'}
guesser.is_text_type?(column).should eq true
end
end
describe '#countries' do
it 'queries the sql api to get a Set of countries' do
countries_column = CartoDB::Importer2::ContentGuesser::COUNTRIES_COLUMN
api_mock = mock
api_mock
.expects(:fetch)
.with(CartoDB::Importer2::ContentGuesser::COUNTRIES_QUERY)
.returns([
{countries_column => 'usa'},
{countries_column => 'united states'},
{countries_column => 'spain'},
{countries_column => 'es'},
{countries_column => 'france'},
{countries_column => 'fr'}
])
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
guesser.geocoder_sql_api = api_mock
guesser.countries.should eq Set.new ['usa', 'united states', 'spain', 'es', 'france', 'fr']
end
it 'caches the response so no need to call the sql api on successive calls' do
api_mock = mock
api_mock
.expects(:fetch)
.once
.with(CartoDB::Importer2::ContentGuesser::COUNTRIES_QUERY)
.returns([])
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
guesser.geocoder_sql_api = api_mock
guesser.countries.should eq Set.new []
guesser.countries.should eq Set.new []
end
it 'shall not add countries from DB if length < 2' do
countries_column = CartoDB::Importer2::ContentGuesser::COUNTRIES_COLUMN
api_mock = mock
api_mock
.expects(:fetch)
.with(CartoDB::Importer2::ContentGuesser::COUNTRIES_QUERY)
.returns([
{countries_column => 'usa'},
{countries_column => 'united states'},
{countries_column => 'fr'},
{countries_column => 's'},
{countries_column => ''},
])
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
guesser.geocoder_sql_api = api_mock
guesser.countries.should eq Set.new ['usa', 'united states', 'fr']
end
end
describe '#id_column' do
it 'should return a column name known to be sequential and with index' do
db = mock
list_of_columns = [
{ column_name: "data", data_type: "string" },
{ column_name: "ogc_fid", data_type: "integer" },
{ column_name: "more_data", data_type: "string" }
]
db.expects(:[]).once.returns(list_of_columns)
guesser = CartoDB::Importer2::ContentGuesser.new db, nil, nil, nil
guesser.id_column.should eq 'ogc_fid'
end
it "should use objectid in case the file is a gdb one" do
db = mock
list_of_columns = [
{ column_name: "data", data_type: "string" },
{ column_name: "objectid", data_type: "integer" },
{ column_name: "more_data", data_type: "string" }
]
db.expects(:[]).once.returns(list_of_columns)
guesser = CartoDB::Importer2::ContentGuesser.new db, nil, nil, nil
guesser.id_column.should eq 'objectid'
end
it "should raise an exception if there's no suitable id column" do
db = mock
list_of_columns = [
{ column_name: "data", data_type: "string" },
{ column_name: "more_data", data_type: "string" }
]
db.expects(:[]).once.returns(list_of_columns)
guesser = CartoDB::Importer2::ContentGuesser.new db, nil, nil, nil
expect {guesser.id_column}.to raise_error(CartoDB::Importer2::ContentGuesserException)
end
end
describe '#metric_entropy' do
it 'should be low for repeated elements after normalization' do
column = { column_name: 'candidate_column_name' }
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
guesser.stubs(:sample).returns [
{candidate_column_name: '1400US600'},
{candidate_column_name: '1400US601'},
{candidate_column_name: '1400US602'}
]
guesser.metric_entropy(column).should > 0.99
guesser.metric_entropy(column, guesser.country_name_normalizer).should < 0.5
end
end
describe '#country_name_normalizer' do
it 'should handle gracefully nil values' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, nil
guesser.country_name_normalizer.call(nil).should == ''
end
end
describe '#is_ip_column?' do
it "returns true if column contains IP's" do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
column = {column_name: 'candidate_column_name', data_type: 'text'}
guesser.stubs(:sample).returns [
{candidate_column_name: '192.168.1.1'},
{candidate_column_name: '162.243.83.87'},
{candidate_column_name: '173.194.66.104'}
]
guesser.stubs(:threshold).returns 0.9
guesser.is_ip_column?(column).should eq true
end
it 'returns false if sample contains a bunch of integers #1803' do
guesser = CartoDB::Importer2::ContentGuesser.new nil, nil, nil, {guessing: {enabled: true}}
column = {column_name: 'candidate_column_name', data_type: 'text'}
guesser.stubs(:sample).returns [
{candidate_column_name: '12345'},
{candidate_column_name: '67891'},
{candidate_column_name: '1024'}
]
guesser.stubs(:threshold).returns 0.9
guesser.is_ip_column?(column).should eq false
end
end
end