cartodb/spec/models/data_import_spec.rb
2020-06-15 10:58:47 +08:00

667 lines
24 KiB
Ruby

require_relative '../spec_helper'
require_relative '../helpers/file_server_helper'
require_relative 'data_import_shared_examples'
require_relative '../../lib/carto/ghost_tables_manager'
describe DataImport do
let(:data_import_class) { ::DataImport }
it_behaves_like 'DataImport model'
before(:each) do
@user = create_user
bypass_named_maps
@table = create_table(user_id: @user.id)
end
after(:each) do
@user.destroy
end
after(:all) do
bypass_named_maps
@user.try(:destroy)
end
it "raises an 1014 error when strategy is set to skip and there's already a table with that name" do
table1 = create_table(user_id: @user.id)
table1.insert_row!(name: 1.0)
table1.insert_row!(name: 2.0)
query = "select * from #{table1.name}"
data_import = DataImport.create(
user_id: @user.id,
table_name: 'target_table',
from_query: query
)
data_import.run_import!
data_import.state.should eq 'complete'
data_import = DataImport.create(
user_id: @user.id,
table_name: 'target_table',
from_query: query,
collision_strategy: 'skip'
)
data_import.run_import!
data_import.state.should eq 'complete'
data_import.error_code.should eq 1022
end
it 'raises an 8004 error when merging tables
through columns with different types' do
table1 = create_table(user_id: @user.id)
table2 = create_table(user_id: @user.id)
table1.modify_column!(name: 'name', type: 'double precision')
table1.insert_row!(name: 1.0)
table2.insert_row!(name: '1')
merge_query = %(
SELECT #{table2.name}.the_geom,
#{table2.name}.description,
#{table2.name}.name,
#{table1.name}.the_geom AS #{table1.name}_the_geom,
#{table1.name}.description AS #{table1.name}_description,
#{table1.name}.name AS #{table1.name}_name
FROM #{table2.name} FULL OUTER JOIN #{table1.name}
ON #{table2.name}.name = #{table1.name}.name
)
data_import = DataImport.create(
user_id: @user.id,
table_name: "merged_table",
from_query: merge_query
)
data_import.run_import!
data_import.error_code.should == 8004
end
it 'raises a meaningful error if cartodb_id is not valid' do
Table.any_instance.stubs(:cartodbfy).raises(CartoDB::CartoDBfyInvalidID)
data_import = DataImport.create(
user_id: @user.id,
data_source: fake_data_path('clubbing.csv'),
updated_at: Time.now
)
data_import.run_import!
data_import.error_code.should == 2011
end
it 'should overwrite dataset if collision_strategy is set to overwrite' do
carto_user = Carto::User.find(@user.id)
carto_user.visualizations.count.should eq 1
data_import = create_import(overwrite: false, truncated: false)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 2
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 3176
user_tables_should_be_registered
data_import = create_import(overwrite: false, truncated: false)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 3
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon_1'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 3176
user_tables_should_be_registered
data_import = create_import(overwrite: true, truncated: true)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 3
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 2
user_tables_should_be_registered
end
it 'should overwrite dataset with query if collision_strategy is set to overwrite' do
# overwriting from a sql is a different case needs to be tackled -> https://github.com/CartoDB/cartodb/issues/13139
carto_user = Carto::User.find(@user.id)
carto_user.visualizations.count.should eq 1
data_import = create_import(overwrite: false, truncated: false)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 2
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 3176
user_tables_should_be_registered
query = 'select * from walmart_latlon limit 1'
data_import = create_import_from_query(overwrite: true, from_query: query)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 2
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 1
user_tables_should_be_registered
end
it 'should raise an error if overwriting with non existent table in query' do
data_import = create_import_from_query(
overwrite: true,
from_query: 'select * from walmart_latlon_wtf limit 1'
)
data_import.run_import!
data_import.state.should eq 'failure'
data_import.error_code.should eq 8003
data_import.log.entries.should match(/relation.*does not exist/)
end
it 'should raise an exception if overwriting with missing data' do
carto_user = Carto::User.find(@user.id)
carto_user.visualizations.count.should eq 1
data_import = create_import(overwrite: false, truncated: true)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 2
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 2
user_tables_should_be_registered
data_import = create_import(overwrite: true, truncated: true, incomplete_schema: true)
expect { data_import.run_import! }.to raise_error(::CartoDB::Importer2::IncompatibleSchemas)
data_import.log.entries.should match(/Exception: Incompatible Schemas/)
end
describe 'organization behaviour' do
include_context 'organization with users helper'
it 'should overwrite even in users with hyphens in the schema name' do
carto_user = Carto::User.find(@org_user_owner.id)
expect(carto_user.username).to include '-' # Fixture check
data_import = create_import(user: @org_user_owner, overwrite: false, truncated: false)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 1
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 3176
user_tables_should_be_registered
data_import = create_import(user: @org_user_owner, overwrite: true, truncated: true)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 1
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 2
user_tables_should_be_registered
end
end
it 'should not raise exceptions if overwriting with more data' do
carto_user = Carto::User.find(@user.id)
carto_user.visualizations.count.should eq 1
data_import = create_import(overwrite: false, truncated: true, incomplete_schema: true)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 2
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 2
user_tables_should_be_registered
data_import = create_import(overwrite: true, truncated: true, incomplete_schema: false)
data_import.run_import!
carto_user.reload
carto_user.visualizations.count.should eq 2
data_import.state.should eq 'complete'
data_import.table_name.should eq 'walmart_latlon'
data_import.user.in_database["select count(*) from #{data_import.table_name}"].all[0][:count].should eq 2
user_tables_should_be_registered
end
def user_tables_should_be_registered
Carto::GhostTablesManager.new(@user.id).user_tables_synced_with_db?.should eq(true), "Tables not properly registered"
end
def create_import(user: @user, overwrite:, truncated:, incomplete_schema: false)
DataImport.create(
user_id: user.id,
data_source: Rails.root.join("spec/support/data/#{truncated ? 'truncated/' : ''}#{incomplete_schema ? 'incomplete_schema/' : ''}walmart_latlon.csv").to_s,
data_type: "file",
table_name: 'walmart_latlon',
state: "pending",
success: false,
updated_at: Time.now,
created_at: Time.now,
original_url: Rails.root.join("spec/support/data/walmart_latlon.csv").to_s,
cartodbfy_time: 0.0,
collision_strategy: overwrite ? 'overwrite' : nil
)
end
def create_import_from_query(user: @user, overwrite:, from_query: '')
DataImport.create(
user_id: user.id,
data_source: from_query,
from_query: from_query,
data_type: "query",
table_name: 'walmart_latlon',
state: "pending",
success: false,
updated_at: Time.now,
created_at: Time.now,
original_url: Rails.root.join("spec/support/data/walmart_latlon.csv").to_s,
cartodbfy_time: 0.0,
collision_strategy: overwrite ? 'overwrite' : nil
)
end
it 'raises a meaningful error if over storage quota' do
previous_quota_in_bytes = @user.quota_in_bytes
@user.quota_in_bytes = 0
@user.save
data_import = DataImport.create(
user_id: @user.id,
data_source: fake_data_path('clubbing.csv'),
updated_at: Time.now
).run_import!
@user.quota_in_bytes = previous_quota_in_bytes
@user.save
data_import.error_code.should == 8001
end
it 'raises a meaningful error if over table quota' do
previous_table_quota = @user.table_quota
@user.table_quota = 0
@user.save
data_import = DataImport.create(
user_id: @user.id,
data_source: fake_data_path('clubbing.csv'),
updated_at: Time.now
).run_import!
@user.table_quota = previous_table_quota
@user.save
data_import.error_code.should == 8002
end
it 'should allow to duplicate an existing table' do
data_import = DataImport.create(
user_id: @user.id,
table_name: 'duplicated_table',
updated_at: Time.now,
table_copy: @table.name).run_import!
data_import.data_type.should eq 'query'
duplicated_table = ::UserTable.where(id: data_import.table_id).first
duplicated_table.should_not be_nil
duplicated_table.name.should be == 'duplicated_table'
end
it 'should allow to create a table from a query' do
data_import_1 = DataImport.create(
user_id: @user.id,
data_source: fake_data_path('clubbing.csv'),
updated_at: Time.now).run_import!
data_import_1.state.should be == 'complete'
data_import_2 = DataImport.create(
user_id: @user.id,
table_name: 'from_query',
updated_at: Time.now,
from_query: "SELECT * FROM #{data_import_1.table_name} LIMIT 5").run_import!
data_import_2.state.should be == 'complete'
data_import_2.data_type.should eq 'query'
duplicated_table = ::UserTable.where(id: data_import_2.table_id).first
duplicated_table.should_not be_nil
duplicated_table.name.should be == 'from_query'
duplicated_table.service.records[:rows].should have(5).items
end
it 'imports a simple file' do
data_import = DataImport.create(
user_id: @user.id,
data_source: fake_data_path('clubbing.csv'),
updated_at: Time.now
).run_import!
table = ::UserTable.where(id: data_import.table_id).first
table.should_not be_nil
table.name.should be == 'clubbing'
table.service.records[:rows].should have(10).items
end
it 'imports a simple file with latlon' do
data_import = DataImport.create(
user_id: @user.id,
data_source: Rails.root.join('services/importer/spec/fixtures/csv_with_geojson.csv').to_s,
updated_at: Time.now
).run_import!
table = ::UserTable.where(id: data_import.table_id).first
table.should_not be_nil
end
it 'imports a gpkg with no coordinate system' do
data_import = DataImport.create(
user_id: @user.id,
data_source: fake_data_path('no_coordinate_system_dataset.gpkg'),
updated_at: Time.now
).run_import!
table = ::UserTable.where(id: data_import.table_id).first
table.should_not be_nil
table.name.should be == 'no_coordinate_system_dataset'
table.service.records[:rows].should have(1).items
end
it 'should allow to create a table from a url' do
data_import = nil
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
serve_file Rails.root.join('db/fake_data/clubbing.csv') do |url|
data_import = DataImport.create(
user_id: @user.id,
data_source: url,
updated_at: Time.now).run_import!
end
table = ::UserTable.where(id: data_import.table_id).first
table.should_not be_nil
table.name.should be == 'clubbing'
table.service.records[:rows].should have(10).items
end
it 'updates synch_job state after success data import' do
data_import = nil
sync_job = FactoryGirl.create(:enqueued_sync)
Carto::Synchronization.find(sync_job.id).state.should eq Carto::Synchronization::STATE_QUEUED
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
serve_file Rails.root.join('db/fake_data/clubbing.csv') do |url|
data_import = DataImport.create(
user_id: @user.id,
data_source: url,
synchronization_id: sync_job.id,
updated_at: Time.now
).run_import!
end
data_import.state.should eq DataImport::STATE_COMPLETE
Carto::Synchronization.find(sync_job.id).state.should eq Carto::Synchronization::STATE_SUCCESS
end
it 'updates synch_job state after failed data import' do
sync_job = FactoryGirl.create(:enqueued_sync)
Carto::Synchronization.find(sync_job.id).state.should eq Carto::Synchronization::STATE_QUEUED
data_import = DataImport.create(
user_id: @user.id,
data_source: "http://mydatasource.cartodb.wadus.com/foo.csv",
synchronization_id: sync_job.id,
updated_at: Time.now
).run_import!
data_import.state.should eq DataImport::STATE_FAILURE
Carto::Synchronization.find(sync_job.id).state.should eq Carto::Synchronization::STATE_FAILURE
end
it 'should allow to create a table from a url with params' do
data_import = nil
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
serve_file Rails.root.join('db/fake_data/clubbing.csv?param=wadus'),
headers: { "content-type" => "text/plain" } do |url|
data_import = DataImport.create(
user_id: @user.id,
data_source: url,
updated_at: Time.now).run_import!
end
table = ::UserTable.where(id: data_import.table_id).first
table.should_not be_nil
table.name.should be == 'clubbing'
table.service.records[:rows].should have(10).items
end
it "can create a table from a query selecting only the cartodb_id" do
data_import_1 = DataImport.create(
user_id: @user.id,
data_source: fake_data_path('clubbing.csv'),
updated_at: Time.now).run_import!
data_import_1.state.should be == 'complete'
data_import_2 = DataImport.create(
user_id: @user.id,
table_name: 'from_query',
updated_at: Time.now,
from_query: "SELECT cartodb_id FROM #{data_import_1.table_name} LIMIT 5").run_import!
data_import_2.state.should be == 'complete'
duplicated_table = ::UserTable.where(id: data_import_2.table_id).first
duplicated_table.should_not be_nil
duplicated_table.name.should be == 'from_query'
duplicated_table.service.records[:rows].should have(5).items
end
it "should remove any uploaded files after deletion" do
upload_path = FileUtils.mkdir_p Rails.root.join('public', 'uploads', 'test0000000000000000')
file_path = File.join(upload_path, 'wadus.csv')
FileUtils.cp Rails.root.join('db/fake_data/clubbing.csv'), file_path
data_import = DataImport.create(
user_id: @user.id,
data_source: file_path,
updated_at: Time.now)
data_import.destroy
Dir.exists?(file_path).should be_false
end
it 'should add a common_data extra_option' do
DataImport.any_instance.stubs(:from_common_data?).returns(true)
data_import = DataImport.create(
user_id: @user.id,
data_source: "http://127.0.0.1/foo.csv"
)
data_import.reload
data_import.extra_options[:common_data].should eq true
end
it 'should know that the import is from common data' do
Cartodb.with_config(common_data: { 'username' => 'mycommondata', 'host' => 'cartodb.wadus.com' }) do
data_import = DataImport.create(
user_id: @user.id,
data_source: "http://mycommondata.cartodb.wadus.com/foo.csv"
)
data_import.from_common_data?.should eq true
end
end
it 'should not consider a import as common data if common_data config does not exist' do
Cartodb.with_config(common_data: nil) do
data_import = DataImport.create(
user_id: @user.id,
data_source: "http://mycommondata.cartodb.wadus.com/foo.csv"
)
data_import.from_common_data?.should eq false
end
end
it 'should not consider a import as common data if common_data config does not match with url' do
Cartodb.with_config(common_data: { 'username' => 'mycommondata', 'host' => 'cartodb.wadus.com' }) do
data_import = DataImport.create(
user_id: @user.id,
data_source: "http://mydatasource.cartodb.wadus.com/foo.csv"
)
data_import.from_common_data?.should eq false
end
end
it 'mark as failure a stuck job' do
data_import = DataImport.create(
user_id: @user.id,
data_source: "http://mydatasource.cartodb.wadus.com/foo.csv",
state: DataImport::STATE_STUCK
)
data_import.mark_as_failed_if_stuck!.should eq true
data_import.success.should eq false
data_import.error_code.should eq 6671
data_import.mark_as_failed_if_stuck!.should eq false
end
describe 'arcgis connector' do
after :each do
CartoDB::Importer2::QueryBatcher.any_instance.unstub(:execute_update)
end
it 'should complete with a warning when the query batcher raises a timeout exception' do
stub_arcgis_response_with_file(File.expand_path('spec/fixtures/arcgis_response_valid.json'))
CartoDB::Importer2::QueryBatcher.any_instance
.stubs(:execute_update)
.raises(Sequel::DatabaseError, 'canceling statement due to statement timeout')
data_import = DataImport.create(
user_id: @user.id,
service_name: 'arcgis',
service_item_id: 'https://wtf.com/arcgis/rest/services/Planning/EPI_Primary_Planning_Layers/MapServer/2'
)
data_import.run_import!
data_import.state.should eq 'complete'
data_import.log.entries.should include 'Error fixing geometries during import, skipped'
end
it 'should raise invalid data error when the query batcher raise any other exception' do
stub_arcgis_response_with_file(File.expand_path('spec/fixtures/arcgis_response_valid.json'))
CartoDB::Importer2::QueryBatcher.any_instance
.stubs(:execute_update)
.raises(Sequel::DatabaseError, 'GEOSisValid(): InterruptedException: Interrupted!')
data_import = DataImport.create(
user_id: @user.id,
service_name: 'arcgis',
service_item_id: 'https://wtf.com/arcgis/rest/services/Planning/EPI_Primary_Planning_Layers/MapServer/2'
)
data_import.run_import!
data_import.state.should eq 'failure'
data_import.error_code.should eq 1012
end
it 'should import this supposed invalid dataset for ogr2ogr 2.1.1' do
stub_arcgis_response_with_file(File.expand_path('spec/fixtures/arcgis_response_invalid.json'))
data_import = DataImport.create(
user_id: @user.id,
service_name: 'arcgis',
service_item_id: 'https://wtf.com/arcgis/rest/services/Planning/EPI_Primary_Planning_Layers/MapServer/2'
)
data_import.run_import!
data_import.state.should eq 'complete'
end
it 'fail with error 1020 if timeout' do
Typhoeus::Response.any_instance.stubs(:timed_out?).returns(true)
stub_arcgis_response_with_file(File.expand_path('spec/fixtures/arcgis_response_missing_ogc_fid.json'))
data_import = DataImport.create(
user_id: @user.id,
service_name: 'arcgis',
service_item_id: 'https://wtf.com/arcgis/rest/services/Planning/EPI_Primary_Planning_Layers/MapServer/2'
)
data_import.run_import!
data_import.state.should eq 'failure'
data_import.error_code.should eq 1020
Typhoeus::Response.unstub(:timed_out?)
end
it 'should import files with missing ogc_fid' do
stub_arcgis_response_with_file(File.expand_path('spec/fixtures/arcgis_response_missing_ogc_fid.json'))
data_import = DataImport.create(
user_id: @user.id,
service_name: 'arcgis',
service_item_id: 'https://wtf.com/arcgis/rest/services/Planning/EPI_Primary_Planning_Layers/MapServer/2'
)
data_import.run_import!
data_import.state.should eq 'complete'
end
end
describe 'log' do
it 'is initialized to a CartoDB::Log instance' do
data_import = DataImport.create(
user_id: @user.id,
data_source: "http://mydatasource.cartodb.wadus.com/foo.csv"
)
data_import.log.should be_instance_of CartoDB::Log
end
it 'allows messages to be appended' do
data_import = DataImport.new(
user_id: @user.id,
table_name: 'foo',
from_query: 'bogus'
)
data_import.log.append('sample message')
data_import.save
data_import.log.to_s.should =~ /sample message/
end
it 'is fetched after retrieving the data_import object from DB' do
data_import = DataImport.new(
user_id: @user.id,
table_name: 'foo',
from_query: 'bogus'
)
data_import.log.append('sample message')
# Logs get saved at checkpoints or certain operations, so force store
data_import.log.store
data_import.save
rehydrated_data_import = DataImport[id: data_import.id]
data_import.log.to_s.should == rehydrated_data_import.log.to_s
end
it 'will not overwrite an existing logger field' do
data_import = DataImport.new(
user_id: @user.id,
table_name: 'foo',
from_query: 'bogus',
)
data_import.save
data_import.logger = 'existing log'
data_import.this.update(logger: 'existing log')
data_import.logger.should == 'existing log'
data_import.log.append('sample message')
data_import.log.to_s.should =~ /sample message/
data_import.save
data_import.logger.should == 'existing log'
end
end
context 'viewer users' do
after(:each) do
@user.viewer = false
@user.save
end
it "can't create new data imports" do
@user.viewer = true
@user.save
data_import = DataImport.new(
user_id: @user.id,
table_name: 'fromviewer',
from_query: 'fromviewer_q'
)
expect { data_import.save }.to raise_error(Sequel::ValidationFailed, "user Viewer users can't create data imports")
end
end
end