cartodb-4.42/services/importer/spec/unit/downloader_spec.rb
2024-04-06 05:25:13 +00:00

404 lines
16 KiB
Ruby

require_relative '../../../../spec/spec_helper_min'
require_relative '../../lib/importer/downloader'
require_relative '../../../../lib/carto/url_validator'
require_relative '../../../../spec/helpers/file_server_helper'
include CartoDB::Importer2
include FileServerHelper
describe Downloader do
before do
@file_url =
"http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/110m/physical/ne_110m_lakes.zip"
@file_filepath = path_to('ne_110m_lakes.zip')
@file_url_without_extension = "http://www.example.com/foowithoutextension"
@file_filepath_without_extension = path_to('foowithoutextension')
@file_url_with_wrong_extension = "http://www.example.com/csvwithwrongextension.xml"
@file_filepath_with_wrong_extension = path_to('csvwithwrongextension.xml')
@fusion_tables_url =
"https://www.google.com/fusiontables/exporttable" +
"?query=select+*+from+1dimNIKKwROG1yTvJ6JlMm4-B4LxMs2YbncM4p9g"
@fusion_tables_filepath = path_to('forest_change.csv')
@ftp_url = "ftp://ftp.nlm.nih.gov/nlmdata/sample/INDEX"
@ftp_filepath = path_to('INDEX.txt')
end
before(:all) { @user = FactoryGirl.create(:carto_user) }
after(:all) { @user.destroy }
describe '#run' do
it 'downloads a file from a url' do
stub_download(url: @file_url, filepath: @file_filepath)
downloader = Downloader.new(@user.id, @file_url)
downloader.run
File.exists?(downloader.source_file.fullpath).should eq true
end
it 'extracts the source_file name from the URL' do
stub_download(url: @file_url, filepath: @file_filepath, content_disposition: false)
downloader = Downloader.new(@user.id, @file_url)
downloader.run
downloader.source_file.name.should eq 'ne_110m_lakes'
end
it 'extracts the source_file name from the URL for S3 actual paths' do
url = 'http://s3.amazonaws.com/com.cartodb.imports.staging/XXXXXXXXXXXXXXXXXXXX/ne_110m_lakes.csv' +
'?AWSAccessKeyId=XXXXXXXXXXXXXXXXXXXX&Expires=1461934764&Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXM%3D'
stub_download(url: url, filepath: @file_filepath, content_disposition: false)
downloader = Downloader.new(@user.id, url)
downloader.run
downloader.source_file.name.should eq 'ne_110m_lakes'
end
it 'extracts the source_file name from the URL for S3 paths without extra parameters' do
url = "http://s3.amazonaws.com/com.cartodb.imports.staging/XXXXXXXXXXXXXXXXXXXX/ne_110m_lakes.csv"
stub_download(url: url, filepath: @file_filepath, content_disposition: false)
downloader = Downloader.new(@user.id, url)
downloader.run
downloader.source_file.name.should eq 'ne_110m_lakes'
end
it 'extracts the source_file name from the URL for FGDB ZIP files' do
url = "http://s3.amazonaws.com/filegeodatabase.gdb.zip"
stub_download(url: url, filepath: @file_filepath, content_disposition: false)
downloader = Downloader.new(@user.id, url)
downloader.run
downloader.source_file.name.should eq 'filegeodatabase.gdb'
end
it 'uses Content-Type header for files without extension' do
stub_download(url: @file_url_without_extension, filepath: @file_filepath_without_extension, headers: { 'Content-Type' => 'text/csv' })
downloader = Downloader.new(@user.id, @file_url_without_extension)
downloader.run
downloader.source_file.filename.should eq 'foowithoutextension.csv'
end
it 'uses file name for file without extension and with unknown Content-Type header' do
stub_download(
url: @file_url_without_extension,
filepath: @file_filepath_without_extension,
headers: { 'Content-Type' => 'application/octet-stream' }
)
downloader = Downloader.new(@user.id, @file_url_without_extension)
downloader.run
downloader.source_file.filename.should eq 'foowithoutextension'
end
it 'uses file name for file with extension and with unknown Content-Type header' do
url_csv_with_extension = "http://www.example.com/ngos.csv"
csv_filepath_with_extension = path_to('ngos.csv')
stub_download(
url: url_csv_with_extension,
filepath: csv_filepath_with_extension,
headers: { 'Content-Type' => 'application/octet-stream' }
)
downloader = Downloader.new(@user.id, url_csv_with_extension)
downloader.run
downloader.source_file.filename.should eq 'ngos.csv'
end
it 'ignores extra type parameters in Content-Type header' do
stub_download(url: @file_url_without_extension, filepath: @file_filepath_without_extension, headers: { 'Content-Type' => 'vnd.ms-excel;charset=UTF-8' })
downloader = Downloader.new(@user.id, @file_url_without_extension)
downloader.run
downloader.send(:content_type).should eq 'vnd.ms-excel'
end
it 'uses Content-Type header extension for files with different extension' do
stub_download(
url: @file_url_with_wrong_extension,
filepath: @file_filepath_with_wrong_extension,
headers: { 'Content-Type' => 'text/csv' }
)
downloader = Downloader.new(@user.id, @file_url_with_wrong_extension)
downloader.run
downloader.source_file.filename.should eq 'csvwithwrongextension.csv'
end
it 'sets the right file extension for file without extension in a multi extension Content-Type' do
url_tgz_without_extension = "http://www.example.com/csvwithwrongextension.xml"
tgz_filepath_without_extension = path_to('csvwithwrongextension.xml')
stub_download(
url: url_tgz_without_extension,
filepath: tgz_filepath_without_extension,
headers: { 'Content-Type' => 'text/csv' }
)
downloader = Downloader.new(@user.id, url_tgz_without_extension)
downloader.run
downloader.source_file.filename.should eq 'csvwithwrongextension.csv'
end
it 'uses the right file extension based in a multiple file extension Content-Type scenario' do
url_tgz_without_extension = "http://www.example.com/ok_data.csv.gz"
tgz_filepath_without_extension = path_to('ok_data.csv.gz')
stub_download(
url: url_tgz_without_extension,
filepath: tgz_filepath_without_extension,
headers: { 'Content-Type' => 'application/x-gzip' }
)
downloader = Downloader.new(@user.id, url_tgz_without_extension)
downloader.run
downloader.source_file.filename.should eq 'ok_data.csv.gz'
end
it 'uses the geojson extension if the header is text/plain' do
url_geojson = "http://www.example.com/tm_world_borders_simpl_0_8.geojson"
filepath_geojson = path_to('tm_world_borders_simpl_0_8.geojson')
stub_download(
url: url_geojson,
filepath: filepath_geojson,
headers: { 'Content-Type' => 'text/plain' }
)
downloader = Downloader.new(@user.id, url_geojson)
downloader.run
downloader.source_file.filename.should eq 'tm_world_borders_simpl_0_8.geojson'
end
it 'uses the kml extension if the header is text/plain' do
url_kml = "http://www.example.com/abandoned.kml"
filepath_kml = path_to('abandoned.kml')
stub_download(
url: url_kml,
filepath: filepath_kml,
headers: { 'Content-Type' => 'text/plain' }
)
downloader = Downloader.new(@user.id, url_kml)
downloader.run
downloader.source_file.filename.should eq 'abandoned.kml'
end
it 'extracts the source_file name from Content-Disposition header' do
stub_download(
url: @fusion_tables_url,
filepath: @fusion_tables_filepath
)
downloader = Downloader.new(@user.id, @fusion_tables_url)
downloader.run
downloader.source_file.name.should eq 'forest_change'
end
it 'supports FTP urls' do
stub_download(url: @ftp_url, filepath: @ftp_filepath)
downloader = Downloader.new(@user.id, @ftp_url)
downloader.run
downloader.source_file.name.should eq 'INDEX'
end
it 'supports accented URLs' do
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
[
{ url: 'spec/fixtures/política_agraria_común.csv', name: 'política_agraria_común' },
{ url: 'spec/fixtures/many_characters_áÁñÑçÇàÀ.csv', name: 'many_characters_áÁñÑçÇàÀ' }
].each do |url_and_name|
serve_file url_and_name[:url] do |url|
downloader = Downloader.new(@user.id, url)
downloader.run
downloader.source_file.name.should eq url_and_name[:name]
end
end
end
it 'does not break urls with % on it' do
# INFO: notice this URL is fake
url_with_percentage = 'https://s3.amazonaws.com/com.cartodb.imports.staging/03b0c2199fc814ceeb75/a_file.zip?AWSAccessKeyId=AKIAIUI5FFFJIRAMEEMA&Expires=1433349484&Signature=t6m%2Bji%2BlKsnrOVqPsptXajPiozw%3D'
downloader = Downloader.new(@user.id, url_with_percentage)
downloader.instance_variable_get("@translated_url").should == url_with_percentage
end
it 'does not break local filenames with special characters on it' do
# INFO: notice this URL is fake
path_with_percentage = '/public/uploads/tést file%.csv'
downloader = Downloader.new(@user.id, path_with_percentage)
downloader.instance_variable_get("@translated_url").should == path_with_percentage
end
it "doesn't download the file if ETag hasn't changed" do
etag = 'bogus'
stub_download(
url: @file_url,
filepath: @file_filepath,
headers: { "ETag" => etag }
)
downloader = Downloader.new(@user.id, @file_url, etag: etag)
downloader.run
downloader.modified?.should be_false
end
it "raises if remote URL doesn't respond with a 2XX code" do
stub_failed_download(
url: @file_url,
filepath: @file_filepath,
headers: {}
)
downloader = Downloader.new(@user.id, @file_url)
lambda { downloader.run }.should raise_error DownloadError
end
it "raises if download fails with partial file error" do
stub_download(
url: @file_url,
filepath: @file_filepath,
headers: {}
)
Typhoeus::Response.any_instance.stubs(:mock).returns(false)
Typhoeus::Response.any_instance.stubs(:return_code).returns(:partial_file)
downloader = Downloader.new(@user.id, @file_url)
lambda { downloader.run }.should raise_error PartialDownloadError
end
describe '#etag' do
it "reads etag from download" do
etag = 'whatever'
stub_download(
url: @file_url,
filepath: @file_filepath,
headers: { "ETag" => etag }
)
downloader = Downloader.new(@user.id, @file_url)
downloader.etag.should eq etag
end
end
describe('#quota_checks') do
before(:all) do
@old_max_import_file_size = @user.max_import_file_size
@user.max_import_file_size = 1024
@user.save
end
after(:all) do
@user.max_import_file_size = @old_max_import_file_size
@user.save
end
it 'raises when file size is bigger than available quota before download' do
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
serve_file 'spec/support/data/ne_110m_lakes.zip' do |url|
downloader = Downloader.new(@user.id, url)
expect { downloader.run }.to raise_error(CartoDB::Importer2::StorageQuotaExceededError)
end
end
it 'raises when file size is bigger than available quota during download' do
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
# We stub the `content_length` so to simulate a situation where we can't infer the
# file size from the headers, and we're forced to do it counting chunk sizes during
# download time.
CartoDB::Importer2::Downloader.any_instance.stubs(:content_length)
serve_file 'spec/support/data/ne_110m_lakes.zip' do |url|
downloader = Downloader.new(@user.id, url)
expect { downloader.run }.to raise_error(CartoDB::Importer2::FileTooBigError)
end
end
end
end
describe '#source_file' do
it 'returns nil if no download initiated' do
downloader = Downloader.new(@user.id, @file_url)
downloader.source_file.should_not be
end
it 'returns a source file based on the path if passed a file path' do
downloader = Downloader.new(@user.id, '/foo/bar')
downloader.run
downloader.source_file.fullpath.should eq '/foo/bar'
end
it 'returns a source_file name' do
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
serve_file 'spec/support/data/ne_110m_lakes.zip' do |url|
downloader = Downloader.new(@user.id, url)
downloader.run
downloader.source_file.name.should eq 'ne_110m_lakes'
end
end
it 'returns a local filepath' do
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
serve_file 'spec/support/data/ne_110m_lakes.zip' do |url|
downloader = Downloader.new(@user.id, url)
downloader.run
downloader.source_file.fullpath.should match /#{@file_url.split('/').last}/
end
end
end
describe '#name inference' do
it 'gets the file name from the Content-Disposition header if present' do
headers = { "Content-Disposition" => %{attachment; filename="bar.csv"} }
downloader = Downloader.new(@user.id, @file_url, headers)
downloader.send(:set_headers, headers)
downloader.instance_variable_get(:@filename).should eq 'bar.csv'
headers = { "Content-Disposition" => %{attachment; filename=bar.csv} }
downloader = Downloader.new(@user.id, @file_url, headers)
downloader.send(:set_headers, headers)
downloader.instance_variable_get(:@filename).should eq 'bar.csv'
disposition = "attachment; filename=map_gaudi3d.geojson; " +
'modification-date="Tue, 06 Aug 2013 15:05:35 GMT'
headers = { "Content-Disposition" => disposition }
downloader = Downloader.new(@user.id, @file_url, headers)
downloader.send(:set_headers, headers)
downloader.instance_variable_get(:@filename).should eq 'map_gaudi3d.geojson'
end
it 'gets the file name from the URL if no Content-Disposition header' do
downloader = Downloader.new(@user.id, @file_url)
downloader.send(:set_headers, Hash.new)
downloader.instance_variable_get(:@filename).should eq 'ne_110m_lakes.zip'
end
it 'gets the file name from the URL if no Content-Disposition header and custom params schema is used' do
hard_url = "https://manolo.escobar.es/param&myfilenameparam&zip_file.csv.zip&otherinfo"
downloader = Downloader.new(@user.id, hard_url)
downloader.send(:set_headers, Hash.new)
downloader.instance_variable_get(:@filename).should eq 'zip_file.csv.zip'
end
it 'uses random name in no name can be found in url or http headers' do
empty_url = "https://manolo.escobar.es/param&myfilenameparam&nothing&otherinfo"
downloader = Downloader.new(@user.id, empty_url)
downloader.send(:set_headers, Hash.new)
downloader.instance_variable_get(:@filename).should_not eq nil
end
it 'discards url query params' do
downloader = Downloader.new(@user.id, "#{@file_url}?foo=bar&woo=wee")
downloader.send(:set_headers, Hash.new)
downloader.instance_variable_get(:@filename).should eq 'ne_110m_lakes.zip'
end
it 'matches longer extension available from filename' do
hard_url = "https://cartofante.net/my_file.xlsx"
downloader = Downloader.new(@user.id, hard_url)
downloader.send(:set_headers, Hash.new)
downloader.instance_variable_get(:@filename).should eq 'my_file.xlsx'
end
end
def path_to(filename)
File.join(File.dirname(__FILE__), '..', 'fixtures', filename)
end
end