404 lines
16 KiB
Ruby
404 lines
16 KiB
Ruby
require_relative '../../../../spec/spec_helper_min'
|
|
require_relative '../../lib/importer/downloader'
|
|
require_relative '../../../../lib/carto/url_validator'
|
|
require_relative '../../../../spec/helpers/file_server_helper'
|
|
|
|
include CartoDB::Importer2
|
|
include FileServerHelper
|
|
|
|
describe Downloader do
|
|
before do
|
|
@file_url =
|
|
"http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/110m/physical/ne_110m_lakes.zip"
|
|
@file_filepath = path_to('ne_110m_lakes.zip')
|
|
@file_url_without_extension = "http://www.example.com/foowithoutextension"
|
|
@file_filepath_without_extension = path_to('foowithoutextension')
|
|
@file_url_with_wrong_extension = "http://www.example.com/csvwithwrongextension.xml"
|
|
@file_filepath_with_wrong_extension = path_to('csvwithwrongextension.xml')
|
|
@fusion_tables_url =
|
|
"https://www.google.com/fusiontables/exporttable" +
|
|
"?query=select+*+from+1dimNIKKwROG1yTvJ6JlMm4-B4LxMs2YbncM4p9g"
|
|
@fusion_tables_filepath = path_to('forest_change.csv')
|
|
@ftp_url = "ftp://ftp.nlm.nih.gov/nlmdata/sample/INDEX"
|
|
@ftp_filepath = path_to('INDEX.txt')
|
|
end
|
|
|
|
before(:all) { @user = FactoryGirl.create(:carto_user) }
|
|
after(:all) { @user.destroy }
|
|
|
|
describe '#run' do
|
|
it 'downloads a file from a url' do
|
|
stub_download(url: @file_url, filepath: @file_filepath)
|
|
|
|
downloader = Downloader.new(@user.id, @file_url)
|
|
downloader.run
|
|
File.exists?(downloader.source_file.fullpath).should eq true
|
|
end
|
|
|
|
it 'extracts the source_file name from the URL' do
|
|
stub_download(url: @file_url, filepath: @file_filepath, content_disposition: false)
|
|
|
|
downloader = Downloader.new(@user.id, @file_url)
|
|
downloader.run
|
|
downloader.source_file.name.should eq 'ne_110m_lakes'
|
|
end
|
|
|
|
it 'extracts the source_file name from the URL for S3 actual paths' do
|
|
url = 'http://s3.amazonaws.com/com.cartodb.imports.staging/XXXXXXXXXXXXXXXXXXXX/ne_110m_lakes.csv' +
|
|
'?AWSAccessKeyId=XXXXXXXXXXXXXXXXXXXX&Expires=1461934764&Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXM%3D'
|
|
stub_download(url: url, filepath: @file_filepath, content_disposition: false)
|
|
|
|
downloader = Downloader.new(@user.id, url)
|
|
downloader.run
|
|
downloader.source_file.name.should eq 'ne_110m_lakes'
|
|
end
|
|
|
|
it 'extracts the source_file name from the URL for S3 paths without extra parameters' do
|
|
url = "http://s3.amazonaws.com/com.cartodb.imports.staging/XXXXXXXXXXXXXXXXXXXX/ne_110m_lakes.csv"
|
|
stub_download(url: url, filepath: @file_filepath, content_disposition: false)
|
|
|
|
downloader = Downloader.new(@user.id, url)
|
|
downloader.run
|
|
downloader.source_file.name.should eq 'ne_110m_lakes'
|
|
end
|
|
|
|
it 'extracts the source_file name from the URL for FGDB ZIP files' do
|
|
url = "http://s3.amazonaws.com/filegeodatabase.gdb.zip"
|
|
stub_download(url: url, filepath: @file_filepath, content_disposition: false)
|
|
|
|
downloader = Downloader.new(@user.id, url)
|
|
downloader.run
|
|
downloader.source_file.name.should eq 'filegeodatabase.gdb'
|
|
end
|
|
|
|
it 'uses Content-Type header for files without extension' do
|
|
stub_download(url: @file_url_without_extension, filepath: @file_filepath_without_extension, headers: { 'Content-Type' => 'text/csv' })
|
|
downloader = Downloader.new(@user.id, @file_url_without_extension)
|
|
downloader.run
|
|
downloader.source_file.filename.should eq 'foowithoutextension.csv'
|
|
end
|
|
|
|
it 'uses file name for file without extension and with unknown Content-Type header' do
|
|
stub_download(
|
|
url: @file_url_without_extension,
|
|
filepath: @file_filepath_without_extension,
|
|
headers: { 'Content-Type' => 'application/octet-stream' }
|
|
)
|
|
downloader = Downloader.new(@user.id, @file_url_without_extension)
|
|
downloader.run
|
|
downloader.source_file.filename.should eq 'foowithoutextension'
|
|
end
|
|
|
|
it 'uses file name for file with extension and with unknown Content-Type header' do
|
|
url_csv_with_extension = "http://www.example.com/ngos.csv"
|
|
csv_filepath_with_extension = path_to('ngos.csv')
|
|
stub_download(
|
|
url: url_csv_with_extension,
|
|
filepath: csv_filepath_with_extension,
|
|
headers: { 'Content-Type' => 'application/octet-stream' }
|
|
)
|
|
downloader = Downloader.new(@user.id, url_csv_with_extension)
|
|
downloader.run
|
|
downloader.source_file.filename.should eq 'ngos.csv'
|
|
end
|
|
|
|
it 'ignores extra type parameters in Content-Type header' do
|
|
stub_download(url: @file_url_without_extension, filepath: @file_filepath_without_extension, headers: { 'Content-Type' => 'vnd.ms-excel;charset=UTF-8' })
|
|
downloader = Downloader.new(@user.id, @file_url_without_extension)
|
|
downloader.run
|
|
downloader.send(:content_type).should eq 'vnd.ms-excel'
|
|
end
|
|
|
|
it 'uses Content-Type header extension for files with different extension' do
|
|
stub_download(
|
|
url: @file_url_with_wrong_extension,
|
|
filepath: @file_filepath_with_wrong_extension,
|
|
headers: { 'Content-Type' => 'text/csv' }
|
|
)
|
|
downloader = Downloader.new(@user.id, @file_url_with_wrong_extension)
|
|
downloader.run
|
|
downloader.source_file.filename.should eq 'csvwithwrongextension.csv'
|
|
end
|
|
|
|
it 'sets the right file extension for file without extension in a multi extension Content-Type' do
|
|
url_tgz_without_extension = "http://www.example.com/csvwithwrongextension.xml"
|
|
tgz_filepath_without_extension = path_to('csvwithwrongextension.xml')
|
|
stub_download(
|
|
url: url_tgz_without_extension,
|
|
filepath: tgz_filepath_without_extension,
|
|
headers: { 'Content-Type' => 'text/csv' }
|
|
)
|
|
downloader = Downloader.new(@user.id, url_tgz_without_extension)
|
|
downloader.run
|
|
downloader.source_file.filename.should eq 'csvwithwrongextension.csv'
|
|
end
|
|
|
|
it 'uses the right file extension based in a multiple file extension Content-Type scenario' do
|
|
url_tgz_without_extension = "http://www.example.com/ok_data.csv.gz"
|
|
tgz_filepath_without_extension = path_to('ok_data.csv.gz')
|
|
stub_download(
|
|
url: url_tgz_without_extension,
|
|
filepath: tgz_filepath_without_extension,
|
|
headers: { 'Content-Type' => 'application/x-gzip' }
|
|
)
|
|
downloader = Downloader.new(@user.id, url_tgz_without_extension)
|
|
downloader.run
|
|
downloader.source_file.filename.should eq 'ok_data.csv.gz'
|
|
end
|
|
|
|
it 'uses the geojson extension if the header is text/plain' do
|
|
url_geojson = "http://www.example.com/tm_world_borders_simpl_0_8.geojson"
|
|
filepath_geojson = path_to('tm_world_borders_simpl_0_8.geojson')
|
|
stub_download(
|
|
url: url_geojson,
|
|
filepath: filepath_geojson,
|
|
headers: { 'Content-Type' => 'text/plain' }
|
|
)
|
|
downloader = Downloader.new(@user.id, url_geojson)
|
|
downloader.run
|
|
downloader.source_file.filename.should eq 'tm_world_borders_simpl_0_8.geojson'
|
|
end
|
|
|
|
it 'uses the kml extension if the header is text/plain' do
|
|
url_kml = "http://www.example.com/abandoned.kml"
|
|
filepath_kml = path_to('abandoned.kml')
|
|
stub_download(
|
|
url: url_kml,
|
|
filepath: filepath_kml,
|
|
headers: { 'Content-Type' => 'text/plain' }
|
|
)
|
|
downloader = Downloader.new(@user.id, url_kml)
|
|
downloader.run
|
|
downloader.source_file.filename.should eq 'abandoned.kml'
|
|
end
|
|
|
|
it 'extracts the source_file name from Content-Disposition header' do
|
|
stub_download(
|
|
url: @fusion_tables_url,
|
|
filepath: @fusion_tables_filepath
|
|
)
|
|
downloader = Downloader.new(@user.id, @fusion_tables_url)
|
|
|
|
downloader.run
|
|
downloader.source_file.name.should eq 'forest_change'
|
|
end
|
|
|
|
it 'supports FTP urls' do
|
|
stub_download(url: @ftp_url, filepath: @ftp_filepath)
|
|
|
|
downloader = Downloader.new(@user.id, @ftp_url)
|
|
downloader.run
|
|
downloader.source_file.name.should eq 'INDEX'
|
|
end
|
|
|
|
it 'supports accented URLs' do
|
|
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
|
|
|
|
[
|
|
{ url: 'spec/fixtures/política_agraria_común.csv', name: 'política_agraria_común' },
|
|
{ url: 'spec/fixtures/many_characters_áÁñÑçÇàÀ.csv', name: 'many_characters_áÁñÑçÇàÀ' }
|
|
].each do |url_and_name|
|
|
serve_file url_and_name[:url] do |url|
|
|
downloader = Downloader.new(@user.id, url)
|
|
downloader.run
|
|
downloader.source_file.name.should eq url_and_name[:name]
|
|
end
|
|
end
|
|
end
|
|
|
|
it 'does not break urls with % on it' do
|
|
# INFO: notice this URL is fake
|
|
url_with_percentage = 'https://s3.amazonaws.com/com.cartodb.imports.staging/03b0c2199fc814ceeb75/a_file.zip?AWSAccessKeyId=AKIAIUI5FFFJIRAMEEMA&Expires=1433349484&Signature=t6m%2Bji%2BlKsnrOVqPsptXajPiozw%3D'
|
|
downloader = Downloader.new(@user.id, url_with_percentage)
|
|
downloader.instance_variable_get("@translated_url").should == url_with_percentage
|
|
end
|
|
|
|
it 'does not break local filenames with special characters on it' do
|
|
# INFO: notice this URL is fake
|
|
path_with_percentage = '/public/uploads/tést file%.csv'
|
|
downloader = Downloader.new(@user.id, path_with_percentage)
|
|
downloader.instance_variable_get("@translated_url").should == path_with_percentage
|
|
end
|
|
|
|
it "doesn't download the file if ETag hasn't changed" do
|
|
etag = 'bogus'
|
|
stub_download(
|
|
url: @file_url,
|
|
filepath: @file_filepath,
|
|
headers: { "ETag" => etag }
|
|
)
|
|
|
|
downloader = Downloader.new(@user.id, @file_url, etag: etag)
|
|
downloader.run
|
|
downloader.modified?.should be_false
|
|
end
|
|
|
|
it "raises if remote URL doesn't respond with a 2XX code" do
|
|
stub_failed_download(
|
|
url: @file_url,
|
|
filepath: @file_filepath,
|
|
headers: {}
|
|
)
|
|
|
|
downloader = Downloader.new(@user.id, @file_url)
|
|
lambda { downloader.run }.should raise_error DownloadError
|
|
end
|
|
|
|
it "raises if download fails with partial file error" do
|
|
stub_download(
|
|
url: @file_url,
|
|
filepath: @file_filepath,
|
|
headers: {}
|
|
)
|
|
|
|
Typhoeus::Response.any_instance.stubs(:mock).returns(false)
|
|
Typhoeus::Response.any_instance.stubs(:return_code).returns(:partial_file)
|
|
|
|
downloader = Downloader.new(@user.id, @file_url)
|
|
lambda { downloader.run }.should raise_error PartialDownloadError
|
|
end
|
|
|
|
describe '#etag' do
|
|
it "reads etag from download" do
|
|
etag = 'whatever'
|
|
stub_download(
|
|
url: @file_url,
|
|
filepath: @file_filepath,
|
|
headers: { "ETag" => etag }
|
|
)
|
|
|
|
downloader = Downloader.new(@user.id, @file_url)
|
|
downloader.etag.should eq etag
|
|
end
|
|
end
|
|
|
|
describe('#quota_checks') do
|
|
before(:all) do
|
|
@old_max_import_file_size = @user.max_import_file_size
|
|
@user.max_import_file_size = 1024
|
|
@user.save
|
|
end
|
|
|
|
after(:all) do
|
|
@user.max_import_file_size = @old_max_import_file_size
|
|
@user.save
|
|
end
|
|
|
|
it 'raises when file size is bigger than available quota before download' do
|
|
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
|
|
serve_file 'spec/support/data/ne_110m_lakes.zip' do |url|
|
|
downloader = Downloader.new(@user.id, url)
|
|
expect { downloader.run }.to raise_error(CartoDB::Importer2::StorageQuotaExceededError)
|
|
end
|
|
end
|
|
|
|
it 'raises when file size is bigger than available quota during download' do
|
|
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
|
|
|
|
# We stub the `content_length` so to simulate a situation where we can't infer the
|
|
# file size from the headers, and we're forced to do it counting chunk sizes during
|
|
# download time.
|
|
CartoDB::Importer2::Downloader.any_instance.stubs(:content_length)
|
|
|
|
serve_file 'spec/support/data/ne_110m_lakes.zip' do |url|
|
|
downloader = Downloader.new(@user.id, url)
|
|
expect { downloader.run }.to raise_error(CartoDB::Importer2::FileTooBigError)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
describe '#source_file' do
|
|
it 'returns nil if no download initiated' do
|
|
downloader = Downloader.new(@user.id, @file_url)
|
|
downloader.source_file.should_not be
|
|
end
|
|
|
|
it 'returns a source file based on the path if passed a file path' do
|
|
downloader = Downloader.new(@user.id, '/foo/bar')
|
|
downloader.run
|
|
downloader.source_file.fullpath.should eq '/foo/bar'
|
|
end
|
|
|
|
it 'returns a source_file name' do
|
|
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
|
|
serve_file 'spec/support/data/ne_110m_lakes.zip' do |url|
|
|
downloader = Downloader.new(@user.id, url)
|
|
downloader.run
|
|
downloader.source_file.name.should eq 'ne_110m_lakes'
|
|
end
|
|
end
|
|
|
|
it 'returns a local filepath' do
|
|
CartoDB::Importer2::Downloader.any_instance.stubs(:validate_url!).returns(true)
|
|
serve_file 'spec/support/data/ne_110m_lakes.zip' do |url|
|
|
downloader = Downloader.new(@user.id, url)
|
|
downloader.run
|
|
downloader.source_file.fullpath.should match /#{@file_url.split('/').last}/
|
|
end
|
|
end
|
|
end
|
|
|
|
describe '#name inference' do
|
|
it 'gets the file name from the Content-Disposition header if present' do
|
|
headers = { "Content-Disposition" => %{attachment; filename="bar.csv"} }
|
|
downloader = Downloader.new(@user.id, @file_url, headers)
|
|
downloader.send(:set_headers, headers)
|
|
downloader.instance_variable_get(:@filename).should eq 'bar.csv'
|
|
|
|
headers = { "Content-Disposition" => %{attachment; filename=bar.csv} }
|
|
downloader = Downloader.new(@user.id, @file_url, headers)
|
|
downloader.send(:set_headers, headers)
|
|
downloader.instance_variable_get(:@filename).should eq 'bar.csv'
|
|
|
|
disposition = "attachment; filename=map_gaudi3d.geojson; " +
|
|
'modification-date="Tue, 06 Aug 2013 15:05:35 GMT'
|
|
headers = { "Content-Disposition" => disposition }
|
|
downloader = Downloader.new(@user.id, @file_url, headers)
|
|
downloader.send(:set_headers, headers)
|
|
downloader.instance_variable_get(:@filename).should eq 'map_gaudi3d.geojson'
|
|
end
|
|
|
|
it 'gets the file name from the URL if no Content-Disposition header' do
|
|
downloader = Downloader.new(@user.id, @file_url)
|
|
|
|
downloader.send(:set_headers, Hash.new)
|
|
downloader.instance_variable_get(:@filename).should eq 'ne_110m_lakes.zip'
|
|
end
|
|
|
|
it 'gets the file name from the URL if no Content-Disposition header and custom params schema is used' do
|
|
hard_url = "https://manolo.escobar.es/param&myfilenameparam&zip_file.csv.zip&otherinfo"
|
|
|
|
downloader = Downloader.new(@user.id, hard_url)
|
|
downloader.send(:set_headers, Hash.new)
|
|
downloader.instance_variable_get(:@filename).should eq 'zip_file.csv.zip'
|
|
end
|
|
|
|
it 'uses random name in no name can be found in url or http headers' do
|
|
empty_url = "https://manolo.escobar.es/param&myfilenameparam¬hing&otherinfo"
|
|
|
|
downloader = Downloader.new(@user.id, empty_url)
|
|
downloader.send(:set_headers, Hash.new)
|
|
downloader.instance_variable_get(:@filename).should_not eq nil
|
|
end
|
|
|
|
it 'discards url query params' do
|
|
downloader = Downloader.new(@user.id, "#{@file_url}?foo=bar&woo=wee")
|
|
downloader.send(:set_headers, Hash.new)
|
|
downloader.instance_variable_get(:@filename).should eq 'ne_110m_lakes.zip'
|
|
end
|
|
|
|
it 'matches longer extension available from filename' do
|
|
hard_url = "https://cartofante.net/my_file.xlsx"
|
|
|
|
downloader = Downloader.new(@user.id, hard_url)
|
|
downloader.send(:set_headers, Hash.new)
|
|
downloader.instance_variable_get(:@filename).should eq 'my_file.xlsx'
|
|
end
|
|
end
|
|
|
|
def path_to(filename)
|
|
File.join(File.dirname(__FILE__), '..', 'fixtures', filename)
|
|
end
|
|
end
|