284 lines
8.0 KiB
Ruby
284 lines
8.0 KiB
Ruby
require 'fileutils'
|
|
require_relative '../../../../spec/spec_helper'
|
|
require_relative '../../lib/importer/csv_normalizer'
|
|
require_relative '../doubles/log'
|
|
require_relative '../doubles/user'
|
|
|
|
include CartoDB::Importer2::Doubles
|
|
|
|
describe CartoDB::Importer2::CsvNormalizer do
|
|
|
|
BUG_COLUMNS_WRONG_SPLIT_FIXTURE_FILE = "#{File.dirname(__FILE__)}/bug_columns_wrong_split.csv"
|
|
|
|
before(:all) do
|
|
@user = FactoryGirl.create(:user)
|
|
end
|
|
|
|
after(:all) do
|
|
@user.destroy
|
|
end
|
|
|
|
describe '#run' do
|
|
it 'transforms the file using a proper comma delimiter' do
|
|
fixture = tab_delimiter_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
|
|
csv.detect_delimiter
|
|
|
|
csv.delimiter.should eq "\t"
|
|
csv.run
|
|
csv.delimiter.should eq ','
|
|
end
|
|
it 'raise if detects an empty file' do
|
|
fixture = empty_file_factory
|
|
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
expect {
|
|
csv.run
|
|
}.to raise_exception CartoDB::Importer2::EmptyFileError
|
|
|
|
FileUtils.rm(fixture)
|
|
end
|
|
end
|
|
|
|
describe '#detect_delimiter' do
|
|
it 'detects the delimiter' do
|
|
fixture = tab_delimiter_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.detect_delimiter.should eq "\t"
|
|
|
|
FileUtils.rm(fixture)
|
|
end
|
|
|
|
it 'detects it correctly even with quoted strings containing delimiters' do
|
|
fixture = quoted_string_with_delimiter_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.detect_delimiter.should eq ','
|
|
end
|
|
|
|
it 'detects it correctly with escaped quotes' do
|
|
fixture = string_with_escaped_quote_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.detect_delimiter.should eq ','
|
|
end
|
|
|
|
it 'detects it correctly with triple quotes, quoted strings and all' do
|
|
fixture = bug_columns_wrong_split_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.detect_delimiter.should eq ','
|
|
end
|
|
|
|
end
|
|
|
|
describe '#encoding' do
|
|
it 'guesses the encoding' do
|
|
fixture = utf16le_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.encoding.should eq 'ISO-8859-1'
|
|
|
|
FileUtils.rm(fixture)
|
|
end
|
|
end
|
|
|
|
describe '#encoding_fuzzy' do
|
|
it 'guesses the encoding of an ISO-8859-1 with a problematic character' do
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(File.join(File.dirname(__FILE__), '../fixtures/charlock_holmes_utf_8_instead_of_iso.csv'), Log.new(@user))
|
|
csv.encoding.should eq 'ISO-8859-1'
|
|
end
|
|
end
|
|
|
|
describe '#encoding_utf8' do
|
|
it 'guesses UTF-8 encoding' do
|
|
fixture = utf8_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.encoding.should eq 'UTF-8'
|
|
|
|
FileUtils.rm(fixture)
|
|
end
|
|
end
|
|
|
|
describe '#single_column?' do
|
|
it 'returns true if CSV header has only one column' do
|
|
fixture = single_column_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.detect_delimiter
|
|
|
|
csv.single_column?.should eq true
|
|
|
|
fixture = tab_delimiter_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.detect_delimiter
|
|
|
|
csv.single_column?.should eq false
|
|
end
|
|
end
|
|
|
|
describe '#multiple_column' do
|
|
it 'returns the passed row if it has more than one cell' do
|
|
fixture = tab_delimiter_factory
|
|
row = ['bogus', 'wadus']
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.multiple_column(row).should eq row
|
|
end
|
|
|
|
it 'adds an empty cell to the row if it has a single cell' do
|
|
fixture = tab_delimiter_factory
|
|
row = ['bogus', 'wadus']
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
csv.multiple_column(row).should eq (row << nil)
|
|
end
|
|
end
|
|
|
|
describe '#spaces_and_commas_delimiter_detector' do
|
|
it 'properly detects delimiter on a CSV containing many spaces and commas' do
|
|
# Also tests that detector is able to load a file with less rows than CartoDB::Importer2::CsvNormalizer::LINES_FOR_DETECTION
|
|
fixture = spaces_and_commas_factory
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture, Log.new(@user))
|
|
|
|
csv.detect_delimiter.should eq ","
|
|
|
|
FileUtils.rm(fixture)
|
|
end
|
|
end
|
|
|
|
describe '#remove_newlines' do
|
|
it 'tests the cleaning of non row-separating newlines inside CSVs' do
|
|
fixture_filepath = newlines_factory
|
|
|
|
csv = CartoDB::Importer2::CsvNormalizer.new(fixture_filepath, Log.new(@user))
|
|
|
|
expect {
|
|
csv.run
|
|
}.to raise_exception CartoDB::Importer2::MalformedCSVException
|
|
|
|
FileUtils.rm(fixture_filepath)
|
|
end
|
|
end
|
|
|
|
# Helpers
|
|
|
|
def newlines_factory
|
|
invalid_content = "field1,\"field\n2\",field3\na,b,c\na2,\"b\n2\",c2\na3,b3,c3\na4,\"\nb\n4\n\",\"c\n4\""
|
|
|
|
filepath = get_temp_csv_fullpath
|
|
|
|
File.open(filepath, 'wb') do |f2|
|
|
f2.puts invalid_content
|
|
end
|
|
|
|
return filepath
|
|
end
|
|
|
|
def utf8_factory
|
|
filepath = get_temp_csv_fullpath
|
|
|
|
::CSV.open(filepath, 'wb', col_sep: "\t") do |csv|
|
|
csv << ["name", "description", "field_3"]
|
|
csv << ["normal 1 1 ", "normal 1 2", "normal 1 3"]
|
|
csv << ["normal 2 1", "normal 2 2", "normal 2_3"]
|
|
csv << ["normal 3 1", "normal 3 2", "normal 3 3"]
|
|
csv << (["áÁéÉíÍ", "óÓúÚ", "ñÑ"].map { |s| s.encode('UTF-8') })
|
|
csv << ["normal 5 1", "normal 5 2", "normal 5 3"]
|
|
end
|
|
|
|
filepath
|
|
end
|
|
|
|
def spaces_and_commas_factory
|
|
filepath = get_temp_csv_fullpath
|
|
|
|
::CSV.open(filepath, 'wb', col_sep: ",") do |csv|
|
|
csv << ["name", "description", "field_3"]
|
|
csv << ["normal 1 1 .", "normal 1 2 .", "normal 1 3 ."]
|
|
csv << ["normal 2 1", "normal 2 2", "normal 2 3"]
|
|
csv << ["normal 3 1 .", "normal 3 2 .", "normal 3 3 ."]
|
|
csv << ["normal 4 1", "normal 4 2", "normal 4 3"]
|
|
csv << ["normal 5 1 . . ", "normal 5 2 . . ", "normal 5 3 . . "]
|
|
end
|
|
|
|
filepath
|
|
end
|
|
|
|
def utf16le_factory
|
|
filepath = get_temp_csv_fullpath
|
|
|
|
::CSV.open(filepath, 'wb', col_sep: "\t") do |csv|
|
|
csv << (["name", "description", "wadus"].map { |s| s.encode('ISO-8859-1') })
|
|
csv << (["bogus_1", "bogus_2", "bogus_3"].map { |s| s.encode('ISO-8859-1') })
|
|
end
|
|
|
|
filepath
|
|
end
|
|
|
|
def tab_delimiter_factory
|
|
filepath = get_temp_csv_fullpath
|
|
|
|
::CSV.open(filepath, 'w', col_sep: "\t") do |csv|
|
|
csv << ["name", "description", "wadus"]
|
|
csv << ["bogus_1", "bogus_2", "bogus_3"]
|
|
end
|
|
|
|
filepath
|
|
end
|
|
|
|
def single_column_factory
|
|
filepath = get_temp_csv_fullpath
|
|
|
|
::CSV.open(filepath, 'w') do |csv|
|
|
csv << ['header_1']
|
|
csv << ['row 1']
|
|
end
|
|
|
|
filepath
|
|
end
|
|
|
|
def empty_file_factory
|
|
filepath = get_temp_csv_fullpath
|
|
|
|
FileUtils.touch(filepath)
|
|
|
|
filepath
|
|
end
|
|
|
|
def quoted_string_with_delimiter_factory
|
|
filepath = get_temp_csv_fullpath
|
|
|
|
::File.open(filepath, 'w') do |file|
|
|
file << 'name,description ; with semicolon,wadus' << "\n"
|
|
file << 'foo,"this description contains; a semicolon and a, comma to affect frequency table",bar' << "\n"
|
|
file << 'foobar,"this description contains; a semicolon but no comma",barfoo' << "\n"
|
|
end
|
|
|
|
filepath
|
|
end
|
|
|
|
def string_with_escaped_quote_factory
|
|
filepath = get_temp_csv_fullpath
|
|
|
|
::File.open(filepath, 'w') do |file|
|
|
file << 'name,description ; with semicolon,wadus' << "\n"
|
|
file << 'foo,"this description contains an escaped \" quote; a semicolon, and a comma",bar' << "\n"
|
|
file << 'foobar,"this description contains \"; a semicolon but no comma",barfoo' << "\n"
|
|
end
|
|
|
|
filepath
|
|
end
|
|
|
|
def bug_columns_wrong_split_factory
|
|
temp_destination = get_temp_csv_fullpath
|
|
|
|
::FileUtils::copy BUG_COLUMNS_WRONG_SPLIT_FIXTURE_FILE, temp_destination
|
|
|
|
temp_destination
|
|
end
|
|
|
|
|
|
|
|
|
|
def get_temp_csv_fullpath
|
|
"/var/tmp/#{Time.now.to_f}-#{rand(999)}.csv"
|
|
end
|
|
|
|
end
|
|
|