From 2c5958f11e18e1e6259fdbddc0e3b9baa6bfc472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mois=C3=A9s=20Calzado?= Date: Fri, 14 Jan 2022 17:44:51 +0100 Subject: [PATCH 1/3] Add autoguessing capabilities to ArcGIS connector --- .../lib/importer/arcgis_autoguessing.rb | 31 +++++++++++++++++++ services/importer/lib/importer/loader.rb | 17 +++++++--- services/importer/lib/importer/runner.rb | 2 +- 3 files changed, 45 insertions(+), 5 deletions(-) create mode 100644 services/importer/lib/importer/arcgis_autoguessing.rb diff --git a/services/importer/lib/importer/arcgis_autoguessing.rb b/services/importer/lib/importer/arcgis_autoguessing.rb new file mode 100644 index 0000000000..0c5f07acdc --- /dev/null +++ b/services/importer/lib/importer/arcgis_autoguessing.rb @@ -0,0 +1,31 @@ +require 'open3' +require_relative './shp_helper' + +module CartoDB + module Importer2 + class ArcGISAutoguessing + + def initialize(db, schema_name, table_name, fields_metadata) + @db = db + @schema_name = schema_name + @table_name = table_name + @fields_metadata = fields_metadata + end + + def run + autoguess_dates + end + + def autoguess_dates + date_fields = @fields_metadata.select { |field| field['type'] == 'esriFieldTypeDate' } + date_fields.each do |field| + @db.run(%Q{ + ALTER TABLE #{@schema_name}.#{@table_name} ALTER COLUMN #{field['name'].downcase} TYPE DATE + using to_timestamp(cast(#{field['name'].downcase}::bigint/1000 as bigint))::date + }) + end + end + + end + end +end diff --git a/services/importer/lib/importer/loader.rb b/services/importer/lib/importer/loader.rb index 57d319243d..ee952211b4 100644 --- a/services/importer/lib/importer/loader.rb +++ b/services/importer/lib/importer/loader.rb @@ -10,6 +10,7 @@ require_relative './georeferencer' require_relative '../importer/post_import_handler' require_relative './geometry_fixer' require_relative './typecaster' +require_relative './arcgis_autoguessing' require_relative '../../../../lib/cartodb/stats/importer' @@ -92,13 +93,13 @@ module CartoDB run_ogr2ogr(append_mode=true) end - def streamed_run_finish(post_import_handler_instance=nil) + def streamed_run_finish(post_import_handler_instance=nil, datasource_name) @post_import_handler = post_import_handler_instance - post_ogr2ogr_tasks + post_ogr2ogr_tasks(datasource_name) end - def post_ogr2ogr_tasks + def post_ogr2ogr_tasks(datasource_name) georeferencer.mark_as_from_geojson_with_transform if post_import_handler.has_transform_geojson_geom_column? job.log 'Georeferencing...' @@ -117,6 +118,14 @@ module CartoDB job.log "Error fixing geometries during import, skipped (#{e.message})" end end + + # If autoguessing is enabled, we try it on arcgis data + if datasource_name == 'arcgis' && options[:ogr2ogr_csv_guessing] + job.log 'Autoguessing ArcGIS data types...' + file = File.open @source_file.fullpath + file_content = JSON.load file + ArcGISAutoguessing.new(job.db, SCHEMA, job.table_name, file_content['fields']).run + end rescue StandardError => e raise CartoDB::Datasources::InvalidInputDataError.new(e.to_s, ERRORS_MAP[CartoDB::Datasources::InvalidInputDataError]) unless statement_timeout?(e.to_s) raise StatementTimeoutError.new(e.to_s, ERRORS_MAP[CartoDB::Importer2::StatementTimeoutError]) @@ -244,7 +253,7 @@ module CartoDB attr_accessor :source_file, :options - + private attr_writer :ogr2ogr, :georeferencer diff --git a/services/importer/lib/importer/runner.rb b/services/importer/lib/importer/runner.rb index eb702cf8c0..3afdba49c0 100644 --- a/services/importer/lib/importer/runner.rb +++ b/services/importer/lib/importer/runner.rb @@ -230,7 +230,7 @@ module CartoDB loader.streamed_run_continue(downloader.source_file) if got_data end while got_data - loader.streamed_run_finish(@post_import_handler) + loader.streamed_run_finish(@post_import_handler, @downloader.datasource.class::DATASOURCE_NAME) end def file_based_loader_run(job, loader) From 507b7754c65b2bbc79ad346fb9f56e080ef9e655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mois=C3=A9s=20Calzado?= Date: Fri, 14 Jan 2022 18:48:17 +0100 Subject: [PATCH 2/3] Update NEWS.md --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index b962aee228..5585e2cabc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,7 @@ Development * Show user's database location in profile [16349](https://github.com/CartoDB/cartodb/pull/16349) * Setting to enable/disable import notifications [16354](https://github.com/CartoDB/cartodb/pull/16354) * Setting to enable/disable random username generation on SAML authentication process [16372](https://github.com/CartoDB/cartodb/pull/16372) +* Add type guessing capabilities to the ArcGIS connector [#16385](https://github.com/CartoDB/cartodb/pull/16385) ### Bug fixes / enhancements - Fix rubocop integration [#16382](https://github.com/CartoDB/cartodb/pull/16382) From 5aa761b6ed13aef12fb46860cb3c51a540044e8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mois=C3=A9s=20Calzado?= Date: Mon, 17 Jan 2022 11:19:36 +0100 Subject: [PATCH 3/3] Fix some rubocop ofenses --- .../lib/importer/arcgis_autoguessing.rb | 20 ++++----- services/importer/lib/importer/loader.rb | 41 +++++++++++-------- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/services/importer/lib/importer/arcgis_autoguessing.rb b/services/importer/lib/importer/arcgis_autoguessing.rb index 0c5f07acdc..ea4ef0e0e5 100644 --- a/services/importer/lib/importer/arcgis_autoguessing.rb +++ b/services/importer/lib/importer/arcgis_autoguessing.rb @@ -5,12 +5,12 @@ module CartoDB module Importer2 class ArcGISAutoguessing - def initialize(db, schema_name, table_name, fields_metadata) - @db = db - @schema_name = schema_name - @table_name = table_name - @fields_metadata = fields_metadata - end + def initialize(db, schema_name, table_name, fields_metadata) + @db = db + @schema_name = schema_name + @table_name = table_name + @fields_metadata = fields_metadata + end def run autoguess_dates @@ -19,10 +19,10 @@ module CartoDB def autoguess_dates date_fields = @fields_metadata.select { |field| field['type'] == 'esriFieldTypeDate' } date_fields.each do |field| - @db.run(%Q{ - ALTER TABLE #{@schema_name}.#{@table_name} ALTER COLUMN #{field['name'].downcase} TYPE DATE - using to_timestamp(cast(#{field['name'].downcase}::bigint/1000 as bigint))::date - }) + @db.run(%{ + ALTER TABLE #{@schema_name}.#{@table_name} ALTER COLUMN #{field['name'].downcase} TYPE DATE + using to_timestamp(cast(#{field['name'].downcase}::bigint/1000 as bigint))::date + }) end end diff --git a/services/importer/lib/importer/loader.rb b/services/importer/lib/importer/loader.rb index ee952211b4..0af5bfbf26 100644 --- a/services/importer/lib/importer/loader.rb +++ b/services/importer/lib/importer/loader.rb @@ -93,13 +93,13 @@ module CartoDB run_ogr2ogr(append_mode=true) end - def streamed_run_finish(post_import_handler_instance=nil, datasource_name) + def streamed_run_finish(post_import_handler_instance = nil, datasource_name = nil) @post_import_handler = post_import_handler_instance post_ogr2ogr_tasks(datasource_name) end - def post_ogr2ogr_tasks(datasource_name) + def post_ogr2ogr_tasks(datasource_name = nil) georeferencer.mark_as_from_geojson_with_transform if post_import_handler.has_transform_geojson_geom_column? job.log 'Georeferencing...' @@ -108,29 +108,27 @@ module CartoDB if post_import_handler.has_fix_geometries_task? job.log 'Fixing geometry...' - # At this point the_geom column is renamed - begin - GeometryFixer.new(job.db, job.table_name, SCHEMA, 'the_geom', job).run - rescue StandardError => e - raise e unless statement_timeout?(e.to_s) - # Ignore timeouts in query batcher - log_warning(exception: e, message: 'Could not fix geometries during import') - job.log "Error fixing geometries during import, skipped (#{e.message})" - end + fix_geometries(job) end # If autoguessing is enabled, we try it on arcgis data - if datasource_name == 'arcgis' && options[:ogr2ogr_csv_guessing] - job.log 'Autoguessing ArcGIS data types...' - file = File.open @source_file.fullpath - file_content = JSON.load file - ArcGISAutoguessing.new(job.db, SCHEMA, job.table_name, file_content['fields']).run - end + autoguessing_on_arcgis_import if datasource_name == 'arcgis' && options[:ogr2ogr_csv_guessing] rescue StandardError => e raise CartoDB::Datasources::InvalidInputDataError.new(e.to_s, ERRORS_MAP[CartoDB::Datasources::InvalidInputDataError]) unless statement_timeout?(e.to_s) raise StatementTimeoutError.new(e.to_s, ERRORS_MAP[CartoDB::Importer2::StatementTimeoutError]) end + def fix_geometries(job) + # At this point the_geom column is renamed + GeometryFixer.new(job.db, job.table_name, SCHEMA, 'the_geom', job).run + rescue StandardError => e + raise e unless statement_timeout?(e.to_s) + + # Ignore timeouts in query batcher + log_warning(exception: e, message: 'Could not fix geometries during import') + job.log "Error fixing geometries during import, skipped (#{e.message})" + end + def normalize converted_filepath = normalizers_for(source_file.extension) .inject(source_file.fullpath) { |filepath, normalizer_klass| @@ -253,7 +251,7 @@ module CartoDB attr_accessor :source_file, :options - + private attr_writer :ogr2ogr, :georeferencer @@ -404,6 +402,13 @@ module CartoDB csv_content[line][column] = "\"#{csv_content[line][column]}\"" File.open(filepath, 'w') { |file| file.puts csv_content.to_s } end + + def autoguessing_on_arcgis_import + job.log 'Autoguessing ArcGIS data types...' + file = File.read(@source_file.fullpath) + file_content = JSON.parse(file) + ArcGISAutoguessing.new(job.db, SCHEMA, job.table_name, file_content['fields']).run + end end end end