cartodb-4.42/lib/importer/misc/shp_normalizer.py

from chardet.universaldetector import UniversalDetector
import itertools
import os.path
import sys
import dbfUtils
import sys
from osgeo import osr
import json
import subprocess

if len(sys.argv) != 3:
    print("usage: python %s shp_file name" % sys.argv[0])
    sys.exit()
shp_file = sys.argv[1]
name = sys.argv[2]

dbf_file = shp_file[0:-4] + '.dbf'
prj_file = shp_file[0:-4] + '.prj'

shape_name = os.path.splitext(shp_file)[0]

def get_spatial_reference(shapefile):
    srs = osr.SpatialReference()
    shpfile = os.path.basename(shape_name)
    ret = srs.SetFromUserInput(prj_string)
    proj4 = srs.ExportToProj4()
    if not proj4:
        prj_file = open(shape_name +'.prj','r')
        prj_lines = prj_file.readlines()
        prj_file.close()
        for i in range(len(prj_lines)):
            prj_lines[i] = prj_string.rstrip( prj_lines[i] )
        srs = osr.SpatialReference()
        srs.ImportFromESRI(prj_lines)
        proj4 = srs.ExportToProj4()
        """
        if not proj4:
            #print 'Failed to convert prj of %s, giving up...' % shpfile
        else:
            #print 'Second try assuming ESRI wkt worked for %s!' % shpfile
        """
        srs.from_esri = True
    else:
        srs.from_esri = False
    srs.AutoIdentifyEPSG()
    return srs

def to_epsg(srs):
    if srs.IsGeographic():
        return srs.GetAuthorityCode('GEOGCS')
    else:
        c = srs.GetAuthorityCode('PROJCS')
        if c:
            return c
        else:
            try:
                return srs.GetAuthorityCode('GEOGCS')
            except:
                return None


srid = None

#Try detecting the SRID
if os.path.isfile(prj_file):
  prj_string = open(prj_file,'r').read()
  code = to_epsg(get_spatial_reference(shp_file))

  srid = code if code else None

try:
# Try to detect the encoding
    dbf = open(dbf_file.strip(), 'rb')
    db = dbfUtils.dbfreader(dbf)

    fnames = next(db)
    ftypes = next(db)

    # find string fields
    sfields = []
    for fno in range(len(fnames)):
      if ( ftypes[fno][0] == 'C' ) : sfields.append(fno)

    detector = UniversalDetector()

    # TODO: Make this a % of total table size and stop guessing correct values for guessing
    for row in itertools.islice(db, 1000):
      # Feed detector with concatenated string fields
      detector.feed( ''.join(row[fno] for fno in sfields) )
      if detector.done: break
    dbf.close()
    detector.close()
    encoding = detector.result["encoding"]
    confidence = detector.result["confidence"]
    if encoding=="ascii":
        encoding="LATIN1" # why not UTF8 here ?
    # There's problems detecting LATIN1 encodings, it detects KOI8-R instead of LATIN1
    if encoding=="KOI8-R":
        encoding="LATIN1"
    # Fix for #1336: since ISO-8859-2 is unlikely and UniversalDetector doesn't support ISO-8859-1,
    # we'll fallback to ISO-8859-1 if confidence is not high
    if encoding=="ISO-8859-2" and confidence < 0.75:
        encoding="ISO-8859-1"
except Exception as err:
    encoding="None" # why not UTF8 here ?
    #sys.stderr.write(repr(err)+'\n')
    #sys.exit(1)

print("%s,%s,%s,%s" % (srid,encoding,shp_file,name))