110 lines
3.1 KiB
Python
110 lines
3.1 KiB
Python
|
from chardet.universaldetector import UniversalDetector
|
||
|
import itertools
|
||
|
import os.path
|
||
|
import sys
|
||
|
import dbfUtils
|
||
|
import sys
|
||
|
from osgeo import osr
|
||
|
import json
|
||
|
import subprocess
|
||
|
|
||
|
if len(sys.argv) != 3:
|
||
|
print("usage: python %s shp_file name" % sys.argv[0])
|
||
|
sys.exit()
|
||
|
shp_file = sys.argv[1]
|
||
|
name = sys.argv[2]
|
||
|
|
||
|
dbf_file = shp_file[0:-4] + '.dbf'
|
||
|
prj_file = shp_file[0:-4] + '.prj'
|
||
|
|
||
|
shape_name = os.path.splitext(shp_file)[0]
|
||
|
|
||
|
def get_spatial_reference(shapefile):
|
||
|
srs = osr.SpatialReference()
|
||
|
shpfile = os.path.basename(shape_name)
|
||
|
ret = srs.SetFromUserInput(prj_string)
|
||
|
proj4 = srs.ExportToProj4()
|
||
|
if not proj4:
|
||
|
prj_file = open(shape_name +'.prj','r')
|
||
|
prj_lines = prj_file.readlines()
|
||
|
prj_file.close()
|
||
|
for i in range(len(prj_lines)):
|
||
|
prj_lines[i] = prj_string.rstrip( prj_lines[i] )
|
||
|
srs = osr.SpatialReference()
|
||
|
srs.ImportFromESRI(prj_lines)
|
||
|
proj4 = srs.ExportToProj4()
|
||
|
"""
|
||
|
if not proj4:
|
||
|
#print 'Failed to convert prj of %s, giving up...' % shpfile
|
||
|
else:
|
||
|
#print 'Second try assuming ESRI wkt worked for %s!' % shpfile
|
||
|
"""
|
||
|
srs.from_esri = True
|
||
|
else:
|
||
|
srs.from_esri = False
|
||
|
srs.AutoIdentifyEPSG()
|
||
|
return srs
|
||
|
|
||
|
def to_epsg(srs):
|
||
|
if srs.IsGeographic():
|
||
|
return srs.GetAuthorityCode('GEOGCS')
|
||
|
else:
|
||
|
c = srs.GetAuthorityCode('PROJCS')
|
||
|
if c:
|
||
|
return c
|
||
|
else:
|
||
|
try:
|
||
|
return srs.GetAuthorityCode('GEOGCS')
|
||
|
except:
|
||
|
return None
|
||
|
|
||
|
|
||
|
srid = None
|
||
|
|
||
|
#Try detecting the SRID
|
||
|
if os.path.isfile(prj_file):
|
||
|
prj_string = open(prj_file,'r').read()
|
||
|
code = to_epsg(get_spatial_reference(shp_file))
|
||
|
|
||
|
srid = code if code else None
|
||
|
|
||
|
try:
|
||
|
# Try to detect the encoding
|
||
|
dbf = open(dbf_file.strip(), 'rb')
|
||
|
db = dbfUtils.dbfreader(dbf)
|
||
|
|
||
|
fnames = next(db)
|
||
|
ftypes = next(db)
|
||
|
|
||
|
# find string fields
|
||
|
sfields = []
|
||
|
for fno in range(len(fnames)):
|
||
|
if ( ftypes[fno][0] == 'C' ) : sfields.append(fno)
|
||
|
|
||
|
detector = UniversalDetector()
|
||
|
|
||
|
# TODO: Make this a % of total table size and stop guessing correct values for guessing
|
||
|
for row in itertools.islice(db, 1000):
|
||
|
# Feed detector with concatenated string fields
|
||
|
detector.feed( ''.join(row[fno] for fno in sfields) )
|
||
|
if detector.done: break
|
||
|
dbf.close()
|
||
|
detector.close()
|
||
|
encoding = detector.result["encoding"]
|
||
|
confidence = detector.result["confidence"]
|
||
|
if encoding=="ascii":
|
||
|
encoding="LATIN1" # why not UTF8 here ?
|
||
|
# There's problems detecting LATIN1 encodings, it detects KOI8-R instead of LATIN1
|
||
|
if encoding=="KOI8-R":
|
||
|
encoding="LATIN1"
|
||
|
# Fix for #1336: since ISO-8859-2 is unlikely and UniversalDetector doesn't support ISO-8859-1,
|
||
|
# we'll fallback to ISO-8859-1 if confidence is not high
|
||
|
if encoding=="ISO-8859-2" and confidence < 0.75:
|
||
|
encoding="ISO-8859-1"
|
||
|
except Exception as err:
|
||
|
encoding="None" # why not UTF8 here ?
|
||
|
#sys.stderr.write(repr(err)+'\n')
|
||
|
#sys.exit(1)
|
||
|
|
||
|
print("%s,%s,%s,%s" % (srid,encoding,shp_file,name))
|