observatory-extension/scripts/generate_fixtures.py

210 lines
8.0 KiB
Python
Raw Normal View History

import os
import psycopg2
import subprocess
DB_CONN = psycopg2.connect('postgres://{user}:{password}@{host}:{port}/{database}'.format(
user=os.environ.get('PGUSER', 'postgres'),
password=os.environ.get('PGPASSWORD', ''),
host=os.environ.get('PGHOST', 'localhost'),
port=os.environ.get('PGPORT', '5432'),
database=os.environ.get('PGDATABASE', 'postgres'),
))
CURSOR = DB_CONN.cursor()
def query(q):
'''
Query the database.
'''
try:
CURSOR.execute(q)
return CURSOR
except:
DB_CONN.rollback()
raise
def commit():
try:
DB_CONN.commit()
except:
DB_CONN.rollback()
raise
2016-05-12 23:07:21 +08:00
def get_tablename_query(column_id, boundary_id, timespan):
"""
given a column_id, boundary-id (us.census.tiger.block_group), and
timespan, give back the current table hash from the data observatory
"""
return """
2016-05-12 23:07:21 +08:00
SELECT t.tablename, geoid_ct.colname colname
FROM observatory.obs_table t,
observatory.obs_column_table geoid_ct,
observatory.obs_column_table data_ct
2016-05-12 23:07:21 +08:00
WHERE
t.id = geoid_ct.table_id AND
t.id = data_ct.table_id AND
geoid_ct.column_id =
(SELECT source_id
FROM observatory.obs_column_to_column
2016-05-12 23:07:21 +08:00
WHERE target_id = '{boundary_id}'
AND reltype = 'geom_ref'
) AND
data_ct.column_id = '{column_id}' AND
timespan = '{timespan}'
""".format(column_id=column_id,
boundary_id=boundary_id,
timespan=timespan)
2016-05-12 23:07:21 +08:00
METADATA_TABLES = ['obs_table', 'obs_column_table', 'obs_column', 'obs_column_tag',
'obs_tag', 'obs_column_to_column', 'obs_dump_version', 'obs_meta',
'obs_meta_numer', 'obs_meta_denom', 'obs_meta_geom',
'obs_meta_timespan', ]
2016-05-12 23:07:21 +08:00
fixtures = [
('us.census.tiger.census_tract', 'us.census.tiger.census_tract', '2014'),
('us.census.tiger.block_group', 'us.census.tiger.block_group', '2014'),
('us.census.tiger.zcta5', 'us.census.tiger.zcta5', '2014'),
2016-05-12 23:07:21 +08:00
('us.census.tiger.county', 'us.census.tiger.county', '2014'),
2016-05-13 23:23:10 +08:00
('us.census.acs.B01003001', 'us.census.tiger.census_tract', '2010 - 2014'),
('us.census.acs.B01003001_quantile', 'us.census.tiger.census_tract', '2010 - 2014'),
('us.census.acs.B01003001', 'us.census.tiger.block_group', '2010 - 2014'),
2016-05-17 20:28:23 +08:00
('us.census.spielman_singleton_segments.X10', 'us.census.tiger.census_tract', '2010 - 2014'),
2016-05-12 23:07:21 +08:00
('us.zillow.AllHomes_Zhvi', 'us.census.tiger.zcta5', '2014-01'),
('us.zillow.AllHomes_Zhvi', 'us.census.tiger.zcta5', '2016-03'),
('whosonfirst.wof_country_geom', 'whosonfirst.wof_country_geom', '2016'),
2016-05-17 20:28:23 +08:00
('us.census.tiger.zcta5_clipped', 'us.census.tiger.zcta5_clipped', '2014'),
('us.census.tiger.block_group_clipped', 'us.census.tiger.block_group_clipped', '2014'),
2016-05-12 23:07:21 +08:00
]
OUTFILE_PATH = 'src/pg/test/fixtures/load_fixtures.sql'
DROPFILE_PATH = 'src/pg/test/fixtures/drop_fixtures.sql'
def dump(cols, tablename, where=''):
with open(DROPFILE_PATH, 'a') as dropfile:
dropfile.write('DROP TABLE IF EXISTS observatory.{tablename};\n'.format(
tablename=tablename,
))
subprocess.check_call('pg_dump -x --section=pre-data -t observatory.{tablename} '
' | sed "s:, pg_catalog::" '
' | sed "s:CREATE TABLE :CREATE TABLE observatory.:" '
' | sed "s:ALTER TABLE.*OWNER.*::" '
' >> {outfile}'.format(
tablename=tablename,
outfile=OUTFILE_PATH,
), shell=True)
with open(OUTFILE_PATH, 'a') as outfile:
outfile.write('COPY observatory."{}" FROM stdin WITH CSV HEADER;\n'.format(tablename))
subprocess.check_call('''
psql -c "COPY (SELECT {cols} \
FROM observatory.{tablename} {where}) \
TO STDOUT WITH CSV HEADER" >> {outfile}'''.format(
cols=cols,
tablename=tablename,
where=where,
outfile=OUTFILE_PATH,
), shell=True)
with open(OUTFILE_PATH, 'a') as outfile:
outfile.write('\\.\n\n')
def main():
unique_tables = set()
for f in fixtures:
column_id, boundary_id, timespan = f
tablename_query = get_tablename_query(column_id, boundary_id, timespan)
tablename, colname = query(tablename_query).fetchone()
table_colname = (tablename, colname, boundary_id, )
if table_colname not in unique_tables:
print(table_colname)
unique_tables.add(table_colname)
print unique_tables
with open(OUTFILE_PATH, 'w') as outfile:
outfile.write('SET client_min_messages TO WARNING;\n\\set ECHO none\n')
outfile.write('CREATE SCHEMA IF NOT EXISTS observatory;\n\n')
with open(DROPFILE_PATH, 'w') as dropfile:
dropfile.write('SET client_min_messages TO WARNING;\n\\set ECHO none\n')
for tablename in METADATA_TABLES:
print(tablename)
if tablename == 'obs_meta':
where = "WHERE " + " OR ".join([
"(numer_id, geom_id, numer_timespan) = ('{}', '{}', '{}')".format(
numer_id, geom_id, timespan)
for numer_id, geom_id, timespan in fixtures
])
elif tablename == 'obs_meta_numer':
where = "WHERE " + " OR ".join([
"numer_id IN ('{}', '{}')".format(numer_id, geom_id)
for numer_id, geom_id, timespan in fixtures
])
elif tablename == 'obs_meta_denom':
where = "WHERE " + " OR ".join([
"denom_id IN ('{}', '{}')".format(numer_id, geom_id)
for numer_id, geom_id, timespan in fixtures
])
elif tablename == 'obs_meta_geom':
where = "WHERE " + " OR ".join([
"geom_id IN ('{}', '{}')".format(numer_id, geom_id)
for numer_id, geom_id, timespan in fixtures
])
elif tablename == 'obs_meta_timespan':
where = "WHERE " + " OR ".join([
"timespan_id = ('{}')".format(timespan)
for numer_id, geom_id, timespan in fixtures
])
elif tablename == 'obs_column':
where = "WHERE " + " OR ".join([
"id IN ('{}', '{}')".format(numer_id, geom_id)
for numer_id, geom_id, timespan in fixtures
])
elif tablename in ('obs_column_table', 'obs_column_tag'):
where = "WHERE " + " OR ".join([
"column_id IN ('{}', '{}')".format(numer_id, geom_id)
for numer_id, geom_id, timespan in fixtures
])
elif tablename == 'obs_column_to_column':
where = "WHERE " + " OR ".join([
"source_id IN ('{}', '{}') OR target_id IN ('{}', '{}')".format(
numer_id, geom_id, numer_id, geom_id)
for numer_id, geom_id, timespan in fixtures
])
elif tablename == 'obs_table':
where = "WHERE " + " OR ".join([
"timespan = '{}'".format(timespan)
for numer_id, geom_id, timespan in fixtures
])
else:
where = ''
dump('*', tablename, where)
for tablename, colname, boundary_id in unique_tables:
if 'zcta5' in boundary_id:
where = '\'11%\''
compare = 'LIKE'
elif 'whosonfirst' in boundary_id:
where = '(\'85632785\',\'85633051\',\'85633111\',\'85633147\',\'85633253\',\'85633267\')'
compare = 'IN'
else:
where = '\'36047%\''
compare = 'LIKE'
print ' '.join(['*', tablename, "WHERE {}::text {} {}".format(colname, compare, where)])
dump('*', tablename, "WHERE {}::text {} {}".format(colname, compare, where))
#cdb.dump(' '.join([select_star(tablename), "WHERE {}::text {} {}".format(colname, compare, where)]),
# tablename, outfile, schema='observatory')
#dropfiles.write('DROP TABLE IF EXISTS observatory.{};\n'.format(tablename))
if __name__ == '__main__':
main()