observatory-extension/scripts/generate_fixtures.py

import os
import psycopg2
import subprocess

DB_CONN = psycopg2.connect('postgres://{user}:{password}@{host}:{port}/{database}'.format(
    user=os.environ.get('PGUSER', 'postgres'),
    password=os.environ.get('PGPASSWORD', ''),
    host=os.environ.get('PGHOST', 'localhost'),
    port=os.environ.get('PGPORT', '5432'),
    database=os.environ.get('PGDATABASE', 'postgres'),
))
CURSOR = DB_CONN.cursor()


def query(q):
    '''
    Query the database.
    '''
    try:
        CURSOR.execute(q)
        return CURSOR
    except:
        DB_CONN.rollback()
        raise


def commit():
    try:
        DB_CONN.commit()
    except:
        DB_CONN.rollback()
        raise


def get_tablename_query(column_id, boundary_id, timespan):
    """
        given a column_id, boundary-id (us.census.tiger.block_group), and
        timespan, give back the current table hash from the data observatory
    """
    return """
        SELECT t.tablename, geoid_ct.colname colname
        FROM observatory.obs_table t,
             observatory.obs_column_table geoid_ct,
             observatory.obs_column_table data_ct
        WHERE
             t.id = geoid_ct.table_id AND
             t.id = data_ct.table_id AND
             geoid_ct.column_id =
        (SELECT source_id
         FROM observatory.obs_column_to_column
         WHERE target_id = '{boundary_id}'
         AND reltype = 'geom_ref'
        ) AND
        data_ct.column_id = '{column_id}' AND
        timespan = '{timespan}'
    """.format(column_id=column_id,
               boundary_id=boundary_id,
               timespan=timespan)


METADATA_TABLES = ['obs_table', 'obs_column_table', 'obs_column', 'obs_column_tag',
            'obs_tag', 'obs_column_to_column', 'obs_dump_version', 'obs_meta',
            'obs_meta_numer', 'obs_meta_denom', 'obs_meta_geom',
            'obs_meta_timespan', ]

fixtures = [
    ('us.census.tiger.census_tract', 'us.census.tiger.census_tract', '2014'),
    ('us.census.tiger.block_group', 'us.census.tiger.block_group', '2014'),
    ('us.census.tiger.zcta5', 'us.census.tiger.zcta5', '2014'),
    ('us.census.tiger.county', 'us.census.tiger.county', '2014'),
    ('us.census.acs.B01003001', 'us.census.tiger.census_tract', '2010 - 2014'),
    ('us.census.acs.B01003001_quantile', 'us.census.tiger.census_tract', '2010 - 2014'),
    ('us.census.acs.B01003001', 'us.census.tiger.block_group', '2010 - 2014'),
    ('us.census.spielman_singleton_segments.X10', 'us.census.tiger.census_tract', '2010 - 2014'),
    ('us.zillow.AllHomes_Zhvi', 'us.census.tiger.zcta5', '2014-01'),
    ('us.zillow.AllHomes_Zhvi', 'us.census.tiger.zcta5', '2016-03'),
    ('whosonfirst.wof_country_geom', 'whosonfirst.wof_country_geom', '2016'),
    ('us.census.tiger.zcta5_clipped', 'us.census.tiger.zcta5_clipped', '2014'),
    ('us.census.tiger.block_group_clipped', 'us.census.tiger.block_group_clipped', '2014'),
]

OUTFILE_PATH = 'src/pg/test/fixtures/load_fixtures.sql'
DROPFILE_PATH = 'src/pg/test/fixtures/drop_fixtures.sql'

def dump(cols, tablename, where=''):

    with open(DROPFILE_PATH, 'a') as dropfile:
        dropfile.write('DROP TABLE IF EXISTS observatory.{tablename};\n'.format(
            tablename=tablename,
        ))

    subprocess.check_call('pg_dump -x --section=pre-data -t observatory.{tablename} '
                          ' | sed "s:, pg_catalog::" '
                          ' | sed "s:CREATE TABLE :CREATE TABLE observatory.:" '
                          ' | sed "s:ALTER TABLE.*OWNER.*::" '
                          ' >> {outfile}'.format(
                              tablename=tablename,
                              outfile=OUTFILE_PATH,
                          ), shell=True)

    with open(OUTFILE_PATH, 'a') as outfile:
        outfile.write('COPY observatory."{}" FROM stdin WITH CSV HEADER;\n'.format(tablename))

    subprocess.check_call('''
      psql -c "COPY (SELECT {cols} \
      FROM observatory.{tablename} {where}) \
      TO STDOUT WITH CSV HEADER" >> {outfile}'''.format(
          cols=cols,
          tablename=tablename,
          where=where,
          outfile=OUTFILE_PATH,
      ), shell=True)

    with open(OUTFILE_PATH, 'a') as outfile:
        outfile.write('\\.\n\n')


def main():
    unique_tables = set()

    for f in fixtures:
        column_id, boundary_id, timespan = f
        tablename_query = get_tablename_query(column_id, boundary_id, timespan)
        tablename, colname = query(tablename_query).fetchone()
        table_colname = (tablename, colname, boundary_id, )
        if table_colname not in unique_tables:
            print(table_colname)
            unique_tables.add(table_colname)

    print unique_tables

    with open(OUTFILE_PATH, 'w') as outfile:
        outfile.write('SET client_min_messages TO WARNING;\n\\set ECHO none\n')
        outfile.write('CREATE SCHEMA IF NOT EXISTS observatory;\n\n')

    with open(DROPFILE_PATH, 'w') as dropfile:
        dropfile.write('SET client_min_messages TO WARNING;\n\\set ECHO none\n')

    for tablename in METADATA_TABLES:
        print(tablename)
        if tablename == 'obs_meta':
            where = "WHERE " + " OR ".join([
                "(numer_id, geom_id, numer_timespan) = ('{}', '{}', '{}')".format(
                    numer_id, geom_id, timespan)
                for numer_id, geom_id, timespan in fixtures
            ])
        elif tablename == 'obs_meta_numer':
            where = "WHERE " + " OR ".join([
                "numer_id IN ('{}', '{}')".format(numer_id, geom_id)
                for numer_id, geom_id, timespan in fixtures
            ])
        elif tablename == 'obs_meta_denom':
            where = "WHERE " + " OR ".join([
                "denom_id IN ('{}', '{}')".format(numer_id, geom_id)
                for numer_id, geom_id, timespan in fixtures
            ])
        elif tablename == 'obs_meta_geom':
            where = "WHERE " + " OR ".join([
                "geom_id IN ('{}', '{}')".format(numer_id, geom_id)
                for numer_id, geom_id, timespan in fixtures
            ])
        elif tablename == 'obs_meta_timespan':
            where = "WHERE " + " OR ".join([
                "timespan_id = ('{}')".format(timespan)
                for numer_id, geom_id, timespan in fixtures
            ])
        elif tablename == 'obs_column':
            where = "WHERE " + " OR ".join([
                "id IN ('{}', '{}')".format(numer_id, geom_id)
                for numer_id, geom_id, timespan in fixtures
            ])
        elif tablename in ('obs_column_table', 'obs_column_tag'):
            where = "WHERE " + " OR ".join([
                "column_id IN ('{}', '{}')".format(numer_id, geom_id)
                for numer_id, geom_id, timespan in fixtures
            ])
        elif tablename == 'obs_column_to_column':
            where = "WHERE " + " OR ".join([
                "source_id IN ('{}', '{}') OR target_id IN ('{}', '{}')".format(
                    numer_id, geom_id, numer_id, geom_id)
                for numer_id, geom_id, timespan in fixtures
            ])
        elif tablename == 'obs_table':
            where = "WHERE " + " OR ".join([
                "timespan = '{}'".format(timespan)
                for numer_id, geom_id, timespan in fixtures
            ])
        else:
            where = ''
        dump('*', tablename, where)

    for tablename, colname, boundary_id in unique_tables:
        if 'zcta5' in boundary_id:
            where = '\'11%\''
            compare = 'LIKE'
        elif 'whosonfirst' in boundary_id:
            where = '(\'85632785\',\'85633051\',\'85633111\',\'85633147\',\'85633253\',\'85633267\')'
            compare = 'IN'
        else:
            where = '\'36047%\''
            compare = 'LIKE'
        print ' '.join(['*', tablename, "WHERE {}::text {} {}".format(colname, compare, where)])
        dump('*', tablename, "WHERE {}::text {} {}".format(colname, compare, where))
        #cdb.dump(' '.join([select_star(tablename), "WHERE {}::text {} {}".format(colname, compare, where)]),
        #         tablename, outfile, schema='observatory')
        #dropfiles.write('DROP TABLE IF EXISTS observatory.{};\n'.format(tablename))

if __name__ == '__main__':
    main()
first pass on generating new metadata from local 2016-10-01 04:44:03 +08:00			`import os`
			`import psycopg2`
			`import subprocess`

			`DB_CONN = psycopg2.connect('postgres://{user}:{password}@{host}:{port}/{database}'.format(`
			`user=os.environ.get('PGUSER', 'postgres'),`
			`password=os.environ.get('PGPASSWORD', ''),`
			`host=os.environ.get('PGHOST', 'localhost'),`
			`port=os.environ.get('PGPORT', '5432'),`
			`database=os.environ.get('PGDATABASE', 'postgres'),`
			`))`
			`CURSOR = DB_CONN.cursor()`


			`def query(q):`
			`'''`
			`Query the database.`
			`'''`
			`try:`
			`CURSOR.execute(q)`
			`return CURSOR`
			`except:`
			`DB_CONN.rollback()`
			`raise`


			`def commit():`
			`try:`
			`DB_CONN.commit()`
			`except:`
			`DB_CONN.rollback()`
			`raise`
adds fixture gen script 2016-05-12 23:07:21 +08:00

			`def get_tablename_query(column_id, boundary_id, timespan):`
			`"""`
			`given a column_id, boundary-id (us.census.tiger.block_group), and`
			`timespan, give back the current table hash from the data observatory`
			`"""`
first pass on generating new metadata from local 2016-10-01 04:44:03 +08:00			`return """`
adds fixture gen script 2016-05-12 23:07:21 +08:00			`SELECT t.tablename, geoid_ct.colname colname`
first pass on generating new metadata from local 2016-10-01 04:44:03 +08:00			`FROM observatory.obs_table t,`
			`observatory.obs_column_table geoid_ct,`
			`observatory.obs_column_table data_ct`
adds fixture gen script 2016-05-12 23:07:21 +08:00			`WHERE`
			`t.id = geoid_ct.table_id AND`
			`t.id = data_ct.table_id AND`
			`geoid_ct.column_id =`
			`(SELECT source_id`
first pass on generating new metadata from local 2016-10-01 04:44:03 +08:00			`FROM observatory.obs_column_to_column`
adds fixture gen script 2016-05-12 23:07:21 +08:00			`WHERE target_id = '{boundary_id}'`
			`AND reltype = 'geom_ref'`
			`) AND`
			`data_ct.column_id = '{column_id}' AND`
			`timespan = '{timespan}'`
first pass on generating new metadata from local 2016-10-01 04:44:03 +08:00			`""".format(column_id=column_id,`
			`boundary_id=boundary_id,`
			`timespan=timespan)`
adds fixture gen script 2016-05-12 23:07:21 +08:00

first pass on generating new metadata from local 2016-10-01 04:44:03 +08:00			`METADATA_TABLES = ['obs_table', 'obs_column_table', 'obs_column', 'obs_column_tag',`
			`'obs_tag', 'obs_column_to_column', 'obs_dump_version', 'obs_meta',`
			`'obs_meta_numer', 'obs_meta_denom', 'obs_meta_geom',`
			`'obs_meta_timespan', ]`
adds fixture gen script 2016-05-12 23:07:21 +08:00
			`fixtures = [`
			`('us.census.tiger.census_tract', 'us.census.tiger.census_tract', '2014'),`
			`('us.census.tiger.block_group', 'us.census.tiger.block_group', '2014'),`
use schema feature of sqldumpr, use different WHERE clause for zcta 2016-05-12 23:41:47 +08:00			`('us.census.tiger.zcta5', 'us.census.tiger.zcta5', '2014'),`
adds fixture gen script 2016-05-12 23:07:21 +08:00			`('us.census.tiger.county', 'us.census.tiger.county', '2014'),`
updating total pop column reference 2016-05-13 23:23:10 +08:00			`('us.census.acs.B01003001', 'us.census.tiger.census_tract', '2010 - 2014'),`
			`('us.census.acs.B01003001_quantile', 'us.census.tiger.census_tract', '2010 - 2014'),`
			`('us.census.acs.B01003001', 'us.census.tiger.block_group', '2010 - 2014'),`
updates fixtures 2016-05-17 20:28:23 +08:00			`('us.census.spielman_singleton_segments.X10', 'us.census.tiger.census_tract', '2010 - 2014'),`
adds fixture gen script 2016-05-12 23:07:21 +08:00			`('us.zillow.AllHomes_Zhvi', 'us.census.tiger.zcta5', '2014-01'),`
use schema feature of sqldumpr, use different WHERE clause for zcta 2016-05-12 23:41:47 +08:00			`('us.zillow.AllHomes_Zhvi', 'us.census.tiger.zcta5', '2016-03'),`
cast return value of obs_getboundaryid to text 2016-05-13 18:51:13 +08:00			`('whosonfirst.wof_country_geom', 'whosonfirst.wof_country_geom', '2016'),`
updates fixtures 2016-05-17 20:28:23 +08:00			`('us.census.tiger.zcta5_clipped', 'us.census.tiger.zcta5_clipped', '2014'),`
			`('us.census.tiger.block_group_clipped', 'us.census.tiger.block_group_clipped', '2014'),`
adds fixture gen script 2016-05-12 23:07:21 +08:00			`]`

first pass on generating new metadata from local 2016-10-01 04:44:03 +08:00			`OUTFILE_PATH = 'src/pg/test/fixtures/load_fixtures.sql'`
			`DROPFILE_PATH = 'src/pg/test/fixtures/drop_fixtures.sql'`

			`def dump(cols, tablename, where=''):`

			`with open(DROPFILE_PATH, 'a') as dropfile:`
			`dropfile.write('DROP TABLE IF EXISTS observatory.{tablename};\n'.format(`
			`tablename=tablename,`
			`))`

			`subprocess.check_call('pg_dump -x --section=pre-data -t observatory.{tablename} '`
			`' \| sed "s:, pg_catalog::" '`
			`' \| sed "s:CREATE TABLE :CREATE TABLE observatory.:" '`
			`' \| sed "s:ALTER TABLE.OWNER.::" '`
			`' >> {outfile}'.format(`
			`tablename=tablename,`
			`outfile=OUTFILE_PATH,`
			`), shell=True)`

			`with open(OUTFILE_PATH, 'a') as outfile:`
			`outfile.write('COPY observatory."{}" FROM stdin WITH CSV HEADER;\n'.format(tablename))`

			`subprocess.check_call('''`
			`psql -c "COPY (SELECT {cols} \`
			`FROM observatory.{tablename} {where}) \`
			`TO STDOUT WITH CSV HEADER" >> {outfile}'''.format(`
			`cols=cols,`
			`tablename=tablename,`
			`where=where,`
			`outfile=OUTFILE_PATH,`
			`), shell=True)`

			`with open(OUTFILE_PATH, 'a') as outfile:`
			`outfile.write('\\.\n\n')`


			`def main():`
			`unique_tables = set()`

			`for f in fixtures:`
			`column_id, boundary_id, timespan = f`
			`tablename_query = get_tablename_query(column_id, boundary_id, timespan)`
			`tablename, colname = query(tablename_query).fetchone()`
			`table_colname = (tablename, colname, boundary_id, )`
			`if table_colname not in unique_tables:`
			`print(table_colname)`
			`unique_tables.add(table_colname)`

			`print unique_tables`

			`with open(OUTFILE_PATH, 'w') as outfile:`
			`outfile.write('SET client_min_messages TO WARNING;\n\\set ECHO none\n')`
			`outfile.write('CREATE SCHEMA IF NOT EXISTS observatory;\n\n')`

			`with open(DROPFILE_PATH, 'w') as dropfile:`
			`dropfile.write('SET client_min_messages TO WARNING;\n\\set ECHO none\n')`

			`for tablename in METADATA_TABLES:`
			`print(tablename)`
			`if tablename == 'obs_meta':`
			`where = "WHERE " + " OR ".join([`
			`"(numer_id, geom_id, numer_timespan) = ('{}', '{}', '{}')".format(`
			`numer_id, geom_id, timespan)`
			`for numer_id, geom_id, timespan in fixtures`
			`])`
			`elif tablename == 'obs_meta_numer':`
			`where = "WHERE " + " OR ".join([`
			`"numer_id IN ('{}', '{}')".format(numer_id, geom_id)`
			`for numer_id, geom_id, timespan in fixtures`
			`])`
			`elif tablename == 'obs_meta_denom':`
			`where = "WHERE " + " OR ".join([`
			`"denom_id IN ('{}', '{}')".format(numer_id, geom_id)`
			`for numer_id, geom_id, timespan in fixtures`
			`])`
			`elif tablename == 'obs_meta_geom':`
			`where = "WHERE " + " OR ".join([`
			`"geom_id IN ('{}', '{}')".format(numer_id, geom_id)`
			`for numer_id, geom_id, timespan in fixtures`
			`])`
			`elif tablename == 'obs_meta_timespan':`
			`where = "WHERE " + " OR ".join([`
			`"timespan_id = ('{}')".format(timespan)`
			`for numer_id, geom_id, timespan in fixtures`
			`])`
			`elif tablename == 'obs_column':`
			`where = "WHERE " + " OR ".join([`
			`"id IN ('{}', '{}')".format(numer_id, geom_id)`
			`for numer_id, geom_id, timespan in fixtures`
			`])`
			`elif tablename in ('obs_column_table', 'obs_column_tag'):`
			`where = "WHERE " + " OR ".join([`
			`"column_id IN ('{}', '{}')".format(numer_id, geom_id)`
			`for numer_id, geom_id, timespan in fixtures`
			`])`
			`elif tablename == 'obs_column_to_column':`
			`where = "WHERE " + " OR ".join([`
			`"source_id IN ('{}', '{}') OR target_id IN ('{}', '{}')".format(`
			`numer_id, geom_id, numer_id, geom_id)`
			`for numer_id, geom_id, timespan in fixtures`
			`])`
			`elif tablename == 'obs_table':`
			`where = "WHERE " + " OR ".join([`
			`"timespan = '{}'".format(timespan)`
			`for numer_id, geom_id, timespan in fixtures`
			`])`
			`else:`
			`where = ''`
			`dump('*', tablename, where)`

			`for tablename, colname, boundary_id in unique_tables:`
			`if 'zcta5' in boundary_id:`
			`where = '\'11%\''`
			`compare = 'LIKE'`
			`elif 'whosonfirst' in boundary_id:`
			`where = '(\'85632785\',\'85633051\',\'85633111\',\'85633147\',\'85633253\',\'85633267\')'`
			`compare = 'IN'`
			`else:`
			`where = '\'36047%\''`
			`compare = 'LIKE'`
			`print ' '.join(['*', tablename, "WHERE {}::text {} {}".format(colname, compare, where)])`
			`dump('*', tablename, "WHERE {}::text {} {}".format(colname, compare, where))`
			`#cdb.dump(' '.join([select_star(tablename), "WHERE {}::text {} {}".format(colname, compare, where)]),`
			`# tablename, outfile, schema='observatory')`
			`#dropfiles.write('DROP TABLE IF EXISTS observatory.{};\n'.format(tablename))`

			`if __name__ == '__main__':`
			`main()`