break apart multipolygons as part of getgeometryscores

This commit is contained in:
John Krauss 2017-01-31 17:21:04 +00:00
parent 0f372604db
commit 9b115baf32

View File

@ -413,6 +413,173 @@ END
$$ LANGUAGE plpgsql; $$ LANGUAGE plpgsql;
-- select st_geometrytype(
-- st_collect(array_agg(st_envelope(the_geom)))::text)
-- from observatory.obs_9812b21f90f3e6a885dc546a3c6ad32e0190d723 tablesample system(
-- (select least(100, 100.0 - 100 * ((count(*) - 50.0) / count(*)))
-- from observatory.obs_9812b21f90f3e6a885dc546a3c6ad32e0190d723))
-- ;
--
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_collect(st_setsrid(st_envelope(the_geom), 4326))
-- from observatory.obs_78fb6c1d6ff6505225175922c2c389ce48d7632c
-- where geoid like '360470001%'), null, 1
-- )
-- order by score desc;
--
-- -- block group -> finds block group, 2s
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_collect(st_setsrid(st_envelope(the_geom), 4326))
-- from observatory.obs_78fb6c1d6ff6505225175922c2c389ce48d7632c tablesample system(
-- (select least(100, 100.0 - 100 * ((count(*) - 50.0) / count(*)))
-- from observatory.obs_78fb6c1d6ff6505225175922c2c389ce48d7632c)))
-- , null, 3
-- )
-- order by score desc;
-- -- simple, alternative approach -> finds block group, 2s
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_setsrid(st_extent(the_geom), 4326)
-- from observatory.obs_78fb6c1d6ff6505225175922c2c389ce48d7632c),
-- null, (select count(*) from observatory.obs_78fb6c1d6ff6505225175922c2c389ce48d7632c)::int
-- )
-- order by score desc;
--
-- -- census tract -> finds census tract, 1.6s
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_collect(st_setsrid(st_envelope(the_geom), 4326))
-- from observatory.obs_87a814e485deabe3b12545a537f693d16ca702c2 tablesample system(
-- (select least(100, 100.0 - 100 * ((count(*) - 50.0) / count(*)))
-- from observatory.obs_87a814e485deabe3b12545a537f693d16ca702c2)))
-- , null, 3
-- )
-- order by score desc;
-- -- simple, alternative approach -> finds census tract, 1.3s
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_setsrid(st_extent(the_geom), 4326)
-- from observatory.obs_87a814e485deabe3b12545a537f693d16ca702c2),
-- null, (select count(*) from observatory.obs_87a814e485deabe3b12545a537f693d16ca702c2)::int
-- )
-- order by score desc;
--
-- -- zcta5 -> finds zcta5
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_collect(st_setsrid(st_envelope(the_geom), 4326))
-- from observatory.obs_c4411eba732408d47d73281772dbf03d60645dec tablesample system(
-- (select least(100, 100.0 - 100 * ((count(*) - 50.0) / count(*)))
-- from observatory.obs_c4411eba732408d47d73281772dbf03d60645dec)))
-- , null, 3
-- )
-- order by score desc;
-- -- simple, alternative approach -> finds zcta5, 1.6s
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_setsrid(st_extent(the_geom), 4326)
-- from observatory.obs_c4411eba732408d47d73281772dbf03d60645dec),
-- null, (select count(*) from observatory.obs_c4411eba732408d47d73281772dbf03d60645dec)::int
-- )
-- order by score desc;
--
-- -- county -> finds unified school district or county
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_collect(st_setsrid(st_envelope(the_geom), 4326))
-- from observatory.obs_0310c639744a2014bb1af82709228f05b59e7d3d tablesample system(
-- (select least(100, 100.0 - 100 * ((count(*) - 50.0) / count(*)))
-- from observatory.obs_0310c639744a2014bb1af82709228f05b59e7d3d)))
-- , null, 3
-- )
-- order by score desc;
-- -- fails for subset (prefers puma)
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_collect(st_setsrid(st_envelope(the_geom), 4326))
-- from observatory.obs_0310c639744a2014bb1af82709228f05b59e7d3d tablesample system(
-- (select least(100, 100.0 - 100 * ((count(*) - 50.0) / count(*)))
-- from observatory.obs_0310c639744a2014bb1af82709228f05b59e7d3d where geoid like '36%')) where geoid like '36%')
-- , null, 3
-- )
-- order by score desc;
-- -- simple, alternative approach -> finds county, 1s
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_setsrid(st_extent(the_geom), 4326)
-- from observatory.obs_0310c639744a2014bb1af82709228f05b59e7d3d),
-- null, (select count(*) from observatory.obs_0310c639744a2014bb1af82709228f05b59e7d3d)::int
-- )
-- order by score desc;
-- -- fails for subset (prefers congressional district)
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_setsrid(st_extent(the_geom), 4326)
-- from observatory.obs_0310c639744a2014bb1af82709228f05b59e7d3d where geoid like '36%'),
-- null, (select count(*) from observatory.obs_0310c639744a2014bb1af82709228f05b59e7d3d where geoid like '36%')::int
-- )
-- order by score desc;
--
-- -- state -> finds congressional district (we're not adding up properly)
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_collect(st_setsrid(st_envelope(the_geom), 4326))
-- from observatory.obs_9812b21f90f3e6a885dc546a3c6ad32e0190d723 tablesample system(
-- (select least(100, 100.0 - 100 * ((count(*) - 50.0) / count(*)))
-- from observatory.obs_9812b21f90f3e6a885dc546a3c6ad32e0190d723)))
-- , null, 3
-- )
-- order by score desc;
-- -- simple, alternative approach -> finds congressional district (we're not adding up properly)
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_setsrid(st_extent(the_geom), 4326)
-- from observatory.obs_9812b21f90f3e6a885dc546a3c6ad32e0190d723),
-- null, (select count(*) from observatory.obs_9812b21f90f3e6a885dc546a3c6ad32e0190d723)::int
-- )
-- order by score desc;
--
-- -- au SA4 (only 106 of them) -> finds au.geo.SUA
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_collect(st_setsrid(st_envelope(the_geom), 4326))
-- from observatory.obs_b5b7ab6bbfc1acdd58e7bf770655274468c10988 tablesample system(
-- (select least(100, 100.0 - 100 * ((count(*) - 200.0) / count(*)))
-- from observatory.obs_b5b7ab6bbfc1acdd58e7bf770655274468c10988)))
-- , null, 3
-- )
-- order by score desc;
-- -- simple, alternative approach -> finds SA3 (are we not adding up properly?)
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_setsrid(st_extent(the_geom), 4326)
-- from observatory.obs_b5b7ab6bbfc1acdd58e7bf770655274468c10988),
-- null, (select count(*) from observatory.obs_b5b7ab6bbfc1acdd58e7bf770655274468c10988)::int
-- )
-- order by score desc;
--
-- -- au SA4 (54K of them) -> finds au.geo.SA1, sometimes SOSR
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_collect(st_setsrid(st_envelope(the_geom), 4326))
-- from observatory.obs_c846ae4ea19a71b5026754ec730334e1828c09f2 tablesample system(
-- (select least(100, 100.0 - 100 * ((count(*) - 50.0) / count(*)))
-- from observatory.obs_c846ae4ea19a71b5026754ec730334e1828c09f2)))
-- , null, 3
-- )
-- order by score desc;
-- -- simple, alternative approach -> finds SA1, fraction of a sec
-- select *
-- from cdb_observatory._obs_getgeometryscores(
-- (select st_setsrid(st_extent(the_geom), 4326)
-- from observatory.obs_c846ae4ea19a71b5026754ec730334e1828c09f2),
-- null, (select count(*) from observatory.obs_c846ae4ea19a71b5026754ec730334e1828c09f2)::int
-- )
-- order by score desc;
CREATE OR REPLACE FUNCTION cdb_observatory._OBS_GetGeometryScores( CREATE OR REPLACE FUNCTION cdb_observatory._OBS_GetGeometryScores(
bounds Geometry(Geometry, 4326) DEFAULT NULL, bounds Geometry(Geometry, 4326) DEFAULT NULL,
filter_geom_ids TEXT[] DEFAULT NULL, filter_geom_ids TEXT[] DEFAULT NULL,
@ -435,27 +602,34 @@ BEGIN
filter_geom_ids := COALESCE(filter_geom_ids, (ARRAY[])::TEXT[]); filter_geom_ids := COALESCE(filter_geom_ids, (ARRAY[])::TEXT[]);
-- Very complex geometries simply fail. For a boundary check, we can -- Very complex geometries simply fail. For a boundary check, we can
-- comfortably get away with the simplicity of an envelope -- comfortably get away with the simplicity of an envelope
IF ST_Npoints(bounds) > 10000 THEN --IF ST_Npoints(bounds) > 10000 THEN
bounds := ST_Envelope(bounds); -- bounds := ST_Envelope(bounds);
END IF; --END IF;
RETURN QUERY RETURN QUERY
EXECUTE $string$ EXECUTE $string$
WITH clipped_geom AS ( WITH expanded_geom AS (
SELECT column_id, table_id SELECT CASE WHEN $1 IS NOT NULL THEN generate_series(1, ST_NumGeometries($1))
, CASE WHEN $1 IS NOT NULL THEN ST_Clip(tile, $1, True) -- -20 ELSE 1 END AS exgeom_id,
CASE WHEN $1 IS NOT NULL THEN (ST_Dump($1)).geom
ELSE NULL END AS exgeom
),
clipped_geom AS (
SELECT column_id, table_id, exgeom_id, exgeom
, CASE WHEN exgeom IS NOT NULL THEN ST_Clip(tile, exgeom, True) -- -20
ELSE tile END clipped_tile ELSE tile END clipped_tile
, tile , tile
FROM observatory.obs_column_table_tile_simple FROM observatory.obs_column_table_tile_simple, expanded_geom
WHERE ($1 IS NULL OR ST_Intersects($1, tile)) WHERE (exgeom IS NULL OR ST_Intersects(exgeom, tile))
AND (column_id = ANY($2) OR cardinality($2) = 0) AND (column_id = ANY($2) OR cardinality($2) = 0)
), clipped_geom_countagg AS ( ), clipped_geom_countagg AS (
SELECT column_id, table_id SELECT column_id, table_id, exgeom_id
, BOOL_AND(ST_BandIsNoData(clipped_tile, 1)) nodata , BOOL_AND(ST_BandIsNoData(clipped_tile, 1)) nodata
, ST_CountAgg(clipped_tile, 1, False)::Numeric pixels -- -10 , ST_CountAgg(clipped_tile, 1, False)::Numeric pixels -- -10
FROM clipped_geom FROM clipped_geom
GROUP BY column_id, table_id GROUP BY column_id, table_id, exgeom_id
), clipped_geom_reagg AS ( ), clipped_geom_reagg AS (
SELECT COUNT(*)::BIGINT cnt, a.column_id, a.table_id, SELECT COUNT(*)::BIGINT cnt, a.column_id, a.table_id, a.exgeom_id,
cdb_observatory.FIRST(exgeom) exgeom,
cdb_observatory.FIRST(nodata) first_nodata, cdb_observatory.FIRST(nodata) first_nodata,
cdb_observatory.FIRST(pixels) first_pixel, cdb_observatory.FIRST(pixels) first_pixel,
cdb_observatory.FIRST(tile) first_tile, cdb_observatory.FIRST(tile) first_tile,
@ -464,26 +638,28 @@ BEGIN
FROM clipped_geom_countagg a, clipped_geom b FROM clipped_geom_countagg a, clipped_geom b
WHERE a.table_id = b.table_id WHERE a.table_id = b.table_id
AND a.column_id = b.column_id AND a.column_id = b.column_id
GROUP BY a.column_id, a.table_id AND a.exgeom_id = b.exgeom_id
GROUP BY a.column_id, a.table_id, a.exgeom_id
), final AS ( ), final AS (
SELECT SELECT
cnt, table_id, column_id MAX(cnt) cnt, table_id, column_id
, NULL::Numeric AS notnull_percent , NULL::Numeric AS notnull_percent
, (CASE WHEN first_nodata IS FALSE , (percentile_cont(0.5) within group (order by (CASE WHEN first_nodata IS FALSE
THEN sum_geoms THEN sum_geoms
ELSE COALESCE(ST_Value(first_tile, 1, ST_PointOnSurface($1)), 0) ELSE COALESCE(ST_Value(first_tile, 1, ST_PointOnSurface(exgeom)), 0)
* (ST_Area($1) / ST_Area(ST_PixelAsPolygon(first_tile, 0, 0)) * (ST_Area(exgeom) / ST_Area(ST_PixelAsPolygon(first_tile, 0, 0))
* first_pixel) -- -20 * first_pixel) -- -20
END)::Numeric END)::Numeric))::numeric
AS numgeoms AS numgeoms
, (CASE WHEN first_nodata IS FALSE , (percentile_cont(0.5) within group (order by (CASE WHEN first_nodata IS FALSE
THEN mean_fill THEN mean_fill
ELSE COALESCE(ST_Value(first_tile, 2, ST_PointOnSurface($1))::Numeric / 255, 0) -- -2 ELSE COALESCE(ST_Value(first_tile, 2, ST_PointOnSurface(exgeom))::Numeric / 255, 0) -- -2
END)::Numeric END)::Numeric))::numeric
AS percentfill AS percentfill
, null::numeric estnumgeoms , null::numeric estnumgeoms
, null::numeric meanmediansize , null::numeric meanmediansize
FROM clipped_geom_reagg FROM clipped_geom_reagg
GROUP BY table_id, column_id
) SELECT ) SELECT
((100.0 / (1+abs(log(0.0001 + $3) - log(0.0001 + numgeoms::Numeric)))) * percentfill)::Numeric ((100.0 / (1+abs(log(0.0001 + $3) - log(0.0001 + numgeoms::Numeric)))) * percentfill)::Numeric
AS score, * AS score, *