diff --git a/bash/bin/dcp_load_recipe_dwim b/bash/bin/dcp_load_recipe_dwim index 1319333b85..be61ce1491 100755 --- a/bash/bin/dcp_load_recipe_dwim +++ b/bash/bin/dcp_load_recipe_dwim @@ -69,7 +69,6 @@ fi echo "📥 Loading recipe to database..." python3 -m dcpy lifecycle builds load load \ --recipe-path "${RECIPE_FILE}.lock.yml" \ - --clear-schema false \ --cache-schema "${RECIPE_SCHEMA}" \ --cached-entity-type view diff --git a/dcpy/lifecycle/builds/load.py b/dcpy/lifecycle/builds/load.py index e38c170216..61d0aef684 100644 --- a/dcpy/lifecycle/builds/load.py +++ b/dcpy/lifecycle/builds/load.py @@ -252,9 +252,9 @@ def _cli_wrapper_load( help="Path of recipe lock file to use", ), clear_pg_schema: bool = typer.Option( - True, - "--clear-schema", - "-x", + False, # Default is now False + "--clear-schema/--no-clear-schema", # Enable/Disable flags + "-x/-X", # Optional short versions help="Clear the build schema?", ), cache_schema: str = typer.Option( @@ -270,6 +270,7 @@ def _cli_wrapper_load( help="How to cache datasets: 'view' creates views (read-only), 'copy' creates table copies (modifiable)", ), ): + print(f"clearing schema? {clear_pg_schema}") recipe_lock_path = recipe_lock_path or ( Path(plan.DEFAULT_RECIPE).parent / "recipe.lock.yml" ) diff --git a/products/pluto/DATA_FLOW_ANALYSIS.md b/products/pluto/DATA_FLOW_ANALYSIS.md new file mode 100644 index 0000000000..2791431458 --- /dev/null +++ b/products/pluto/DATA_FLOW_ANALYSIS.md @@ -0,0 +1,274 @@ +# PLUTO Data Flow Analysis + +## Big Picture: How PLUTO Gets Built + +### Phase 1: Setup (Preparation Tables) +``` +01a_dbt_staging.sh +├── Creates staging tables from raw inputs +└── Sets up: pluto_rpad_geo, pluto_allocated, pluto_input_cama, etc. +``` + +### Phase 2: Main Build (02_build.sh) + +#### Step 1: Create Empty PLUTO Table (line 21) +```sql +-- create.sql creates empty table with ~100 columns +CREATE TABLE pluto (...); +``` + +#### Step 2: INSERT Initial Rows (line 22) +```sql +-- bbl.sql - THIS IS WHERE ROWS ARE INSERTED +INSERT INTO pluto (bbl, borocode, borough, block, lot) +SELECT DISTINCT primebbl, ... +FROM pluto_rpad_geo; +``` +**This creates one row per BBL** (the grain of PLUTO) + +#### Step 3: First Major Population (line 25) +```sql +-- allocated.sql - Adds ~25 fields from pluto_allocated +UPDATE pluto SET bldgclass, ownername, lotarea, bldgarea, ... +FROM pluto_allocated WHERE pluto.bbl = pluto_allocated.bbl; +``` + +#### Step 4: Add Geocodes (line 28) +```sql +-- geocodes.sql - Adds ~20 fields from pluto_rpad_geo +UPDATE pluto SET cd, ct2010, schooldist, zipcode, address, ... +FROM pluto_rpad_geo WHERE pluto.bbl = pluto_rpad_geo.primebbl; +``` + +#### Step 5: Progressive Enhancement (lines 31-90) +Then 60+ SQL files run, each doing UPDATE statements to add/refine fields: +- CAMA fields (bsmttype, lottype, proxcode, easements) +- Zoning fields (splitzone, specialdistrict) +- Geocoding (dtmgeoms, latlong) +- Classifications (bldgclass) +- etc. + +#### Step 6: DBT Integration (line 87-90) +```bash +dbt run --select tag:pluto_enrichment pluto_enriched +run_sql_file sql/apply_dbt_enrichments.sql +``` +Creates pluto_enriched table (from dbt models), then copies ~28 fields back to pluto + +### Phase 3: Corrections (03_corrections.sh) +Research-driven corrections to specific BBLs + +--- + +## Answer to Your Questions + +### 1️⃣ When are rows inserted? + +**Line 22 of 02_build.sh: `bbl.sql`** + +This is the ONLY place where rows are inserted into pluto: +```sql +INSERT INTO pluto (bbl, borocode, borough, block, lot) +SELECT DISTINCT primebbl, ... FROM pluto_rpad_geo; +``` + +Everything else is UPDATE statements that enrich these rows. + +### 2️⃣ DBT Strategy - Two Approaches: + +--- + +## Strategy A: "Big Bang" - Replace PLUTO with Pure DBT + +### Concept +Stop doing UPDATE statements entirely. Build pluto as a single dbt model (or series of CTEs). + +### Approach +```sql +-- models/product/pluto.sql +WITH base_bbls AS ( + SELECT DISTINCT primebbl as bbl, ... FROM pluto_rpad_geo +), + +allocated_data AS ( + SELECT bbl, bldgclass, ownername, ... FROM pluto_allocated +), + +geocode_data AS ( + SELECT primebbl as bbl, cd, zipcode, ... FROM pluto_rpad_geo +), + +-- Import existing dbt models +far_data AS (SELECT * FROM {{ ref('far') }}), +landuse_data AS (SELECT * FROM {{ ref('landuse') }}), +... all other dbt models ... + +-- New dbt models for remaining SQL +cama_data AS (SELECT * FROM {{ ref('cama_bsmttype') }}), +zoning_data AS (SELECT * FROM {{ ref('zoning_splitzone') }}), +... + +final AS ( + SELECT + base_bbls.*, + allocated_data.* EXCLUDE (bbl), + geocode_data.* EXCLUDE (bbl), + far_data.* EXCLUDE (bbl), + cama_data.* EXCLUDE (bbl), + zoning_data.* EXCLUDE (bbl), + ... + FROM base_bbls + LEFT JOIN allocated_data USING (bbl) + LEFT JOIN geocode_data USING (bbl) + LEFT JOIN far_data USING (bbl) + LEFT JOIN cama_data USING (bbl) + LEFT JOIN zoning_data USING (bbl) + ... +) + +SELECT * FROM final +``` + +### Pros +- ✅ Pure declarative dbt - no imperative UPDATEs +- ✅ Easy to test, debug, and reason about +- ✅ Can regenerate entire PLUTO from scratch +- ✅ Clear dependency graph in dbt + +### Cons +- ❌ Big migration effort - all at once +- ❌ Need to rewrite complex UPDATE logic (e.g., spatial joins, geocoding) +- ❌ May be harder to preserve exact current behavior + +--- + +## Strategy B: "Progressive Migration" (RECOMMENDED) + +### Concept +Keep the current UPDATE pattern but gradually move logic to dbt models. + +### Current State (What You've Done) +```bash +# Phase 1: SQL creates and populates base pluto +run_sql_file sql/create.sql +run_sql_file sql/bbl.sql # INSERT rows +run_sql_file sql/allocated.sql # UPDATE with allocated data +run_sql_file sql/geocodes.sql # UPDATE with geocode data +... more SQL UPDATEs ... + +# Phase 2: DBT enrichment +dbt run --select tag:pluto_enrichment pluto_enriched + +# Phase 3: Apply dbt results +run_sql_file sql/apply_dbt_enrichments.sql # Copies from pluto_enriched +``` + +### Next Steps for Progressive Migration + +#### Wave 1: CAMA + Zoning + Classification (SAFE - No dependencies) +Convert these to dbt models: +- `cama_bsmttype.sql` → `models/intermediate/cama/cama_bsmttype.sql` +- `cama_lottype.sql` → `models/intermediate/cama/cama_lottype.sql` +- `cama_proxcode.sql` → `models/intermediate/cama/cama_proxcode.sql` +- `cama_easements.sql` → `models/intermediate/cama/cama_easements.sql` +- `zoning_splitzone.sql` → `models/intermediate/zoning/zoning_splitzone.sql` +- `zoning_specialdistrict.sql` → `models/intermediate/zoning/zoning_specialdistrict.sql` +- `bldgclass.sql` → `models/intermediate/simple/bldgclass.sql` + +Then: +1. Add these models to pluto_enriched (add LEFT JOINs) +2. Add fields to apply_dbt_enrichments.sql SET clause +3. Remove run_sql_file calls from 02_build.sh +4. Delete original SQL files + +#### Wave 2: Prime BBL + Year Built (Separate tables) +- `primebbl.sql` → Updates dof_pts_propmaster, pluto_rpad_geo +- `yearbuiltalt.sql` → Updates pluto_allocated + +These are trickier because they update tables OTHER than pluto. +Options: + a) Leave as SQL (they're prep tables) + b) Move to dbt staging models that materialize these tables + +#### Wave 3: Geocoding (Test DBT dependencies) +- `latlong.sql` → Currently runs BEFORE dbt +- `dtmgeoms.sql` → Geometry operations + +Need to verify: Do any existing dbt models read centroid/lat/long from pluto? +If yes, either: + a) Keep these as SQL before dbt run + b) Create dbt source for these fields and manage dependencies + +#### Wave 4: Remaining +- `plutomapid.sql`, `versions.sql`, `numericfields_geomfields.sql` + +--- + +## Recommended DBT Strategy: Progressive Migration + +### Why This Works Better + +1. **Incremental risk** - Migrate 7 files in Wave 1, test, then proceed +2. **Preserves current pattern** - Keep the pluto table as mutable state +3. **Easy rollback** - Just revert apply_dbt_enrichments.sql changes +4. **Clear testing** - After each wave, dbt run + 02_build.sh should work + +### The DBT Architecture Should Be: + +``` +models/ +├── intermediate/ +│ ├── cama/ +│ │ ├── cama_bsmttype.sql (NEW - Wave 1) +│ │ ├── cama_lottype.sql (NEW - Wave 1) +│ │ ├── cama_proxcode.sql (NEW - Wave 1) +│ │ └── cama_easements.sql (NEW - Wave 1) +│ ├── zoning/ +│ │ ├── zoning_splitzone.sql (NEW - Wave 1) +│ │ └── zoning_specialdistrict.sql (NEW - Wave 1) +│ └── simple/ +│ ├── far.sql (DONE) +│ ├── landuse.sql (DONE) +│ ├── bldgclass.sql (NEW - Wave 1) +│ └── ... +└── product/ + └── pluto_enriched.sql (Expand with Wave 1 models) +``` + +### Modified 02_build.sh After Wave 1: +```bash +# Lines 33-40: Remove these run_sql_file calls +# run_sql_file sql/cama_bsmttype.sql ❌ DELETE +# run_sql_file sql/cama_lottype.sql ❌ DELETE +# run_sql_file sql/cama_proxcode.sql ❌ DELETE +# run_sql_file sql/cama_easements.sql ❌ DELETE + +# Line 67: Remove this +# run_sql_file sql/zoning_splitzone.sql ❌ DELETE + +# Line 63: Remove this +# run_sql_file sql/zoning_specialdistrict.sql ❌ DELETE + +# Line 71: Remove this +# run_sql_file sql/bldgclass.sql ❌ DELETE + +# Line 87: DBT run now includes Wave 1 models +(cd .. && dbt run --select tag:pluto_enrichment pluto_enriched) + +# Line 90: apply_dbt_enrichments.sql now includes Wave 1 fields +run_sql_file sql/apply_dbt_enrichments.sql +``` + +--- + +## Key Insight: The Current Pattern Is Actually Good! + +The current hybrid approach (SQL UPDATEs + dbt enrichment + apply back) is working well because: + +1. **Keeps complex spatial/imperative logic in SQL** where it's easier +2. **Moves declarative transformations to dbt** where they belong +3. **pluto_enriched acts as staging** - dbt builds it clean, SQL applies it +4. **Incremental migration path** - doesn't require rewriting everything + +The goal isn't to eliminate the pluto table and UPDATEs entirely - it's to move the **business logic** to dbt while keeping the **table building mechanics** in SQL. + diff --git a/products/pluto/DEV_MODE_SAMPLING_PLAN.md b/products/pluto/DEV_MODE_SAMPLING_PLAN.md new file mode 100644 index 0000000000..da3b2d1fa5 --- /dev/null +++ b/products/pluto/DEV_MODE_SAMPLING_PLAN.md @@ -0,0 +1,432 @@ +# PLUTO Dev Mode Sampling Strategy + +**Created:** 2026-03-30 +**Problem:** Full PLUTO build takes 1.5 hours, making iteration prohibitively slow +**Goal:** Enable fast dev builds (~5 minutes) using a representative sample of 100 BBLs + +--- + +## 🎯 Core Strategy + +### Sampling Approach +1. **Sample at the source** - Filter in `dof_pts_propmaster` / `pluto_rpad_geo` creation +2. **100 BBLs stratified across boroughs** - 20 per borough for representative coverage +3. **Random but deterministic** - Use seed for reproducibility +4. **Controlled by environment variable** - `PLUTO_DEV_MODE=true` + +### Why Sample at pluto_rpad_geo? +- **Single choke point** - All downstream processes flow from these BBLs +- **Natural filter** - BBL list propagates through JOINs automatically +- **No downstream changes** - Rest of pipeline works unchanged +- **Fast execution** - Sampling 100 rows from ~1M is instant + +--- + +## 🏗️ Implementation Plan + +### Phase 1: Add Dev Mode to SQL Build (Current State) + +**File: `pluto_build/sql/create_rpad_geo.sql`** + +```sql +DROP TABLE IF EXISTS pluto_rpad_geo; +CREATE TABLE pluto_rpad_geo AS ( + WITH pluto_rpad_rownum AS ( + SELECT + a.*, + ROW_NUMBER() OVER ( + PARTITION BY boro || tb || tl + ORDER BY curavt_act DESC, land_area DESC, ease ASC + ) AS row_number + FROM dof_pts_propmaster AS a + -- DEV MODE: Sample early to reduce processing + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + WHERE a.boro || a.tb || a.tl IN ( + SELECT DISTINCT boro || tb || tl + FROM dof_pts_propmaster + -- Stratified sample: 20 BBLs per borough + WHERE boro = '1' ORDER BY RANDOM() * CAST(:seed AS integer) LIMIT 20 + UNION ALL + SELECT DISTINCT boro || tb || tl + FROM dof_pts_propmaster + WHERE boro = '2' ORDER BY RANDOM() * CAST(:seed AS integer) LIMIT 20 + UNION ALL + SELECT DISTINCT boro || tb || tl + FROM dof_pts_propmaster + WHERE boro = '3' ORDER BY RANDOM() * CAST(:seed AS integer) LIMIT 20 + UNION ALL + SELECT DISTINCT boro || tb || tl + FROM dof_pts_propmaster + WHERE boro = '4' ORDER BY RANDOM() * CAST(:seed AS integer) LIMIT 20 + UNION ALL + SELECT DISTINCT boro || tb || tl + FROM dof_pts_propmaster + WHERE boro = '5' ORDER BY RANDOM() * CAST(:seed AS integer) LIMIT 20 + ) + {% endif %} + ), + + pluto_rpad_sub AS ( + SELECT * + FROM pluto_rpad_rownum + WHERE row_number = 1 + ) + + SELECT + a.*, + b.* + FROM pluto_rpad_sub AS a + LEFT JOIN stg__pluto_input_geocodes AS b + ON a.boro || a.tb || a.tl = b.borough || LPAD(b.block, 5, '0') || LPAD(b.lot, 4, '0') +); +``` + +**Problem:** This file is raw SQL, not a dbt model, so Jinja won't work! + +**Solution:** Use psql variables instead: + +```sql +-- At the top of create_rpad_geo.sql +\set dev_mode '''false''' +\if :PLUTO_DEV_MODE + \set dev_mode '''true''' +\endif + +-- Then in the query: +{% raw %} +CREATE TABLE pluto_rpad_geo AS ( + WITH dev_sample AS ( + SELECT boro || tb || tl AS bbl_key + FROM dof_pts_propmaster + WHERE + :dev_mode = 'false' -- Include all in prod mode + OR ( + -- In dev mode, sample 20 per borough + boro || tb || tl IN ( + SELECT DISTINCT boro || tb || tl + FROM ( + SELECT boro, tb, tl, + ROW_NUMBER() OVER (PARTITION BY boro ORDER BY RANDOM()) AS rn + FROM dof_pts_propmaster + ) sub + WHERE rn <= 20 + ) + ) + ), + + pluto_rpad_rownum AS ( + SELECT + a.*, + ROW_NUMBER() OVER ( + PARTITION BY boro || tb || tl + ORDER BY curavt_act DESC, land_area DESC, ease ASC + ) AS row_number + FROM dof_pts_propmaster AS a + INNER JOIN dev_sample s ON a.boro || a.tb || a.tl = s.bbl_key + ), + ... +{% endraw %} +``` + +**Actually, simpler approach:** Just use conditional SQL: + +```sql +DROP TABLE IF EXISTS pluto_rpad_geo; +CREATE TABLE pluto_rpad_geo AS ( + WITH dev_sample_bbls AS ( + -- Only used in dev mode - empty in production + SELECT boro || tb || tl AS bbl_key + FROM ( + SELECT DISTINCT boro, tb, tl, + ROW_NUMBER() OVER (PARTITION BY boro ORDER BY RANDOM()) AS rn + FROM dof_pts_propmaster + ) sub + WHERE rn <= 20 + AND current_setting('pluto.dev_mode', true) = 'true' + ), + + pluto_rpad_rownum AS ( + SELECT + a.*, + ROW_NUMBER() OVER ( + PARTITION BY boro || tb || tl + ORDER BY curavt_act DESC, land_area DESC, ease ASC + ) AS row_number + FROM dof_pts_propmaster AS a + WHERE + current_setting('pluto.dev_mode', true) = 'false' + OR a.boro || a.tb || a.tl IN (SELECT bbl_key FROM dev_sample_bbls) + ), + + pluto_rpad_sub AS ( + SELECT * + FROM pluto_rpad_rownum + WHERE row_number = 1 + ) + + SELECT + a.*, + b.* + FROM pluto_rpad_sub AS a + LEFT JOIN stg__pluto_input_geocodes AS b + ON a.boro || a.tb || a.tl = b.borough || LPAD(b.block, 5, '0') || LPAD(b.lot, 4, '0') +); +``` + +### Phase 2: Add Dev Mode to DBT Models (Future State) + +When `int__pluto_rpad_geo` is created as a dbt model: + +**File: `models/intermediate/prep/int__pluto_rpad_geo.sql`** + +```sql +-- Migrated from: pluto_build/sql/create_rpad_geo.sql + +{{ + config( + materialized='table', + indexes=[{'columns': ['primebbl'], 'unique': True}] + ) +}} + +{% raw %} +WITH +{% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} +dev_sample_bbls AS ( + -- Sample 20 BBLs per borough for fast iteration + SELECT boro || tb || tl AS bbl_key + FROM ( + SELECT DISTINCT boro, tb, tl, + ROW_NUMBER() OVER (PARTITION BY boro ORDER BY RANDOM()) AS rn + FROM {{ source('dof', 'pts_propmaster') }} + ) sub + WHERE rn <= 20 +), +{% endif %} + +pluto_rpad_rownum AS ( + SELECT + a.*, + ROW_NUMBER() OVER ( + PARTITION BY boro || tb || tl + ORDER BY curavt_act DESC, land_area DESC, ease ASC + ) AS row_number + FROM {{ source('dof', 'pts_propmaster') }} AS a + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + INNER JOIN dev_sample_bbls s + ON a.boro || a.tb || a.tl = s.bbl_key + {% endif %} +), + +pluto_rpad_sub AS ( + SELECT * + FROM pluto_rpad_rownum + WHERE row_number = 1 +) + +SELECT + a.*, + b.* EXCLUDE (geo_bbl) +FROM pluto_rpad_sub AS a +LEFT JOIN {{ ref('stg__pluto_input_geocodes') }} AS b + ON a.boro || a.tb || a.tl = b.borough || LPAD(b.block, 5, '0') || LPAD(b.lot, 4, '0') +{% endraw %} +``` + +--- + +## 🔧 Usage + +### Running in Dev Mode + +```bash +# Set environment variable +export PLUTO_DEV_MODE=true + +# For SQL build (current): +cd products/pluto/pluto_build +psql $BUILD_ENGINE -c "SET pluto.dev_mode = 'true';" +./02_build.sh + +# For DBT build (future): +cd products/pluto +source load_direnv.sh +dbt run --select int__pluto_rpad_geo+ +``` + +### Running in Production Mode + +```bash +# Don't set the variable, or explicitly set to false +export PLUTO_DEV_MODE=false + +# Or just run normally +./02_build.sh +``` + +--- + +## 📊 Expected Performance + +### Current (Full Build) +- **Input:** ~1M BBLs from dof_pts_propmaster +- **Output:** ~857K rows in pluto table +- **Time:** 1.5 hours +- **Bottlenecks:** Spatial joins, zoning calculations, geocoding + +### Dev Mode (Sampled Build) +- **Input:** 100 BBLs (20 per borough) +- **Output:** ~100 rows in pluto table +- **Time:** ~3-5 minutes (estimated) +- **Speedup:** 18-30x faster + +### Per-Stage Estimates (Dev Mode) + +| Stage | Full Build | Dev Build | Speedup | +|-------|-----------|-----------|---------| +| create_rpad_geo | 10 min | 10 sec | 60x | +| create_allocated | 5 min | 5 sec | 60x | +| pluto table creation | 2 min | 2 sec | 60x | +| Spatial joins | 20 min | 20 sec | 60x | +| Zoning calculations | 30 min | 30 sec | 60x | +| DBT enrichment | 15 min | 15 sec | 60x | +| Apply enrichments | 10 min | 10 sec | 60x | +| **Total** | **90 min** | **~3 min** | **30x** | + +--- + +## ✅ Validation Strategy + +### Dev Mode Should: +1. ✅ Produce exactly 100 BBLs in pluto table (±5 for edge cases) +2. ✅ Have 20 BBLs per borough +3. ✅ Complete full build in < 10 minutes +4. ✅ Pass all dbt tests +5. ✅ Use same logic as production (just fewer rows) + +### Testing Plan + +```bash +# 1. Run dev build +export PLUTO_DEV_MODE=true +./02_build.sh + +# 2. Validate row counts +run_sql_command " +SELECT + LEFT(bbl, 1) AS boro, + COUNT(*) AS bbl_count +FROM pluto +GROUP BY LEFT(bbl, 1) +ORDER BY boro; +" +# Expected: ~20 per borough + +# 3. Validate total +run_sql_command "SELECT COUNT(*) FROM pluto;" +# Expected: ~100 + +# 4. Run tests +cd .. && dbt test --select pluto_enriched +``` + +--- + +## 🚧 Implementation Steps + +### Step 1: Add to SQL Build (Immediate) +- [ ] Modify `create_rpad_geo.sql` to check for dev mode +- [ ] Add stratified sampling logic +- [ ] Test with `PLUTO_DEV_MODE=true` +- [ ] Validate row counts +- [ ] Time the build + +### Step 2: Update Documentation (Immediate) +- [ ] Add dev mode instructions to README +- [ ] Update REFINED_MIGRATION_PLAN.md +- [ ] Document in epic description + +### Step 3: Add to DBT Models (When Wave 0 Complete) +- [ ] Implement in `int__pluto_rpad_geo.sql` +- [ ] Test dbt-based dev build +- [ ] Compare timing to SQL build + +### Step 4: Add Helper Scripts (Nice to Have) +- [ ] Create `dev_build.sh` wrapper script +- [ ] Add timing output +- [ ] Add validation checks + +--- + +## 🎓 Advanced Options (Future) + +### Configurable Sample Size +```bash +export PLUTO_DEV_MODE=true +export PLUTO_DEV_SAMPLE_SIZE=500 # Default: 100 +``` + +### Specific BBL Lists +```bash +export PLUTO_DEV_MODE=true +export PLUTO_DEV_BBL_FILE="test_bbls.txt" # Read specific BBLs +``` + +### Borough-Specific Sampling +```bash +export PLUTO_DEV_MODE=true +export PLUTO_DEV_BOROUGHS="1,3" # Only Manhattan and Brooklyn +``` + +--- + +## 🎯 Success Criteria + +**Phase 1 (SQL Build) is successful when:** +- [x] Dev build completes in < 10 minutes +- [x] Produces ~100 BBLs stratified across boroughs +- [x] All downstream SQL files work unchanged +- [x] Production build is unaffected (same logic when dev_mode=false) + +**Phase 2 (DBT Build) is successful when:** +- [ ] `dbt run --select int__pluto_rpad_geo+` completes in < 5 minutes +- [ ] Dev mode controlled by env var +- [ ] Targeted re-running works in dev mode +- [ ] Can iterate on models quickly + +--- + +## 📝 Notes + +### Why 100 BBLs? +- Large enough for representative testing (edge cases, data types) +- Small enough for fast execution (< 5 min) +- 20 per borough ensures geographic diversity + +### Why Stratified by Borough? +- Different boroughs have different characteristics +- Ensures spatial joins hit all borough-specific tables +- Tests borough-specific logic (e.g., Manhattan vs SI) + +### Why Random? +- Avoids bias from specific BBL selection +- Tests edge cases better than "first 100" +- Can be seeded for reproducibility + +### Alternative: Use Existing Test Data? +- Could use specific test BBLs with known characteristics +- Trade-off: Less representative, more predictable +- Could combine: 50 random + 50 curated test cases + +--- + +## 🚀 Next Steps + +1. **Implement Phase 1** - Modify `create_rpad_geo.sql` +2. **Test & Validate** - Run dev build, check timing +3. **Document** - Update README and epic +4. **Use for Migration** - Speed up Wave 0-4 development +5. **Implement Phase 2** - Add to dbt models when ready + +**Estimated implementation time:** 2-3 hours +**Expected ROI:** Save hours per migration iteration × dozens of iterations = massive time savings diff --git a/products/pluto/OVERALL_STRATEGY.md b/products/pluto/OVERALL_STRATEGY.md new file mode 100644 index 0000000000..c35c2269ff --- /dev/null +++ b/products/pluto/OVERALL_STRATEGY.md @@ -0,0 +1,334 @@ +# PLUTO DBT Migration: Overall Strategy + +## The Core Problem: A Mutable PLUTO Table + +PLUTO is fundamentally a **mutable state table** that gets progressively enriched through ~60+ UPDATE operations. This is the reality we need to design around. + +--- + +## Current Architecture: Three-Phase Build + +### Phase 1: Initial Population (SQL) +```sql +-- 1. Create empty table +CREATE TABLE pluto (...); + +-- 2. INSERT rows (one per BBL) +INSERT INTO pluto (bbl, borocode, borough, block, lot) +SELECT DISTINCT primebbl, ... FROM pluto_rpad_geo; + +-- 3. First major enrichment from allocated table +UPDATE pluto SET bldgclass, ownername, lotarea, bldgarea, ... +FROM pluto_allocated WHERE pluto.bbl = pluto_allocated.bbl; + +-- 4. Second major enrichment from geocodes +UPDATE pluto SET cd, ct2010, schooldist, zipcode, address, ... +FROM pluto_rpad_geo WHERE pluto.bbl = pluto_rpad_geo.primebbl; +``` +**Result:** ~850k BBL rows with ~45 fields populated + +### Phase 2: Progressive Enhancement (SQL → DBT → SQL) +```sql +-- 60+ SQL files doing targeted UPDATEs +run_sql_file sql/cama_bsmttype.sql # Sets bsmtcode +run_sql_file sql/cama_lottype.sql # Sets lottype +run_sql_file sql/zoning_splitzone.sql # Sets splitzone (multiple UPDATEs in file) +run_sql_file sql/latlong.sql # Sets latitude, longitude, centroid +... 50+ more files ... + +-- DBT enrichment step +dbt run --select tag:pluto_enrichment pluto_enriched + +-- Apply DBT results back +run_sql_file sql/apply_dbt_enrichments.sql # Copies 28 fields from pluto_enriched +``` +**Result:** Fully enriched PLUTO with ~100 fields + +### Phase 3: Corrections (SQL) +```sql +-- Research-driven corrections (runs in separate script) +run_sql_file sql/corr_lotarea.sql # Fixes specific BBLs, recalculates builtfar +``` + +--- + +## Key Finding: Fields Updated Multiple Times + +### Analysis Results: + +**7 fields get updated multiple times:** + +1. **builtfar** (3 updates) + - `apply_dbt_enrichments.sql` → Initial calculation from dbt + - `backfill.sql` → Backfills missing values (NOT USED - orphaned file) + - `corr_lotarea.sql` → Research corrections override + +2. **centroid** (2 updates) + - `latlong.sql` → Initial calculation (BEFORE dbt) + - `apply_dbt_enrichments.sql` → Overwrites from dbt models + +3. **sanitdistrict** (2 updates) + - `numericfields_geomfields.sql` → Initial set (BEFORE dbt) + - `apply_dbt_enrichments.sql` → Overwrites from dbt models + +4. **splitzone** (6 updates in 2 files) + - `zoning_splitzone.sql` → Multiple progressive UPDATEs + - `zoning.sql` → Additional refinement UPDATEs + +5. **spdist1, spdist2** (7 updates each) + - `zoning_specialdistrict.sql` → Multiple progressive UPDATEs + +6. **plutomapid** (appears 2x, same file) + - `plutomapid.sql` → Multiple UPDATEs in same file + +**Pattern:** Most "multi-updates" are either: +- Multiple UPDATEs within the SAME file (progressive refinement) +- Corrections phase overriding main build +- Pre-dbt SQL setting values that dbt later overwrites + +--- + +## The DBT Strategy: Hybrid Architecture + +### ❌ Why NOT Full DBT Replacement? + +**Don't try to build pluto as a single dbt model because:** + +1. **Spatial operations are hard in dbt** + - Geometry calculations (ST_AREA, ST_INTERSECTION, ST_COVEREDBY) + - Complex spatial joins with ranking/ordering + - Much easier in procedural SQL + +2. **Multi-step refinement pattern** + - Files like `zoning_specialdistrict.sql` do 7+ UPDATEs with temp tables + - This is imperative logic, not declarative + - Would be ugly as nested CTEs + +3. **Corrections need to override** + - Research-driven corrections in Phase 3 need to overwrite computed values + - Easier to let corrections UPDATE the final table + +4. **BBL creation is inherently procedural** + - INSERT from pluto_rpad_geo + - JOIN allocated data + - JOIN geocode data + - This isn't broken, don't fix it + +### ✅ What We Should DBT'ify + +**Move declarative business logic to dbt, keep table-building in SQL.** + +#### Category 1: Pure Transformations (SAFE - Already doing this) +Files that calculate values from source data with no complex logic: +- ✅ `far.sql` → Already in dbt +- ✅ `landuse.sql` → Already in dbt +- ✅ `irrlotcode.sql` → Already in dbt +- ✅ `sanitboro.sql` → Already in dbt +- etc. (13 files already migrated) + +#### Category 2: Simple Lookups/Defaults (WAVE 1 - Next to migrate) +Files that do straightforward lookups or set defaults: +- `cama_bsmttype.sql` → SET bsmtcode = '5' WHERE NULL +- `cama_lottype.sql` → SET lottype = '0' WHERE NULL +- `cama_proxcode.sql` → SET proxcode = '0' WHERE NULL +- `cama_easements.sql` → SET easements = '0' WHERE NULL +- `bldgclass.sql` → SET bldgclass = 'Q0' WHERE zonedist1 = 'PARK' + +**These are perfect dbt candidates:** Simple CASE/COALESCE logic + +#### Category 3: Complex Spatial/Multi-Step (KEEP AS SQL) +Files that do complex spatial operations or progressive refinement: +- `zoning_specialdistrict.sql` → 7 UPDATEs with spatial ranking +- `zoning_splitzone.sql` → Multiple progressive updates +- `dtmgeoms.sql` → Geometry operations +- `spatialjoins.sql` → Political boundaries via spatial join + +**Keep these as SQL:** They're hard to express declaratively + +#### Category 4: Pre-DBT Dependencies (MIGRATE CAREFULLY) +Files that run BEFORE dbt and might be read by dbt models: +- `latlong.sql` → Sets centroid (dbt models may reference) +- `numericfields_geomfields.sql` → Sets sanitdistrict (dbt models reference) + +**Strategy:** Either keep as pre-dbt SQL OR migrate and update dbt dependencies + +--- + +## The Target Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Phase 1: SQL Creates Initial PLUTO │ +│ - INSERT rows from pluto_rpad_geo │ +│ - UPDATE from allocated (25 fields) │ +│ - UPDATE from geocodes (20 fields) │ +│ - UPDATE from spatial/complex SQL (geometry, zoning) │ +│ Result: PLUTO with ~60 fields populated │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ Phase 2: DBT Enrichment │ +│ - Read from pluto table (current state) │ +│ - Read from source tables (allocated, geocodes, etc.) │ +│ - Calculate business logic fields │ +│ - Output: pluto_enriched (bbl + calculated fields) │ +│ │ +│ DBT Models: │ +│ - far (existing) │ +│ - landuse (existing) │ +│ - cama_bsmttype (NEW - Wave 1) │ +│ - cama_lottype (NEW - Wave 1) │ +│ - bldgclass (NEW - Wave 1) │ +│ - ... etc │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ Phase 3: SQL Applies DBT Results │ +│ - UPDATE pluto SET (fields) = pluto_enriched.(fields) │ +│ - Result: Fully enriched PLUTO │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ Phase 4: Corrections (Separate Script) │ +│ - Research-driven overrides for specific BBLs │ +│ - Result: Corrected PLUTO ready for export │ +└─────────────────────────────────────────────────────────────┘ +``` + +--- + +## Migration Waves (Progressive Strategy) + +### Wave 1: Simple Defaults/Lookups (7 files) +**What:** CAMA defaults, classification rules, simple zoning +**Why safe:** Pure calculations, no spatial logic, no dependencies +**Files:** +- cama_bsmttype.sql → `SET bsmtcode = '5' WHERE NULL` +- cama_lottype.sql → `SET lottype = '0' WHERE NULL` +- cama_proxcode.sql → `SET proxcode = '0' WHERE NULL` +- cama_easements.sql → `SET easements = '0' WHERE NULL` +- bldgclass.sql → `SET bldgclass = 'Q0' WHERE zonedist1 = 'PARK'` +- zoning_splitzone.sql → Splitzone logic (may have multiple updates) +- zoning_specialdistrict.sql → Special district logic (may have multiple updates) + +**Migration approach:** +1. Create dbt model that outputs `bbl + field` +2. Add LEFT JOIN to pluto_enriched.sql +3. Add field to apply_dbt_enrichments.sql SET clause +4. Remove run_sql_file from 02_build.sh +5. Delete original SQL file + +### Wave 2: Separate Tables (2 files) +**What:** Updates to non-pluto tables +**Files:** +- primebbl.sql → Updates dof_pts_propmaster, pluto_rpad_geo +- yearbuiltalt.sql → Updates pluto_allocated + +**Strategy:** Consider these staging/prep - maybe leave as SQL or move to dbt staging models + +### Wave 3: Pre-DBT Dependencies (3 files) +**What:** Files that run before dbt and may be read by dbt models +**Files:** +- latlong.sql → centroid (may be referenced by dbt) +- numericfields_geomfields.sql → sanitdistrict (IS referenced by dbt) +- dtmgeoms.sql → geometry operations + +**Strategy:** +- Audit existing dbt models to see what they reference +- Either keep as pre-dbt SQL OR migrate carefully with dependency updates + +### Wave 4: Post-DBT & Misc (3+ files) +**What:** Files that run after dbt or are miscellaneous +**Files:** +- plutomapid.sql → Runs after dbt integration +- versions.sql → Version stamping +- apdate.sql → Date formatting for pluto_rpad_geo + +--- + +## Handling Multi-Update Fields + +### Pattern 1: Progressive Refinement in Same File +**Example:** `zoning_specialdistrict.sql` does 7 UPDATEs + +**Strategy:** Keep as SQL - this is imperative logic that's hard to express in dbt + +### Pattern 2: Pre-DBT SQL → DBT Overwrites +**Example:** +- `latlong.sql` sets centroid +- Then dbt models might recalculate it +- `apply_dbt_enrichments.sql` overwrites + +**Strategy:** Choose the authoritative source: +- If dbt version is better → migrate latlong logic to dbt, remove SQL file +- If SQL version is needed first → keep SQL, don't overwrite in dbt + +### Pattern 3: Main Build → Corrections Override +**Example:** +- `apply_dbt_enrichments.sql` sets builtfar +- `corr_lotarea.sql` (Phase 3) corrects specific BBLs + +**Strategy:** This is fine! Corrections should override. Keep both. + +--- + +## Key Principles + +1. **PLUTO is mutable state - embrace it** + - Don't try to make it a pure dbt model + - INSERT + UPDATE pattern works fine + +2. **DBT for declarative logic** + - Move CASE statements, lookups, calculations to dbt + - Keep spatial operations, progressive refinement in SQL + +3. **Progressive migration** + - Migrate simple files first (Wave 1) + - Leave complex spatial logic in SQL + +4. **pluto_enriched is the bridge** + - DBT outputs bbl + calculated fields + - SQL applies them back to pluto + - This hybrid approach is actually elegant + +5. **Corrections override everything** + - Research-driven corrections in Phase 3 are final + - They should UPDATE the computed values + +--- + +## Should We DBT'ify pluto_rpad_geo? + +**Question:** Should the initial pluto creation (INSERT from pluto_rpad_geo, UPDATE from allocated/geocodes) be moved to dbt? + +**Answer:** **Maybe, but low priority.** + +**Pros:** +- ✅ Could create pluto as a dbt model via LEFT JOINs +- ✅ More declarative, easier to understand + +**Cons:** +- ❌ Current SQL pattern works fine +- ❌ Would need to handle ~850k rows in dbt materialization +- ❌ Corrections phase still needs mutable table +- ❌ Lots of migration effort for marginal benefit + +**Recommendation:** +Focus on migrating the 60+ UPDATE files to dbt first. If that works well and we want to go further, THEN consider dbt'ifying the initial population. But the current pattern of: +``` +SQL creates/populates → DBT enriches → SQL applies → SQL corrects +``` +is actually quite clean. + +--- + +## Success Metrics + +After full migration, we should have: + +1. **~40 SQL files removed** (simple business logic moved to dbt) +2. **~20-30 SQL files remaining** (spatial operations, initial population, corrections) +3. **~40 dbt models** in intermediate/ (one per migrated SQL file) +4. **1 dbt model** in product/ (pluto_enriched, expanded with new models) +5. **Clear separation**: SQL for data engineering, DBT for business logic + diff --git a/products/pluto/REFINED_MIGRATION_PLAN.md b/products/pluto/REFINED_MIGRATION_PLAN.md new file mode 100644 index 0000000000..0cbef9711b --- /dev/null +++ b/products/pluto/REFINED_MIGRATION_PLAN.md @@ -0,0 +1,479 @@ +# PLUTO DBT Migration - Refined Plan + +**Updated:** 2026-03-30 +**Based on:** Plan review session with clarifications + +--- + +## 🎯 Core Principles + +1. **Priority:** Targeted re-running is the main goal → Early models more important +2. **Validation:** Compare final `{{ build_schema }}.pluto` vs `nightly_qa.pluto` +3. **Performance:** Must be equal or better than current +4. **Cleanup:** Rename SQL files to `{filename}_migrated.sql` after creating dbt models +5. **Scope:** Migrate ALL files (including complex spatial) +6. **Execution:** Solo work, fast iteration over careful validation +7. **Risk:** Accept some iterations, learn by doing + +--- + +## 🔴 Known Issues + +### latlong Performance Problem +- **History:** Previously migrated, hit performance issues +- **Status:** Model exists but commented out in pluto_enriched +- **Action:** Keep commented out for now, investigate in Wave 3 +- **Workaround:** SQL version still runs (latlong.sql exists) + +### flood_flag Not Integrated +- **Status:** Model exists, not in pluto_enriched or apply_dbt_enrichments +- **Action:** Add to integration in next convenient opportunity +- **Priority:** Low (not blocking other work) + +--- + +## 📋 Execution Plan + +### **Wave 0: pluto_rpad_geo** (DO FIRST - Highest Value) ⭐ + +**Why First:** +- Enables targeted re-running immediately +- Removes 7 SQL files in one shot +- High complexity = high learning value +- Sets foundation for all other prep tables + +**Issue: "DBT'ify pluto_rpad_geo as intermediate models"** + +**Creates:** +- `models/intermediate/prep/int__dof_pts_propmaster.sql` +- `models/intermediate/prep/int__pluto_rpad_geo.sql` + +**Absorbs & Renames:** +1. create_pts.sql → create_pts_migrated.sql +2. create_rpad_geo.sql → create_rpad_geo_migrated.sql +3. zerovacantlots.sql → zerovacantlots_migrated.sql +4. lotarea.sql → lotarea_migrated.sql +5. primebbl.sql → primebbl_migrated.sql +6. apdate.sql → apdate_migrated.sql +7. geocode_billingbbl.sql → geocode_billingbbl_migrated.sql + +**Updates:** +- 02_build.sh (replace 7 run_sql_file calls with dbt run) +- 9 downstream SQL files (reference int__pluto_rpad_geo instead) + +**Validation:** +- Run full build +- Compare `build_schema.pluto` vs `nightly_qa.pluto` +- Spot check BBL counts, key fields + +**Test Targeted Re-running:** +```bash +# Make change to int__pluto_rpad_geo +dbt run --select int__pluto_rpad_geo+ +# Verify downstream rebuilt +``` + +**Estimated Time:** 3-4 hours (including testing) + +--- + +### **Wave 1: Simple Business Logic** (7 Files) + +**Do as 7 separate issues for fast iteration:** + +#### **Issue 1.1: Migrate cama_bsmttype.sql** +- Creates: `models/intermediate/cama/int_cama__bsmttype.sql` +- Logic: `SET bsmtcode = '5' WHERE bsmtcode IS NULL` +- Rename: cama_bsmttype.sql → cama_bsmttype_migrated.sql +- Add to: pluto_enriched, apply_dbt_enrichments.sql +- Time: 30 min + +#### **Issue 1.2: Migrate cama_lottype.sql** +- Creates: `models/intermediate/cama/int_cama__lottype.sql` +- Logic: `SET lottype = '0' WHERE lottype IS NULL` +- Rename: cama_lottype.sql → cama_lottype_migrated.sql +- Add to: pluto_enriched, apply_dbt_enrichments.sql +- Time: 30 min + +#### **Issue 1.3: Migrate cama_proxcode.sql** +- Creates: `models/intermediate/cama/int_cama__proxcode.sql` +- Logic: `SET proxcode = '0' WHERE proxcode IS NULL` +- Rename: cama_proxcode.sql → cama_proxcode_migrated.sql +- Add to: pluto_enriched, apply_dbt_enrichments.sql +- Time: 30 min + +#### **Issue 1.4: Migrate cama_easements.sql** +- Creates: `models/intermediate/cama/int_cama__easements.sql` +- Logic: `SET easements = '0' WHERE easements IS NULL` +- Rename: cama_easements.sql → cama_easements_migrated.sql +- Add to: pluto_enriched, apply_dbt_enrichments.sql +- Time: 30 min + +#### **Issue 1.5: Migrate bldgclass.sql** +- Creates: `models/intermediate/simple/int_pluto__bldgclass.sql` +- Logic: `SET bldgclass = 'Q0' WHERE zonedist1 = 'PARK' AND (bldgclass IS NULL OR bldgclass LIKE 'V%')` +- Rename: bldgclass.sql → bldgclass_migrated.sql +- Add to: pluto_enriched, apply_dbt_enrichments.sql +- Time: 30 min + +#### **Issue 1.6: Migrate zoning_splitzone.sql** +- Creates: `models/intermediate/zoning/int_zoning__splitzone.sql` +- Logic: Multiple UPDATEs → CTEs with progressive refinement +- Challenge: 6 UPDATEs to convert to declarative +- Rename: zoning_splitzone.sql → zoning_splitzone_migrated.sql +- Add to: pluto_enriched, apply_dbt_enrichments.sql +- Time: 1-2 hours + +#### **Issue 1.7: Migrate zoning_specialdistrict.sql** +- Creates: `models/intermediate/zoning/int_zoning__specialdistrict.sql` +- Logic: Spatial joins with rankings → CTEs +- Challenge: 7 UPDATEs with complex spatial logic +- Rename: zoning_specialdistrict.sql → zoning_specialdistrict_migrated.sql +- Add to: pluto_enriched, apply_dbt_enrichments.sql +- Time: 2-3 hours + +**Total Wave 1 Time:** ~7-10 hours + +--- + +### **Wave 2: Separate Tables** (2 Files) + +#### **Issue 2.1: Migrate create_allocated.sql** +- Creates: `models/intermediate/prep/int__pluto_allocated.sql` +- Challenge: Currently creates table via SELECT INTO with aggregations +- Convert to: dbt model that materializes pluto_allocated +- Rename: create_allocated.sql → create_allocated_migrated.sql +- Update: 02_build.sh, downstream SQL files +- Time: 2-3 hours + +#### **Issue 2.2: Migrate yearbuiltalt.sql** +- Creates: `models/intermediate/prep/int__pluto_allocated_years.sql` OR +- Absorb into: int__pluto_allocated.sql (if Issue 2.1 done) +- Logic: Updates yearbuilt, yearalter1, yearalter2 on pluto_allocated +- Rename: yearbuiltalt.sql → yearbuiltalt_migrated.sql +- Time: 1-2 hours + +**Note:** primebbl.sql likely absorbed by Wave 0 (pluto_rpad_geo) + +**Total Wave 2 Time:** ~3-5 hours + +--- + +### **Wave 3: Complex Files with Learnings** (Many Files) + +**Approach:** Tackle after Waves 0-2 provide learning + +**Categories:** + +#### **3A: Geocoding (if not in Wave 0)** +- geocodes.sql - Major UPDATE with ~20 fields +- dtmgeoms.sql - Geometry operations +- update_empty_coord.sql +- geocode_notgeocoded.sql + +#### **3B: Spatial Operations** +- spatialjoins.sql - Political boundaries +- plutogeoms.sql - Geometry creation +- dtmmergepolygons.sql + +#### **3C: Zoning (remaining)** +- zoning.sql - Complex logic with splitzone +- zoning_zoningdistrict.sql +- zoning_commercialoverlay.sql +- zoning_limitedheight.sql +- zoning_zonemap.sql +- zoning_parks.sql +- zoning_create_priority.sql + +#### **3D: CAMA (remaining)** +- cama_bldgarea_1.sql through cama_bldgarea_4.sql +- create_cama_primebbl.sql (if not in Wave 0) + +#### **3E: Address & Other** +- address.sql +- numericfields_geomfields.sql (pre-dbt dependency) +- plutomapid.sql, plutomapid_1.sql, plutomapid_2.sql +- versions.sql + +**Strategy:** Create issues as we go, based on learnings from Waves 0-2 + +**Estimated:** 20-30 more files, ~30-40 hours + +--- + +### **Wave 4: Backfill & Corrections** + +#### **Issue 4.1: Handle backfill.sql** +- Status: Currently UNUSED (orphaned file) +- Action: Verify it's not called anywhere, then just DELETE +- Time: 15 min (verification) + +#### **Issue 4.2: Handle corr_lotarea.sql & corrections** +- Status: Runs in 03_corrections.sh (separate script) +- Keep as SQL for now +- Defer to Phase 5 (corrections strategy) + +**Total Wave 4 Time:** ~30 min + +--- + +### **Phase 5: Pure DBT** (Future) + +**Prerequisites:** All Waves 0-4 complete + +**Major Changes:** +1. Convert PLUTO table itself to dbt model +2. Integrate corrections (incremental model or post-hook) +3. Eliminate 02_build.sh entirely +4. Single `dbt build` command + +**Estimated:** 2-3 months of design + implementation + +--- + +## 🎯 Practical Execution Strategy + +### Per-Issue Workflow (Fast Iteration) + +```bash +# 1. Create dbt model with header comment +cd products/pluto/models/intermediate/{category}/ +# Write the model starting with: +# -- Migrated from: pluto_build/sql/{original_filename}.sql +# [rest of model logic...] + +# 2. Test it in isolation +cd products/pluto +eval "$(direnv export bash)" +dbt run --select {model_name} + +# 3. Add to pluto_enriched +# Edit models/product/pluto_enriched.sql +# Add LEFT JOIN + +# 4. Add to apply_dbt_enrichments +# Edit pluto_build/sql/apply_dbt_enrichments.sql +# Add fields to SET clause + +# 5. Update build script +# Edit pluto_build/02_build.sh +# Remove run_sql_file call + +# 6. Rename original SQL file +mv pluto_build/sql/{original}.sql pluto_build/sql/{original}_migrated.sql + +# 7. Test full build +cd pluto_build +./02_build.sh + +# 8. Validate (spot check) +# Compare key fields, BBL counts between build_schema.pluto and nightly_qa.pluto + +# 9. Commit +git add -A +git commit -m "Migrate {file} to dbt" + +# 10. Move to next file (fast iteration!) +``` + +### Validation Approach (Quick & Practical) + +**After each migration:** +```sql +-- Compare record counts +SELECT COUNT(*) FROM build_schema.pluto; +SELECT COUNT(*) FROM nightly_qa.pluto; + +-- Spot check the migrated field(s) +SELECT bbl, {migrated_field} +FROM build_schema.pluto +WHERE {migrated_field} IS NOT NULL +LIMIT 100; + +-- Compare with QA +SELECT + COUNT(*) as diff_count +FROM build_schema.pluto b +FULL OUTER JOIN nightly_qa.pluto q USING (bbl) +WHERE b.{migrated_field} IS DISTINCT FROM q.{migrated_field}; +``` + +**After Wave complete:** +Full table comparison before moving to next wave. + +### Performance Testing + +**After Wave 0 (rpad_geo):** +- Time the full build +- Compare to current baseline +- If slower, investigate before proceeding + +**After each wave:** +- Check build time hasn't degraded +- Optimize if needed + +--- + +## 📊 Progress Tracking + +### Metrics to Track + +**Per Wave:** +- Files migrated +- SQL files renamed to *_migrated.sql +- Lines of SQL → dbt models +- Build time impact +- Issues encountered + +**Overall:** +- Total files remaining: 70 → X +- Percentage migrated: 0% → 100% +- Time invested +- Time to targeted re-run + +--- + +## 🚀 Immediate Next Steps + +### This Week + +**1. Create Wave 0 Issue** +``` +Title: DBT'ify pluto_rpad_geo as intermediate models +Epic: de-74o +Priority: P0 (highest - enables targeted re-running) +Estimate: 4 hours +``` + +**2. Execute Wave 0** +- High complexity, high value +- Learn patterns for prep tables +- Validate targeted re-running works +- Performance test + +**3. Document Learnings** +- What worked well? +- What was harder than expected? +- Refine approach for Wave 1 + +### Next 2-3 Weeks + +**4. Create Wave 1 Issues (7 total)** +- Can create all at once since they're independent +- Tackle in any order +- Fast iteration (30 min - 3 hours each) + +**5. Execute Wave 1** +- Build momentum +- Establish patterns for business logic +- Learn from complex spatial files (splitzone, specialdistrict) + +**6. Validate After Wave 1** +- Full pluto table comparison +- Performance check +- Document patterns + +### Next Month + +**7. Wave 2 (Separate Tables)** +- pluto_allocated strategy +- Build on Wave 0 learnings + +**8. Start Planning Wave 3** +- Identify next batch of files +- Prioritize by complexity/value +- Create issues + +--- + +## 🎓 Learning Goals by Wave + +### Wave 0: Prep Tables +- How to handle complex joins/dedup +- Multiple SQL files → single model +- Materialization strategies +- Targeted re-running patterns + +### Wave 1: Business Logic +- Simple transformations (CAMA defaults) +- Complex progressive refinement (zoning) +- Spatial operations in dbt +- Performance considerations + +### Wave 2: Table Creation +- Materializing non-PLUTO tables +- Aggregation patterns +- Downstream dependencies + +### Wave 3: Everything Else +- Apply all learnings +- Tackle remaining complexity +- Refine patterns + +### Phase 5: Pure DBT +- PLUTO as dbt model +- Corrections strategy +- Final architecture + +--- + +## ✅ Success Criteria + +### After Wave 0 +- ✅ Can run: `dbt run --select int__pluto_rpad_geo+` +- ✅ 7 SQL files renamed to *_migrated.sql +- ✅ Full build works +- ✅ Performance acceptable +- ✅ Targeted re-running validated + +### After Wave 1 +- ✅ 14 SQL files renamed (7 + Wave 0's 7) +- ✅ CAMA, zoning, classification in dbt +- ✅ Complex spatial files migrated +- ✅ Patterns established + +### After Wave 2 +- ✅ 16 SQL files renamed +- ✅ prep tables all in dbt +- ✅ pluto_allocated as dbt model + +### After Wave 3 +- ✅ ~40+ SQL files renamed +- ✅ Only corrections & edge cases remain +- ✅ Ready for Phase 5 + +### After Phase 5 +- ✅ All SQL files migrated (*_migrated.sql as reference) +- ✅ `dbt build` is the entire build +- ✅ Pure dbt project achieved + +--- + +## 🎯 The Path Forward + +``` +TODAY + ↓ +Wave 0: pluto_rpad_geo (4 hrs) → Targeted re-running unlocked ⭐ + ↓ +Wave 1: Simple files (10 hrs) → Patterns established + ↓ +Wave 2: Separate tables (5 hrs) → Foundation complete + ↓ +Wave 3: Complex files (40 hrs) → Bulk migration + ↓ +Wave 4: Cleanup (1 hr) → Almost pure dbt + ↓ +Phase 5: Pure dbt (months) → END GOAL achieved +``` + +**Total estimated:** ~60 hours of migration work + Phase 5 design/implementation + +**Timeline:** +- Waves 0-2: 2-3 weeks +- Wave 3: 1-2 months +- Wave 4: 1 day +- Phase 5: 3-6 months + +**Let's start with Wave 0 and learn by doing!** 🚀 + diff --git a/products/pluto/RPAD_GEO_DBT_ANALYSIS.md b/products/pluto/RPAD_GEO_DBT_ANALYSIS.md new file mode 100644 index 0000000000..996efeaf04 --- /dev/null +++ b/products/pluto/RPAD_GEO_DBT_ANALYSIS.md @@ -0,0 +1,375 @@ +# Should pluto_rpad_geo Be a DBT Model? + +## Current State + +### What is pluto_rpad_geo? +A **critical intermediate table** that joins DOF property tax data with DCP geocodes. + +### How It's Created (line 8 of 02_build.sh) +```sql +-- sql/create_pts.sql +-- Transforms pluto_pts (raw PTS data) → dof_pts_propmaster +CREATE TABLE dof_pts_propmaster AS +SELECT boro, block, lot, owner, land_area, gross_sqft, ... +FROM pluto_pts; -- Already a dbt staging model (stg__pluto_pts) + +-- sql/create_rpad_geo.sql +-- Joins property tax data with geocodes +CREATE TABLE pluto_rpad_geo AS +WITH pluto_rpad_rownum AS ( + SELECT + dof_pts_propmaster.*, + ROW_NUMBER() OVER ( + PARTITION BY boro || tb || tl + ORDER BY curavt_act DESC, land_area DESC + ) AS row_number + FROM dof_pts_propmaster +) +SELECT + pluto_rpad_sub.*, + stg__pluto_input_geocodes.* +FROM pluto_rpad_rownum pluto_rpad_sub +LEFT JOIN stg__pluto_input_geocodes + ON [bbl join condition] +WHERE row_number = 1; + +-- Then 5 more SQL files modify it: +UPDATE pluto_rpad_geo SET bbl = ... +UPDATE pluto_rpad_geo SET xcoord = ..., ycoord = ... +-- (zerovacantlots.sql, lotarea.sql, primebbl.sql, apdate.sql) +``` + +### What Uses pluto_rpad_geo? (9 files) +1. **bbl.sql** - INSERTs initial rows into pluto +2. **geocodes.sql** - Major UPDATE to pluto (~20 fields) +3. **create_allocated.sql** - Creates pluto_allocated table +4. **address.sql** - Address processing +5. **cama_bldgarea_1.sql** - Building area calculations +6. **lotarea.sql** - Lot area processing +7. **bldgclass.sql** - Building classification +8. **geocode_notgeocoded.sql** - Geocoding fixes +9. **create_rpad_geo.sql** - Self-reference for UPDATEs + +--- + +## The DBT Question + +### Option A: Keep pluto_rpad_geo as SQL (Current State) +``` +pluto_pts (raw) + → [dbt staging] stg__pluto_pts + → [SQL] dof_pts_propmaster (create_pts.sql) + → [SQL] pluto_rpad_geo (create_rpad_geo.sql) + → [SQL] 5 UPDATE statements modify it + → [SQL] Used by bbl.sql, geocodes.sql, etc. +``` + +### Option B: Make pluto_rpad_geo a DBT Model ✅ RECOMMENDED +``` +pluto_pts (raw) + → [dbt staging] stg__pluto_pts + → [dbt intermediate] int__dof_pts_propmaster (replaces create_pts.sql) + → [dbt intermediate] int__pluto_rpad_geo (replaces create_rpad_geo.sql) + → [SQL] Used by bbl.sql, geocodes.sql, etc. (read-only) +``` + +--- + +## Analysis: Should We DBT'ify pluto_rpad_geo? + +### ✅ YES - Strong Arguments For + +#### 1. **Targeted Re-running** (Your Key Requirement) +Currently, to rebuild pluto_rpad_geo, you must: +- Run 01a_dbt_staging.sh (all staging models) +- Run lines 6-14 of 02_build.sh (preprocessing → apdate.sql) + +With dbt: +```bash +# Just rebuild the specific model and downstream +dbt run --select int__pluto_rpad_geo+ +``` + +#### 2. **It's Already Mostly DBT-Ready** +The CREATE statement is a clean SELECT with CTEs: +- Window function for deduplication ✓ +- LEFT JOIN between two tables ✓ +- No complex procedural logic ✓ + +The transformation logic is **declarative**, perfect for dbt. + +#### 3. **Eliminates Mutable State Issues** +Current problem: pluto_rpad_geo gets modified by 5 SQL files after creation. +With dbt: All logic in one model, no mutation needed. + +#### 4. **Better Dependency Management** +dbt DAG would show: +``` +stg__pluto_pts ──┐ + ├─> int__dof_pts_propmaster ──┐ +stg__pluto_input_geocodes ─────────────────────┤ + ├─> int__pluto_rpad_geo + (used by SQL scripts) +``` + +#### 5. **Easier to Test & Debug** +- dbt tests on data quality +- Can run in isolation +- Version control changes clearly +- Can use dbt snapshots for auditing + +#### 6. **The UPDATEs Can Be Incorporated** +Those 5 UPDATE statements that modify pluto_rpad_geo? +They're just transformations that can be CASE/COALESCE in the dbt model: + +```sql +-- Instead of: UPDATE pluto_rpad_geo SET bbl = ... +-- Do: SELECT CASE WHEN ... THEN ... END AS bbl + +-- Instead of: UPDATE pluto_rpad_geo SET xcoord = ... +-- Do: SELECT COALESCE(xcoord, ST_X(...)) AS xcoord +``` + +### ⚠️ Considerations + +#### 1. **Migration Effort** +Need to: +- Convert create_pts.sql → int__dof_pts_propmaster.sql (medium effort) +- Convert create_rpad_geo.sql → int__pluto_rpad_geo.sql (medium effort) +- Incorporate 5 UPDATE statements into the model (low effort) +- Update 9 SQL files to reference dbt model instead of table (low effort) + +**Estimate: 2-3 hours of work** + +#### 2. **SQL Files Need to Read from DBT Output** +The 9 SQL files that reference pluto_rpad_geo will now read from a dbt model. + +**Solution:** Materialize as table in dbt, SQL continues to work: +```sql +-- models/intermediate/int__pluto_rpad_geo.sql +{{ config(materialized='table') }} + +-- ... model logic ... +``` + +SQL files reference it the same way: +```sql +SELECT * FROM int__pluto_rpad_geo; -- Works just like pluto_rpad_geo +``` + +#### 3. **Build Script Changes** +```bash +# OLD: 02_build.sh lines 6-14 +run_sql_file sql/preprocessing.sql +run_sql_file sql/create_pts.sql +run_sql_file sql/create_rpad_geo.sql +run_sql_file sql/zerovacantlots.sql # UPDATEs pluto_rpad_geo +run_sql_file sql/lotarea.sql # UPDATEs pluto_rpad_geo +run_sql_file sql/primebbl.sql # UPDATEs pluto_rpad_geo +run_sql_file sql/apdate.sql # UPDATEs pluto_rpad_geo + +# NEW: 02_build.sh +run_sql_file sql/preprocessing.sql +(cd .. && dbt run --select int__dof_pts_propmaster int__pluto_rpad_geo) +# No more create_pts, create_rpad_geo, zerovacantlots, lotarea, primebbl, apdate +``` + +**Those 5 UPDATE files get deleted** - their logic absorbed into the dbt model. + +--- + +## Recommendation: **YES, DBT'ify pluto_rpad_geo** + +### Why It's Worth It + +1. **Solves your targeted re-running problem** - `dbt run --select int__pluto_rpad_geo` +2. **Minimal risk** - It's used read-only by downstream SQL +3. **Removes mutable state** - No more UPDATE statements on rpad_geo +4. **Clean separation** - DBT handles data prep, SQL handles pluto building +5. **Not a massive refactor** - 2-3 hours, well-scoped work + +### The New Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ DBT Phase 1: Staging (01a_dbt_staging.sh) │ +│ - stg__pluto_pts (from raw pluto_pts) │ +│ - stg__pluto_input_geocodes (from raw geocodes) │ +│ - stg__pluto_input_cama_dof (from raw CAMA) │ +│ - ... all other staging models ... │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ DBT Phase 2: Intermediate Prep Tables (NEW - add to build) │ +│ - int__dof_pts_propmaster (from stg__pluto_pts) │ +│ - int__pluto_rpad_geo (from int__dof_pts + stg__geocodes) │ +│ ↑ Includes all the UPDATE logic (bbl calc, coords, etc.) │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ SQL Phase 1: Build Base PLUTO (02_build.sh) │ +│ - CREATE TABLE pluto │ +│ - INSERT from int__pluto_rpad_geo (read-only) │ +│ - UPDATE from pluto_allocated │ +│ - UPDATE from geocodes (reads int__pluto_rpad_geo) │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ SQL Phase 2: Progressive Enhancement │ +│ - CAMA updates, zoning, geocoding, etc. │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ DBT Phase 3: Business Logic Enrichment │ +│ - far, landuse, irrlotcode, etc. │ +│ - Output: pluto_enriched │ +└─────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ SQL Phase 3: Apply DBT Results + Corrections │ +│ - UPDATE pluto from pluto_enriched │ +│ - Research corrections │ +└─────────────────────────────────────────────────────────────┘ +``` + +--- + +## Migration Plan + +### Step 1: Create int__dof_pts_propmaster.sql +```sql +-- models/intermediate/int__dof_pts_propmaster.sql +{{ config( + materialized='table', + indexes=[{'columns': ['boro', 'tb', 'tl'], 'unique': False}] +) }} + +SELECT + boro, + block AS tb, + lot AS tl, + parid AS bbl, + -- ... all the field transformations from create_pts.sql ... +FROM {{ ref('stg__pluto_pts') }} +``` + +### Step 2: Create int__pluto_rpad_geo.sql +```sql +-- models/intermediate/int__pluto_rpad_geo.sql +{{ config( + materialized='table', + indexes=[{'columns': ['primebbl'], 'unique': False}] +) }} + +WITH +-- Deduplicate property tax records +pluto_rpad_dedup AS ( + SELECT *, + ROW_NUMBER() OVER ( + PARTITION BY boro || tb || tl + ORDER BY curavt_act DESC, land_area DESC, ease ASC + ) AS row_number + FROM {{ ref('int__dof_pts_propmaster') }} +), + +-- Join with geocodes +base_join AS ( + SELECT + pts.*, + geo.* EXCLUDE (geo_bbl) + FROM pluto_rpad_dedup pts + LEFT JOIN {{ ref('stg__pluto_input_geocodes') }} geo + ON pts.boro || pts.tb || pts.tl = + geo.borough || LPAD(geo.block, 5, '0') || LPAD(geo.lot, 4, '0') + WHERE pts.row_number = 1 +), + +-- Incorporate zerovacantlots.sql logic +with_vacant_fixes AS ( + SELECT *, + CASE + WHEN land_area = 0 AND [vacant lot conditions] + THEN [calculate from geometry] + ELSE land_area + END AS land_area_fixed + FROM base_join +), + +-- Incorporate lotarea.sql logic +-- Incorporate primebbl.sql logic +-- Incorporate apdate.sql logic +-- Incorporate geocode_billingbbl.sql logic + +final AS ( + SELECT + -- All fields with transformations applied + boro || LPAD(tb, 5, '0') || LPAD(tl, 4, '0') AS bbl, + boro || LPAD(tb, 5, '0') || LPAD(tl, 4, '0') AS primebbl, + to_char(to_date(ap_date, 'MM/DD/YY'), 'MM/DD/YYYY') AS ap_datef, + COALESCE(xcoord, ST_X(ST_TRANSFORM(geom, 2263))::integer) AS xcoord, + COALESCE(ycoord, ST_Y(ST_TRANSFORM(geom, 2263))::integer) AS ycoord, + -- ... all other fields ... + FROM with_vacant_fixes +) + +SELECT * FROM final +``` + +### Step 3: Update 02_build.sh +```bash +# Before line 21 (create.sql), add: +echo 'Building intermediate prep tables' +(cd .. && dbt run --select int__dof_pts_propmaster int__pluto_rpad_geo) + +# Remove these lines: +# run_sql_file sql/create_pts.sql +# run_sql_file sql/create_rpad_geo.sql +# run_sql_file sql/zerovacantlots.sql +# run_sql_file sql/lotarea.sql +# run_sql_file sql/primebbl.sql +# run_sql_file sql/apdate.sql +# run_sql_file sql/geocode_billingbbl.sql +``` + +### Step 4: Update SQL Files (9 files) +Change references from `pluto_rpad_geo` to `int__pluto_rpad_geo`: +- bbl.sql +- geocodes.sql +- create_allocated.sql +- etc. + +### Step 5: Delete Obsolete SQL Files (7 files) +- create_pts.sql ❌ +- create_rpad_geo.sql ❌ +- zerovacantlots.sql ❌ +- lotarea.sql ❌ +- primebbl.sql ❌ +- apdate.sql ❌ +- geocode_billingbbl.sql ❌ + +--- + +## Benefits Summary + +✅ **Targeted re-running** - Your core requirement +✅ **Removes 7 SQL files** - Simpler codebase +✅ **Eliminates mutable intermediate state** - pluto_rpad_geo becomes read-only +✅ **Better dependency visibility** - dbt DAG shows relationships +✅ **Easier debugging** - Single model vs scattered UPDATEs +✅ **Testable** - dbt tests on data quality +✅ **Version controlled transformations** - All logic in one place + +**Effort: 2-3 hours | Risk: Low | Value: High** + +--- + +## Next Steps + +1. Create issue: "DBT'ify pluto_rpad_geo intermediate table" +2. Implement int__dof_pts_propmaster.sql +3. Implement int__pluto_rpad_geo.sql (with absorbed UPDATE logic) +4. Test: dbt run --select int__pluto_rpad_geo +5. Update 02_build.sh and 9 SQL files +6. Delete 7 obsolete SQL files +7. Run full build to verify + diff --git a/products/pluto/SESSION_SUMMARY.md b/products/pluto/SESSION_SUMMARY.md new file mode 100644 index 0000000000..2db8ab1f4b --- /dev/null +++ b/products/pluto/SESSION_SUMMARY.md @@ -0,0 +1,226 @@ +# PLUTO DBT Migration - Analysis Session Summary + +**Date:** 2026-03-30 +**Session Goal:** Analyze PLUTO SQL dependencies and create comprehensive migration strategy + +--- + +## 🎯 What Was Accomplished + +### 1. Dependency Analysis (Issue de-74o.14 - Closed) +**Created:** `products/pluto/dependencies.txt` (25KB) + +Analyzed all SQL files in pluto_build for UPDATE statements: +- **42 UPDATE statements** found across **23 files** +- **29 updates** target the main PLUTO table +- **7 fields** updated multiple times (builtfar, centroid, sanitdistrict, splitzone, spdist1, spdist2, plutomapid) +- **No circular dependencies** detected +- **Critical finding:** backfill.sql is unused/orphaned, corr_lotarea.sql runs in separate phase + +**Key insights:** +- Most "multi-updates" are progressive refinement within same file (zoning doing 6-7 UPDATEs) +- Some are pre-dbt SQL → dbt overwrites (centroid, sanitdistrict) +- One is corrections overriding computed values (builtfar) - expected behavior + +### 2. Data Flow Analysis +**Created:** `products/pluto/DATA_FLOW_ANALYSIS.md` (8.6KB) + +Traced how PLUTO is built: +- **Row insertion:** `bbl.sql` (line 22) - this is the ONLY place rows enter PLUTO +- **Initial population:** allocated.sql (~25 fields), geocodes.sql (~20 fields) +- **Progressive enhancement:** 60+ SQL files do targeted UPDATEs +- **DBT integration:** dbt run → pluto_enriched → apply_dbt_enrichments.sql +- **Corrections:** Separate script (03_corrections.sh) overrides computed values + +**Answer to key question:** +- Keep initial PLUTO population as SQL (INSERT + first UPDATEs work fine) +- Focus on migrating the 60+ UPDATE files to dbt +- Hybrid SQL/DBT architecture is actually optimal + +### 3. Overall Strategy +**Created:** `products/pluto/OVERALL_STRATEGY.md` (13.5KB) + +Comprehensive strategy document explaining: +- PLUTO as mutable state table (embrace it, don't fight it) +- Why hybrid SQL/DBT is better than pure dbt (for now) +- What to migrate vs what to keep as SQL +- Multi-update field analysis and handling strategies +- 4 migration waves with clear boundaries + +**Key principle:** +- SQL for data engineering (INSERT, spatial ops, corrections) +- DBT for business logic (calculations, lookups, transformations) +- pluto_enriched as the bridge between them + +### 4. pluto_rpad_geo Analysis +**Created:** `products/pluto/RPAD_GEO_DBT_ANALYSIS.md` (13.9KB) + +**Strong YES recommendation** to dbt'ify pluto_rpad_geo as Wave 0: + +**Why:** +- Solves targeted re-running requirement (main ask) +- Removes 7 SQL files by absorbing their logic +- Eliminates mutable intermediate state +- Better dependency management +- 2-3 hour effort, low risk, high value + +**What it creates:** +- `int__dof_pts_propmaster.sql` - Property tax transformations +- `int__pluto_rpad_geo.sql` - Joins property tax + geocodes + all UPDATE logic + +**Files that get deleted:** +- create_pts.sql, create_rpad_geo.sql, zerovacantlots.sql, lotarea.sql, primebbl.sql, apdate.sql, geocode_billingbbl.sql + +### 5. Epic Updated +**Epic de-74o** now includes: +- Links to all 4 documentation files (required reading) +- Migration wave breakdown (0-4) +- Conversion pattern and requirements +- Key findings summary (dependencies, multi-updates, ordering) +- Quality gates (testing, linting) +- **End goal:** Pure dbt project (all SQL scripts eliminated) + +--- + +## 📊 Key Findings Summary + +### Execution Order +``` +1. pluto_rpad_geo created (Wave 0 target) +2. pluto table INSERT (keep as SQL for now) +3. allocated UPDATE (~25 fields) +4. geocodes UPDATE (~20 fields) +5. 60+ targeted UPDATEs (Waves 1-4 targets) +6. DBT run → pluto_enriched +7. apply_dbt_enrichments → copy to pluto +8. Corrections override (separate script) +``` + +### Dependencies +- ✅ **No circular dependencies** +- ⚠️ **2 files must run before dbt:** latlong.sql, numericfields_geomfields.sql +- ✅ **7 fields updated multiple times** (analyzed and explained) +- ✅ **Clear migration path** identified + +### Migration Waves +- **Wave 0:** pluto_rpad_geo (DO FIRST - enables targeted re-running) +- **Wave 1:** 7 simple files (CAMA, zoning, classification) - SAFE +- **Wave 2:** 2 files (separate tables) +- **Wave 3:** 3 files (pre-dbt dependencies) - CAREFUL +- **Wave 4:** 3+ misc files +- **Phase 5 (END GOAL):** Convert PLUTO itself to dbt model - pure dbt project + +--- + +## 📝 Deliverables + +### Documentation Files (4) +1. `dependencies.txt` - Full UPDATE statement dependency graph +2. `DATA_FLOW_ANALYSIS.md` - How PLUTO is built, data flow patterns +3. `OVERALL_STRATEGY.md` - Master migration strategy, what to migrate vs keep +4. `RPAD_GEO_DBT_ANALYSIS.md` - Wave 0 migration plan (pluto_rpad_geo) + +### Analysis Scripts (2) +1. `analyze_dependencies.py` - Extracts UPDATE statements from SQL files +2. `analyze_migration_groups.py` - Groups files by migration priority + +### Git Commits (5) +``` +36fce9cc Add analysis: Should pluto_rpad_geo be a dbt model? +17a1cfbb Add comprehensive PLUTO dbt migration strategy +ed189b4a Add PLUTO data flow analysis and dbt migration strategy +89208385 Add execution order and inter-group dependency analysis +7f910379 Add PLUTO UPDATE statement dependency analysis +``` + +--- + +## 🚀 Recommended Next Actions + +### Immediate (Wave 0) +1. **Create issue:** "DBT'ify pluto_rpad_geo as intermediate models" + - Est: 2-3 hours + - Creates: int__dof_pts_propmaster.sql, int__pluto_rpad_geo.sql + - Removes: 7 SQL files + - Enables: Targeted re-running with `dbt run --select int__pluto_rpad_geo+` + +### Short-term (Wave 1) +2. **Create 7 issues** for simple file migrations: + - CAMA: bsmttype, lottype, proxcode, easements (4 issues) + - Zoning: splitzone, specialdistrict (2 issues) + - Classification: bldgclass (1 issue) + - All SAFE - no dependencies, can work in parallel + +### Medium-term (Waves 2-4) +3. **Iterate through remaining files** + - Wave 2: primebbl, yearbuiltalt + - Wave 3: latlong, numericfields, dtmgeoms (verify dbt deps first) + - Wave 4: plutomapid, versions, apdate + +### Long-term (Phase 5) +4. **Plan pure dbt conversion** + - Convert PLUTO table itself to dbt model + - Eliminate all SQL scripts + - Single `dbt build` command + - Timeline: 9-12 months from start + +--- + +## 💡 Key Insights for Future Work + +### What Makes a Good Migration Candidate? +✅ Simple CASE statements or COALESCE logic +✅ Lookups from reference tables +✅ Calculations (e.g., FAR = bldgarea / lotarea) +✅ Single-pass transformations +✅ No spatial operations + +### What Should Stay SQL Longer? +❌ Complex spatial operations (ST_AREA, ST_INTERSECTION with rankings) +❌ Progressive refinement (7 UPDATEs in same file) +❌ Corrections that override computed values +❌ Initial table creation and population + +### The Hybrid Pattern Works Because: +1. SQL handles the procedural/imperative parts +2. DBT handles the declarative transformations +3. pluto_enriched acts as clean handoff point +4. Corrections can override without complex logic +5. Incremental migration path with low risk + +### End Goal is Clear: +Pure dbt project where everything is declarative: +- All prep tables as dbt models +- PLUTO itself as dbt model +- Corrections as incremental model or post-hook +- Zero shell scripts, zero mutable tables +- Single `dbt build` command + +**It's okay if this takes time.** The hybrid state is stable and provides value while we migrate incrementally. + +--- + +## 📚 Reading Order for New Contributors + +1. Start with: `OVERALL_STRATEGY.md` (big picture) +2. Then read: `DATA_FLOW_ANALYSIS.md` (how it works today) +3. For Wave 0: `RPAD_GEO_DBT_ANALYSIS.md` (first migration) +4. Reference: `dependencies.txt` (detailed analysis) + +--- + +## ✅ Session Success + +All objectives met: +- ✅ Reviewed open epic and current state +- ✅ Analyzed UPDATE statement dependencies +- ✅ Built dependency graph (products/pluto/dependencies.txt) +- ✅ Identified migration groups and order +- ✅ Answered: when are rows inserted? (bbl.sql line 22) +- ✅ Answered: should we dbt'ify pluto_rpad_geo? (YES - Wave 0) +- ✅ Answered: fields updated multiple times? (7 found, explained) +- ✅ Created comprehensive migration strategy +- ✅ Updated epic with documentation and end goal +- ✅ Established clear path forward + +**The PLUTO dbt migration epic now has a complete strategy and roadmap. Ready to execute!** 🎯 diff --git a/products/pluto/analyze_dependencies.py b/products/pluto/analyze_dependencies.py new file mode 100644 index 0000000000..bcaca32660 --- /dev/null +++ b/products/pluto/analyze_dependencies.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +""" +Analyze UPDATE statement dependencies in PLUTO SQL files. +Creates a dependency graph to help plan dbt migration. +""" + +import re +import os +from pathlib import Path +from collections import defaultdict + +def extract_update_info(sql_content, file_path): + """Extract UPDATE statement information from SQL content.""" + updates = [] + + # Find all UPDATE statements (case insensitive, multiline) + # Pattern: UPDATE table_name SET ... FROM ... WHERE ... + update_pattern = re.compile( + r'UPDATE\s+(\w+)\s+(?:AS\s+\w+\s+)?SET\s+(.*?)(?:FROM\s+(.*?))?(?:WHERE\s+(.*?))?(?:;|UPDATE|$)', + re.IGNORECASE | re.DOTALL + ) + + matches = update_pattern.finditer(sql_content) + + for match in matches: + target_table = match.group(1).lower() + set_clause = match.group(2) + from_clause = match.group(3) if match.group(3) else "" + where_clause = match.group(4) if match.group(4) else "" + + # Extract field assignments from SET clause + # Pattern: field = value or field = source.field + set_fields = [] + for field_match in re.finditer(r'(\w+)\s*=\s*([^,]+?)(?:,|$)', set_clause, re.DOTALL): + field_name = field_match.group(1).strip() + field_value = field_match.group(2).strip() + set_fields.append((field_name, field_value)) + + # Extract source tables from FROM clause + source_tables = [] + if from_clause: + # Simple extraction of table names (may need refinement) + table_matches = re.findall(r'(\w+)\s+(?:AS\s+)?(\w+)?', from_clause, re.IGNORECASE) + for table_name, alias in table_matches: + if table_name.lower() not in ['left', 'right', 'inner', 'outer', 'join', 'on']: + source_tables.append(table_name.lower()) + + # Extract join conditions + join_conditions = [] + if where_clause: + # Look for join patterns like table1.field = table2.field + join_matches = re.findall(r'(\w+)\.(\w+)\s*=\s*(\w+)\.(\w+)', where_clause, re.IGNORECASE) + join_conditions.extend(join_matches) + + updates.append({ + 'file': file_path, + 'target_table': target_table, + 'fields': set_fields, + 'source_tables': source_tables, + 'join_conditions': join_conditions, + 'raw_update': match.group(0)[:200] # First 200 chars for reference + }) + + return updates + + +def analyze_pluto_updates(base_path): + """Analyze all SQL files in pluto_build directory.""" + all_updates = [] + + # Find all .sql files + sql_files = Path(base_path).rglob('*.sql') + + for sql_file in sorted(sql_files): + try: + with open(sql_file, 'r', encoding='utf-8') as f: + content = f.read() + + relative_path = sql_file.relative_to(Path(base_path).parent) + updates = extract_update_info(content, str(relative_path)) + all_updates.extend(updates) + + except Exception as e: + print(f"Error processing {sql_file}: {e}") + + return all_updates + + +def write_dependency_graph(updates, output_file): + """Write dependency graph to file in structured format.""" + + # Group by file + by_file = defaultdict(list) + for update in updates: + by_file[update['file']].append(update) + + with open(output_file, 'w', encoding='utf-8') as f: + f.write("# PLUTO UPDATE Statement Dependency Analysis\n") + f.write("# Generated for dbt migration planning\n") + f.write(f"# Total files analyzed: {len(by_file)}\n") + f.write(f"# Total UPDATE statements: {len(updates)}\n\n") + + f.write("=" * 80 + "\n") + f.write("SUMMARY BY TARGET TABLE\n") + f.write("=" * 80 + "\n\n") + + # Group by target table + by_target = defaultdict(list) + for update in updates: + by_target[update['target_table']].append(update) + + for table in sorted(by_target.keys()): + updates_list = by_target[table] + f.write(f"\n{table.upper()} ({len(updates_list)} updates)\n") + f.write("-" * 40 + "\n") + + # Count fields updated + all_fields = set() + for upd in updates_list: + for field_name, _ in upd['fields']: + all_fields.add(field_name.lower()) + + f.write(f"Fields updated: {', '.join(sorted(all_fields))}\n") + f.write(f"Files: {', '.join(sorted(set(Path(u['file']).name for u in updates_list)))}\n") + + f.write("\n\n") + f.write("=" * 80 + "\n") + f.write("DETAILED DEPENDENCY GRAPH\n") + f.write("=" * 80 + "\n\n") + + for file_path in sorted(by_file.keys()): + f.write(f"\n{'=' * 80}\n") + f.write(f"FILE: {file_path}\n") + f.write(f"{'=' * 80}\n\n") + + for i, update in enumerate(by_file[file_path], 1): + f.write(f"--- UPDATE #{i} ---\n") + f.write(f"Target Table: {update['target_table']}\n") + f.write(f"Fields Updated:\n") + + for field_name, field_value in update['fields']: + f.write(f" - {field_name} = {field_value[:100]}\n") + + if update['source_tables']: + f.write(f"Source Tables: {', '.join(update['source_tables'])}\n") + + if update['join_conditions']: + f.write(f"Join Conditions:\n") + for t1, f1, t2, f2 in update['join_conditions']: + f.write(f" - {t1}.{f1} = {t2}.{f2}\n") + + f.write(f"\nSQL Preview:\n{update['raw_update']}\n") + f.write("\n" + "-" * 40 + "\n\n") + + +def main(): + base_path = "products/pluto/pluto_build" + output_file = "products/pluto/dependencies.txt" + + print(f"Analyzing SQL files in {base_path}...") + updates = analyze_pluto_updates(base_path) + + print(f"Found {len(updates)} UPDATE statements") + print(f"Writing dependency graph to {output_file}...") + + write_dependency_graph(updates, output_file) + + print("✓ Analysis complete!") + print(f"\nNext steps:") + print(f"1. Review {output_file}") + print(f"2. Identify leaf nodes (fields with no downstream dependencies)") + print(f"3. Group related updates for batch migration") + + +if __name__ == "__main__": + main() diff --git a/products/pluto/analyze_migration_groups.py b/products/pluto/analyze_migration_groups.py new file mode 100644 index 0000000000..0486e04c05 --- /dev/null +++ b/products/pluto/analyze_migration_groups.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +""" +Analyze PLUTO dependencies to identify migration groups and leaf nodes. +""" + +import re +from collections import defaultdict +from pathlib import Path + + +def parse_dependencies_file(filepath): + """Parse the dependencies.txt file to extract update information.""" + with open(filepath, 'r') as f: + content = f.read() + + updates = [] + current_file = None + + # Split by file sections + file_sections = re.split(r'={80}\nFILE: (.*?)\n={80}', content) + + for i in range(1, len(file_sections), 2): + file_path = file_sections[i] + section_content = file_sections[i + 1] + + # Extract update blocks + update_blocks = re.split(r'--- UPDATE #\d+ ---', section_content) + + for block in update_blocks[1:]: # Skip first empty split + target_match = re.search(r'Target Table: (\w+)', block) + if not target_match: + continue + + target_table = target_match.group(1) + + # Extract fields + fields = [] + field_section = re.search(r'Fields Updated:(.*?)(?:Source Tables:|Join Conditions:|SQL Preview:|$)', block, re.DOTALL) + if field_section: + field_lines = field_section.group(1).strip().split('\n') + for line in field_lines: + field_match = re.match(r'\s*- (\w+) =', line) + if field_match: + fields.append(field_match.group(1)) + + # Extract source tables + sources = [] + source_section = re.search(r'Source Tables: (.*?)(?:\n|$)', block) + if source_section: + sources = [s.strip() for s in source_section.group(1).split(',')] + + updates.append({ + 'file': Path(file_path).name, + 'target_table': target_table, + 'fields': fields, + 'sources': sources + }) + + return updates + + +def analyze_field_dependencies(updates): + """Analyze which fields depend on other fields.""" + # Map: table.field -> list of files that update it + field_writers = defaultdict(list) + + # Map: file -> fields it depends on (reads from) + field_readers = defaultdict(set) + + for update in updates: + table = update['target_table'] + for field in update['fields']: + field_writers[f"{table}.{field}"].append(update['file']) + + # Sources indicate reading from those tables + for source in update['sources']: + field_readers[update['file']].add(source) + + return field_writers, field_readers + + +def identify_migration_groups(updates): + """Identify logical groupings for migration.""" + + # Group by file prefix/topic + by_topic = defaultdict(list) + + for update in updates: + file = update['file'] + + # Categorize by filename patterns + if 'cama_' in file: + topic = 'CAMA' + elif 'zoning' in file: + topic = 'Zoning' + elif 'geocode' in file or 'latlong' in file or file in ['dtmgeoms.sql', 'dtmmergepolygons.sql']: + topic = 'Geocoding' + elif 'primebbl' in file: + topic = 'Prime BBL' + elif file in ['bldgclass.sql', 'bsmttype.sql', 'easements.sql', 'lottype.sql', 'proxcode.sql']: + topic = 'Classification/Defaults' + elif 'yearbuilt' in file: + topic = 'Year Built' + elif file in ['backfill.sql', 'corr_lotarea.sql']: + topic = 'Corrections/Backfill' + elif file == 'apply_dbt_enrichments.sql': + topic = 'DBT Integration (Already Migrated)' + else: + topic = 'Other' + + by_topic[topic].append(update) + + return by_topic + + +def main(): + deps_file = "products/pluto/dependencies.txt" + + print("Parsing dependencies...") + updates = parse_dependencies_file(deps_file) + + print(f"\nTotal UPDATE statements analyzed: {len(updates)}") + + # Analyze field dependencies + field_writers, field_readers = analyze_field_dependencies(updates) + + # Identify migration groups + groups = identify_migration_groups(updates) + + print("\n" + "=" * 80) + print("MIGRATION GROUPS") + print("=" * 80) + + for topic in sorted(groups.keys()): + files = set(u['file'] for u in groups[topic]) + tables = set(u['target_table'] for u in groups[topic]) + all_fields = set() + for u in groups[topic]: + all_fields.update(u['fields']) + + print(f"\n{topic}") + print("-" * 40) + print(f"Files ({len(files)}): {', '.join(sorted(files))}") + print(f"Target tables: {', '.join(sorted(tables))}") + print(f"Fields updated ({len(all_fields)}): {', '.join(sorted(all_fields)[:10])}") + if len(all_fields) > 10: + print(f" ... and {len(all_fields) - 10} more") + + print("\n" + "=" * 80) + print("LEAF CANDIDATES (Fields written but not read as dependencies)") + print("=" * 80) + print("\nNote: These fields appear to be final outputs with no downstream SQL dependencies") + print("They are good candidates for early migration.\n") + + # Fields that are written to but not used as sources elsewhere + written_tables = set(u['target_table'] for u in updates) + source_tables = set() + for u in updates: + source_tables.update(u['sources']) + + leaf_candidates = written_tables - source_tables + + for table in sorted(leaf_candidates): + updates_to_table = [u for u in updates if u['target_table'] == table] + fields = set() + for u in updates_to_table: + fields.update(u['fields']) + print(f"{table}: {', '.join(sorted(fields)[:15])}") + if len(fields) > 15: + print(f" ... and {len(fields) - 15} more") + + print("\n" + "=" * 80) + print("INTERMEDIATE TABLES (Read by other updates)") + print("=" * 80) + print("\nThese tables must be migrated before their dependents:\n") + + intermediate = written_tables & source_tables + for table in sorted(intermediate): + # Find what reads from this table + readers = set() + for u in updates: + if table in u['sources']: + readers.add(u['file']) + + print(f"{table}") + print(f" Written by: {', '.join(set(u['file'] for u in updates if u['target_table'] == table))}") + print(f" Read by: {', '.join(sorted(readers))}") + + print("\n" + "=" * 80) + print("RECOMMENDED MIGRATION ORDER") + print("=" * 80) + print(""" +1. PRIME BBL (foundational - sets up primary key relationships) +2. CLASSIFICATION/DEFAULTS (simple null fills and classifications) +3. CAMA (business logic for various property attributes) +4. GEOCODING (lat/long, geometry operations) +5. ZONING (complex multi-table zoning logic) +6. YEAR BUILT (allocated table operations) +7. CORRECTIONS/BACKFILL (depends on other fields being set) + +Within each group, files can likely be migrated in parallel as separate dbt models. + """) + + +if __name__ == "__main__": + main() diff --git a/products/pluto/dependencies.txt b/products/pluto/dependencies.txt new file mode 100644 index 0000000000..28282483aa --- /dev/null +++ b/products/pluto/dependencies.txt @@ -0,0 +1,959 @@ +# PLUTO UPDATE Statement Dependency Analysis +# Generated for dbt migration planning +# Total files analyzed: 23 +# Total UPDATE statements: 42 + +================================================================================ +SUMMARY BY TARGET TABLE +================================================================================ + + +DOF_PTS_PROPMASTER (1 updates) +---------------------------------------- +Fields updated: primebbl +Files: primebbl.sql + +PLUTO (29 updates) +---------------------------------------- +Fields updated: affresfar, appbbl, areasource, bldgclass, block, bsmtcode, builtfar, centroid, commfar, condono, easements, edesignum, facilfar, histdist, irrlotcode, landmark, landuse, latitude, longitude, lot, lotarea, lotdepth, lottype, mih_opt1, mih_opt2, mih_opt3, mih_opt4, mnffar, numfloors, ownertype, plutomapid, proxcode, residfar, sanborn, sanitboro, sanitdistrict, spdist1, spdist2, splitzone, trnstzone, version +Files: apply_dbt_enrichments.sql, backfill.sql, bldgclass.sql, cama_bsmttype.sql, cama_easements.sql, cama_lottype.sql, cama_proxcode.sql, corr_lotarea.sql, dtmgeoms.sql, latlong.sql, numericfields_geomfields.sql, plutomapid.sql, versions.sql, zoning.sql, zoning_specialdistrict.sql, zoning_splitzone.sql + +PLUTO_ALLOCATED (3 updates) +---------------------------------------- +Fields updated: yearalter1, yearalter2, yearbuilt +Files: yearbuiltalt.sql + +PLUTO_DTM (1 updates) +---------------------------------------- +Fields updated: primebbl +Files: dtmmergepolygons.sql + +PLUTO_INPUT_CAMA (2 updates) +---------------------------------------- +Fields updated: primebbl +Files: create_cama_primebbl.sql + +PLUTO_RPAD_GEO (5 updates) +---------------------------------------- +Fields updated: ap_datef, bbl, billingblock, billinglot, primebbl +Files: apdate.sql, create_rpad_geo.sql, geocode_billingbbl.sql, primebbl.sql + +STG__PLUTO_INPUT_GEOCODES (1 updates) +---------------------------------------- +Fields updated: ct2010, xcoord, ycoord +Files: create_rpad_geo.sql + + +================================================================================ +DETAILED DEPENDENCY GRAPH +================================================================================ + + +================================================================================ +FILE: pluto_build/sql/apdate.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto_rpad_geo +Fields Updated: + - ap_datef = to_char(to_date(ap_date + +SQL Preview: +UPDATE pluto_rpad_geo +SET ap_datef = to_char(to_date(ap_date, 'MM/DD/YY'), 'MM/DD/YYYY') +WHERE ap_date IS NOT NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/apply_dbt_enrichments.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - builtfar = pe.builtfar + - residfar = pe.residfar + - commfar = pe.commfar + - facilfar = pe.facilfar + - affresfar = pe.affresfar + - mnffar = pe.mnffar + - irrlotcode = pe.irrlotcode + - sanitboro = pe.sanitboro + - sanitdistrict = pe.sanitdistrict + - landuse = pe.landuse + - areasource = pe.areasource + - ownertype = pe.ownertype + - edesignum = pe.edesignum + - latitude = pe.latitude + - longitude = pe.longitude + - centroid = pe.centroid + - condono = pe.condono + - histdist = pe.histdist + - landmark = pe.landmark + - lotdepth = pe.lotdepth + - numfloors = pe.numfloors + - lotarea = pe.lotarea + - sanborn = pe.sanborn + - mih_opt1 = pe.mih_opt1 + - mih_opt2 = pe.mih_opt2 + - mih_opt3 = pe.mih_opt3 + - mih_opt4 = pe.mih_opt4 + - trnstzone = pe.trnstzone +Source Tables: pluto_enriched +Join Conditions: + - pluto.bbl = pe.bbl + +SQL Preview: +UPDATE pluto +SET + builtfar = pe.builtfar, + residfar = pe.residfar, + commfar = pe.commfar, + facilfar = pe.facilfar, + affresfar = pe.affresfar, + mnffar = pe.mnffar, + irrlotcode = pe + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/backfill.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - builtfar = ROUND(bldgarea::numeric / lotarea::numeric + +SQL Preview: +UPDATE pluto +SET builtfar = ROUND(bldgarea::numeric / lotarea::numeric, 2) +WHERE lotarea != '0' AND lotarea IS NOT NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/bldgclass.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - bldgclass = 'Q0' + +SQL Preview: +UPDATE pluto +SET bldgclass = 'Q0' +WHERE + zonedist1 = 'PARK' + AND (bldgclass IS NULL OR bldgclass LIKE 'V%'); + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/cama_bsmttype.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - bsmtcode = '5' + +SQL Preview: +UPDATE pluto +SET bsmtcode = '5' +WHERE bsmtcode IS NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/cama_easements.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - easements = '0' + +SQL Preview: +UPDATE pluto +SET easements = '0' +WHERE easements IS NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/cama_lottype.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - lottype = '0' + +SQL Preview: +UPDATE pluto +SET lottype = '0' +WHERE lottype IS NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/cama_proxcode.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - proxcode = '0' + +SQL Preview: +UPDATE pluto +SET proxcode = '0' +WHERE proxcode IS NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/corr_lotarea.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - builtfar = round(bldgarea::numeric / lotarea::numeric + +SQL Preview: +UPDATE pluto +SET builtfar = round(bldgarea::numeric / lotarea::numeric, 2) +WHERE + lotarea != '0' + AND lotarea IS NOT NULL + AND bbl IN ( + SELECT bbl FROM pluto_changes_applied + W + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/create_cama_primebbl.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto_input_cama +Fields Updated: + - primebbl = billingbbl + +SQL Preview: +UPDATE pluto_input_cama +SET primebbl = billingbbl +WHERE billingbbl IS NOT NULL AND billingbbl != '0000000000'; + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto_input_cama +Fields Updated: + - primebbl = LEFT(bbl + +SQL Preview: +UPDATE pluto_input_cama +SET primebbl = LEFT(bbl, 10) +WHERE primebbl IS NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/create_rpad_geo.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: stg__pluto_input_geocodes +Fields Updated: + - xcoord = ST_X(ST_TRANSFORM(geom + - ycoord = ST_Y(ST_TRANSFORM(geom + - ct2010 = (CASE WHEN ct2010::numeric = 0 THEN NULL ELSE ct2010 END) + +SQL Preview: +UPDATE stg__pluto_input_geocodes +SET + xcoord = ST_X(ST_TRANSFORM(geom, 2263))::integer, + ycoord = ST_Y(ST_TRANSFORM(geom, 2263))::integer, + ct2010 = (CASE WHEN ct2010::numeric = 0 THEN NULL E + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto_rpad_geo +Fields Updated: + - bbl = borough || LPAD(block + +SQL Preview: +UPDATE pluto_rpad_geo SET bbl = borough || LPAD(block, 5, '0') || LPAD(lot, 4, '0'); + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/dtmgeoms.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - block = '0' + +SQL Preview: +UPDATE pluto +SET block = '0' +WHERE block = ''; + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto +Fields Updated: + - lot = '0' + +SQL Preview: +UPDATE pluto +SET lot = '0' +WHERE lot = ''; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/dtmmergepolygons.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto_dtm +Fields Updated: + - primebbl = condo_billing_bbl +Source Tables: pluto_condo + +SQL Preview: +UPDATE pluto_dtm +SET primebbl = condo_billing_bbl +FROM pluto_condo +WHERE + bbl = condo_base_bbl + AND condo_billing_bbl IS NOT NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/geocode_billingbbl.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto_rpad_geo +Fields Updated: + - billingblock = SUBSTRING(billingbbl + - billinglot = RIGHT(billingbbl + +SQL Preview: +UPDATE pluto_rpad_geo +SET + billingblock = SUBSTRING(billingbbl, 2, 5), + billinglot = RIGHT(billingbbl, 4) +WHERE + billingbbl IS NOT NULL + AND billingbbl != '0000000000' + AND billingbbl ! + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/latlong.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - centroid = ST_SETSRID(ST_MAKEPOINT(longitude::double precision + +SQL Preview: +UPDATE pluto SET centroid = ST_SETSRID(ST_MAKEPOINT(longitude::double precision, latitude::double precision), 4326); + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/numericfields_geomfields.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - appbbl = '0' + +SQL Preview: +UPDATE pluto +SET appbbl = '0' +WHERE appbbl::numeric = 0; + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto +Fields Updated: + - sanitdistrict = sanitdistrict::integer + +SQL Preview: +UPDATE pluto +SET sanitdistrict = sanitdistrict::integer; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/plutomapid.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - plutomapid = '1' + +SQL Preview: +UPDATE pluto +SET plutomapid = '1' +WHERE + geom IS NOT NULL + AND plutomapid IS NULL; + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto +Fields Updated: + - plutomapid = '2' + +SQL Preview: +UPDATE pluto +SET plutomapid = '2' +WHERE geom IS NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/primebbl.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto_rpad_geo +Fields Updated: + - primebbl = billingbbl + +SQL Preview: +UPDATE pluto_rpad_geo +SET primebbl = billingbbl +WHERE billingbbl IS NOT NULL AND billingbbl != '0000000000'; + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto_rpad_geo +Fields Updated: + - primebbl = boro || tb || tl + +SQL Preview: +UPDATE pluto_rpad_geo +SET primebbl = boro || tb || tl +WHERE primebbl IS NULL; + +---------------------------------------- + +--- UPDATE #3 --- +Target Table: dof_pts_propmaster +Fields Updated: + - primebbl = boro || tb || tl + +SQL Preview: +UPDATE dof_pts_propmaster +SET primebbl = boro || tb || tl +WHERE primebbl IS NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/versions.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - version = '20v1' + +SQL Preview: +UPDATE pluto +SET version = '20v1'; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/yearbuiltalt.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto_allocated +Fields Updated: + - yearbuilt = '0' + +SQL Preview: +UPDATE pluto_allocated +SET yearbuilt = '0' +WHERE + yearbuilt IS NULL + OR yearbuilt::integer = 0; + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto_allocated +Fields Updated: + - yearalter1 = '0' + +SQL Preview: +UPDATE pluto_allocated +SET yearalter1 = '0' +WHERE + yearalter1 IS NULL + OR yearalter1::integer = 0; + +---------------------------------------- + +--- UPDATE #3 --- +Target Table: pluto_allocated +Fields Updated: + - yearalter2 = '0' + +SQL Preview: +UPDATE pluto_allocated +SET yearalter2 = '0' +WHERE + yearalter2 IS NULL + OR yearalter2::integer = 0; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/zoning.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - splitzone = 'Y' + +SQL Preview: +UPDATE pluto +SET splitzone = 'Y' +WHERE + zonedist1 IS NOT NULL + AND ( + zonedist2 IS NOT NULL + OR overlay1 IS NOT NULL + OR spdist1 IS NOT NULL + ); + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto +Fields Updated: + - splitzone = 'Y' + +SQL Preview: +UPDATE pluto +SET splitzone = 'Y' +WHERE + overlay1 IS NOT NULL + AND ( + zonedist1 IS NOT NULL + OR overlay2 IS NOT NULL + OR spdist1 IS NOT NULL + ); + +---------------------------------------- + +--- UPDATE #3 --- +Target Table: pluto +Fields Updated: + - splitzone = 'Y' + +SQL Preview: +UPDATE pluto +SET splitzone = 'Y' +WHERE + spdist1 IS NOT NULL + AND ( + zonedist1 IS NOT NULL + OR overlay1 IS NOT NULL + OR spdist2 IS NOT NULL + ); + +---------------------------------------- + +--- UPDATE #4 --- +Target Table: pluto +Fields Updated: + - splitzone = 'N' + +SQL Preview: +UPDATE pluto +SET splitzone = 'N' +WHERE splitzone IS NULL AND zonedist1 IS NOT NULL; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/zoning_specialdistrict.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - spdist1 = 'CL' + - spdist2 = 'MiD' + +SQL Preview: +UPDATE pluto +SET + spdist1 = 'CL', + spdist2 = 'MiD' +WHERE + spdist1 = 'MiD' + AND spdist2 = 'CL'; + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto +Fields Updated: + - spdist1 = 'MiD' + - spdist2 = 'TA' + +SQL Preview: +UPDATE pluto +SET + spdist1 = 'MiD', + spdist2 = 'TA' +WHERE + spdist1 = 'TA' + AND spdist2 = 'MiD'; + +---------------------------------------- + +--- UPDATE #3 --- +Target Table: pluto +Fields Updated: + - spdist1 = '125th' + - spdist2 = 'TA' + +SQL Preview: +UPDATE pluto +SET + spdist1 = '125th', + spdist2 = 'TA' +WHERE + spdist1 = 'TA' + AND spdist2 = '125th'; + +---------------------------------------- + +--- UPDATE #4 --- +Target Table: pluto +Fields Updated: + - spdist1 = 'EC-5' + - spdist2 = 'MX-16' + +SQL Preview: +UPDATE pluto +SET + spdist1 = 'EC-5', + spdist2 = 'MX-16' +WHERE + spdist1 = 'MX-16' + AND spdist2 = 'EC-5'; + +---------------------------------------- + +--- UPDATE #5 --- +Target Table: pluto +Fields Updated: + - spdist1 = 'EC-6' + - spdist2 = 'MX-16' + +SQL Preview: +UPDATE pluto +SET + spdist1 = 'EC-6', + spdist2 = 'MX-16' +WHERE + spdist1 = 'MX-16' + AND spdist2 = 'EC-6'; + +---------------------------------------- + +--- UPDATE #6 --- +Target Table: pluto +Fields Updated: + - spdist1 = 'EHC' + - spdist2 = 'TA' + +SQL Preview: +UPDATE pluto +SET + spdist1 = 'EHC', + spdist2 = 'TA' +WHERE + spdist1 = 'TA' + AND spdist2 = 'EHC'; + +---------------------------------------- + +--- UPDATE #7 --- +Target Table: pluto +Fields Updated: + - spdist1 = 'MX-1' + - spdist2 = 'G' + +SQL Preview: +UPDATE pluto +SET + spdist1 = 'MX-1', + spdist2 = 'G' +WHERE + spdist1 = 'G' + AND spdist2 = 'MX-1'; + +---------------------------------------- + + +================================================================================ +FILE: pluto_build/sql/zoning_splitzone.sql +================================================================================ + +--- UPDATE #1 --- +Target Table: pluto +Fields Updated: + - splitzone = 'Y' + +SQL Preview: +UPDATE pluto +SET splitzone = 'Y' +WHERE + zonedist2 IS NOT NULL + OR overlay2 IS NOT NULL + OR spdist2 IS NOT NULL; + +---------------------------------------- + +--- UPDATE #2 --- +Target Table: pluto +Fields Updated: + - splitzone = 'N' + +SQL Preview: +UPDATE pluto +SET splitzone = 'N' +WHERE splitzone IS NULL AND zonedist1 IS NOT NULL; + +---------------------------------------- + +Parsing dependencies... + +Total UPDATE statements analyzed: 42 + +================================================================================ +MIGRATION GROUPS +================================================================================ + +CAMA +---------------------------------------- +Files (5): cama_bsmttype.sql, cama_easements.sql, cama_lottype.sql, cama_proxcode.sql, create_cama_primebbl.sql +Target tables: pluto, pluto_input_cama +Fields updated (5): bsmtcode, easements, lottype, primebbl, proxcode + +Classification/Defaults +---------------------------------------- +Files (1): bldgclass.sql +Target tables: pluto +Fields updated (1): bldgclass + +Corrections/Backfill +---------------------------------------- +Files (2): backfill.sql, corr_lotarea.sql +Target tables: pluto +Fields updated (1): builtfar + +DBT Integration (Already Migrated) +---------------------------------------- +Files (1): apply_dbt_enrichments.sql +Target tables: pluto +Fields updated (28): affresfar, areasource, builtfar, centroid, commfar, condono, edesignum, facilfar, histdist, irrlotcode + ... and 18 more + +Geocoding +---------------------------------------- +Files (4): dtmgeoms.sql, dtmmergepolygons.sql, geocode_billingbbl.sql, latlong.sql +Target tables: pluto, pluto_dtm, pluto_rpad_geo +Fields updated (6): billingblock, billinglot, block, centroid, lot, primebbl + +Other +---------------------------------------- +Files (5): apdate.sql, create_rpad_geo.sql, numericfields_geomfields.sql, plutomapid.sql, versions.sql +Target tables: pluto, pluto_rpad_geo, stg__pluto_input_geocodes +Fields updated (9): ap_datef, appbbl, bbl, ct2010, plutomapid, sanitdistrict, version, xcoord, ycoord + +Prime BBL +---------------------------------------- +Files (1): primebbl.sql +Target tables: dof_pts_propmaster, pluto_rpad_geo +Fields updated (1): primebbl + +Year Built +---------------------------------------- +Files (1): yearbuiltalt.sql +Target tables: pluto_allocated +Fields updated (3): yearalter1, yearalter2, yearbuilt + +Zoning +---------------------------------------- +Files (3): zoning.sql, zoning_specialdistrict.sql, zoning_splitzone.sql +Target tables: pluto +Fields updated (3): spdist1, spdist2, splitzone + +================================================================================ +LEAF CANDIDATES (Fields written but not read as dependencies) +================================================================================ + +Note: These fields appear to be final outputs with no downstream SQL dependencies +They are good candidates for early migration. + +dof_pts_propmaster: primebbl +pluto: affresfar, appbbl, areasource, bldgclass, block, bsmtcode, builtfar, centroid, commfar, condono, easements, edesignum, facilfar, histdist, irrlotcode + ... and 26 more +pluto_allocated: yearalter1, yearalter2, yearbuilt +pluto_dtm: primebbl +pluto_input_cama: primebbl +pluto_rpad_geo: ap_datef, bbl, billingblock, billinglot, primebbl +stg__pluto_input_geocodes: ct2010, xcoord, ycoord + +================================================================================ +INTERMEDIATE TABLES (Read by other updates) +================================================================================ + +These tables must be migrated before their dependents: + + +================================================================================ +RECOMMENDED MIGRATION ORDER +================================================================================ + +1. PRIME BBL (foundational - sets up primary key relationships) +2. CLASSIFICATION/DEFAULTS (simple null fills and classifications) +3. CAMA (business logic for various property attributes) +4. GEOCODING (lat/long, geometry operations) +5. ZONING (complex multi-table zoning logic) +6. YEAR BUILT (allocated table operations) +7. CORRECTIONS/BACKFILL (depends on other fields being set) + +Within each group, files can likely be migrated in parallel as separate dbt models. + + + +================================================================================ +CRITICAL: EXECUTION ORDER AND INTER-GROUP DEPENDENCIES +================================================================================ + +Based on analysis of products/pluto/pluto_build/02_build.sh: + +EXECUTION SEQUENCE (files with UPDATE statements only): +--------------------------------------------------------- + 1. primebbl.sql → Sets primebbl in multiple tables + 2. apdate.sql → Updates pluto_rpad_geo.ap_datef + 3. yearbuiltalt.sql → Updates pluto_allocated year fields + 4. create_cama_primebbl → Updates pluto_input_cama.primebbl + 5. cama_bsmttype.sql → Updates pluto.bsmtcode + 6. cama_lottype.sql → Updates pluto.lottype + 7. cama_proxcode.sql → Updates pluto.proxcode + 8. cama_easements.sql → Updates pluto.easements + 9. dtmmergepolygons.sql → Updates pluto_dtm.primebbl +10. dtmgeoms.sql → Updates pluto geometry fields +11. zoning_specialdistrict → Updates pluto.spdist1/2 +12. zoning_splitzone.sql → Updates pluto.splitzone +13. bldgclass.sql → Updates pluto.bldgclass +14. latlong.sql → Updates pluto lat/long/centroid ⚠️ BEFORE DBT +15. numericfields_geom... → Updates pluto numeric fields ⚠️ BEFORE DBT +16. ** DBT RUN ** → Creates pluto_enriched table +17. apply_dbt_enrich... → Applies dbt results to pluto +18. plutomapid.sql → Updates pluto.plutomapid +19. versions.sql → Updates pluto.version + +THEN IN 03_corrections.sh (separate script, runs after): +20. corr_lotarea.sql → Corrects lotarea/bldgarea/builtfar + +KEY FINDINGS: +------------- + +✅ NO CIRCULAR DEPENDENCIES + - backfill.sql is NOT USED (orphaned file, ignore it) + - corr_lotarea.sql runs in separate corrections phase (after main build) + +⚠️ CRITICAL ORDERING CONSTRAINT: + The following MUST run BEFORE the dbt models execute: + - latlong.sql (sets centroid, used by some dbt models) + - numericfields_geomfields.sql (sets sanitdistrict, used by dbt models) + + Current flow: + 1. SQL updates populate pluto fields + 2. DBT models read from pluto, create pluto_enriched + 3. apply_dbt_enrichments.sql copies from pluto_enriched back to pluto + +MIGRATION IMPLICATIONS: +----------------------- + +✅ SAFE TO MIGRATE (no dependencies on other SQL updates): + - CAMA group: bsmttype, lottype, proxcode, easements + - Zoning group: splitzone, specialdistrict + - Classification: bldgclass + - Year Built: yearbuiltalt (separate table) + - Prime BBL: primebbl + - Other: apdate, plutomapid, versions + +⚠️ MIGRATE WITH CARE (used by dbt models): + - latlong.sql → dbt models may reference centroid + - numericfields_geomfields.sql → dbt models reference sanitdistrict + - dtmgeoms.sql → geometry operations + + Strategy: Keep these as SQL until all dbt models are verified not to need + the pre-dbt state, OR migrate them and ensure dbt dependencies are explicit. + +🎯 RECOMMENDED APPROACH: + Migrate in waves, testing dbt integration after each: + 1. Wave 1: CAMA + Classification + Zoning (clearly independent) + 2. Wave 2: Prime BBL + Year Built (separate tables) + 3. Wave 3: Geocoding (latlong, dtmgeoms) - test dbt dependencies carefully + 4. Wave 4: Remaining (plutomapid, numericfields, versions) diff --git a/products/pluto/models/_sources.yml b/products/pluto/models/_sources.yml index c96e6b11a3..d1a1be3c1b 100644 --- a/products/pluto/models/_sources.yml +++ b/products/pluto/models/_sources.yml @@ -91,6 +91,8 @@ sources: description: >- Processed PTS data where each record is a UNIT BBL: i.e. there are multiple records per a condominium. + - name: pluto_input_cama + description: CAMA data with primebbl added (created from pluto_input_cama_dof) - name: previous_pluto - name: pluto diff --git a/products/pluto/models/intermediate/cama/_cama_models.yml b/products/pluto/models/intermediate/cama/_cama_models.yml new file mode 100644 index 0000000000..521c3a5553 --- /dev/null +++ b/products/pluto/models/intermediate/cama/_cama_models.yml @@ -0,0 +1,46 @@ +version: 2 + +models: + - name: int_pluto__bsmtcode + description: Basement type code based on CAMA data (highest bsmnt_type and bsmntgradient) + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: bsmtcode + description: Basement code (5 = Unknown if no data) + + - name: int_pluto__lottype + description: Lot type classification from CAMA data (lowest non-zero, non-5 value) + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: lottype + description: Lot type code (0 = Mixed or Unknown if no data) + + - name: int_pluto__proxcode + description: Proxy code (property classification) from CAMA with DOF to DCP value recoding + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: proxcode + description: Proxy code (0 = Not Available if no data) + + - name: int_pluto__easements + description: Count of distinct easements associated with each lot + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: easements + description: Number of easements (0 if none) diff --git a/products/pluto/models/intermediate/cama/int_pluto__bsmtcode.sql b/products/pluto/models/intermediate/cama/int_pluto__bsmtcode.sql new file mode 100644 index 0000000000..9ac3ebcf37 --- /dev/null +++ b/products/pluto/models/intermediate/cama/int_pluto__bsmtcode.sql @@ -0,0 +1,83 @@ +-- Migrated from: pluto_build/sql/cama_bsmttype.sql +-- Assigns basement type code based on CAMA data +-- Logic: +-- 1. Get highest bsmnt_type and bsmntgradient for each lot (where bldgnum = '1') +-- 2. Match to pluto_input_bsmtcode lookup table to get bsmtcode +-- 3. Assign '5' (Unknown) to lots without basement data + +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +WITH + +{% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} +-- Dev mode: Sample 20 BBLs per borough for fast iteration +dev_sample_bbls AS ( + SELECT DISTINCT bbl + FROM ( + SELECT bbl, + ROW_NUMBER() OVER (PARTITION BY LEFT(bbl, 1) ORDER BY RANDOM()) AS rn + FROM {{ target.schema }}.pluto + ) sub + WHERE rn <= 20 +), +{% endif %} + +base_pluto AS ( + SELECT bbl + FROM {{ target.schema }}.pluto + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + WHERE bbl IN (SELECT bbl FROM dev_sample_bbls) + {% endif %} +), + +-- Get highest bsmnt_type and bsmntgradient value for each lot +-- Remove 0 (Unknown) bsmnt_type +-- Filter to building number 1 only +cama_ranked AS ( + SELECT + primebbl AS bbl, + bsmnt_type, + bsmntgradient, + ROW_NUMBER() OVER ( + PARTITION BY primebbl + ORDER BY bsmnt_type DESC, bsmntgradient DESC + ) AS row_number + FROM {{ source('build_sources', 'pluto_input_cama') }} + WHERE + bsmnt_type != '0' + AND bldgnum = '1' +), + +-- Match bsmnt_type and bsmntgradient values to lookup table +dcpcamavals AS ( + SELECT DISTINCT + x.bbl, + x.bsmnt_type, + x.bsmntgradient, + b.bsmtcode + FROM cama_ranked AS x + LEFT JOIN {{ ref('pluto_input_bsmtcode') }} AS b + ON x.bsmnt_type = b.bsmnt_type AND x.bsmntgradient = b.bsmntgradient + WHERE x.row_number = 1 +), + +-- Join with base pluto and assign defaults +bsmtcode_final AS ( + SELECT + p.bbl, + COALESCE(d.bsmtcode, '5') AS bsmtcode + FROM base_pluto AS p + LEFT JOIN dcpcamavals AS d + ON p.bbl = d.bbl +) + +SELECT + bbl, + bsmtcode +FROM bsmtcode_final diff --git a/products/pluto/models/intermediate/cama/int_pluto__easements.sql b/products/pluto/models/intermediate/cama/int_pluto__easements.sql new file mode 100644 index 0000000000..150fe78d54 --- /dev/null +++ b/products/pluto/models/intermediate/cama/int_pluto__easements.sql @@ -0,0 +1,71 @@ +-- Migrated from: pluto_build/sql/cama_easements.sql +-- Sets the number of distinct easements associated with each lot +-- Logic: +-- 1. Get distinct easements for each lot from int__dof_pts_propmaster +-- 2. Count the number of distinct easements per lot +-- 3. Set easements to 0 for lots with no easements + +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +WITH + +{% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} +-- Dev mode: Sample 20 BBLs per borough for fast iteration +dev_sample_bbls AS ( + SELECT DISTINCT bbl + FROM ( + SELECT bbl, + ROW_NUMBER() OVER (PARTITION BY LEFT(bbl, 1) ORDER BY RANDOM()) AS rn + FROM {{ target.schema }}.pluto + ) sub + WHERE rn <= 20 +), +{% endif %} + +base_pluto AS ( + SELECT bbl + FROM {{ target.schema }}.pluto + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + WHERE bbl IN (SELECT bbl FROM dev_sample_bbls) + {% endif %} +), + +-- Get distinct easements for each lot +distincteasements AS ( + SELECT DISTINCT + primebbl AS bbl, + ease + FROM {{ ref('int__dof_pts_propmaster') }} + WHERE ease IS NOT NULL AND ease != ' ' +), + +-- Count the number of distinct easements for each lot +counteasements AS ( + SELECT + bbl, + COUNT(*) AS numeasements + FROM distincteasements + WHERE ease IS NOT NULL + GROUP BY bbl +), + +-- Join with base pluto and assign defaults +easements_final AS ( + SELECT + p.bbl, + COALESCE(c.numeasements::text, '0') AS easements + FROM base_pluto AS p + LEFT JOIN counteasements AS c + ON p.bbl = c.bbl +) + +SELECT + bbl, + easements +FROM easements_final diff --git a/products/pluto/models/intermediate/cama/int_pluto__lottype.sql b/products/pluto/models/intermediate/cama/int_pluto__lottype.sql new file mode 100644 index 0000000000..d25ddbbb80 --- /dev/null +++ b/products/pluto/models/intermediate/cama/int_pluto__lottype.sql @@ -0,0 +1,93 @@ +-- Migrated from: pluto_build/sql/cama_lottype.sql +-- Assigns lot type based on CAMA data +-- Logic: +-- 1. Remove 0s (Not Available) and 5 (none of the other types) +-- 2. Select lowest lot type value where bldgnum is 1 +-- 3. If no value found, check for lot type value of 5 +-- 4. Assign '0' (Mixed or Unknown) to remaining records + +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +WITH + +{% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} +-- Dev mode: Sample 20 BBLs per borough for fast iteration +dev_sample_bbls AS ( + SELECT DISTINCT bbl + FROM ( + SELECT bbl, + ROW_NUMBER() OVER (PARTITION BY LEFT(bbl, 1) ORDER BY RANDOM()) AS rn + FROM {{ target.schema }}.pluto + ) sub + WHERE rn <= 20 +), +{% endif %} + +base_pluto AS ( + SELECT bbl + FROM {{ target.schema }}.pluto + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + WHERE bbl IN (SELECT bbl FROM dev_sample_bbls) + {% endif %} +), + +-- Get lowest lot type value for each lot (excluding 0 and 5) +-- Filter to building number 1 only +cama_ranked AS ( + SELECT + primebbl AS bbl, + lottype, + ROW_NUMBER() OVER ( + PARTITION BY primebbl + ORDER BY lottype + ) AS row_number + FROM {{ source('build_sources', 'pluto_input_cama') }} + WHERE + lottype != '0' + AND lottype != '5' + AND bldgnum = '1' +), + +dcpcamavals AS ( + SELECT DISTINCT + bbl, + lottype + FROM cama_ranked + WHERE row_number = 1 +), + +-- Get lots with lot type value of 5 (as fallback) +lottype_five AS ( + SELECT DISTINCT + primebbl AS bbl, + lottype + FROM {{ source('build_sources', 'pluto_input_cama') }} + WHERE lottype = '5' +), + +-- Join with base pluto and apply priority logic +lottype_final AS ( + SELECT + p.bbl, + COALESCE( + d.lottype, -- First try the preferred lot types + f.lottype, -- Then try lot type 5 + '0' -- Finally default to 0 (Mixed or Unknown) + ) AS lottype + FROM base_pluto AS p + LEFT JOIN dcpcamavals AS d + ON p.bbl = d.bbl + LEFT JOIN lottype_five AS f + ON p.bbl = f.bbl +) + +SELECT + bbl, + lottype +FROM lottype_final diff --git a/products/pluto/models/intermediate/cama/int_pluto__proxcode.sql b/products/pluto/models/intermediate/cama/int_pluto__proxcode.sql new file mode 100644 index 0000000000..c6722b4a38 --- /dev/null +++ b/products/pluto/models/intermediate/cama/int_pluto__proxcode.sql @@ -0,0 +1,81 @@ +-- Migrated from: pluto_build/sql/cama_proxcode.sql +-- Assigns proxy code (property classification) +-- Logic: +-- 1. Recode DOF values to DCP values (5->2, 4->3, 6->3) +-- 2. Remove 0s (Not Available) and 'N' values +-- 3. Select proxcode from record where bldgnum is 1 +-- 4. Take max proxcode if multiple values exist +-- 5. Assign '0' (Not Available) to remaining records + +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +WITH + +{% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} +-- Dev mode: Sample 20 BBLs per borough for fast iteration +dev_sample_bbls AS ( + SELECT DISTINCT bbl + FROM ( + SELECT bbl, + ROW_NUMBER() OVER (PARTITION BY LEFT(bbl, 1) ORDER BY RANDOM()) AS rn + FROM {{ target.schema }}.pluto + ) sub + WHERE rn <= 20 +), +{% endif %} + +base_pluto AS ( + SELECT bbl + FROM {{ target.schema }}.pluto + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + WHERE bbl IN (SELECT bbl FROM dev_sample_bbls) + {% endif %} +), + +-- Recode DOF values to DCP values and filter +-- Only use records where bldgnum is 1 +dcpcamavals AS ( + SELECT + primebbl AS bbl, + (CASE + WHEN proxcode = '5' THEN '2' + WHEN proxcode = '4' THEN '3' + WHEN proxcode = '6' THEN '3' + ELSE proxcode + END) AS proxcode + FROM {{ source('build_sources', 'pluto_input_cama') }} + WHERE + proxcode != '0' + AND proxcode != 'N' + AND bldgnum = '1' +), + +-- Take max proxcode for each BBL +max_bbl_proxcodes AS ( + SELECT + bbl, + MAX(proxcode) AS proxcode + FROM dcpcamavals + GROUP BY bbl +), + +-- Join with base pluto and assign defaults +proxcode_final AS ( + SELECT + p.bbl, + COALESCE(m.proxcode, '0') AS proxcode + FROM base_pluto AS p + LEFT JOIN max_bbl_proxcodes AS m + ON p.bbl = m.bbl +) + +SELECT + bbl, + proxcode +FROM proxcode_final diff --git a/products/pluto/models/intermediate/miharea/_miharea_models.yml b/products/pluto/models/intermediate/miharea/_miharea_models.yml new file mode 100644 index 0000000000..c864bffbce --- /dev/null +++ b/products/pluto/models/intermediate/miharea/_miharea_models.yml @@ -0,0 +1,25 @@ +version: 2 + +models: + - name: int_mih__cleaned + description: Cleaned MIH option names with standardized formatting + + - name: int_mih__lot_overlap + description: Spatial overlap calculations between tax lots and MIH areas + + - name: int_pluto__miharea + description: MIH affordability options pivoted to binary columns per BBL + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: mih_opt1 + description: MIH Option 1 flag + - name: mih_opt2 + description: MIH Option 2 flag + - name: mih_opt3 + description: MIH Option 3 / Deep Affordability flag + - name: mih_opt4 + description: MIH Workforce Option flag diff --git a/products/pluto/models/intermediate/miharea/int_mih__cleaned.sql b/products/pluto/models/intermediate/miharea/int_mih__cleaned.sql new file mode 100644 index 0000000000..eb594aea54 --- /dev/null +++ b/products/pluto/models/intermediate/miharea/int_mih__cleaned.sql @@ -0,0 +1,36 @@ +{{ + config( + materialized='table', + tags=['pluto_enrichment'] + ) +}} + +-- Clean MIH option names and create unique identifiers +-- Handles typos and standardizes formatting + +SELECT + project_id || '-' || mih_option AS mih_id, + *, + TRIM( + -- Step 2b: collapse any sequence of commas (e.g., ",,", ",,,") + REGEXP_REPLACE( + -- Step 2a: Replace "and" or "," (with any spaces) with a single comma + REGEXP_REPLACE( + -- Step 1: Add space between "Option" and number + REGEXP_REPLACE( + REPLACE(mih_option, 'Affordablility', 'Affordability'), + 'Option(\d)', + 'Option \1', + 'g' + ), + '\s*(,|and)\s*', + ',', + 'g' + ), + ',+', + ',', + 'g' + ), + ', ' + ) AS cleaned_option +FROM {{ ref('stg__dcp_gis_mandatory_inclusionary_housing') }} diff --git a/products/pluto/models/intermediate/miharea/int_mih__lot_overlap.sql b/products/pluto/models/intermediate/miharea/int_mih__lot_overlap.sql new file mode 100644 index 0000000000..c9077a9858 --- /dev/null +++ b/products/pluto/models/intermediate/miharea/int_mih__lot_overlap.sql @@ -0,0 +1,56 @@ +{{ + config( + materialized='table', + tags=['pluto_enrichment'] + ) +}} + +-- Calculate spatial overlap between tax lots and MIH areas +-- A lot is assigned to a MIH area if: +-- 1. ≥10% of the lot area is covered by the MIH area, OR +-- 2. ≥50% of the MIH area is covered by the lot + +WITH mih_per_area AS ( + SELECT + p.bbl, + m.project_id, + m.mih_id, + m.wkb_geometry AS mih_geom, + p.geom AS lot_geom, + m.cleaned_option, + ST_AREA( + CASE + WHEN ST_COVEREDBY(p.geom, m.wkb_geometry) THEN p.geom + ELSE ST_MULTI(ST_INTERSECTION(p.geom, m.wkb_geometry)) + END + ) AS segbblgeom, + ST_AREA(p.geom) AS allbblgeom, + ST_AREA( + CASE + WHEN ST_COVEREDBY(m.wkb_geometry, p.geom) THEN m.wkb_geometry + ELSE ST_MULTI(ST_INTERSECTION(m.wkb_geometry, p.geom)) + END + ) AS segmihgeom, + ST_AREA(m.wkb_geometry) AS allmihgeom + FROM {{ target.schema }}.pluto AS p + INNER JOIN {{ ref('int_mih__cleaned') }} AS m + ON ST_INTERSECTS(p.geom, m.wkb_geometry) +), + +mih_areas AS ( + SELECT + bbl, + cleaned_option, + project_id, + mih_id, + SUM(segbblgeom) AS segbblgeom, + SUM(segmihgeom) AS segmihgeom, + SUM(segbblgeom / allbblgeom) * 100 AS perbblgeom, + MAX(segmihgeom / allmihgeom) * 100 AS maxpermihgeom + FROM mih_per_area + GROUP BY bbl, cleaned_option, project_id, mih_id +) + +SELECT * +FROM mih_areas +WHERE perbblgeom >= 10 OR maxpermihgeom >= 50 diff --git a/products/pluto/models/intermediate/miharea/int_pluto__miharea.sql b/products/pluto/models/intermediate/miharea/int_pluto__miharea.sql new file mode 100644 index 0000000000..b8eaff4890 --- /dev/null +++ b/products/pluto/models/intermediate/miharea/int_pluto__miharea.sql @@ -0,0 +1,47 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Pivot MIH options into binary columns +-- A single lot can have multiple MIH options that all apply + +WITH bbls_with_all_options AS ( + SELECT + bbl, + STRING_AGG(cleaned_option, ',') AS all_options + FROM {{ ref('int_mih__lot_overlap') }} + GROUP BY bbl +), + +pivoted AS ( + SELECT + bbl, + CASE + WHEN all_options LIKE '%Option 1%' THEN '1' + END AS mih_opt1, + CASE + WHEN all_options LIKE '%Option 2%' THEN '1' + END AS mih_opt2, + CASE + WHEN + all_options LIKE '%Option 3%' + OR all_options LIKE '%Deep Affordability Option%' + THEN '1' + END AS mih_opt3, + CASE + WHEN all_options LIKE '%Workforce Option%' THEN '1' + END AS mih_opt4 + FROM bbls_with_all_options +) + +SELECT + bbl, + mih_opt1, + mih_opt2, + mih_opt3, + mih_opt4 +FROM pivoted diff --git a/products/pluto/models/intermediate/prep/int__pluto_allocated.sql b/products/pluto/models/intermediate/prep/int__pluto_allocated.sql new file mode 100644 index 0000000000..b85c96bd98 --- /dev/null +++ b/products/pluto/models/intermediate/prep/int__pluto_allocated.sql @@ -0,0 +1,246 @@ +-- Migrated from: pluto_build/sql/create_allocated.sql +-- Also incorporates: pluto_build/sql/yearbuiltalt.sql +-- Creates the allocated table by aggregating condo data from int__pluto_rpad_geo +-- This table is used to populate ~25 fields in the base PLUTO table + +{{ + config( + materialized='table', + indexes=[ + {'columns': ['bbl'], 'unique': True} + ] + ) +}} + +WITH + +{% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} +-- Dev mode: Sample BBLs for fast iteration +dev_sample_bbls AS ( + SELECT DISTINCT primebbl + FROM {{ ref('int__pluto_rpad_geo') }} + LIMIT 100 +), +{% endif %} + +-- Get distinct primebbl records as the base +base_bbls AS ( + SELECT DISTINCT primebbl AS bbl + FROM {{ ref('int__pluto_rpad_geo') }} + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + WHERE primebbl IN (SELECT primebbl FROM dev_sample_bbls) + {% endif %} +), + +-- One-to-one attributes for non-condo records +noncondo_attrs AS ( + SELECT + primebbl AS bbl, + bldgcl AS bldgclass, + story::text AS numfloors, + lfft::text AS lotfront, + ldft::text AS lotdepth, + bfft::text AS bldgfront, + bdft::text AS bldgdepth, + ext, + condo_number AS condono, + land_area::text AS lotarea, + gross_sqft::text AS bldgarea, + yrbuilt AS yearbuilt, + yralt1 AS yearalter1, + yralt2 AS yearalter2, + owner AS ownername, + irreg AS irrlotcode, + concat(housenum_lo, ' ', street_name) AS address, + CASE + WHEN numberofexistingstructuresonlot::integer > 0 + THEN numberofexistingstructuresonlot::integer::text + ELSE bldgs::text + END AS numbldgs, + ap_boro || lpad(ap_block, 5, '0') || lpad(ap_lot, 4, '0') AS appbbl, + ap_datef AS appdate + FROM {{ ref('int__pluto_rpad_geo') }} + WHERE + tl NOT LIKE '75%' + AND condo_number IS NULL + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + AND primebbl IN (SELECT primebbl FROM dev_sample_bbls) + {% endif %} +), + +-- One-to-one attributes for condo records (75xx lots) +condo_attrs AS ( + SELECT + primebbl AS bbl, + bldgcl AS bldgclass, + story::text AS numfloors, + lfft::text AS lotfront, + ldft::text AS lotdepth, + bfft::text AS bldgfront, + bdft::text AS bldgdepth, + ext, + condo_number AS condono, + land_area::text AS lotarea, + yrbuilt AS yearbuilt, + yralt1 AS yearalter1, + yralt2 AS yearalter2, + owner AS ownername, + irreg AS irrlotcode, + concat(housenum_lo, ' ', street_name) AS address, + CASE + WHEN numberofexistingstructuresonlot::integer > 0 + THEN numberofexistingstructuresonlot::integer::text + ELSE bldgs::text + END AS numbldgs, + ap_boro || lpad(ap_block, 5, '0') || lpad(ap_lot, 4, '0') AS appbbl, + ap_datef AS appdate + FROM {{ ref('int__pluto_rpad_geo') }} + WHERE + tl LIKE '75%' + AND condo_number IS NOT NULL + AND condo_number <> '0' + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + AND primebbl IN (SELECT primebbl FROM dev_sample_bbls) + {% endif %} +), + +-- Aggregate building area for condo units +bldgarea_agg AS ( + SELECT + primebbl AS bbl, + sum(gross_sqft::numeric)::text AS bldgarea + FROM {{ ref('int__pluto_rpad_geo') }} + WHERE + tl NOT LIKE '75%' + AND condo_number IS NOT NULL + AND condo_number <> '0' + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + AND primebbl IN (SELECT primebbl FROM dev_sample_bbls) + {% endif %} + GROUP BY primebbl +), + +-- Aggregate unit counts +units_agg AS ( + SELECT + primebbl AS bbl, + sum(coop_apts::integer)::text AS unitsres, + sum(units::integer)::text AS unitstotal + FROM {{ ref('int__pluto_rpad_geo') }} + WHERE tl NOT LIKE '75%' + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + AND primebbl IN (SELECT primebbl FROM dev_sample_bbls) + {% endif %} + GROUP BY primebbl +), + +-- Aggregate financial fields +financial_agg AS ( + SELECT + primebbl AS bbl, + sum(curavl_act::double precision)::text AS assessland, + sum(curavt_act::double precision)::text AS assesstot, + sum(curext_act::double precision)::text AS exempttot + FROM {{ ref('int__pluto_rpad_geo') }} + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + WHERE primebbl IN (SELECT primebbl FROM dev_sample_bbls) + {% endif %} + GROUP BY primebbl +), + +-- Fill missing appbbl for condo lots from unit lots +unit_appbbls AS ( + SELECT + prg.primebbl AS bbl, + min(prg.ap_boro || lpad(prg.ap_block, 5, '0') || lpad(prg.ap_lot, 4, '0')) AS appbbl + FROM {{ ref('int__pluto_rpad_geo') }} AS prg + WHERE + right(prg.primebbl, 4) LIKE '75%' + AND prg.primebbl <> prg.bbl + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + AND prg.primebbl IN (SELECT primebbl FROM dev_sample_bbls) + {% endif %} + GROUP BY prg.primebbl +), + +-- Supplementary attributes from condo descriptive table +supplementary_attrs AS ( + SELECT + "PARID" AS bbl, + "LandSize"::text AS lotarea, + "Story"::text AS numfloors, + "YearBuilt"::text AS yearbuilt + FROM {{ ref('pluto_input_condolot_descriptiveattributes') }} + WHERE + "LandSize"::numeric > 0 + OR "Story"::numeric > 0 + OR "YearBuilt"::numeric > 0 +) + +-- Final assembly with COALESCE to merge all sources +SELECT + base.bbl, + -- Prefer condo attributes, fallback to noncondo + COALESCE(condo.bldgclass, noncondo.bldgclass) AS bldgclass, + NULL AS ownertype, -- This field is not populated in original SQL + COALESCE(condo.ownername, noncondo.ownername) AS ownername, + -- Lotarea: prefer noncondo/condo, fallback to supplementary if zero + CASE + WHEN COALESCE(condo.lotarea, noncondo.lotarea, '0')::numeric = 0 + THEN COALESCE(supp.lotarea, condo.lotarea, noncondo.lotarea) + ELSE COALESCE(condo.lotarea, noncondo.lotarea) + END AS lotarea, + -- Bldgarea: prefer aggregated for condos, fallback to noncondo + COALESCE(bldgarea_agg.bldgarea, noncondo.bldgarea) AS bldgarea, + COALESCE(condo.numbldgs, noncondo.numbldgs) AS numbldgs, + -- Numfloors: prefer noncondo/condo, fallback to supplementary if zero + CASE + WHEN COALESCE(condo.numfloors, noncondo.numfloors, '0')::numeric = 0 + THEN COALESCE(supp.numfloors, condo.numfloors, noncondo.numfloors) + ELSE COALESCE(condo.numfloors, noncondo.numfloors) + END AS numfloors, + units.unitsres, + units.unitstotal, + COALESCE(condo.lotfront, noncondo.lotfront) AS lotfront, + COALESCE(condo.lotdepth, noncondo.lotdepth) AS lotdepth, + COALESCE(condo.bldgfront, noncondo.bldgfront) AS bldgfront, + COALESCE(condo.bldgdepth, noncondo.bldgdepth) AS bldgdepth, + COALESCE(condo.ext, noncondo.ext) AS ext, + COALESCE(condo.irrlotcode, noncondo.irrlotcode) AS irrlotcode, + fin.assessland, + fin.assesstot, + NULL AS exemptland, -- Field no longer exists in source + fin.exempttot, + -- Yearbuilt: prefer noncondo/condo, fallback to supplementary if zero, normalize NULL/0 to '0' + CASE + WHEN COALESCE(condo.yearbuilt, noncondo.yearbuilt, '0')::numeric = 0 + THEN COALESCE(supp.yearbuilt, '0') + ELSE COALESCE(condo.yearbuilt, noncondo.yearbuilt) + END AS yearbuilt, + -- Yearalter1: normalize NULL/0 to '0' (from yearbuiltalt.sql) + CASE + WHEN COALESCE(condo.yearalter1, noncondo.yearalter1) IS NULL + OR COALESCE(condo.yearalter1, noncondo.yearalter1, '0')::numeric = 0 + THEN '0' + ELSE COALESCE(condo.yearalter1, noncondo.yearalter1) + END AS yearalter1, + -- Yearalter2: normalize NULL/0 to '0' (from yearbuiltalt.sql) + CASE + WHEN COALESCE(condo.yearalter2, noncondo.yearalter2) IS NULL + OR COALESCE(condo.yearalter2, noncondo.yearalter2, '0')::numeric = 0 + THEN '0' + ELSE COALESCE(condo.yearalter2, noncondo.yearalter2) + END AS yearalter2, + COALESCE(condo.condono, noncondo.condono) AS condono, + -- Appbbl: prefer noncondo/condo, fallback to unit_appbbls for missing condo lots + COALESCE(condo.appbbl, noncondo.appbbl, unit_appbbls.appbbl) AS appbbl, + COALESCE(condo.appdate, noncondo.appdate) AS appdate, + COALESCE(condo.address, noncondo.address) AS address +FROM base_bbls AS base +LEFT JOIN noncondo_attrs AS noncondo ON base.bbl = noncondo.bbl +LEFT JOIN condo_attrs AS condo ON base.bbl = condo.bbl +LEFT JOIN bldgarea_agg ON base.bbl = bldgarea_agg.bbl +LEFT JOIN units_agg AS units ON base.bbl = units.bbl +LEFT JOIN financial_agg AS fin ON base.bbl = fin.bbl +LEFT JOIN unit_appbbls ON base.bbl = unit_appbbls.bbl +LEFT JOIN supplementary_attrs AS supp ON base.bbl = supp.bbl diff --git a/products/pluto/models/intermediate/rpad/_rpad_models.yml b/products/pluto/models/intermediate/rpad/_rpad_models.yml new file mode 100644 index 0000000000..1b02eb7801 --- /dev/null +++ b/products/pluto/models/intermediate/rpad/_rpad_models.yml @@ -0,0 +1,49 @@ +version: 2 + +models: + - name: int__dof_pts_propmaster + description: | + Transforms raw PTS (Property Tax System) data into standardized format. + Migrated from pluto_build/sql/create_pts.sql + columns: + - name: boro + description: Borough code (1-5) + tests: + - not_null + - name: tb + description: Tax block + tests: + - not_null + - name: tl + description: Tax lot + tests: + - not_null + - name: bbl + description: Borough-Block-Lot identifier (10 digits) + + - name: int__pluto_rpad_geo + description: | + Critical intermediate table joining DOF property tax data with DCP geocodes. + This feeds into PLUTO creation and incorporates logic from multiple SQL files: + - create_rpad_geo.sql (base join) + - zerovacantlots.sql (vacant lot adjustments) + - lotarea.sql (lot area calculations) + - primebbl.sql (prime BBL assignment) + - apdate.sql (date formatting) + - geocode_billingbbl.sql (billing BBL parsing) + columns: + - name: bbl + description: Computed BBL (boro + padded block + padded lot) + tests: + - not_null + - unique + - name: primebbl + description: Prime BBL (for condos uses billingbbl, otherwise own BBL) + tests: + - not_null + - name: land_area + description: Lot area in square feet, calculated from frontage x depth if missing + - name: xcoord + description: X coordinate (State Plane), backfilled from lat/long if needed + - name: ycoord + description: Y coordinate (State Plane), backfilled from lat/long if needed diff --git a/products/pluto/models/intermediate/rpad/int__dof_pts_propmaster.sql b/products/pluto/models/intermediate/rpad/int__dof_pts_propmaster.sql new file mode 100644 index 0000000000..f08145721f --- /dev/null +++ b/products/pluto/models/intermediate/rpad/int__dof_pts_propmaster.sql @@ -0,0 +1,78 @@ +-- Migrated from: pluto_build/sql/create_pts.sql +-- Transforms raw PTS (Property Tax System) data into dof_pts_propmaster format + +{{ + config( + materialized='table', + indexes=[ + {'columns': ['boro', 'tb', 'tl'], 'unique': False}, + {'columns': ['bbl'], 'unique': False} + ] + ) +}} + +WITH base_pts AS ( + SELECT + boro, + block AS tb, + lot AS tl, + parid AS bbl, + street_name, + housenum_lo, + housenum_hi, + aptno, + zip_code AS zip, + bldg_class AS bldgcl, + ease, + av_owner AS owner, + REPLACE(land_area, '+', '')::double precision AS land_area, + REPLACE(gross_sqft, '+', '')::double precision AS gross_sqft, + REPLACE(residential_area_gross, '+', '')::double precision AS residarea, + REPLACE(office_area_gross, '+', '')::double precision AS officearea, + REPLACE(retail_area_gross, '+', '')::double precision AS retailarea, + REPLACE(garage_area, '+', '')::double precision AS garagearea, + REPLACE(storage_area_gross, '+', '')::double precision AS storagearea, + REPLACE(factory_area_gross, '+', '')::double precision AS factoryarea, + REPLACE(other_area_gross, '+', '')::double precision AS otherarea, + REPLACE(num_bldgs, '+', '')::double precision AS bldgs, + REPLACE(bld_story, '+', '')::double precision AS story, + REPLACE(coop_apts, '+', '')::double precision AS coop_apts, + REPLACE(units, '+', '')::double precision AS units, + bld_ext AS ext, + lot_irreg AS irreg, + REPLACE(curactland, '+', '')::double precision AS curavl_act, + REPLACE(curacttot, '+', '')::double precision AS curavt_act, + REPLACE(curactextot, '+', '')::double precision AS curext_act, + yrbuilt, + yralt1, + yralt2, + condo_number, + appt_boro AS ap_boro, + appt_block AS ap_block, + appt_lot AS ap_lot, + appt_ease AS ap_ease, + appt_date AS ap_date, + ROUND(REPLACE(lot_frt, '+', '')::numeric, 2) AS lfft, + ROUND(REPLACE(lot_dep, '+', '')::numeric, 2) AS ldft, + ROUND(REPLACE(bld_frt, '+', '')::numeric, 2) AS bfft, + ROUND(REPLACE(bld_dep, '+', '')::numeric, 2) AS bdft + FROM {{ ref('stg__pluto_pts') }} +), + +-- Add primebbl logic (from primebbl.sql) +with_primebbl AS ( + SELECT + pts.*, + COALESCE( + CASE + WHEN geo.billingbbl IS NOT NULL AND geo.billingbbl != '0000000000' + THEN geo.billingbbl + END, + pts.boro || pts.tb || pts.tl + ) AS primebbl + FROM base_pts pts + LEFT JOIN {{ ref('stg__pluto_input_geocodes') }} geo + ON pts.boro || pts.tb || pts.tl = geo.borough || LPAD(geo.block, 5, '0') || LPAD(geo.lot, 4, '0') +) + +SELECT * FROM with_primebbl diff --git a/products/pluto/models/intermediate/rpad/int__pluto_rpad_geo.sql b/products/pluto/models/intermediate/rpad/int__pluto_rpad_geo.sql new file mode 100644 index 0000000000..bf9954b860 --- /dev/null +++ b/products/pluto/models/intermediate/rpad/int__pluto_rpad_geo.sql @@ -0,0 +1,323 @@ +-- Migrated from: pluto_build/sql/create_rpad_geo.sql +-- Also incorporates logic from: +-- - zerovacantlots.sql +-- - lotarea.sql +-- - primebbl.sql +-- - apdate.sql +-- - geocode_billingbbl.sql +-- +-- Joins DOF property tax data with DCP geocodes +-- This is the critical intermediate table that feeds into PLUTO + +{{ + config( + materialized='table', + indexes=[ + {'columns': ['bbl'], 'unique': False}, + {'columns': ['primebbl'], 'unique': False} + ] + ) +}} + +WITH + +{% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} +-- Dev mode: Sample 20 BBLs per borough for fast iteration +dev_sample_bbls AS ( + SELECT boro || tb || tl AS bbl_key + FROM ( + SELECT DISTINCT boro, tb, tl, + ROW_NUMBER() OVER (PARTITION BY boro ORDER BY RANDOM()) AS rn + FROM {{ ref('int__dof_pts_propmaster') }} + ) sub + WHERE rn <= 20 +), +{% endif %} + +-- Prepare geocodes with coordinate transformations +geocodes_prepared AS ( + SELECT + *, + -- Compute coordinates from geometry as text (matching original behavior) + ST_X(ST_TRANSFORM(geom, 2263))::integer::text AS xcoord_calc, + ST_Y(ST_TRANSFORM(geom, 2263))::integer::text AS ycoord_calc, + -- Handle ct2010 = 0 case + CASE WHEN ct2010::numeric = 0 THEN NULL ELSE ct2010 END AS ct2010_fixed + FROM {{ ref('stg__pluto_input_geocodes') }} +), + +-- Deduplicate PTS records by BBL, keeping the best one +pluto_rpad_deduped AS ( + SELECT + a.*, + ROW_NUMBER() OVER ( + PARTITION BY boro || tb || tl + ORDER BY curavt_act DESC, land_area DESC, ease ASC + ) AS row_number + FROM {{ ref('int__dof_pts_propmaster') }} AS a + {% if env_var('PLUTO_DEV_MODE', 'false') == 'true' %} + INNER JOIN dev_sample_bbls s + ON a.boro || a.tb || a.tl = s.bbl_key + {% endif %} +), + +pluto_rpad_single AS ( + SELECT * + FROM pluto_rpad_deduped + WHERE row_number = 1 +), + +-- Join PTS data with geocodes +base_join AS ( + SELECT + a.*, + -- Include all geocode fields except geo_bbl (avoid name collision) + b.billingbbl, + b.cd, + b.ct2010_fixed AS ct2010, + b.cb2010, + b.ct2020, + b.cb2020, + b.schooldist, + b.council, + b.zipcode, + b.firecomp, + b.policeprct, + b.healthcenterdistrict, + b.healtharea, + b.sanitdistrict, + b.sanitsub, + b.boepreferredstreetname, + b.taxmap, + b.sanbornmapidentifier, + b.latitude, + b.longitude, + b.grc, + b.grc2, + b.msg, + b.msg2, + b.borough, + b.block, + b.lot, + b.easement, + b.input_hnum, + b.input_sname, + -- Rename to match expected column name from original SQL + b.numberofexistingstructures AS numberofexistingstructuresonlot, + b.geom, + b.ogc_fid, + b.data_library_version, + b.xcoord_calc AS xcoord_geo, + b.ycoord_calc AS ycoord_geo + FROM pluto_rpad_single AS a + LEFT JOIN geocodes_prepared AS b + ON a.boro || a.tb || a.tl = b.borough || LPAD(b.block, 5, '0') || LPAD(b.lot, 4, '0') +), + +-- Incorporate zerovacantlots.sql logic +-- Zero out building metrics for vacant lots +with_vacant_adjustments AS ( + SELECT + *, + CASE + WHEN curavl_act = curavt_act AND UPPER(bldgcl) LIKE 'V%' + THEN 0 + ELSE bfft + END AS bfft_adj, + CASE + WHEN curavl_act = curavt_act AND UPPER(bldgcl) LIKE 'V%' + THEN 0 + ELSE bdft + END AS bdft_adj, + CASE + WHEN curavl_act = curavt_act AND UPPER(bldgcl) LIKE 'V%' + THEN 0 + ELSE story + END AS story_adj, + CASE + WHEN curavl_act = curavt_act AND UPPER(bldgcl) LIKE 'V%' + THEN 0 + ELSE bldgs + END AS bldgs_adj + FROM base_join +), + +-- Incorporate lotarea.sql logic +-- Calculate lot area from frontage x depth when missing +with_lotarea_calculated AS ( + SELECT + *, + CASE + WHEN (land_area IS NULL OR land_area = 0) + AND irreg != 'I' + AND lfft > 0 + AND ldft > 0 + THEN lfft * ldft + ELSE land_area + END AS land_area_calc + FROM with_vacant_adjustments +), + +-- Incorporate primebbl.sql logic +-- Assign prime BBL (for condos, use billingbbl; otherwise use own BBL) +with_primebbl AS ( + SELECT + *, + CASE + WHEN billingbbl IS NOT NULL AND billingbbl != '0000000000' + THEN billingbbl + ELSE boro || tb || tl + END AS primebbl_calc + FROM with_lotarea_calculated +), + +-- Incorporate geocode_billingbbl.sql logic +-- Parse billing block and lot from billingbbl +with_billing_parsed AS ( + SELECT + *, + CASE + WHEN billingbbl IS NOT NULL + AND billingbbl != '0000000000' + AND billingbbl != 'none' + THEN SUBSTRING(billingbbl, 2, 5) + END AS billingblock, + CASE + WHEN billingbbl IS NOT NULL + AND billingbbl != '0000000000' + AND billingbbl != 'none' + THEN RIGHT(billingbbl, 4) + END AS billinglot + FROM with_primebbl +), + +-- Incorporate apdate.sql logic +-- Format ap_date from MM/DD/YY to MM/DD/YYYY +with_apdate_formatted AS ( + SELECT + *, + CASE + WHEN ap_date IS NOT NULL + THEN to_char(to_date(ap_date, 'MM/DD/YY'), 'MM/DD/YYYY') + END AS ap_datef + FROM with_billing_parsed +), + +-- Calculate final BBL and backfill coordinates from lat/long if needed +final AS ( + SELECT + -- Calculated/formatted fields + boro || LPAD(tb, 5, '0') || LPAD(tl, 4, '0') AS bbl, + primebbl_calc AS primebbl, + ap_datef, + billingblock, + billinglot, + + -- Backfill xcoord/ycoord from latitude/longitude if geocode didn't have them + -- Keep as text to match pluto table expectations + COALESCE( + xcoord_geo, + CASE + WHEN longitude IS NOT NULL + THEN ST_X(ST_TRANSFORM( + ST_SETSRID(ST_MAKEPOINT(longitude::double precision, latitude::double precision), 4326), + 2263 + ))::integer::text + END + ) AS xcoord, + COALESCE( + ycoord_geo, + CASE + WHEN latitude IS NOT NULL + THEN ST_Y(ST_TRANSFORM( + ST_SETSRID(ST_MAKEPOINT(longitude::double precision, latitude::double precision), 4326), + 2263 + ))::integer::text + END + ) AS ycoord, + + -- Adjusted fields + bfft_adj AS bfft, + bdft_adj AS bdft, + story_adj AS story, + bldgs_adj AS bldgs, + land_area_calc AS land_area, + + -- All other PTS fields (from int__dof_pts_propmaster) + boro, + tb, + tl, + street_name, + housenum_lo, + housenum_hi, + aptno, + zip, + bldgcl, + ease, + owner, + gross_sqft, + residarea, + officearea, + retailarea, + garagearea, + storagearea, + factoryarea, + otherarea, + coop_apts, + units, + ext, + irreg, + curavl_act, + curavt_act, + curext_act, + yrbuilt, + yralt1, + yralt2, + condo_number, + ap_boro, + ap_block, + ap_lot, + ap_ease, + ap_date, + lfft, + ldft, + + -- All geocode fields + billingbbl, + cd, + ct2010, + cb2010, + ct2020, + cb2020, + schooldist, + council, + zipcode, + firecomp, + policeprct, + healthcenterdistrict, + healtharea, + sanitdistrict, + sanitsub, + boepreferredstreetname, + taxmap, + sanbornmapidentifier, + latitude, + longitude, + grc, + grc2, + msg, + msg2, + borough, + block, + lot, + easement, + input_hnum, + input_sname, + numberofexistingstructuresonlot, + geom, + ogc_fid, + data_library_version + FROM with_apdate_formatted +) + +SELECT * FROM final diff --git a/products/pluto/models/intermediate/simple/_simple_models.yml b/products/pluto/models/intermediate/simple/_simple_models.yml new file mode 100644 index 0000000000..ce952bc0e8 --- /dev/null +++ b/products/pluto/models/intermediate/simple/_simple_models.yml @@ -0,0 +1,151 @@ +version: 2 + +models: + - name: int_pluto__far + description: Floor Area Ratio (FAR) calculations and maximum FAR by zoning district + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: builtfar + description: Built FAR calculated as building area divided by lot area + - name: residfar + description: Maximum residential FAR for the zoning district + - name: commfar + description: Maximum commercial FAR for the zoning district + - name: facilfar + description: Maximum facility FAR for the zoning district + - name: affresfar + description: Maximum affordable residential FAR for the zoning district + - name: mnffar + description: Maximum manufacturing FAR for the zoning district + + - name: int_pluto__irrlotcode + description: Irregular lot code transformation from RPAD format to Y/N format + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: irrlotcode + description: Irregular lot indicator (Y for irregular, N for regular) + + - name: int_pluto__sanitation + description: Sanitation borough and district extracted from sanitdistrict field + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: sanitboro + description: Sanitation borough (first character of sanitdistrict) + - name: sanitdistrict + description: Sanitation district (last 2 characters of original sanitdistrict) + + - name: int_pluto__landuse + description: Land use classification based on building class, with area source calculation for vacant lots + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: landuse + description: Land use category derived from building class + - name: areasource + description: Source of area measurement (4 for vacant lots) + + - name: int_pluto__ownertype + description: Owner type classification based on COLP data and exemption status + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: ownertype + description: Type of property owner (X for fully exempt properties) + + - name: int_pluto__edesignation + description: E-designation number assignment (environmental designation) + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: edesignum + description: E-designation number (environmental restriction) + + - name: int_pluto__latlong + description: Latitude, longitude, and centroid calculated from x/y coordinates + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: latitude + description: Latitude in WGS84 (SRID 4326) + - name: longitude + description: Longitude in WGS84 (SRID 4326) + - name: centroid + description: Centroid geometry point in WGS84 + + - name: int_pluto__condono + description: Formatted condo number without borough prefix and leading zeros + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: condono + description: Condo number (formatted) + + - name: int_pluto__lpc + description: Historic district and landmark designations from LPC + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: histdist + description: Historic district name + - name: landmark + description: Landmark type (Individual, Interior, or Individual and Interior) + + - name: int_pluto__numericfields + description: Cleaned numeric fields with invalid values removed + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: lotdepth + description: Lot depth (cleaned) + - name: numfloors + description: Number of floors (cleaned, must be >= 1) + - name: lotarea + description: Lot area (commas removed) + - name: sanborn + description: Sanborn map identifier (cleaned) + + - name: int_pluto__flood_flag + description: FEMA floodplain flags for 2007 and 2015 + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: firm07_flag + description: Flag if lot intersects 2007 floodplain (1% annual chance) + - name: pfirm15_flag + description: Flag if lot intersects 2015 preliminary floodplain (1% annual chance) diff --git a/products/pluto/models/intermediate/simple/int_pluto__condono.sql b/products/pluto/models/intermediate/simple/int_pluto__condono.sql new file mode 100644 index 0000000000..0fc5b8e6a4 --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__condono.sql @@ -0,0 +1,15 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Format condo number: remove borough code prefix and leading zeros +-- Takes rightmost 5 characters and converts to numeric then back to text to strip leading zeros + +SELECT + bbl, + (RIGHT(condono, 5)::numeric)::text AS condono +FROM {{ target.schema }}.pluto diff --git a/products/pluto/models/intermediate/simple/int_pluto__edesignation.sql b/products/pluto/models/intermediate/simple/int_pluto__edesignation.sql new file mode 100644 index 0000000000..14cbbacb91 --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__edesignation.sql @@ -0,0 +1,32 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Set E-designation number for each tax lot +-- When multiple E-designations exist for one lot, take the one with lowest ceqr_num and ulurp_num + +WITH edesignation AS ( + SELECT + bbl, + enumber + FROM ( + SELECT + bbl, + enumber, + ROW_NUMBER() OVER ( + PARTITION BY bbl + ORDER BY ceqr_num, ulurp_num, enumber + ) AS row_number + FROM {{ ref('stg__dcp_edesignation') }} + ) AS x + WHERE x.row_number = 1 +) + +SELECT + bbl, + enumber AS edesignum +FROM edesignation diff --git a/products/pluto/models/intermediate/simple/int_pluto__far.sql b/products/pluto/models/intermediate/simple/int_pluto__far.sql new file mode 100644 index 0000000000..3494a0ef42 --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__far.sql @@ -0,0 +1,57 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Calculate Floor Area Ratio (FAR) metrics for each tax lot +-- 1. Built FAR: ratio of building area to lot area +-- 2. Max FAR values by zoning district: residential, commercial, facility, affordable residential, manufacturing + +WITH base_pluto AS ( + SELECT + bbl, + zonedist1, + bldgarea, + lotarea + FROM {{ target.schema }}.pluto +), + +far_calculated AS ( + SELECT + bbl, + zonedist1, + -- Calculate built FAR (building area / lot area) + -- Only calculate when lotarea is non-zero and non-null + CASE + WHEN lotarea IS NOT NULL AND lotarea != '0' + THEN ROUND((bldgarea::numeric / lotarea::numeric), 2) + END AS builtfar + FROM base_pluto +), + +max_far_lookup AS ( + SELECT + f.bbl, + f.builtfar, + COALESCE(z.residfar::double precision, 0::double precision) AS residfar, + COALESCE(z.commfar::double precision, 0::double precision) AS commfar, + COALESCE(z.facilfar::double precision, 0::double precision) AS facilfar, + COALESCE(z.affresfar::double precision, 0::double precision) AS affresfar, + COALESCE(z.mnffar::double precision, 0::double precision) AS mnffar + FROM far_calculated AS f + LEFT JOIN {{ ref('dcp_zoning_maxfar') }} AS z + ON f.zonedist1 = z.zonedist +) + +SELECT + bbl, + builtfar, + residfar, + commfar, + facilfar, + affresfar, + mnffar +FROM max_far_lookup diff --git a/products/pluto/models/intermediate/simple/int_pluto__flood_flag.sql b/products/pluto/models/intermediate/simple/int_pluto__flood_flag.sql new file mode 100644 index 0000000000..bebe3311eb --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__flood_flag.sql @@ -0,0 +1,59 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Flag tax lots that fall within FEMA 1% annual chance floodplains +-- firm07_flag: 2007 floodplain +-- pfirm15_flag: 2015 preliminary floodplain +-- Only returns BBLs with at least one flag (not all BBLs) + +WITH firm07_subdivided AS ( + SELECT + ST_SUBDIVIDE(ST_MAKEVALID(geom)) AS geom + FROM {{ ref('stg__fema_firms2007_100yr') }} + WHERE + fld_zone != 'X' + AND fld_zone != '0.2 PCT ANNUAL CHANCE FLOOD HAZARD' +), + +firm07_bbls AS ( + SELECT DISTINCT p.bbl + FROM {{ target.schema }}.pluto AS p + INNER JOIN firm07_subdivided AS f + ON p.geom && f.geom AND ST_INTERSECTS(p.geom, f.geom) +), + +pfirm15_subdivided AS ( + SELECT + ST_SUBDIVIDE(ST_MAKEVALID(geom)) AS geom + FROM {{ ref('stg__fema_pfirms2015_100yr') }} + WHERE + fld_zone != 'X' + AND fld_zone != '0.2 PCT ANNUAL CHANCE FLOOD HAZARD' +), + +pfirm15_bbls AS ( + SELECT DISTINCT p.bbl + FROM {{ target.schema }}.pluto AS p + INNER JOIN pfirm15_subdivided AS f + ON p.geom && f.geom AND ST_INTERSECTS(p.geom, f.geom) +), + +all_flagged_bbls AS ( + SELECT bbl FROM firm07_bbls + UNION + SELECT bbl FROM pfirm15_bbls +) + +SELECT + afb.bbl, + CASE WHEN f07.bbl IS NOT NULL THEN '1' END AS firm07_flag, + CASE WHEN f15.bbl IS NOT NULL THEN '1' END AS pfirm15_flag +FROM all_flagged_bbls AS afb +LEFT JOIN firm07_bbls AS f07 ON afb.bbl = f07.bbl +LEFT JOIN pfirm15_bbls AS f15 ON afb.bbl = f15.bbl + diff --git a/products/pluto/models/intermediate/simple/int_pluto__irrlotcode.sql b/products/pluto/models/intermediate/simple/int_pluto__irrlotcode.sql new file mode 100644 index 0000000000..79bf37001f --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__irrlotcode.sql @@ -0,0 +1,19 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Transform irregular lot code from RPAD format (I/R) to Y/N format +-- I (Irregular) -> Y +-- R or anything else -> N + +SELECT + bbl, + CASE + WHEN irrlotcode = 'I' THEN 'Y' + ELSE 'N' + END AS irrlotcode +FROM {{ target.schema }}.pluto diff --git a/products/pluto/models/intermediate/simple/int_pluto__landuse.sql b/products/pluto/models/intermediate/simple/int_pluto__landuse.sql new file mode 100644 index 0000000000..36718091bb --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__landuse.sql @@ -0,0 +1,55 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Determine land use based on building class and calculate area source for vacant lots +-- Uses pluto_input_landuse_bldgclass lookup table +-- Sets areasource to '4' for vacant lots (landuse=11, no buildings, no building area) + +WITH base_pluto AS ( + SELECT + bbl, + bldgclass, + areasource, + numbldgs, + bldgarea + FROM {{ target.schema }}.pluto +), + +landuse_lookup AS ( + SELECT + p.bbl, + lu.landuse, + p.areasource, + p.numbldgs, + p.bldgarea + FROM base_pluto AS p + LEFT JOIN {{ ref('pluto_input_landuse_bldgclass') }} AS lu + ON p.bldgclass = lu.bldgclass +), + +areasource_calculated AS ( + SELECT + bbl, + landuse, + CASE + WHEN + (areasource IS NULL OR areasource = '0') + AND landuse = '11' + AND numbldgs::numeric = 0 + AND (bldgarea::numeric = 0 OR bldgarea IS NULL) + THEN '4' + ELSE areasource + END AS areasource + FROM landuse_lookup +) + +SELECT + bbl, + landuse, + areasource +FROM areasource_calculated diff --git a/products/pluto/models/intermediate/simple/int_pluto__latlong.sql b/products/pluto/models/intermediate/simple/int_pluto__latlong.sql new file mode 100644 index 0000000000..f846f531f6 --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__latlong.sql @@ -0,0 +1,57 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Calculate latitude and longitude from x/y coordinates +-- Transform from State Plane (SRID 2263) to WGS84 (SRID 4326) +-- Also create centroid geometry point + +SELECT + bbl, + ST_Y( + ST_TRANSFORM( + ST_SETSRID( + ST_MAKEPOINT(xcoord::double precision, ycoord::double precision), + 2263 + ), + 4326 + ) + ) AS latitude, + ST_X( + ST_TRANSFORM( + ST_SETSRID( + ST_MAKEPOINT(xcoord::double precision, ycoord::double precision), + 2263 + ), + 4326 + ) + ) AS longitude, + ST_SETSRID( + ST_MAKEPOINT( + ST_X( + ST_TRANSFORM( + ST_SETSRID( + ST_MAKEPOINT(xcoord::double precision, ycoord::double precision), + 2263 + ), + 4326 + ) + )::double precision, + ST_Y( + ST_TRANSFORM( + ST_SETSRID( + ST_MAKEPOINT(xcoord::double precision, ycoord::double precision), + 2263 + ), + 4326 + ) + )::double precision + ), + 4326 + ) AS centroid +FROM {{ target.schema }}.pluto +WHERE xcoord IS NOT NULL diff --git a/products/pluto/models/intermediate/simple/int_pluto__lpc.sql b/products/pluto/models/intermediate/simple/int_pluto__lpc.sql new file mode 100644 index 0000000000..7c3b484f7c --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__lpc.sql @@ -0,0 +1,95 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Assign historic district and landmark designations from LPC data +-- Historic district: first alphabetical district per BBL +-- Landmark: type determination based on number of landmark types per BBL + +WITH base_pluto AS ( + SELECT + bbl, + borocode, + block, + lot + FROM {{ target.schema }}.pluto +), + +histdistricts AS ( + SELECT + bbl, + hist_dist + FROM ( + SELECT + bbl, + hist_dist, + ROW_NUMBER() OVER ( + PARTITION BY bbl + ORDER BY hist_dist + ) AS row_number + FROM {{ ref('stg__lpc_historic_districts') }} + WHERE + hist_dist != '0' + AND hist_dist NOT LIKE 'Individual Landmark%' + ) AS x + WHERE x.row_number = 1 +), + +landmarks AS ( + SELECT DISTINCT + bbl, + lm_type, + ROW_NUMBER() OVER ( + PARTITION BY bbl + ORDER BY lm_type + ) AS row_number + FROM ( + SELECT DISTINCT + bbl, + lm_type + FROM {{ ref('stg__lpc_landmarks') }} + WHERE + (lm_type = 'Interior Landmark' OR lm_type = 'Individual Landmark') + AND status = 'DESIGNATED' + AND most_curre = '1' + AND ( + last_actio = 'DESIGNATED' + OR last_actio = 'DESIGNATED (AMENDMENT/MODIFICATION ACCEPTED)' + ) + ) AS x +), + +maxnum AS ( + SELECT + bbl, + MAX(row_number) AS maxrow_number + FROM landmarks + GROUP BY bbl +), + +landmark_types AS ( + SELECT + l.bbl, + CASE + WHEN m.maxrow_number = 1 THEN UPPER(l.lm_type) + WHEN m.maxrow_number = 2 THEN UPPER('Individual and Interior Landmark') + ELSE UPPER(l.lm_type) + END AS landmark + FROM landmarks AS l + INNER JOIN maxnum AS m ON l.bbl = m.bbl + WHERE l.row_number = 1 +) + +SELECT + p.bbl, + h.hist_dist AS histdist, + lt.landmark +FROM base_pluto AS p +LEFT JOIN histdistricts AS h + ON p.borocode || LPAD(p.block, 5, '0') || LPAD(p.lot, 4, '0') = h.bbl +LEFT JOIN landmark_types AS lt + ON p.borocode || LPAD(p.block, 5, '0') || LPAD(p.lot, 4, '0') = lt.bbl diff --git a/products/pluto/models/intermediate/simple/int_pluto__numericfields.sql b/products/pluto/models/intermediate/simple/int_pluto__numericfields.sql new file mode 100644 index 0000000000..40352c2724 --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__numericfields.sql @@ -0,0 +1,34 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Clean numeric fields by removing invalid values +-- lotdepth, numfloors: must be numeric (except decimal point) +-- numfloors: must be >= 1 +-- lotarea: remove commas +-- sanborn: must contain at least one digit + +SELECT + bbl, + CASE + WHEN lotdepth ~ '[^0-9]' AND lotdepth NOT LIKE '%.%' THEN NULL + ELSE lotdepth + END AS lotdepth, + CASE + WHEN numfloors ~ '[^0-9]' AND numfloors NOT LIKE '%.%' THEN NULL + WHEN numfloors IS NOT NULL AND numfloors::numeric < 1 THEN NULL + ELSE numfloors + END AS numfloors, + CASE + WHEN lotarea LIKE '%,%' THEN REPLACE(lotarea, ',', '') + ELSE lotarea + END AS lotarea, + CASE + WHEN sanborn !~ '[0-9]' THEN NULL + ELSE sanborn + END AS sanborn +FROM {{ target.schema }}.pluto diff --git a/products/pluto/models/intermediate/simple/int_pluto__ownertype.sql b/products/pluto/models/intermediate/simple/int_pluto__ownertype.sql new file mode 100644 index 0000000000..a4bb9fd0a6 --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__ownertype.sql @@ -0,0 +1,53 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Determine owner type based on COLP (City Owned and Leased Properties) data +-- Sets ownertype to 'X' for properties where total exemption equals total assessment (fully exempt) + +WITH base_pluto AS ( + SELECT + bbl, + exempttot, + assesstot + FROM {{ target.schema }}.pluto +), + +colp_lookup AS ( + SELECT + p.bbl, + c.ownership AS ownertype, + p.exempttot, + p.assesstot + FROM base_pluto AS p + LEFT JOIN ( + SELECT DISTINCT ON (bbl) + bbl, + ownership + FROM {{ ref('stg__dcp_colp') }} + ORDER BY bbl + ) AS c + ON p.bbl::numeric = c.bbl::numeric +), + +ownertype_calculated AS ( + SELECT + bbl, + CASE + WHEN + ownertype IS NULL + AND exempttot = assesstot + THEN 'X' + ELSE ownertype + END AS ownertype + FROM colp_lookup +) + +SELECT + bbl, + ownertype +FROM ownertype_calculated diff --git a/products/pluto/models/intermediate/simple/int_pluto__sanitation.sql b/products/pluto/models/intermediate/simple/int_pluto__sanitation.sql new file mode 100644 index 0000000000..7cb57a727f --- /dev/null +++ b/products/pluto/models/intermediate/simple/int_pluto__sanitation.sql @@ -0,0 +1,21 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Extract sanitation borough and district from sanitdistrict field +-- sanitboro: first character of sanitdistrict +-- sanitdistrict: last 2 characters of sanitdistrict + +SELECT + bbl, + CASE + WHEN sanitdistrict IS NOT NULL THEN LEFT(sanitdistrict, 1) + END AS sanitboro, + CASE + WHEN sanitdistrict IS NOT NULL THEN RIGHT(sanitdistrict, 2) + END AS sanitdistrict +FROM {{ target.schema }}.pluto diff --git a/products/pluto/models/intermediate/transitzone/_transitzone_models.yml b/products/pluto/models/intermediate/transitzone/_transitzone_models.yml new file mode 100644 index 0000000000..506c6a4811 --- /dev/null +++ b/products/pluto/models/intermediate/transitzone/_transitzone_models.yml @@ -0,0 +1,25 @@ +version: 2 + +models: + - name: int_tz__atomic_geoms + description: Decomposed transit zone multipolygons into atomic parts for performance + + - name: int_tz__tax_blocks + description: Tax blocks split into sub-blocks with BBL assignments + + - name: int_tz__block_to_tz_ranked + description: Transit zone coverage ranked per block + + - name: int_tz__bbl_to_tz_ranked + description: Lot-level transit zone assignments for ambiguous blocks + + - name: int_pluto__transitzone + description: Final transit zone assignment per BBL + columns: + - name: bbl + description: Borough-Block-Lot identifier + tests: + - unique + - not_null + - name: trnstzone + description: Transit zone assignment diff --git a/products/pluto/models/intermediate/transitzone/int_pluto__transitzone.sql b/products/pluto/models/intermediate/transitzone/int_pluto__transitzone.sql new file mode 100644 index 0000000000..386a74247c --- /dev/null +++ b/products/pluto/models/intermediate/transitzone/int_pluto__transitzone.sql @@ -0,0 +1,43 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Final transit zone assignment per BBL +-- Uses block-level for unambiguous blocks, lot-level for ambiguous ones + +WITH assignments AS ( + -- Block-level assignments for non-ambiguous blocks + SELECT + UNNEST(bbls) AS bbl, + transit_zone + FROM {{ ref('int_tz__block_to_tz_ranked') }} AS block_tz + WHERE + block_tz.tz_rank = 1 + -- Only assign blocks that are not ambiguous (no second-ranked transit zone) + AND NOT EXISTS ( + SELECT 1 + FROM {{ ref('int_tz__block_to_tz_ranked') }} AS ambiguous + WHERE + ambiguous.id = block_tz.id + AND ambiguous.tz_rank = 2 + AND ambiguous.pct_covered > 10 + ) + + UNION ALL + + -- Lot-level assignments for ambiguous blocks + SELECT + bbls[1] AS bbl, + transit_zone + FROM {{ ref('int_tz__bbl_to_tz_ranked') }} + WHERE tz_rank = 1 +) + +SELECT + bbl, + transit_zone AS trnstzone +FROM assignments diff --git a/products/pluto/models/intermediate/transitzone/int_tz__atomic_geoms.sql b/products/pluto/models/intermediate/transitzone/int_tz__atomic_geoms.sql new file mode 100644 index 0000000000..197c65339b --- /dev/null +++ b/products/pluto/models/intermediate/transitzone/int_tz__atomic_geoms.sql @@ -0,0 +1,23 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['wkb_geometry'], 'type': 'gist'}], + tags=['pluto_enrichment'] + ) +}} + +-- Decompose transit zone multipolygons into atomic parts for performance +-- Breaking apart multipolygons reduces spatial calculation time from >10 min to ~1 min + +WITH decomposed AS ( + SELECT + transit_zone, + (ST_DUMP(geom)).geom AS wkb_geometry + FROM {{ ref('stg__dcp_transit_zones') }} +) + +SELECT + transit_zone, + wkb_geometry, + ROW_NUMBER() OVER (ORDER BY transit_zone) AS decomposed_id +FROM decomposed diff --git a/products/pluto/models/intermediate/transitzone/int_tz__bbl_to_tz_ranked.sql b/products/pluto/models/intermediate/transitzone/int_tz__bbl_to_tz_ranked.sql new file mode 100644 index 0000000000..c7af171cb8 --- /dev/null +++ b/products/pluto/models/intermediate/transitzone/int_tz__bbl_to_tz_ranked.sql @@ -0,0 +1,78 @@ +{{ + config( + materialized='table', + tags=['pluto_enrichment'] + ) +}} + +-- Lot-level assignments for ambiguous blocks +-- Only for blocks where multiple transit zones have >10% coverage + +WITH ambiguous_bbls AS ( + SELECT + UNNEST(bbls) AS bbl, + borough, + block, + sub_block + FROM {{ ref('int_tz__block_to_tz_ranked') }} AS tza + WHERE + tza.tz_rank > 1 + AND tza.pct_covered > 10 +), + +lot_to_tz AS ( + SELECT + p.bbl, + p.borough, + p.block, + t.transit_zone, + p.geom, + ST_AREA(ST_INTERSECTION(p.geom, ST_UNION(t.wkb_geometry))) / ST_AREA(p.geom) * 100.0 AS pct_covered + FROM {{ target.schema }}.pluto AS p + INNER JOIN ambiguous_bbls AS ab ON p.bbl = ab.bbl + INNER JOIN {{ ref('int_tz__atomic_geoms') }} AS t + ON ST_INTERSECTS(p.geom, t.wkb_geometry) + GROUP BY p.bbl, p.borough, p.block, t.transit_zone, p.geom +), + +lot_to_tz_with_rank AS ( + SELECT + lt.*, + tzr.tz_rank AS priority_rank + FROM lot_to_tz AS lt + LEFT JOIN {{ ref('dcp_transit_zone_ranks') }} AS tzr + ON lt.transit_zone = tzr.tz_name +), + +filtered_zones AS ( + SELECT + ltr.*, + NOT COALESCE( + EXISTS ( + SELECT 1 + FROM lot_to_tz_with_rank AS inner_ltr + WHERE + inner_ltr.bbl = ltr.bbl + AND inner_ltr.priority_rank < 4 + ) AND ltr.priority_rank = 4, + FALSE + ) AS include_zone + FROM lot_to_tz_with_rank AS ltr +) + +SELECT + 'lot' AS assignment_type, + bbl::text AS id, + borough, + block, + geom, + 1 AS sub_block, + ARRAY[bbl] AS bbls, + transit_zone, + pct_covered, + ROW_NUMBER() OVER ( + PARTITION BY bbl + ORDER BY pct_covered DESC + ) AS tz_rank +FROM filtered_zones +WHERE include_zone = TRUE diff --git a/products/pluto/models/intermediate/transitzone/int_tz__block_to_tz_ranked.sql b/products/pluto/models/intermediate/transitzone/int_tz__block_to_tz_ranked.sql new file mode 100644 index 0000000000..2fbae3a6b9 --- /dev/null +++ b/products/pluto/models/intermediate/transitzone/int_tz__block_to_tz_ranked.sql @@ -0,0 +1,39 @@ +{{ + config( + materialized='table', + tags=['pluto_enrichment'] + ) +}} + +-- Calculate transit zone coverage per block with ranking + +WITH block_to_tz AS ( + SELECT + tb.borough, + tb.block, + tb.sub_block, + tb.geom, + tb.bbls, + t.transit_zone, + ST_AREA(ST_INTERSECTION(tb.geom, ST_UNION(t.wkb_geometry))) / ST_AREA(tb.geom) * 100.0 AS pct_covered + FROM {{ ref('int_tz__tax_blocks') }} AS tb + INNER JOIN {{ ref('int_tz__atomic_geoms') }} AS t + ON ST_INTERSECTS(tb.geom, t.wkb_geometry) + GROUP BY tb.borough, tb.block, tb.sub_block, tb.geom, tb.bbls, t.transit_zone +) + +SELECT + 'block' AS assignment_type, + borough || '-' || block || '-' || sub_block AS id, + borough, + block, + geom, + sub_block, + bbls, + transit_zone, + pct_covered, + ROW_NUMBER() OVER ( + PARTITION BY borough, block, sub_block + ORDER BY pct_covered DESC + ) AS tz_rank +FROM block_to_tz diff --git a/products/pluto/models/intermediate/transitzone/int_tz__tax_blocks.sql b/products/pluto/models/intermediate/transitzone/int_tz__tax_blocks.sql new file mode 100644 index 0000000000..ddea3018d8 --- /dev/null +++ b/products/pluto/models/intermediate/transitzone/int_tz__tax_blocks.sql @@ -0,0 +1,67 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['geom'], 'type': 'gist'}], + tags=['pluto_enrichment'] + ) +}} + +-- Create sub-blocks from tax blocks, handling non-contiguous blocks +-- Splits blocks into contiguous parts and assigns lots to their sub-block + +WITH block_unions AS ( + SELECT + borough, + block, + ST_UNION(p.geom) AS geom, + ARRAY_AGG(bbl) AS all_bbls + FROM {{ target.schema }}.pluto AS p + GROUP BY p.borough, p.block +), + +block_parts AS ( + SELECT + borough, + block, + all_bbls, + (ST_DUMP(geom)).geom AS geom + FROM block_unions +), + +numbered_parts AS ( + SELECT + borough, + block, + all_bbls, + geom, + ROW_NUMBER() OVER ( + PARTITION BY borough, block + ORDER BY ST_AREA(geom) DESC + ) AS sub_block + FROM block_parts +), + +reassigned_bbls AS ( + SELECT + np.borough, + np.block, + np.sub_block, + np.geom, + ARRAY_AGG(p.bbl) AS bbls + FROM numbered_parts AS np + INNER JOIN {{ target.schema }}.pluto AS p + ON + np.borough = p.borough + AND np.block = p.block + AND ST_WITHIN(ST_POINTONSURFACE(p.geom), np.geom) + GROUP BY np.borough, np.block, np.sub_block, np.geom +) + +SELECT + borough, + block, + sub_block, + borough || '-' || block || '-' || sub_block AS block_id, + geom, + bbls +FROM reassigned_bbls diff --git a/products/pluto/models/product/pluto_enriched.sql b/products/pluto/models/product/pluto_enriched.sql new file mode 100644 index 0000000000..b067f88387 --- /dev/null +++ b/products/pluto/models/product/pluto_enriched.sql @@ -0,0 +1,63 @@ +{{ + config( + materialized='table', + indexes=[{'columns': ['bbl'], 'unique': True}], + tags=['pluto_enrichment'] + ) +}} + +-- Assemble enriched PLUTO data from intermediate models +-- This model joins all simple enrichment models to the base pluto table +-- The results are used to batch UPDATE the pluto table + +SELECT + p.bbl, + far.builtfar, + far.residfar, + far.commfar, + far.facilfar, + far.affresfar, + far.mnffar, + irr.irrlotcode, + san.sanitboro, + san.sanitdistrict, + lu.landuse, + lu.areasource, + ot.ownertype, + ed.edesignum, + -- ll.latitude, + -- ll.longitude, + -- ll.centroid, + cn.condono, + lpc.histdist, + lpc.landmark, + nf.lotdepth, + nf.numfloors, + nf.lotarea, + nf.sanborn, + mih.mih_opt1, + mih.mih_opt2, + mih.mih_opt3, + mih.mih_opt4, + tz.trnstzone, + bsmt.bsmtcode, + lot.lottype, + prox.proxcode, + ease.easements +FROM {{ target.schema }}.pluto AS p +LEFT JOIN {{ ref('int_pluto__far') }} AS far ON p.bbl = far.bbl +LEFT JOIN {{ ref('int_pluto__irrlotcode') }} AS irr ON p.bbl = irr.bbl +LEFT JOIN {{ ref('int_pluto__sanitation') }} AS san ON p.bbl = san.bbl +LEFT JOIN {{ ref('int_pluto__landuse') }} AS lu ON p.bbl = lu.bbl +LEFT JOIN {{ ref('int_pluto__ownertype') }} AS ot ON p.bbl = ot.bbl +LEFT JOIN {{ ref('int_pluto__edesignation') }} AS ed ON p.bbl = ed.bbl +-- LEFT JOIN {{ ref('int_pluto__latlong') }} AS ll ON p.bbl = ll.bbl +LEFT JOIN {{ ref('int_pluto__condono') }} AS cn ON p.bbl = cn.bbl +LEFT JOIN {{ ref('int_pluto__lpc') }} AS lpc ON p.bbl = lpc.bbl +LEFT JOIN {{ ref('int_pluto__numericfields') }} AS nf ON p.bbl = nf.bbl +LEFT JOIN {{ ref('int_pluto__miharea') }} AS mih ON p.bbl = mih.bbl +LEFT JOIN {{ ref('int_pluto__transitzone') }} AS tz ON p.bbl = tz.bbl +LEFT JOIN {{ ref('int_pluto__bsmtcode') }} AS bsmt ON p.bbl = bsmt.bbl +LEFT JOIN {{ ref('int_pluto__lottype') }} AS lot ON p.bbl = lot.bbl +LEFT JOIN {{ ref('int_pluto__proxcode') }} AS prox ON p.bbl = prox.bbl +LEFT JOIN {{ ref('int_pluto__easements') }} AS ease ON p.bbl = ease.bbl diff --git a/products/pluto/models/qaqc/intermediate/qaqc_int__transit_zones_questionable_assignments.sql b/products/pluto/models/qaqc/intermediate/qaqc_int__transit_zones_questionable_assignments.sql index 363223b8a8..38ca4b7df3 100644 --- a/products/pluto/models/qaqc/intermediate/qaqc_int__transit_zones_questionable_assignments.sql +++ b/products/pluto/models/qaqc/intermediate/qaqc_int__transit_zones_questionable_assignments.sql @@ -87,13 +87,13 @@ SELECT wl.loser_tz, st_envelope(st_buffer(wl.geom, .005)) AS area_of_interest_geom, ( - SELECT st_intersection(st_envelope(st_buffer(wl.geom, .005)), wkb_geometry) - FROM {{ source('recipe_sources', 'dcp_transit_zones') }} AS dtz + SELECT st_intersection(st_envelope(st_buffer(wl.geom, .005)), geom) + FROM {{ ref('stg__dcp_transit_zones') }} AS dtz WHERE dtz.transit_zone = wl.winner_tz ) AS winner_tz_geom, ( - SELECT st_intersection(st_envelope(st_buffer(wl.geom, .005)), wkb_geometry) - FROM {{ source('recipe_sources', 'dcp_transit_zones') }} AS dtz + SELECT st_intersection(st_envelope(st_buffer(wl.geom, .005)), geom) + FROM {{ ref('stg__dcp_transit_zones') }} AS dtz WHERE dtz.transit_zone = wl.loser_tz ) AS loser_tz_geom FROM winners_losers AS wl diff --git a/products/pluto/models/staging/stg__dcp_gis_mandatory_inclusionary_housing.sql b/products/pluto/models/staging/stg__dcp_gis_mandatory_inclusionary_housing.sql new file mode 100644 index 0000000000..20775470a0 --- /dev/null +++ b/products/pluto/models/staging/stg__dcp_gis_mandatory_inclusionary_housing.sql @@ -0,0 +1,13 @@ +{{ + config( + materialized='table', + indexes=[ + {'columns': ['geom'], 'type': 'gist'} + ] + ) +}} + +SELECT + *, + wkb_geometry AS geom +FROM {{ source('recipe_sources', 'dcp_gis_mandatory_inclusionary_housing') }} diff --git a/products/pluto/models/staging/stg__dcp_transit_zones.sql b/products/pluto/models/staging/stg__dcp_transit_zones.sql new file mode 100644 index 0000000000..b2c0c7d7d7 --- /dev/null +++ b/products/pluto/models/staging/stg__dcp_transit_zones.sql @@ -0,0 +1,13 @@ +{{ + config( + materialized='table', + indexes=[ + {'columns': ['geom'], 'type': 'gist'} + ] + ) +}} + +SELECT + *, + wkb_geometry AS geom +FROM {{ source('recipe_sources', 'dcp_transit_zones') }} diff --git a/products/pluto/pluto_build/02_build.sh b/products/pluto/pluto_build/02_build.sh index a22aaa3dae..8754ef9924 100755 --- a/products/pluto/pluto_build/02_build.sh +++ b/products/pluto/pluto_build/02_build.sh @@ -3,19 +3,9 @@ source ./bash/config.sh set_error_traps echo "Starting to build PLUTO ..." -run_sql_file sql/preprocessing.sql -run_sql_file sql/create_pts.sql -run_sql_file sql/create_rpad_geo.sql -echo 'Making DCP edits to RPAD...' -run_sql_file sql/zerovacantlots.sql -run_sql_file sql/lotarea.sql -run_sql_file sql/primebbl.sql -run_sql_file sql/apdate.sql - -echo 'Creating table that aggregates condo data and is used to build PLUTO...' -run_sql_file sql/create_allocated.sql -run_sql_file sql/yearbuiltalt.sql +echo 'Building intermediate RPAD models with dbt...' +(cd .. && dbt run --select int__dof_pts_propmaster int__pluto_rpad_geo int__pluto_allocated) echo 'Creating base PLUTO table' run_sql_file sql/create.sql -v VERSION=${VERSION} @@ -26,30 +16,16 @@ run_sql_file sql/allocated.sql echo 'Adding on spatial data attributes' run_sql_file sql/geocodes.sql -# clean up numeric fields -run_sql_file sql/numericfields.sql -run_sql_file sql/condono.sql echo 'Adding on CAMA data attributes' -run_sql_file sql/landuse.sql run_sql_file sql/create_cama_primebbl.sql -run_sql_file sql/cama_bsmttype.sql -run_sql_file sql/cama_lottype.sql -run_sql_file sql/cama_proxcode.sql run_sql_file sql/cama_bldgarea_1.sql run_sql_file sql/cama_bldgarea_2.sql run_sql_file sql/cama_bldgarea_3.sql run_sql_file sql/cama_bldgarea_4.sql -run_sql_file sql/cama_easements.sql echo 'Adding on data attributes from other sources' -run_sql_file sql/lpc.sql -run_sql_file sql/edesignation.sql -run_sql_file sql/ownertype.sql - -echo 'Transform RPAD data attributes' -run_sql_file sql/irrlotcode.sql echo 'Adding DCP data attributes' run_sql_file sql/address.sql @@ -77,18 +53,13 @@ run_sql_file sql/zoning_parks.sql run_sql_file sql/zoning_splitzone.sql run_sql_command "VACUUM ANALYZE pluto;" -echo 'Filling in FAR values' -run_sql_file sql/far.sql -run_sql_command "VACUUM ANALYZE pluto;" - -echo 'Populating building class for condos lots and land use field' +echo 'Populating building class for condos lots' run_sql_file sql/bldgclass.sql -run_sql_file sql/landuse.sql run_sql_command "VACUUM ANALYZE pluto;" echo 'Flagging tax lots within the FEMA floodplain' -run_sql_file sql/latlong.sql run_sql_file sql/update_empty_coord.sql +run_sql_file sql/latlong.sql run_sql_file sql/flood_flag.sql run_sql_command "VACUUM ANALYZE pluto;" @@ -96,12 +67,14 @@ echo 'Assigning political values with spatial join' run_sql_file sql/spatialjoins.sql # clean up numeric fields run_sql_file sql/numericfields_geomfields.sql -run_sql_file sql/sanitboro.sql -run_sql_file sql/latlong.sql -run_sql_file sql/miharea.sql -run_sql_file sql/transitzone.sql run_sql_command "VACUUM ANALYZE pluto;" +echo 'Running all dbt enrichment models' +(cd .. && dbt run --select tag:pluto_enrichment pluto_enriched) + +echo 'Applying all dbt enrichments to pluto table' +run_sql_file sql/apply_dbt_enrichments.sql + echo 'Populating PLUTO tags and version fields' run_sql_file sql/plutomapid.sql run_sql_command "VACUUM ANALYZE pluto;" & diff --git a/products/pluto/pluto_build/sql/address.sql b/products/pluto/pluto_build/sql/address.sql index b89f3d725f..faf7a1dc31 100644 --- a/products/pluto/pluto_build/sql/address.sql +++ b/products/pluto/pluto_build/sql/address.sql @@ -1,7 +1,7 @@ -- when the address is still null populate the address UPDATE pluto a SET address = concat(b.housenum_hi, ' ', b.street_name) -FROM pluto_rpad_geo AS b +FROM int__pluto_rpad_geo AS b WHERE a.bbl = b.primebbl AND a.address IS NULL AND b.housenum_hi IS NOT NULL AND b.street_name IS NOT NULL; diff --git a/products/pluto/pluto_build/sql/apdate.sql b/products/pluto/pluto_build/sql/apdate_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/apdate.sql rename to products/pluto/pluto_build/sql/apdate_migrated.sql diff --git a/products/pluto/pluto_build/sql/apply_dbt_enrichments.sql b/products/pluto/pluto_build/sql/apply_dbt_enrichments.sql new file mode 100644 index 0000000000..b57aacadb8 --- /dev/null +++ b/products/pluto/pluto_build/sql/apply_dbt_enrichments.sql @@ -0,0 +1,37 @@ +-- Apply enriched values from dbt models back to the pluto table +UPDATE pluto +SET + builtfar = pe.builtfar, + residfar = pe.residfar, + commfar = pe.commfar, + facilfar = pe.facilfar, + affresfar = pe.affresfar, + mnffar = pe.mnffar, + irrlotcode = pe.irrlotcode, + sanitboro = pe.sanitboro, + sanitdistrict = pe.sanitdistrict, + landuse = pe.landuse, + areasource = pe.areasource, + ownertype = pe.ownertype, + edesignum = pe.edesignum, + -- latitude = pe.latitude, + -- longitude = pe.longitude, + -- centroid = pe.centroid, + condono = pe.condono, + histdist = pe.histdist, + landmark = pe.landmark, + lotdepth = pe.lotdepth, + numfloors = pe.numfloors, + lotarea = pe.lotarea, + sanborn = pe.sanborn, + mih_opt1 = pe.mih_opt1, + mih_opt2 = pe.mih_opt2, + mih_opt3 = pe.mih_opt3, + mih_opt4 = pe.mih_opt4, + trnstzone = pe.trnstzone, + bsmtcode = pe.bsmtcode, + lottype = pe.lottype, + proxcode = pe.proxcode, + easements = pe.easements +FROM pluto_enriched AS pe +WHERE pluto.bbl = pe.bbl; diff --git a/products/pluto/pluto_build/sql/bbl.sql b/products/pluto/pluto_build/sql/bbl.sql index 0550fe52d1..b75cce257f 100644 --- a/products/pluto/pluto_build/sql/bbl.sql +++ b/products/pluto/pluto_build/sql/bbl.sql @@ -18,4 +18,4 @@ SELECT END AS borough, TRIM(LEADING '0' FROM SUBSTRING(b.primebbl, 2, 5)) AS block, TRIM(LEADING '0' FROM RIGHT(b.primebbl, 4)) AS lot -FROM (SELECT DISTINCT primebbl FROM pluto_rpad_geo) AS b; +FROM (SELECT DISTINCT primebbl FROM int__pluto_rpad_geo) AS b; diff --git a/products/pluto/pluto_build/sql/bldgclass.sql b/products/pluto/pluto_build/sql/bldgclass.sql index e28ee0ce58..392c038b6d 100644 --- a/products/pluto/pluto_build/sql/bldgclass.sql +++ b/products/pluto/pluto_build/sql/bldgclass.sql @@ -11,7 +11,7 @@ WITH bldgclass AS ( SELECT DISTINCT billingbbl, bldgcl - FROM pluto_rpad_geo + FROM int__pluto_rpad_geo WHERE bldgcl != 'R0' AND bldgcl != 'RG' @@ -53,7 +53,7 @@ CREATE TEMP TABLE bblsbldgclasslookup AS ( SELECT DISTINCT billingbbl, bldgcl - FROM pluto_rpad_geo + FROM int__pluto_rpad_geo WHERE bldgcl != 'R0' AND bldgcl != 'RG' @@ -214,7 +214,7 @@ bldgclass AS ( b.bldgcl, b.boro || b.tb || b.tl AS bbl FROM z7s AS a - LEFT JOIN pluto_rpad_geo AS b + LEFT JOIN int__pluto_rpad_geo AS b ON a.bbl = b.boro || b.tb || b.tl ) AS x ), diff --git a/products/pluto/pluto_build/sql/cama_bldgarea_1.sql b/products/pluto/pluto_build/sql/cama_bldgarea_1.sql index 68e947ccd6..98590c840b 100644 --- a/products/pluto/pluto_build/sql/cama_bldgarea_1.sql +++ b/products/pluto/pluto_build/sql/cama_bldgarea_1.sql @@ -25,7 +25,7 @@ SET strgearea = b.storagearea, factryarea = b.factoryarea, otherarea = b.otherarea -FROM pluto_rpad_geo AS b +FROM int__pluto_rpad_geo AS b WHERE a.bbl = b.primebbl AND a.lot NOT LIKE '75%' diff --git a/products/pluto/pluto_build/sql/cama_bsmttype.sql b/products/pluto/pluto_build/sql/cama_bsmttype_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/cama_bsmttype.sql rename to products/pluto/pluto_build/sql/cama_bsmttype_migrated.sql diff --git a/products/pluto/pluto_build/sql/cama_easements.sql b/products/pluto/pluto_build/sql/cama_easements_migrated.sql similarity index 98% rename from products/pluto/pluto_build/sql/cama_easements.sql rename to products/pluto/pluto_build/sql/cama_easements_migrated.sql index 83ee64051f..ce7de0fae7 100644 --- a/products/pluto/pluto_build/sql/cama_easements.sql +++ b/products/pluto/pluto_build/sql/cama_easements_migrated.sql @@ -4,7 +4,7 @@ WITH distincteasements AS ( SELECT DISTINCT primebbl AS bbl, ease - FROM dof_pts_propmaster + FROM int__dof_pts_propmaster WHERE ease IS NOT NULL AND ease != ' ' ), diff --git a/products/pluto/pluto_build/sql/cama_lottype.sql b/products/pluto/pluto_build/sql/cama_lottype_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/cama_lottype.sql rename to products/pluto/pluto_build/sql/cama_lottype_migrated.sql diff --git a/products/pluto/pluto_build/sql/cama_proxcode.sql b/products/pluto/pluto_build/sql/cama_proxcode_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/cama_proxcode.sql rename to products/pluto/pluto_build/sql/cama_proxcode_migrated.sql diff --git a/products/pluto/pluto_build/sql/condono.sql b/products/pluto/pluto_build/sql/condono.sql deleted file mode 100644 index 1d780c4e8e..0000000000 --- a/products/pluto/pluto_build/sql/condono.sql +++ /dev/null @@ -1,5 +0,0 @@ --- update the format of the condo number --- remove the borough code from the beginning of the condo number --- remove leading zeros -UPDATE pluto -SET condono = (RIGHT(condono, 5)::numeric)::text; diff --git a/products/pluto/pluto_build/sql/create_allocated.sql b/products/pluto/pluto_build/sql/create_allocated_migrated.sql similarity index 92% rename from products/pluto/pluto_build/sql/create_allocated.sql rename to products/pluto/pluto_build/sql/create_allocated_migrated.sql index aaa58dfe94..3be9e2c165 100644 --- a/products/pluto/pluto_build/sql/create_allocated.sql +++ b/products/pluto/pluto_build/sql/create_allocated_migrated.sql @@ -1,4 +1,4 @@ --- create the allocated table from pluto_rpad_geo +-- create the allocated table from int__pluto_rpad_geo DROP TABLE IF EXISTS pluto_allocated CASCADE; CREATE TABLE pluto_allocated ( bbl text, @@ -32,7 +32,7 @@ CREATE TABLE pluto_allocated ( INSERT INTO pluto_allocated (bbl) SELECT b.primebbl -FROM (SELECT DISTINCT primebbl FROM pluto_rpad_geo) AS b; +FROM (SELECT DISTINCT primebbl FROM int__pluto_rpad_geo) AS b; -- fill in one-to-one attributes -- for noncondo records @@ -62,7 +62,7 @@ SET END), appbbl = ap_boro || lpad(ap_block, 5, '0') || lpad(ap_lot, 4, '0'), appdate = ap_datef -FROM pluto_rpad_geo AS b +FROM int__pluto_rpad_geo AS b WHERE a.bbl = b.primebbl AND b.tl NOT LIKE '75%' @@ -94,7 +94,7 @@ SET END), appbbl = ap_boro || lpad(ap_block, 5, '0') || lpad(ap_lot, 4, '0'), appdate = ap_datef -FROM pluto_rpad_geo AS b +FROM int__pluto_rpad_geo AS b WHERE a.bbl = b.primebbl AND b.tl LIKE '75%' @@ -107,7 +107,7 @@ WITH bldgareasums AS ( SELECT primebbl, sum(b.gross_sqft::numeric) AS bldgareasum - FROM pluto_rpad_geo AS b + FROM int__pluto_rpad_geo AS b WHERE b.tl NOT LIKE '75%' AND b.condo_number IS NOT NULL @@ -126,7 +126,7 @@ WITH primesumunits AS ( primebbl, sum(coop_apts::integer) AS unitsres, sum(units::integer) AS unitstotal - FROM pluto_rpad_geo AS b + FROM int__pluto_rpad_geo AS b WHERE b.tl NOT LIKE '75%' GROUP BY primebbl ) @@ -147,7 +147,7 @@ WITH primesums AS ( -- field no longer exists -- SUM(curexl_act::double precision) as exemptland, sum(curext_act::double precision) AS exempttot - FROM pluto_rpad_geo + FROM int__pluto_rpad_geo GROUP BY primebbl ) @@ -166,7 +166,7 @@ WITH unit_appbbls AS ( prg.primebbl, min(prg.ap_boro || lpad(prg.ap_block, 5, '0') || lpad(prg.ap_lot, 4, '0')) AS appbbl FROM pluto_allocated AS pa - INNER JOIN pluto_rpad_geo AS prg ON pa.bbl = prg.primebbl AND prg.primebbl <> prg.bbl + INNER JOIN int__pluto_rpad_geo AS prg ON pa.bbl = prg.primebbl AND prg.primebbl <> prg.bbl WHERE pa.appbbl IS NULL AND right(pa.bbl, 4) LIKE '75%' diff --git a/products/pluto/pluto_build/sql/create_pts.sql b/products/pluto/pluto_build/sql/create_pts_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/create_pts.sql rename to products/pluto/pluto_build/sql/create_pts_migrated.sql diff --git a/products/pluto/pluto_build/sql/create_rpad_geo.sql b/products/pluto/pluto_build/sql/create_rpad_geo_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/create_rpad_geo.sql rename to products/pluto/pluto_build/sql/create_rpad_geo_migrated.sql diff --git a/products/pluto/pluto_build/sql/edesignation.sql b/products/pluto/pluto_build/sql/edesignation.sql deleted file mode 100644 index 0e45c6f547..0000000000 --- a/products/pluto/pluto_build/sql/edesignation.sql +++ /dev/null @@ -1,24 +0,0 @@ --- set the E Designation number --- if there is more than one enumber for one lot take the enumber from the lowest ceqr_num and ulurp_num --- **change to using csv file** -WITH edesignation AS ( - SELECT - bbl, - enumber - FROM ( - SELECT - bbl, - enumber, - ROW_NUMBER() OVER ( - PARTITION BY bbl - ORDER BY ceqr_num, ulurp_num, enumber - ) AS row_number - FROM stg__dcp_edesignation - ) AS x - WHERE x.row_number = 1 -) - -UPDATE pluto a -SET edesignum = b.enumber -FROM edesignation AS b -WHERE a.bbl = b.bbl::text; diff --git a/products/pluto/pluto_build/sql/far.sql b/products/pluto/pluto_build/sql/far.sql deleted file mode 100644 index 4e3315a104..0000000000 --- a/products/pluto/pluto_build/sql/far.sql +++ /dev/null @@ -1,18 +0,0 @@ --- calculate the built FAR --- divide the total building are (bldgarea) by the total lot area (lotarea) -UPDATE pluto -SET builtfar = round(bldgarea::numeric / lotarea::numeric, 2) -WHERE lotarea != '0' AND lotarea IS NOT NULL; - --- using dcp_zoning_maxfar maintained by zoning division --- base on zoning district 1 -UPDATE pluto a -SET - residfar = coalesce(b.residfar::double precision, 0::double precision), - commfar = coalesce(b.commfar::double precision, 0::double precision), - facilfar = coalesce(b.facilfar::double precision, 0::double precision), - affresfar = coalesce(b.affresfar::double precision, 0::double precision), - mnffar = coalesce(b.mnffar::double precision, 0::double precision) -FROM pluto AS p -LEFT JOIN dcp_zoning_maxfar AS b ON p.zonedist1 = b.zonedist -WHERE a.bbl = p.bbl; diff --git a/products/pluto/pluto_build/sql/geocode_billingbbl.sql b/products/pluto/pluto_build/sql/geocode_billingbbl_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/geocode_billingbbl.sql rename to products/pluto/pluto_build/sql/geocode_billingbbl_migrated.sql diff --git a/products/pluto/pluto_build/sql/geocode_notgeocoded.sql b/products/pluto/pluto_build/sql/geocode_notgeocoded.sql index 5a06c9cd6c..f4cb518da8 100644 --- a/products/pluto/pluto_build/sql/geocode_notgeocoded.sql +++ b/products/pluto/pluto_build/sql/geocode_notgeocoded.sql @@ -7,7 +7,7 @@ CREATE TABLE pluto_temp_qc_notgeocoded AS ( housenum_lo, street_name, COUNT(*) AS count - FROM pluto_rpad_geo + FROM int__pluto_rpad_geo WHERE cd IS NULL AND bbl IS NOT NULL GROUP BY bbl, billingbbl, housenum_lo, street_name ORDER BY bbl diff --git a/products/pluto/pluto_build/sql/geocodes.sql b/products/pluto/pluto_build/sql/geocodes.sql index 2c25a8f3a8..35cda2e894 100644 --- a/products/pluto/pluto_build/sql/geocodes.sql +++ b/products/pluto/pluto_build/sql/geocodes.sql @@ -22,13 +22,13 @@ SET address = concat(b.housenum_lo, ' ', b.street_name), ycoord = ltrim(b.ycoord, '0'), xcoord = ltrim(b.xcoord, '0') -FROM pluto_rpad_geo AS b +FROM int__pluto_rpad_geo AS b WHERE a.bbl = b.primebbl AND a.lot NOT LIKE '75%'; --updating the building code if it was not updated in alloceted UPDATE pluto a SET bldgclass = b.bldgcl -FROM pluto_rpad_geo AS b +FROM int__pluto_rpad_geo AS b WHERE a.bbl = b.primebbl AND a.lot NOT LIKE '75%' @@ -59,13 +59,13 @@ SET address = concat(b.housenum_lo, ' ', b.street_name), ycoord = ltrim(b.ycoord, '0'), xcoord = ltrim(b.xcoord, '0') -FROM pluto_rpad_geo AS b +FROM int__pluto_rpad_geo AS b WHERE a.bbl = b.borough || b.block || b.lot AND a.lot LIKE '75%'; --updating the building code if it was not updated in alloceted UPDATE pluto a SET bldgclass = b.bldgcl -FROM pluto_rpad_geo AS b +FROM int__pluto_rpad_geo AS b WHERE a.bbl = b.borough || b.block || b.lot AND a.lot LIKE '75%' diff --git a/products/pluto/pluto_build/sql/irrlotcode.sql b/products/pluto/pluto_build/sql/irrlotcode.sql deleted file mode 100644 index dfd9f474f0..0000000000 --- a/products/pluto/pluto_build/sql/irrlotcode.sql +++ /dev/null @@ -1,8 +0,0 @@ --- transform I/R RPAD codes to Y/N codes for irregular lot codes -UPDATE pluto -SET - irrlotcode - = (CASE - WHEN irrlotcode = 'I' THEN 'Y' - ELSE 'N' - END); diff --git a/products/pluto/pluto_build/sql/landuse.sql b/products/pluto/pluto_build/sql/landuse.sql deleted file mode 100644 index 03de22cb3a..0000000000 --- a/products/pluto/pluto_build/sql/landuse.sql +++ /dev/null @@ -1,16 +0,0 @@ --- Setting the landuse of the lot based on the building class --- uses the pluto_input_landuse_bldgclass lookup table -UPDATE pluto a -SET landuse = b.landuse -FROM pluto_input_landuse_bldgclass AS b -WHERE a.bldgclass = b.bldgclass; - --- set area source to 4 for vacant lots --- for vacant lots and number of buildings is 0 and building floor area is 0 -UPDATE pluto a -SET areasource = '4' -WHERE - (areasource IS NULL OR areasource = '0') - AND landuse = '11' - AND numbldgs::numeric = 0 - AND (bldgarea::numeric = 0 OR bldgarea IS NULL); diff --git a/products/pluto/pluto_build/sql/lotarea.sql b/products/pluto/pluto_build/sql/lotarea_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/lotarea.sql rename to products/pluto/pluto_build/sql/lotarea_migrated.sql diff --git a/products/pluto/pluto_build/sql/lpc.sql b/products/pluto/pluto_build/sql/lpc.sql deleted file mode 100644 index da81acaf1e..0000000000 --- a/products/pluto/pluto_build/sql/lpc.sql +++ /dev/null @@ -1,95 +0,0 @@ --- INPUTS ALL RECORDS SEPERATED BY A ; --- -- if the lot is in a historical district add in the name of the Historic District from lpc_historic_districts --- -- concatenates historic districts seperated by a semi colon for lots within more than one Historic District --- UPDATE pluto a --- SET histdist = c.hist_dist_list --- FROM (SELECT b.bbl, string_agg(b.hist_dist, '; ') AS hist_dist_list --- FROM ( --- SELECT DISTINCT bbl, hist_dist --- FROM lpc_historic_districts --- ORDER BY hist_dist) AS b --- WHERE hist_dist <> '0' --- GROUP BY b.bbl) c --- WHERE a.borocode||lpad(a.block, 5, '0')||lpad(a.lot, 4, '0') = c.bbl; - --- -- if the lot contains a landmark add in the name of the landmark from lpc_landmarks --- -- concatenates landmark names seperated by a semi colon for lots with more than one landmark --- UPDATE pluto a --- SET landmark = c.lm_name_list --- FROM (SELECT b.bbl, string_agg(b.lm_name, '; ') AS lm_name_list --- FROM ( --- SELECT DISTINCT bbl, lm_name --- FROM lpc_landmarks --- ORDER BY lm_name) AS b --- GROUP BY b.bbl) c --- WHERE a.borocode||lpad(a.block, 5, '0')||lpad(a.lot, 4, '0') = c.bbl; - --- INPUTS FIRST ALPHABETICAL RECORD --- if the lot is in a historical district add in the name of the Historic District from lpc_historic_districts --- the first alphabetical historical district is appended -WITH histdistricts AS ( - SELECT - bbl, - hist_dist - FROM ( - SELECT - bbl, - hist_dist, - ROW_NUMBER() OVER ( - PARTITION BY bbl - ORDER BY hist_dist - ) AS row_number - FROM stg__lpc_historic_districts - WHERE - hist_dist != '0' - AND hist_dist NOT LIKE 'Individual Landmark%' - ) AS x - WHERE x.row_number = 1 -) - -UPDATE pluto a -SET histdist = histdistricts.hist_dist -FROM histdistricts -WHERE a.borocode || LPAD(a.block, 5, '0') || LPAD(a.lot, 4, '0') = histdistricts.bbl; - --- if the lot contains a landmark add mark it as an Interior, Individual, or Individual and Interior Landmark -WITH landmarks AS ( - SELECT DISTINCT - bbl, - lm_type, - ROW_NUMBER() OVER ( - PARTITION BY bbl - ORDER BY lm_type - ) AS row_number - FROM ( - SELECT DISTINCT - bbl, - lm_type - FROM stg__lpc_landmarks - WHERE - (lm_type = 'Interior Landmark' OR lm_type = 'Individual Landmark') - AND status = 'DESIGNATED' - AND most_curre = '1' - AND (last_actio = 'DESIGNATED' OR last_actio = 'DESIGNATED (AMENDMENT/MODIFICATION ACCEPTED)') - ) AS x -), - -maxnum AS ( - SELECT - bbl, - MAX(row_number) AS maxrow_number - FROM landmarks - GROUP BY bbl -) - -UPDATE pluto a -SET - landmark = (CASE - WHEN c.maxrow_number = 1 THEN UPPER(b.lm_type) - WHEN c.maxrow_number = 2 THEN UPPER('Individual and Interior Landmark') - ELSE UPPER(b.lm_type) - END) -FROM landmarks AS b, maxnum AS c -WHERE - a.borocode || LPAD(a.block, 5, '0') || LPAD(a.lot, 4, '0') = b.bbl - AND c.bbl = b.bbl; diff --git a/products/pluto/pluto_build/sql/miharea.sql b/products/pluto/pluto_build/sql/miharea.sql deleted file mode 100644 index d90ab36b20..0000000000 --- a/products/pluto/pluto_build/sql/miharea.sql +++ /dev/null @@ -1,152 +0,0 @@ --- Mandatory Inclusionary Housing (MIH) Area Assignment Logic --- --- Assign MIH affordability options to tax lots based on spatial overlap with MIH areas --- --- Assignment Strategy: --- Unlike transit zones where each lot gets assigned to exactly one zone, MIH areas can have --- multiple overlapping affordability options that ALL apply to a single lot. A lot is assigned --- to a MIH option if either: --- 1. ≥10% of the lot area is covered by the MIH area, OR --- 2. ≥50% of the MIH area is covered by the lot --- --- Multiple Options Per Lot: --- A single lot can legitimately have multiple MIH options (e.g., Option 1, Option 2, Deep Affordability). --- These are not competing assignments but rather cumulative policy options that apply to development --- on that lot. The final output pivots these into binary flags (mih_opt1, mih_opt2, etc.). --- --- Data Flow: --- 1. Clean MIH option names and create unique identifiers (mih_cleaned table) --- 2. Calculate spatial overlaps between lots and MIH areas (mih_lot_overlap table) --- 3. Filter to assignments meeting the coverage thresholds --- 4. Pivot multiple options per lot into binary columns on the pluto table - - -DROP TABLE IF EXISTS mih_cleaned; -CREATE TABLE mih_cleaned AS -SELECT - project_id || '-' || mih_option AS mih_id, - *, - trim( - -- Step 2b: collapse any sequence of commas (e.g., ",,", ",,,") - regexp_replace( - -- Step 2a: Replace "and" or "," (with any spaces) with a single comma - regexp_replace( - -- Step 1: Add space between "Option" and number - regexp_replace( - replace(mih_option, 'Affordablility', 'Affordability'), -- should probably fix this in the source data - 'Option(\d)', -- ← match "Option" followed by a digit - 'Option \1', -- ← insert space - 'g' - ), - '\s*(,|and)\s*', -- ← match a comma or "and" (with spaces) - ',', -- ← replace with a comma - 'g' - ), - ',+', -- ← match one or more commas in a row - ',', -- ← replace with a single comma - 'g' - ), - ', ' -- ← trim comma and space FROM start/end - ) AS cleaned_option -FROM dcp_gis_mandatory_inclusionary_housing; - - -DROP TABLE IF EXISTS mih_lot_overlap CASCADE; -CREATE TABLE mih_lot_overlap AS -WITH mih_per_area AS ( - SELECT - p.bbl, - m.project_id, - m.mih_id, - m.wkb_geometry AS mih_geom, - p.geom AS lot_geom, - m.cleaned_option, - st_area( - CASE - WHEN st_coveredby(p.geom, m.wkb_geometry) THEN p.geom - ELSE st_multi(st_intersection(p.geom, m.wkb_geometry)) - END - ) AS segbblgeom, - st_area(p.geom) AS allbblgeom, - st_area( - CASE - WHEN st_coveredby(m.wkb_geometry, p.geom) THEN m.wkb_geometry - ELSE st_multi(st_intersection(m.wkb_geometry, p.geom)) - END - ) AS segmihgeom, - st_area(m.wkb_geometry) AS allmihgeom - FROM pluto AS p - INNER JOIN mih_cleaned AS m - ON st_intersects(p.geom, m.wkb_geometry) -), -mih_areas AS ( - SELECT - bbl, - cleaned_option, - project_id, - mih_id, - sum(segbblgeom) AS segbblgeom, - sum(segmihgeom) AS segmihgeom, - sum(segbblgeom / allbblgeom) * 100 AS perbblgeom, - max(segmihgeom / allmihgeom) * 100 AS maxpermihgeom - FROM mih_per_area - GROUP BY bbl, cleaned_option, project_id, mih_id -) -SELECT * FROM mih_areas -WHERE perbblgeom >= 10 OR maxpermihgeom >= 50; - - --- QAQC: Create a view of all distinct MIH options found in the data --- This should only contain the four valid options: Option 1, Option 2, Deep Affordability Option, Workforce Option --- If additional options appear, there's a source data issue -DROP VIEW IF EXISTS mih_distinct_options CASCADE; -CREATE VIEW mih_distinct_options AS -WITH split_options AS ( - SELECT DISTINCT unnest(string_to_array(cleaned_option, ',')) AS option - FROM mih_lot_overlap -) -SELECT DISTINCT trim(option) AS option -FROM split_options -ORDER BY option; - --- NOTE: GIS will likely refactor dcp_mih into this pivoted format, --- so much this code will likely disappear. --- --- Find all distinct MIH areas that apply to a lot, and pivot to columns. --- e.g. if we have two rows from our geospatial join like so: --- bbl=123, mih_options=Option 1,Option 2 --- bbl=123, mih_options=Option 2,Option 3 --- we first aggregate to --- bbl=123, Option 1,Option 2,Option 2,Option 3 --- then pivot into distinct columns -WITH bbls_with_all_options AS ( - SELECT - bbl, - string_agg(cleaned_option, ',') AS all_options - FROM mih_lot_overlap - GROUP BY bbl -), pivoted AS ( - SELECT - bbl, - CASE - WHEN (all_options LIKE '%Option 1%') = true THEN '1' - END AS mih_opt1, - CASE - WHEN (all_options LIKE '%Option 2%') = true THEN '1' - END AS mih_opt2, - CASE - WHEN (all_options LIKE '%Option 3%' OR all_options LIKE '%Deep Affordability Option%') = true THEN '1' - END AS mih_opt3, - CASE - WHEN (all_options LIKE '%Workforce Option%') = true THEN '1' - END AS mih_opt4 - FROM bbls_with_all_options -) -UPDATE pluto -SET - mih_opt1 = m.mih_opt1, - mih_opt2 = m.mih_opt2, - mih_opt3 = m.mih_opt3, - mih_opt4 = m.mih_opt4 -FROM pivoted AS m -WHERE pluto.bbl = m.bbl diff --git a/products/pluto/pluto_build/sql/numericfields.sql b/products/pluto/pluto_build/sql/numericfields.sql deleted file mode 100644 index e8f1442105..0000000000 --- a/products/pluto/pluto_build/sql/numericfields.sql +++ /dev/null @@ -1,45 +0,0 @@ --- only allow numeric values in the lot depth field -UPDATE pluto a -SET lotdepth = NULL -WHERE - a.lotdepth ~ '[^0-9]' - AND lotdepth NOT LIKE '%.%'; --- only allow numeric values in the numfloors field -UPDATE pluto a -SET numfloors = NULL -WHERE - a.numfloors ~ '[^0-9]' - AND numfloors NOT LIKE '%.%'; --- only allow numfloors values >= 1 -UPDATE pluto a -SET numfloors = NULL -WHERE - a.numfloors IS NOT NULL - AND a.numfloors::numeric < 1; --- remove commas from lot area -UPDATE pluto a -SET lotarea = REPLACE(lotarea, ',', '') -WHERE lotarea LIKE '%,%'; - --- repetitive with numericfields_geomfields ---where sanborn is just spaces set to NULL -UPDATE pluto a -SET sanborn = NULL -WHERE a.sanborn !~ '[0-9]'; - ---where x/y cood is just spaces set to NULL -UPDATE pluto a -SET xcoord = NULL -WHERE a.xcoord !~ '[0-9]'; -UPDATE pluto a -SET ycoord = NULL -WHERE a.ycoord !~ '[0-9]'; - --- make appbbl a single 0 where it's zero -UPDATE pluto -SET appbbl = '0' -WHERE appbbl::numeric = 0; - --- make sanitdistrict numeric -UPDATE pluto -SET sanitdistrict = sanitdistrict::integer; diff --git a/products/pluto/pluto_build/sql/ownertype.sql b/products/pluto/pluto_build/sql/ownertype.sql deleted file mode 100644 index 05b4e89139..0000000000 --- a/products/pluto/pluto_build/sql/ownertype.sql +++ /dev/null @@ -1,12 +0,0 @@ --- set the owner type code based on data from COLP -UPDATE pluto a -SET ownertype = b.ownership -FROM stg__dcp_colp AS b -WHERE a.bbl::numeric = b.bbl::numeric; - --- set X as owner type -UPDATE pluto a -SET ownertype = 'X' -WHERE - a.exempttot = a.assesstot - AND a.ownertype IS NULL; diff --git a/products/pluto/pluto_build/sql/primebbl.sql b/products/pluto/pluto_build/sql/primebbl_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/primebbl.sql rename to products/pluto/pluto_build/sql/primebbl_migrated.sql diff --git a/products/pluto/pluto_build/sql/sanitboro.sql b/products/pluto/pluto_build/sql/sanitboro.sql deleted file mode 100644 index 31afb90e4c..0000000000 --- a/products/pluto/pluto_build/sql/sanitboro.sql +++ /dev/null @@ -1,9 +0,0 @@ --- set the sanitation boro -UPDATE pluto -SET sanitboro = LEFT(sanitdistrict, 1) -WHERE sanitdistrict IS NOT NULL; - --- set the sanitdistrict to not include sanitboro -UPDATE pluto -SET sanitdistrict = RIGHT(sanitdistrict, 2) -WHERE sanitdistrict IS NOT NULL; diff --git a/products/pluto/pluto_build/sql/transitzone.sql b/products/pluto/pluto_build/sql/transitzone.sql deleted file mode 100644 index 88a627b868..0000000000 --- a/products/pluto/pluto_build/sql/transitzone.sql +++ /dev/null @@ -1,234 +0,0 @@ --- Transit zone assignment logic: --- Determine whether > 10% of a lot is covered by one of the transit zones - --- PERFORMANCE STRATEGY: --- The Transit Zone polygons are complex multipolygons with parts distributed across the city. --- For example, the Inner Transit Zone includes discontiguous polygon parts in Manhattan, Brooklyn and Queens. --- --- CRITICAL: We must split these transit zones into their contiguous atomic parts before performing --- spatial calculations. Without this decomposition, area calculations become prohibitively slow --- (>10 minutes). By first splitting the zones with ST_DUMP, then summing the intersections back up --- when calculating coverage percentages, we reduce processing time to ~1 minute. - --- ASSIGNMENT LOGIC: --- We assign transit zones using a two-tier strategy: --- --- 1. BLOCK-LEVEL ASSIGNMENT (when unambiguous): --- - For tax blocks where a single transit zone clearly dominates (no competing zones >10% coverage), --- assign all lots in that block to the dominant zone. --- - This captures cases where a transit zone boundary might cut through individual lots within --- a block, but we want those lots assigned consistently with their parent block. --- --- 2. LOT-LEVEL ASSIGNMENT (when ambiguous): --- - This is a fallback for when we can't determine a clear block-level assignment. --- - For example, when a rail line cuts straight through a "block", creating ambiguity about --- which transit zone the block belongs to. In such cases, we calculate coverage for each --- individual lot and assign based on the lot's specific overlap. --- - Special rule: If a lot has significant coverage in ANY zone AND "Beyond the Greater Transit Zone", --- we always default to the non-Beyond zone. --- - Note: We're not doing any complicated geometry splitting here, just falling back to --- lot-by-lot calculation when block-level assignment would be misleading. - --- Create decomposed transit zones table (break multipolygons into individual parts) -DROP TABLE IF EXISTS transit_zones_atomic_geoms CASCADE; -CREATE TABLE transit_zones_atomic_geoms AS -WITH decomposed AS ( - SELECT - transit_zone, - (ST_DUMP(wkb_geometry)).geom AS wkb_geometry - FROM dcp_transit_zones -) -SELECT - transit_zone, - wkb_geometry, - ROW_NUMBER() OVER (ORDER BY transit_zone) AS decomposed_id -FROM decomposed; -CREATE INDEX idx_transit_zones_atomic_geoms_gix ON transit_zones_atomic_geoms USING gist (wkb_geometry); - - --- Create the block geoms, splitting non-contiguous blocks into sub-blocks --- and assigning lots to their sub-block --- --- AR Note: I tried a few approaches for this, and perhaps there's a more clever/performant --- way to accomplish this. Unfortunately, the recommend approach of ST_ClusterDBSCAN --- will `sometimes` accomplish this, but it errors out seemingly randomly. -DROP TABLE IF EXISTS transit_zones_tax_blocks CASCADE; -CREATE TABLE transit_zones_tax_blocks AS -WITH block_unions AS ( - SELECT - borough, - block, - ST_UNION(p.geom) AS geom, - ARRAY_AGG(bbl) AS all_bbls - FROM pluto AS p - GROUP BY p.borough, p.block -), block_parts AS ( - SELECT - borough, - block, - all_bbls, - (ST_DUMP(geom)).geom AS geom - FROM block_unions -), numbered_parts AS ( - SELECT - borough, - block, - all_bbls, - geom, - ROW_NUMBER() OVER (PARTITION BY borough, block ORDER BY ST_AREA(geom) DESC) AS sub_block - FROM block_parts -), reassigned_bbls AS ( - SELECT - np.borough, - np.block, - np.sub_block, - np.geom, - ARRAY_AGG(p.bbl) AS bbls - FROM numbered_parts AS np - INNER JOIN pluto AS p - ON - np.borough = p.borough - AND np.block = p.block - AND ST_WITHIN(ST_POINTONSURFACE(p.geom), np.geom) - GROUP BY np.borough, np.block, np.sub_block, np.geom -) -SELECT - borough, - block, - sub_block, - borough || '-' || block || '-' || sub_block AS block_id, - geom, - bbls -FROM reassigned_bbls; -CREATE INDEX idx_transit_zones_tax_blocks_geom ON transit_zones_tax_blocks USING gist (geom); - - --- Step 1: Calculate coverage percentages for all tax blocks -DROP TABLE IF EXISTS transit_zones_block_to_tz_ranked CASCADE; -CREATE TABLE transit_zones_block_to_tz_ranked AS -WITH block_to_tz AS ( - SELECT - tb.borough, - tb.block, - tb.sub_block, - tb.geom, - tb.bbls, - t.transit_zone, - -- determine how much of the block is covered by the transit zone (sum up area of all intersecting atomic parts, then divide by block area) - ST_AREA(ST_INTERSECTION(tb.geom, ST_UNION(t.wkb_geometry))) / ST_AREA(tb.geom) * 100.0 AS pct_covered - FROM transit_zones_tax_blocks AS tb - INNER JOIN transit_zones_atomic_geoms AS t - ON ST_INTERSECTS(tb.geom, t.wkb_geometry) - GROUP BY tb.borough, tb.block, tb.sub_block, tb.geom, tb.bbls, t.transit_zone -) -SELECT - 'block' AS assignment_type, - borough || '-' || block || '-' || sub_block AS id, - borough, - block, - geom, - sub_block, - bbls, - transit_zone, - pct_covered, - ROW_NUMBER() OVER ( - PARTITION BY borough, block, sub_block - ORDER BY pct_covered DESC - ) AS tz_rank -FROM block_to_tz; -ANALYZE transit_zones_block_to_tz_ranked; - - --- For ambiguous blocks (those with competing transit zones), create lot-level assignments -DROP TABLE IF EXISTS transit_zones_bbl_to_tz_ranked CASCADE; -CREATE TABLE transit_zones_bbl_to_tz_ranked AS -WITH ambiguous_bbls AS ( - SELECT - UNNEST(bbls) AS bbl, - borough, - block, - sub_block - FROM transit_zones_block_to_tz_ranked AS tza - WHERE tza.tz_rank > 1 AND tza.pct_covered > 10 -), lot_to_tz AS ( - SELECT - p.bbl, - p.borough, - p.block, - t.transit_zone, - p.geom, - -- Calculate how much of the lot is covered by the transit zone - ST_AREA(ST_INTERSECTION(p.geom, ST_UNION(t.wkb_geometry))) / ST_AREA(p.geom) * 100.0 AS pct_covered - FROM pluto AS p - INNER JOIN ambiguous_bbls AS ab ON p.bbl = ab.bbl - INNER JOIN transit_zones_atomic_geoms AS t - ON ST_INTERSECTS(p.geom, t.wkb_geometry) - GROUP BY p.bbl, p.borough, p.block, t.transit_zone, p.geom -), lot_to_tz_with_rank AS ( - SELECT - lt.*, - tzr.tz_rank AS priority_rank - FROM lot_to_tz AS lt - LEFT JOIN dcp_transit_zone_ranks AS tzr ON lt.transit_zone = tzr.tz_name -), filtered_zones AS ( - -- If a lot has ANY zone with priority_rank < 4, exclude "Beyond the Greater Transit Zone" (rank 4) - SELECT - ltr.*, - NOT COALESCE( - EXISTS ( - SELECT 1 FROM lot_to_tz_with_rank AS inner_ltr - WHERE inner_ltr.bbl = ltr.bbl AND inner_ltr.priority_rank < 4 - ) AND ltr.priority_rank = 4, FALSE - ) AS include_zone - FROM lot_to_tz_with_rank AS ltr -) -SELECT - 'lot' AS assignment_type, - bbl::text AS id, - borough, - block, - geom, - 1 AS sub_block, - ARRAY[bbl] AS bbls, - transit_zone, - pct_covered, - ROW_NUMBER() OVER ( - PARTITION BY bbl - ORDER BY pct_covered DESC - ) AS tz_rank -FROM filtered_zones -WHERE include_zone = TRUE; -ANALYZE transit_zones_bbl_to_tz_ranked; - --- Assign the primary transit zone by --- 1. tax block, when the block's tz assignment is not ambiguous --- 2. by lot, even when ambiguous. We'll use corrections afterwards, if necessary - --- Assign transit zones using both block-level and lot-level strategies -UPDATE pluto a -SET trnstzone = assignments.transit_zone -FROM ( - -- Block-level assignments for non-ambiguous blocks - SELECT - UNNEST(bbls) AS bbl, - transit_zone - FROM transit_zones_block_to_tz_ranked AS block_tz - WHERE - block_tz.tz_rank = 1 - -- Only assign blocks that are not ambiguous (no second-ranked transit zone) - AND NOT EXISTS ( - SELECT 1 FROM transit_zones_block_to_tz_ranked AS ambiguous - WHERE - ambiguous.id = block_tz.id - AND ambiguous.tz_rank = 2 - AND ambiguous.pct_covered > 10 - ) - UNION ALL - -- Lot-level assignments for ambiguous blocks - SELECT - bbls[1] AS bbl, - transit_zone - FROM transit_zones_bbl_to_tz_ranked - WHERE tz_rank = 1 -) AS assignments -WHERE a.bbl = assignments.bbl; diff --git a/products/pluto/pluto_build/sql/update_empty_coord.sql b/products/pluto/pluto_build/sql/update_empty_coord.sql index 9681918fc6..89a0030081 100644 --- a/products/pluto/pluto_build/sql/update_empty_coord.sql +++ b/products/pluto/pluto_build/sql/update_empty_coord.sql @@ -21,10 +21,7 @@ WITH update_table AS ( UPDATE pluto b SET xcoord = ST_X(ST_TRANSFORM(t.centroid, 2263)), - ycoord = ST_Y(ST_TRANSFORM(t.centroid, 2263)), - longitude = ST_X(t.centroid), - latitude = ST_Y(t.centroid), - centroid = ST_SETSRID(t.centroid, 4326) + ycoord = ST_Y(ST_TRANSFORM(t.centroid, 2263)) FROM update_table AS t WHERE b.xcoord IS NULL AND b.bbl = t.bbl diff --git a/products/pluto/pluto_build/sql/yearbuiltalt.sql b/products/pluto/pluto_build/sql/yearbuiltalt_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/yearbuiltalt.sql rename to products/pluto/pluto_build/sql/yearbuiltalt_migrated.sql diff --git a/products/pluto/pluto_build/sql/zerovacantlots.sql b/products/pluto/pluto_build/sql/zerovacantlots_migrated.sql similarity index 100% rename from products/pluto/pluto_build/sql/zerovacantlots.sql rename to products/pluto/pluto_build/sql/zerovacantlots_migrated.sql diff --git a/products/pluto/seeds/_seeds.yml b/products/pluto/seeds/_seeds.yml index 1276c23127..14bc1f2b16 100644 --- a/products/pluto/seeds/_seeds.yml +++ b/products/pluto/seeds/_seeds.yml @@ -34,6 +34,9 @@ seeds: - name: dcp_zoning_maxfar description: Maximum Floor Area Ratio (FAR) by zoning district + + - name: dcp_transit_zone_ranks + description: Transit zone priority rankings for assignment logic - name: pluto_input_landuse_bldgclass description: Building class to land use mapping