Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,8 @@ package-lock.yml

# uv
uv.lock

# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
148 changes: 148 additions & 0 deletions bash/bin/dcp_distribute
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#!/bin/bash
# Trigger GitHub Actions distribute workflow for a product
# See action workflow file: .github/workflows/distribute_socrata_from_bytes.yml
# Usage: dcp_distribute [product] [version] [--source <source>] [--datasets <d1,d2>]
# [--destination-tags <t1,t2>] [--destination-types <t1,t2>]
# [--publish] [--metadata-only] [--validate] [--dry-run] [--no-tail]

set -e

# Defaults
TAIL_LOGS=true
SOURCE="bytes"
DESTINATION_TYPES="open_data"
PUBLISH=false
METADATA_ONLY=false
VALIDATE=false
DRY_RUN=false
DATASETS=""
DESTINATION_TAGS=""

POSITIONAL=()
while [[ $# -gt 0 ]]; do
case "$1" in
--no-tail)
TAIL_LOGS=false
shift
;;
--source)
SOURCE="$2"
shift 2
;;
--datasets)
DATASETS="$2"
shift 2
;;
--destination-tags)
DESTINATION_TAGS="$2"
shift 2
;;
--destination-types)
DESTINATION_TYPES="$2"
shift 2
;;
--publish)
PUBLISH=true
shift
;;
--metadata-only)
METADATA_ONLY=true
shift
;;
--validate)
VALIDATE=true
shift
;;
--dry-run)
DRY_RUN=true
shift
;;
*)
POSITIONAL+=("$1")
shift
;;
esac
done

PRODUCT="${POSITIONAL[0]:-${PRODUCT}}"
VERSION="${POSITIONAL[1]:-${VERSION}}"
BRANCH=$(git branch --show-current)

if [ -z "$PRODUCT" ]; then
echo "Error: PRODUCT must be set as env var or passed as first argument"
echo "Usage: dcp_distribute <product> <version> [options]"
echo " or: PRODUCT=pluto VERSION=25v1 dcp_distribute [options]"
echo ""
echo "Options:"
echo " --source <source> Source ID (default: bytes)"
echo " --datasets <d1,d2> Comma-separated list of datasets to push"
echo " --destination-tags <t1,t2> Comma-separated list of destination tags"
echo " --destination-types <t1,t2> Comma-separated list of destination types (default: open_data)"
echo " --publish Publish the Socrata Revision"
echo " --metadata-only Only push metadata (including attachments)"
echo " --validate Validate assembled dataset files"
echo " --dry-run Perform a dry run (will just list destinations)"
echo " --no-tail Do not tail workflow logs after dispatch"
exit 1
fi

if [ -z "$VERSION" ]; then
echo "Error: VERSION must be set as env var or passed as second argument"
echo "Usage: dcp_distribute <product> <version> [options]"
exit 1
fi

echo "Triggering distribute workflow for:"
echo " Product: $PRODUCT"
echo " Version: $VERSION"
echo " Source: $SOURCE"
echo " Branch: $BRANCH"
[ -n "$DATASETS" ] && echo " Datasets: $DATASETS"
[ -n "$DESTINATION_TAGS" ] && echo " Destination tags: $DESTINATION_TAGS"
[ -n "$DESTINATION_TYPES" ] && echo " Destination types: $DESTINATION_TYPES"
[ "$PUBLISH" = true ] && echo " Publish: true"
[ "$METADATA_ONLY" = true ] && echo " Metadata only: true"
[ "$VALIDATE" = true ] && echo " Validate files: true"
[ "$DRY_RUN" = true ] && echo " Dry run: true"
echo ""

WORKFLOW_ARGS=(
--ref "$BRANCH"
-f product="$PRODUCT"
-f version="$VERSION"
-f source="$SOURCE"
-f destination_types="$DESTINATION_TYPES"
-f publish="$PUBLISH"
-f metadata_only="$METADATA_ONLY"
-f validate_dataset_files="$VALIDATE"
-f dry_run="$DRY_RUN"
)

[ -n "$DATASETS" ] && WORKFLOW_ARGS+=(-f datasets="$DATASETS")
[ -n "$DESTINATION_TAGS" ] && WORKFLOW_ARGS+=(-f destination_tags="$DESTINATION_TAGS")

gh workflow run distribute_socrata_from_bytes.yml "${WORKFLOW_ARGS[@]}"

echo ""
echo "✓ Workflow dispatched!"
echo ""

# Wait a moment for the run to appear in the API
sleep 2

# Get the most recent run for this workflow and branch
RUN_INFO=$(gh run list --workflow=distribute_socrata_from_bytes.yml --branch="$BRANCH" --limit 1 --json databaseId,url --jq '.[0]')
RUN_URL=$(echo "$RUN_INFO" | jq -r '.url')
RUN_ID=$(echo "$RUN_INFO" | jq -r '.databaseId')

if [ -n "$RUN_URL" ]; then
# Create clickable hyperlink using ANSI escape codes
echo -e "View run: \033]8;;${RUN_URL}\033\\${RUN_URL}\033]8;;\033\\"
if [ "$TAIL_LOGS" = true ]; then
echo ""
echo "Tailing logs..."
gh run watch "$RUN_ID"
fi
else
echo "View runs: gh run list --workflow=distribute_socrata_from_bytes.yml --branch=$BRANCH"
fi
1 change: 1 addition & 0 deletions bash/bin/dcp_trigger_build
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash
# Trigger GitHub Actions build workflow for current branch
# See action workflow file: .github/workflows/build.yml
# Usage: dcp_trigger_build [dataset] [recipe] [build_note] [--no-tail]

set -e
Expand Down
6 changes: 6 additions & 0 deletions dcpy/connectors/edm/bytes/_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ def get_latest_version(self, key: str, **_) -> str:
*self._key_to_product_dataset(key)
)

def get_page_url(self, key: str) -> str | None:
try:
return _sitemap.get_page_url(*self._key_to_product_dataset(key))
except KeyError:
return None

def fetch_all_latest_versions(self) -> list[tuple[str, str, str]]:
"""Fetch latest versions for all datasets.

Expand Down
5 changes: 5 additions & 0 deletions dcpy/connectors/edm/bytes/_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
BYTES_API_PREFIX = (
"https://apps.nyc.gov/content-api/v1/content/planning/resources/datasets"
)
BYTES_PAGE_URL_PREFIX = "https://www.nyc.gov/content/planning/pages/resources/datasets"


class _FileUrlInfo(BaseModel, extra="forbid"):
Expand Down Expand Up @@ -92,6 +93,10 @@ def get_most_recent_version_url(product, dataset) -> str:
return f"{BYTES_API_PREFIX}/{get_product_dataset_bytes_resource(product, dataset)}"


def get_page_url(product, dataset) -> str:
return f"{BYTES_PAGE_URL_PREFIX}/{get_product_dataset_bytes_resource(product, dataset)}"


def get_dataset_catalog_json_url(product, dataset) -> str | None:
conf = SITE_MAP[product][dataset]
if conf.no_archived_versions:
Expand Down
2 changes: 1 addition & 1 deletion dcpy/connectors/edm/bytes/site_map.json
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@
"digital_city_map__geodatabase": {
"files": {
"fgdb": {
"filename": "dcm_{v}fdgb.zip"
"filename": "dcm_{v}fgdb.zip"
},
"dcm_street_name_changes_areas_pdf": {
"filename": "dcm_street_name_changes_areas.pdf"
Expand Down
17 changes: 16 additions & 1 deletion dcpy/lifecycle/scripts/version_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ def __hash__(self):
return hash(self.original)


def open_data_page_url(four_four: str) -> str:
return f"https://data.cityofnewyork.us/d/{four_four}"


def sort_by_outdated_products(df):
"""
Sort dataframe to show products with outdated datasets first.
Expand Down Expand Up @@ -161,7 +165,7 @@ def sort_by_outdated_products(df):
return df_sorted.set_index(["product", "dataset"])


def get_all_open_data_keys():
def get_all_open_data_keys() -> list[str]:
"""retrieve all product.dataset.destination_ids"""
return product_metadata.load(version="dummy").query_product_dataset_destinations(
destination_filter={"types": {"open_data"}},
Expand Down Expand Up @@ -193,9 +197,18 @@ def get_bytes_versions(all_keys):


def make_comparison_dataframe(bytes_versions, open_data_versions):
metadata = product_metadata.load(version="dummy")
rows = []
for key in open_data_versions:
product, dataset, destination_id = key.split(".")
four_four = (
metadata.product(product)
.dataset(dataset)
.get_destination(destination_id)
.custom.get("four_four")
)
open_data_url = open_data_page_url(four_four) if four_four else None
bytes_url = connectors["bytes"].get_page_url(f"{product}.{dataset}")
bytes_version = bytes_versions.get(f"{product}.{dataset}")
open_data_vers = open_data_versions.get(key, [])

Expand All @@ -216,6 +229,8 @@ def make_comparison_dataframe(bytes_versions, open_data_versions):
"bytes_version": bytes_version,
"open_data_versions": open_data_vers,
"up_to_date": up_to_date,
"bytes_url": bytes_url,
"open_data_url": open_data_url,
}
)
df = pd.DataFrame(rows).set_index(["product", "dataset"]).sort_index()
Expand Down
1 change: 0 additions & 1 deletion notebooks/marimo/admin/version_comparison.py

This file was deleted.

37 changes: 24 additions & 13 deletions notebooks/marimo/lifecycle/distribution/bytes_socrata_versions.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,49 @@
import marimo

__generated_with = "0.18.3"
__generated_with = "0.19.7"
app = marimo.App(width="full")

with app.setup:
import marimo as mo

from dcpy.lifecycle.scripts import version_compare


@app.cell(hide_code=True)
def _(mo):
def _():
mo.md(r"""
# Socrata <> Bytes version comparison tracker

FYI, this might take a minute or two to run.
""")
return


@app.cell
@app.cell(hide_code=True)
def _():
from dcpy.lifecycle.scripts import version_compare

versions = version_compare.run()
return (versions,)
mo.md(r"""
## Helpful links
- Github action to distribute from Bytes to Open Data: https://github.com/NYCPlanning/data-engineering/actions/workflows/distribute_socrata_from_bytes.yml
- Open Data page to sign in and publish revisions: https://opendata.cityofnewyork.us/
- Product Metadata repo: https://github.com/NYCPlanning/product-metadata
""")
return


@app.cell
def _(versions):
versions
mo.ui.table(versions.reset_index(), page_size=25, selection=None)
return


@app.cell
@app.cell(hide_code=True)
def _():
import marimo as mo
with mo.status.spinner(title="Fetching versions from Bytes and Open Data ..."):
versions = version_compare.run()
return (versions,)


return (mo,)
@app.cell
def _():
return


if __name__ == "__main__":
Expand Down
Loading