diff --git a/.gitignore b/.gitignore index 3faba4e613..308df4a122 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,8 @@ package-lock.yml # uv uv.lock + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ diff --git a/bash/bin/dcp_distribute b/bash/bin/dcp_distribute new file mode 100755 index 0000000000..5595f5c361 --- /dev/null +++ b/bash/bin/dcp_distribute @@ -0,0 +1,148 @@ +#!/bin/bash +# Trigger GitHub Actions distribute workflow for a product +# See action workflow file: .github/workflows/distribute_socrata_from_bytes.yml +# Usage: dcp_distribute [product] [version] [--source ] [--datasets ] +# [--destination-tags ] [--destination-types ] +# [--publish] [--metadata-only] [--validate] [--dry-run] [--no-tail] + +set -e + +# Defaults +TAIL_LOGS=true +SOURCE="bytes" +DESTINATION_TYPES="open_data" +PUBLISH=false +METADATA_ONLY=false +VALIDATE=false +DRY_RUN=false +DATASETS="" +DESTINATION_TAGS="" + +POSITIONAL=() +while [[ $# -gt 0 ]]; do + case "$1" in + --no-tail) + TAIL_LOGS=false + shift + ;; + --source) + SOURCE="$2" + shift 2 + ;; + --datasets) + DATASETS="$2" + shift 2 + ;; + --destination-tags) + DESTINATION_TAGS="$2" + shift 2 + ;; + --destination-types) + DESTINATION_TYPES="$2" + shift 2 + ;; + --publish) + PUBLISH=true + shift + ;; + --metadata-only) + METADATA_ONLY=true + shift + ;; + --validate) + VALIDATE=true + shift + ;; + --dry-run) + DRY_RUN=true + shift + ;; + *) + POSITIONAL+=("$1") + shift + ;; + esac +done + +PRODUCT="${POSITIONAL[0]:-${PRODUCT}}" +VERSION="${POSITIONAL[1]:-${VERSION}}" +BRANCH=$(git branch --show-current) + +if [ -z "$PRODUCT" ]; then + echo "Error: PRODUCT must be set as env var or passed as first argument" + echo "Usage: dcp_distribute [options]" + echo " or: PRODUCT=pluto VERSION=25v1 dcp_distribute [options]" + echo "" + echo "Options:" + echo " --source Source ID (default: bytes)" + echo " --datasets Comma-separated list of datasets to push" + echo " --destination-tags Comma-separated list of destination tags" + echo " --destination-types Comma-separated list of destination types (default: open_data)" + echo " --publish Publish the Socrata Revision" + echo " --metadata-only Only push metadata (including attachments)" + echo " --validate Validate assembled dataset files" + echo " --dry-run Perform a dry run (will just list destinations)" + echo " --no-tail Do not tail workflow logs after dispatch" + exit 1 +fi + +if [ -z "$VERSION" ]; then + echo "Error: VERSION must be set as env var or passed as second argument" + echo "Usage: dcp_distribute [options]" + exit 1 +fi + +echo "Triggering distribute workflow for:" +echo " Product: $PRODUCT" +echo " Version: $VERSION" +echo " Source: $SOURCE" +echo " Branch: $BRANCH" +[ -n "$DATASETS" ] && echo " Datasets: $DATASETS" +[ -n "$DESTINATION_TAGS" ] && echo " Destination tags: $DESTINATION_TAGS" +[ -n "$DESTINATION_TYPES" ] && echo " Destination types: $DESTINATION_TYPES" +[ "$PUBLISH" = true ] && echo " Publish: true" +[ "$METADATA_ONLY" = true ] && echo " Metadata only: true" +[ "$VALIDATE" = true ] && echo " Validate files: true" +[ "$DRY_RUN" = true ] && echo " Dry run: true" +echo "" + +WORKFLOW_ARGS=( + --ref "$BRANCH" + -f product="$PRODUCT" + -f version="$VERSION" + -f source="$SOURCE" + -f destination_types="$DESTINATION_TYPES" + -f publish="$PUBLISH" + -f metadata_only="$METADATA_ONLY" + -f validate_dataset_files="$VALIDATE" + -f dry_run="$DRY_RUN" +) + +[ -n "$DATASETS" ] && WORKFLOW_ARGS+=(-f datasets="$DATASETS") +[ -n "$DESTINATION_TAGS" ] && WORKFLOW_ARGS+=(-f destination_tags="$DESTINATION_TAGS") + +gh workflow run distribute_socrata_from_bytes.yml "${WORKFLOW_ARGS[@]}" + +echo "" +echo "✓ Workflow dispatched!" +echo "" + +# Wait a moment for the run to appear in the API +sleep 2 + +# Get the most recent run for this workflow and branch +RUN_INFO=$(gh run list --workflow=distribute_socrata_from_bytes.yml --branch="$BRANCH" --limit 1 --json databaseId,url --jq '.[0]') +RUN_URL=$(echo "$RUN_INFO" | jq -r '.url') +RUN_ID=$(echo "$RUN_INFO" | jq -r '.databaseId') + +if [ -n "$RUN_URL" ]; then + # Create clickable hyperlink using ANSI escape codes + echo -e "View run: \033]8;;${RUN_URL}\033\\${RUN_URL}\033]8;;\033\\" + if [ "$TAIL_LOGS" = true ]; then + echo "" + echo "Tailing logs..." + gh run watch "$RUN_ID" + fi +else + echo "View runs: gh run list --workflow=distribute_socrata_from_bytes.yml --branch=$BRANCH" +fi diff --git a/bash/bin/dcp_trigger_build b/bash/bin/dcp_trigger_build index f5ef960c13..66a14bf05a 100755 --- a/bash/bin/dcp_trigger_build +++ b/bash/bin/dcp_trigger_build @@ -1,5 +1,6 @@ #!/bin/bash # Trigger GitHub Actions build workflow for current branch +# See action workflow file: .github/workflows/build.yml # Usage: dcp_trigger_build [dataset] [recipe] [build_note] [--no-tail] set -e diff --git a/dcpy/connectors/edm/bytes/_connector.py b/dcpy/connectors/edm/bytes/_connector.py index 9ee3be1c2f..d045907427 100644 --- a/dcpy/connectors/edm/bytes/_connector.py +++ b/dcpy/connectors/edm/bytes/_connector.py @@ -148,6 +148,12 @@ def get_latest_version(self, key: str, **_) -> str: *self._key_to_product_dataset(key) ) + def get_page_url(self, key: str) -> str | None: + try: + return _sitemap.get_page_url(*self._key_to_product_dataset(key)) + except KeyError: + return None + def fetch_all_latest_versions(self) -> list[tuple[str, str, str]]: """Fetch latest versions for all datasets. diff --git a/dcpy/connectors/edm/bytes/_sitemap.py b/dcpy/connectors/edm/bytes/_sitemap.py index 0bd0d7fcb7..2510a624b3 100644 --- a/dcpy/connectors/edm/bytes/_sitemap.py +++ b/dcpy/connectors/edm/bytes/_sitemap.py @@ -20,6 +20,7 @@ BYTES_API_PREFIX = ( "https://apps.nyc.gov/content-api/v1/content/planning/resources/datasets" ) +BYTES_PAGE_URL_PREFIX = "https://www.nyc.gov/content/planning/pages/resources/datasets" class _FileUrlInfo(BaseModel, extra="forbid"): @@ -92,6 +93,10 @@ def get_most_recent_version_url(product, dataset) -> str: return f"{BYTES_API_PREFIX}/{get_product_dataset_bytes_resource(product, dataset)}" +def get_page_url(product, dataset) -> str: + return f"{BYTES_PAGE_URL_PREFIX}/{get_product_dataset_bytes_resource(product, dataset)}" + + def get_dataset_catalog_json_url(product, dataset) -> str | None: conf = SITE_MAP[product][dataset] if conf.no_archived_versions: diff --git a/dcpy/connectors/edm/bytes/site_map.json b/dcpy/connectors/edm/bytes/site_map.json index db748c1d74..1a4e1fae54 100644 --- a/dcpy/connectors/edm/bytes/site_map.json +++ b/dcpy/connectors/edm/bytes/site_map.json @@ -146,7 +146,7 @@ "digital_city_map__geodatabase": { "files": { "fgdb": { - "filename": "dcm_{v}fdgb.zip" + "filename": "dcm_{v}fgdb.zip" }, "dcm_street_name_changes_areas_pdf": { "filename": "dcm_street_name_changes_areas.pdf" diff --git a/dcpy/lifecycle/scripts/version_compare.py b/dcpy/lifecycle/scripts/version_compare.py index 26ec439ba6..7abbe1a071 100644 --- a/dcpy/lifecycle/scripts/version_compare.py +++ b/dcpy/lifecycle/scripts/version_compare.py @@ -114,6 +114,10 @@ def __hash__(self): return hash(self.original) +def open_data_page_url(four_four: str) -> str: + return f"https://data.cityofnewyork.us/d/{four_four}" + + def sort_by_outdated_products(df): """ Sort dataframe to show products with outdated datasets first. @@ -161,7 +165,7 @@ def sort_by_outdated_products(df): return df_sorted.set_index(["product", "dataset"]) -def get_all_open_data_keys(): +def get_all_open_data_keys() -> list[str]: """retrieve all product.dataset.destination_ids""" return product_metadata.load(version="dummy").query_product_dataset_destinations( destination_filter={"types": {"open_data"}}, @@ -193,9 +197,18 @@ def get_bytes_versions(all_keys): def make_comparison_dataframe(bytes_versions, open_data_versions): + metadata = product_metadata.load(version="dummy") rows = [] for key in open_data_versions: product, dataset, destination_id = key.split(".") + four_four = ( + metadata.product(product) + .dataset(dataset) + .get_destination(destination_id) + .custom.get("four_four") + ) + open_data_url = open_data_page_url(four_four) if four_four else None + bytes_url = connectors["bytes"].get_page_url(f"{product}.{dataset}") bytes_version = bytes_versions.get(f"{product}.{dataset}") open_data_vers = open_data_versions.get(key, []) @@ -216,6 +229,8 @@ def make_comparison_dataframe(bytes_versions, open_data_versions): "bytes_version": bytes_version, "open_data_versions": open_data_vers, "up_to_date": up_to_date, + "bytes_url": bytes_url, + "open_data_url": open_data_url, } ) df = pd.DataFrame(rows).set_index(["product", "dataset"]).sort_index() diff --git a/notebooks/marimo/admin/version_comparison.py b/notebooks/marimo/admin/version_comparison.py deleted file mode 100644 index 4203c53cd4..0000000000 --- a/notebooks/marimo/admin/version_comparison.py +++ /dev/null @@ -1 +0,0 @@ -# We'll implement the little dashboard here for bytes <> socrata version comparison diff --git a/notebooks/marimo/lifecycle/distribution/bytes_socrata_versions.py b/notebooks/marimo/lifecycle/distribution/bytes_socrata_versions.py index e54c133b79..2deb358814 100644 --- a/notebooks/marimo/lifecycle/distribution/bytes_socrata_versions.py +++ b/notebooks/marimo/lifecycle/distribution/bytes_socrata_versions.py @@ -1,38 +1,49 @@ import marimo -__generated_with = "0.18.3" +__generated_with = "0.19.7" app = marimo.App(width="full") +with app.setup: + import marimo as mo + + from dcpy.lifecycle.scripts import version_compare + @app.cell(hide_code=True) -def _(mo): +def _(): mo.md(r""" # Socrata <> Bytes version comparison tracker - - FYI, this might take a minute or two to run. """) return -@app.cell +@app.cell(hide_code=True) def _(): - from dcpy.lifecycle.scripts import version_compare - - versions = version_compare.run() - return (versions,) + mo.md(r""" + ## Helpful links + - Github action to distribute from Bytes to Open Data: https://github.com/NYCPlanning/data-engineering/actions/workflows/distribute_socrata_from_bytes.yml + - Open Data page to sign in and publish revisions: https://opendata.cityofnewyork.us/ + - Product Metadata repo: https://github.com/NYCPlanning/product-metadata + """) + return @app.cell def _(versions): - versions + mo.ui.table(versions.reset_index(), page_size=25, selection=None) return -@app.cell +@app.cell(hide_code=True) def _(): - import marimo as mo + with mo.status.spinner(title="Fetching versions from Bytes and Open Data ..."): + versions = version_compare.run() + return (versions,) + - return (mo,) +@app.cell +def _(): + return if __name__ == "__main__":