diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index d027e2df..3831e438 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -5,12 +5,9 @@ on: - develop tags: - 'v*' - pull_request: - types: [opened, synchronize, reopened, closed] permissions: contents: write - pull-requests: write jobs: deploy-versions: @@ -20,20 +17,24 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 # Important for mike to work with tags - - - uses: astral-sh/setup-uv@v3 + + - uses: astral-sh/setup-uv@v7 with: - version: "0.5.*" + version: "0.9.*" enable-cache: true - name: Sync - run: | + run: | uv sync git restore uv.lock - + - name: Configure Git run: | git config --local user.name "GitHub Actions" git config --local user.email "actions@github.com" + + - name: Fetch gh-pages branch + run: | + git fetch origin gh-pages --depth=1 || true - name: Deploy docs env: @@ -56,166 +57,3 @@ jobs: uv run mike deploy --push --update-aliases dev uv run mike set-default --push dev fi - - # Job for PR previews - preview: - if: github.event_name == 'pull_request' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - uses: astral-sh/setup-uv@v3 - with: - version: "0.5.*" - enable-cache: true - - name: Sync - run: | - uv sync - git diff - - # Only build and deploy if PR is opened/synchronized/reopened - - name: Build docs - if: github.event.action != 'closed' - env: - EARTHDATA_USERNAME: ${{ secrets.earthdata_username }} - EARTHDATA_PASSWORD: ${{ secrets.earthdata_password }} - run: | - echo "machine urs.earthdata.nasa.gov\nlogin ${EARTHDATA_USERNAME}\npassword ${EARTHDATA_PASSWORD}" > ~/.netrc - uv run mkdocs build --site-dir site/pr-${{ github.event.pull_request.number }} - - - name: Deploy preview - id: deploy - if: github.event.action != 'closed' - run: | - git config --global user.name "GitHub Actions" - git config --global user.email "actions@github.com" - - # Checkout gh-pages branch - git restore . - git fetch origin gh-pages --depth=1 - git checkout gh-pages - - # Create preview directory if it doesn't exist - mkdir -p pr-previews - - # Copy new preview - rm -rf pr-previews/pr-${{ github.event.pull_request.number }} - cp -r site/pr-${{ github.event.pull_request.number }} pr-previews/ - - # Check if there are actual changes in git - git add pr-previews - if git diff --staged --quiet; then - echo "No changes in documentation. Skipping deployment." - echo "has_changes=false" >> $GITHUB_OUTPUT - exit 0 - fi - - # If we get here, there are changes - git commit -m "Deploy preview for PR #${{ github.event.pull_request.number }}" - git push origin gh-pages - echo "has_changes=true" >> $GITHUB_OUTPUT - - - name: Post/Update PR Comment - if: github.event.action != 'closed' - uses: actions/github-script@v6 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - console.log('Starting PR comment update...'); - const has_changes = '${{ steps.deploy.outputs.has_changes }}' === 'true'; - const preview_url = `https://${context.repo.owner}.github.io/${context.repo.repo}/pr-previews/pr-${context.payload.pull_request.number}/`; - const message = `📚 Documentation preview will be available at: ${preview_url} - - Status: ${has_changes ? '✅ Preview is ready!' : '🔄 No changes in documentation since last update'}`; - - const comments = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.payload.pull_request.number, - }); - - const docComment = comments.data.find(comment => - comment.user.login === 'github-actions[bot]' && - comment.body.includes('Documentation preview will be available at:') - ); - - if (docComment) { - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: docComment.id, - body: message - }); - } else { - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.payload.pull_request.number, - body: message - }); - } - - - name: Cleanup preview - if: github.event.action == 'closed' - run: | - git config --global user.name "GitHub Actions" - git config --global user.email "actions@github.com" - - # Checkout gh-pages branch - git restore uv.lock - git fetch origin gh-pages --depth=1 - git checkout gh-pages - - # Remove the preview directory for this PR - rm -rf pr-previews/pr-${{ github.event.pull_request.number }} - - # Commit and push if there are changes - git add pr-previews - git diff --staged --quiet || (git commit -m "Remove preview for PR #${{ github.event.pull_request.number }}" && git push origin gh-pages) - - # Optional job to cleanup old PR previews - cleanup-old-previews: - if: github.event_name == 'push' && github.ref == 'refs/heads/develop' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - ref: gh-pages - - - name: Get closed PRs - id: closed-prs - uses: actions/github-script@v7 - with: - script: | - const prs = await github.rest.pulls.list({ - owner: context.repo.owner, - repo: context.repo.repo, - state: 'closed' - }); - return prs.data.map(pr => pr.number); - result-encoding: string - - - name: Cleanup old previews - run: | - git config --global user.name "GitHub Actions" - git config --global user.email "actions@github.com" - - if [ -d "pr-previews" ]; then - cd pr-previews - for preview in pr-*; do - if [ -d "$preview" ]; then - PR_NUM=$(echo $preview | sed 's/pr-//') - if ! echo "${{ steps.closed-prs.outputs.result }}" | grep -q "$PR_NUM"; then - rm -rf "$preview" - echo "Removed old preview: $preview" - fi - fi - done - - git add . - git diff --staged --quiet || (git commit -m "Cleanup old PR previews" && git push origin gh-pages) - else - echo "No preview directores found. Nothing to clean up." - fi - - diff --git a/CHANGELOG.md b/CHANGELOG.md index 21fb8ea5..5aeaf2b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - interpolate {datetime} in if sel includes {dim}={datetime} ([#78](https://github.com/developmentseed/titiler-cmr/pull/78)) - /compatibility and /concept_metadata endpoints ([#80](https://github.com/developmentseed/titiler-cmr/pull/80)) +- add dataset statistics to /compatibility endpoint output for xarray datasets ([#80](https://github.com/developmentseed/titiler-cmr/pull/82)) ## [v0.2.0] diff --git a/docs/examples/rasterio_backend_example.ipynb b/docs/examples/rasterio_backend_example.ipynb index 226922c1..451c1c98 100644 --- a/docs/examples/rasterio_backend_example.ipynb +++ b/docs/examples/rasterio_backend_example.ipynb @@ -27,9 +27,10 @@ }, "outputs": [], "source": [ + "import json\n", + "\n", "import earthaccess\n", "import httpx\n", - "import json\n", "\n", "from folium import Map, TileLayer" ] @@ -76,9 +77,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Examine a granule\n", + "## Explore the collection using the `/compatibility` endpoint\n", "\n", - "Each granule contains the data for a single point in time for an MGRS tile. " + "The `/compatibility` endpoint will display information about the collection and return some details about a sample granule. The output is helpful for understanding the structure of the collection and the granules so that you can craft the right set of parameters for visualization or statistics requests." ] }, { @@ -87,27 +88,20 @@ "metadata": {}, "outputs": [], "source": [ - "import earthaccess\n", - "import morecantile\n", - "\n", - "tms = morecantile.tms.get(\"WebMercatorQuad\")\n", - "\n", - "bounds = tms.bounds(62, 44, 7)\n", - "xmin, ymin, xmax, ymax = (round(n, 8) for n in bounds)\n", - "concept_id = \"C2021957295-LPCLOUD\"\n", + "compatibility_response = httpx.get(\n", + " f\"{titiler_endpoint}/compatibility\",\n", + " params={\"concept_id\": concept_id},\n", + " timeout=None,\n", + ").json()\n", "\n", - "results = earthaccess.search_data(\n", - " bounding_box=(xmin, ymin, xmax, ymax),\n", - " count=1,\n", - " concept_id=concept_id,\n", - " temporal=(\"2024-02-11\", \"2024-02-13\"),\n", - ")\n", - "print(\"Granules:\")\n", - "print(results)\n", - "print()\n", - "print(\"Example of COGs URL: \")\n", - "for link in results[0].data_links(access=\"direct\"):\n", - " print(link)" + "print(json.dumps(compatibility_response, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The details from the sample granule show that it has 18 assets (you would need to look more into what each of the assets represents). To properly configure the assets for titiler-cmr we will need to use the `bands_regex` parameter to identify the bands that we want to be available for visualizations. The `datetime` key shows the reported temporal range from CMR which indicates that the dataset has granules from `2015-11-28` to present." ] }, { @@ -138,7 +132,7 @@ " temporal=(\"2024-02-11\", \"2024-02-13\"),\n", " )\n", "\n", - "print(assets[0])" + "print(json.dumps(assets[0], indent=2))" ] }, { diff --git a/docs/examples/xarray_backend_example.ipynb b/docs/examples/xarray_backend_example.ipynb index 8cc3f8a6..ebbf8163 100644 --- a/docs/examples/xarray_backend_example.ipynb +++ b/docs/examples/xarray_backend_example.ipynb @@ -30,7 +30,6 @@ "\n", "import earthaccess\n", "import httpx\n", - "import xarray as xr\n", "from folium import Map, TileLayer\n", "\n", "# titiler_endpoint = \"http://localhost:8081\" # docker network endpoint\n", @@ -73,9 +72,9 @@ "id": "2a4cffa6-0059-4033-a708-db60d743f0e3", "metadata": {}, "source": [ - "## Examine a granule\n", + "## Explore the collection using the `/compatibility` endpoint\n", "\n", - "Each granule contains a single day record for the entire globe and has a single data file. " + "The `/compatibility` endpoint will display information about the collection and return some details about a sample granule. The output is helpful for understanding the structure of the collection and the granules so that you can craft the right set of parameters for visualization or statistics requests." ] }, { @@ -85,47 +84,21 @@ "metadata": {}, "outputs": [], "source": [ - "results = earthaccess.search_data(\n", - " count=1,\n", - " concept_id=concept_id,\n", - " temporal=(\"2024-10-12\", \"2024-10-13\"),\n", - ")\n", - "print(\"Granules:\")\n", - "print(results)\n", - "print()\n", - "print(\"Example of NetCDF URL: \")\n", - "for link in results[0].data_links(access=\"external\"):\n", - " print(link)" - ] - }, - { - "cell_type": "markdown", - "id": "eaa3f378-95fa-4c5a-9ccb-24b3064fb5a7", - "metadata": {}, - "source": [ - "## Explore the available variables\n", + "compatibility_response = httpx.get(\n", + " f\"{titiler_endpoint}/compatibility\",\n", + " params={\"concept_id\": concept_id},\n", + " timeout=None,\n", + ").json()\n", "\n", - "The NetCDF file can be opened with xarray using the `h5netcdf` engine. When running outside of AWS region us-west-2 you will need to access the data using \"external\" `https` links (rather than \"direct\" `s3` links). Those links will require authentication which is handled by `earthaccess` as long as you have your Earthdata credentials stored in the `~/.netrc` file!" + "print(json.dumps(compatibility_response, indent=2))" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "61ec4071-bf37-421f-bf58-ac399f827052", + "cell_type": "markdown", + "id": "04014a32-9c11-4b75-b40a-e5ad4efd686b", "metadata": {}, - "outputs": [], "source": [ - "fs = earthaccess.get_fsspec_https_session()\n", - "\n", - "ds = xr.open_dataset(\n", - " fs.open(results[0].data_links(access=\"external\")[0]),\n", - " engine=\"h5netcdf\",\n", - ")\n", - "print(\"Data Variables:\")\n", - "for var in ds.data_vars:\n", - " print(str(var))\n", - "\n", - "display(ds)" + "The details from the sample granule show that it is a NetCDF file with four variables (`analysed_sst`, `analysis_error`, `mask`, and `sea_ice_fraction`) and each contains an array with a single time coordinate. The `datetime` key shows the reported temporal range from CMR which indicates that the dataset has granules from `2002-05-31` to present. For each variable several summary statistics are available to help you craft min/max values for the `rescale` parameter." ] }, { diff --git a/tests/test_compatibility.py b/tests/test_compatibility.py index 118231de..ba1d2e9c 100644 --- a/tests/test_compatibility.py +++ b/tests/test_compatibility.py @@ -2,6 +2,7 @@ from unittest.mock import MagicMock, patch +import numpy as np import pytest from fastapi import HTTPException @@ -24,14 +25,14 @@ def test_extract_basic_metadata(self): # Mock data variables mock_var = MagicMock() mock_var.shape = (365, 1800, 3600) - mock_var.dtype = "float32" + mock_var.dtype = np.dtype("float32") mock_ds.data_vars = ["temperature"] mock_ds.__getitem__ = lambda self, key: mock_var # Mock coordinates mock_coord = MagicMock() mock_coord.size = 365 - mock_coord.dtype.kind = "f" + mock_coord.dtype = np.dtype("float64") mock_coord.min.return_value = 0.0 mock_coord.max.return_value = 364.0 @@ -61,14 +62,14 @@ def test_extract_metadata_with_non_numeric_coord(self): # Mock data variables mock_var = MagicMock() mock_var.shape = (10,) - mock_var.dtype = "float32" + mock_var.dtype = np.dtype("float32") mock_ds.data_vars = ["data"] mock_ds.__getitem__ = lambda self, key: mock_var # Mock string coordinate (no min/max) mock_coord = MagicMock() mock_coord.size = 10 - mock_coord.dtype.kind = "U" # Unicode string + mock_coord.dtype = np.dtype("U10") # Unicode string # Create a proper mock for coords mock_coords = MagicMock() @@ -114,7 +115,7 @@ def test_xarray_success(self, mock_xarray_open, mock_backend): mock_ds = MagicMock() mock_var = MagicMock() mock_var.shape = (10, 20) - mock_var.dtype = "float32" + mock_var.dtype = np.dtype("float32") mock_ds.data_vars = ["temp"] mock_ds.__getitem__ = lambda self, key: mock_var diff --git a/titiler/cmr/compatibility.py b/titiler/cmr/compatibility.py index 37fee738..59467ed4 100644 --- a/titiler/cmr/compatibility.py +++ b/titiler/cmr/compatibility.py @@ -2,6 +2,7 @@ from typing import Any, Dict, List, Literal, Optional +import numpy as np from fastapi import HTTPException from pydantic import BaseModel from rio_tiler.constants import WEB_MERCATOR_TMS @@ -18,35 +19,120 @@ from titiler.xarray.io import Reader as XarrayReader +class VariableInfo(BaseModel): + """Metadata for a single xarray variable""" + + shape: List[int] + dtype: str + min: Optional[float] = None + max: Optional[float] = None + mean: Optional[float] = None + p01: Optional[float] = None + p05: Optional[float] = None + p95: Optional[float] = None + p99: Optional[float] = None + + +class CoordinateInfo(BaseModel): + """Metadata for a single xarray coordinate""" + + size: int + dtype: str + min: Optional[float] = None + max: Optional[float] = None + + class CompatibilityResponse(BaseModel): """Compatibility endpoint response model""" concept_id: ConceptID backend: Literal["rasterio", "xarray"] datetime: List[Dict[str, Any]] - variables: Optional[Dict[str, Dict[str, Any]]] = None + variables: Optional[Dict[str, VariableInfo]] = None dimensions: Optional[Dict[str, int]] = None - coordinates: Optional[Dict[str, Dict[str, Any]]] = None + coordinates: Optional[Dict[str, CoordinateInfo]] = None example_assets: Optional[Dict[str, str] | str] = None sample_asset_raster_info: Optional[Info] = None -def extract_xarray_metadata(ds: Any) -> Dict[str, Any]: +def extract_xarray_metadata( + ds: Any, max_sample_size: float = 100_000.0 +) -> Dict[str, Any]: """Extract comprehensive metadata from an xarray Dataset. + For large arrays, uses sampling along each dimension to avoid memory issues. + Args: ds: xarray Dataset instance + max_sample_size: Maximum number of elements to sample for statistics. + Arrays larger than this will be sampled. Default: 1,000,000 Returns: Dictionary containing variables, dimensions, and coordinates metadata """ - variables = { - var: { + variables = {} + for var in ds.data_vars: + var_info: Dict[str, Any] = { "shape": list(ds[var].shape), "dtype": str(ds[var].dtype), } - for var in ds.data_vars - } + + if ds[var].dtype.kind in ["i", "f", "u"]: + try: + var_data = ds[var] + total_size = var_data.size + + # Use sampling for large arrays to avoid memory issues + if total_size > max_sample_size: + # Calculate exact sample size per dimension to stay within budget + indexers = {} + actual_sample_size = 1 + remaining_budget = max_sample_size + + for i, dim in enumerate(var_data.dims): + dim_size = var_data.sizes[dim] + # Distribute budget across remaining dimensions + dims_remaining = len(var_data.dims) - i + samples_per_dim = int( + remaining_budget ** (1.0 / dims_remaining) + ) + sample_size = min(dim_size, max(1, samples_per_dim)) + + # Random sample of indices along this dimension + indices = np.sort( + np.random.choice(dim_size, size=sample_size, replace=False) + ) + indexers[dim] = indices + actual_sample_size *= sample_size + remaining_budget = max_sample_size / actual_sample_size + + # Sample using integer indexing (efficient with chunked data) + sampled = var_data.isel(indexers) + values = sampled.values + + logger.info( + f"Sampled {actual_sample_size:,} of {total_size:,} elements " + f"from variable '{var}' for statistics" + ) + else: + # Load entire array for smaller datasets + values = var_data.values + + var_info["min"] = float(np.nanmin(values)) + var_info["max"] = float(np.nanmax(values)) + var_info["mean"] = float(np.nanmean(values)) + + # Calculate multiple percentiles in a single pass, filtering out NaNs + p01, p05, p95, p99 = np.nanpercentile(values, [1, 5, 95, 99]) + var_info["p01"] = float(p01) + var_info["p05"] = float(p05) + var_info["p95"] = float(p95) + var_info["p99"] = float(p99) + except Exception: + # Skip statistics if computation fails (e.g., too large, all NaN values) + pass + + variables[var] = var_info coordinates = {} for coord, coord_data in ds.coords.items(): diff --git a/titiler/cmr/factory.py b/titiler/cmr/factory.py index ef1e8eba..a39f56e1 100644 --- a/titiler/cmr/factory.py +++ b/titiler/cmr/factory.py @@ -482,15 +482,22 @@ def concept_metadata_endpoint( "**Returns:**\n" "- `backend`: Which reader to use ('xarray' or 'rasterio')\n" "- `datetime`: Temporal extent(s) of the dataset\n" - "- `variables`: Available data variables with shape and dtype (xarray only)\n" + "- `variables`: Available data variables with shape, dtype, and statistics (xarray only)\n" + " - For numeric variables, includes: min, max, mean, and percentiles (p01, p05, p95, p99)\n" + " - These statistics are computed from the sample asset to characterize the data distribution\n" "- `dimensions`: Dimension names and sizes (xarray only)\n" - "- `coordinates`: Coordinate information with ranges (xarray only)\n" - "- `example_assets`: Sample data URL from the collection\n\n" + "- `coordinates`: Coordinate information with value ranges (xarray only)\n" + "- `example_assets`: Sample data URL from the collection\n" + "- `sample_asset_raster_info`: Rasterio metadata for sample asset (rasterio only)\n\n" "**Use this information to:**\n" "- Set `backend` parameter for /tiles, /bbox, and /feature endpoints\n" "- Choose valid `variable` names for xarray datasets\n" "- Identify available dimensions for selection/interpolation\n" "- Determine temporal coverage for time-based queries\n" + "- **Parameterize visualization requests** using variable statistics:\n" + " - Set `rescale` parameter based on min/max or percentile ranges (e.g., `rescale=p01,p99` for contrast stretching)\n" + " - Choose appropriate colormaps based on data range and distribution\n" + " - Avoid clipping by understanding the actual data value range before rendering\n" "- Understand data structure before making tile/image requests" ), response_model_exclude_none=True,