From ba7c9e42d8d33e15e0b527aef34a1c39d32ba60d Mon Sep 17 00:00:00 2001 From: Kyle Lesinger Date: Mon, 16 Mar 2026 16:07:35 -0500 Subject: [PATCH] add workflow for changing event name and adding to metadata --- Jupyterhub/extract_eventname_metadata.ipynb | 950 ++++++++++++++++++++ _quarto.yml | 2 + 2 files changed, 952 insertions(+) create mode 100644 Jupyterhub/extract_eventname_metadata.ipynb diff --git a/Jupyterhub/extract_eventname_metadata.ipynb b/Jupyterhub/extract_eventname_metadata.ipynb new file mode 100644 index 0000000..50afc26 --- /dev/null +++ b/Jupyterhub/extract_eventname_metadata.ipynb @@ -0,0 +1,950 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "title: S3 GeoTIFF Event Metadata Extractor\n", + "description: This notebook processes GeoTIFF files from S3 that contain activation events in their filenames and extracts event metadata.\n", + "author: \n", + " - NASA Disasters Team\n", + "date: March 16, 2026\n", + "execute:\n", + " freeze: true\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook processes GeoTIFF files from S3 that contain activation events in their filenames.\n", + "\n", + "**Workflow:**\n", + "\n", + "1. Read files from AWS S3\n", + "2. Detect activation event pattern: `YYYYMM___*.tif`\n", + "3. Add event metadata to GeoTIFF attributes\n", + "4. Move file to new S3 location\n", + "5. Optionally delete from original location" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Set your S3 bucket names and paths here." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "# S3 Configuration\n", + "SOURCE_BUCKET = \"nasa-disasters\"\n", + "SOURCE_PREFIX = \"drcs_activations_new/Landsat/burnRatio\" # Path within source bucket\n", + "\n", + "DESTINATION_BUCKET = \"nasa-disasters-dev\"\n", + "DESTINATION_PREFIX = \"ProgramData/Landsat/burnRatio/\" # Path within destination bucket (must have / at the very end of it!!!)\n", + "\n", + "# AWS Region\n", + "AWS_REGION = \"us-west-2\"\n", + "\n", + "# Processing options\n", + "DELETE_AFTER_MOVE = False # Set to True to delete original files after successful processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Libraries" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Required Libraries\n", + "\n", + "Before running this notebook, install the required Python libraries:\n", + "\n", + "```bash\n", + "# Install core dependencies\n", + "pip install boto3 rasterio\n", + "\n", + "# Install GDAL (required for COG operations)\n", + "# On macOS with Homebrew:\n", + "brew install gdal\n", + "pip install gdal\n", + "\n", + "# On Ubuntu/Debian:\n", + "sudo apt-get install gdal-bin libgdal-dev\n", + "pip install gdal==$(gdal-config --version)\n", + "\n", + "# On conda:\n", + "conda install -c conda-forge gdal\n", + "\n", + "# Install rio-cogeo for COG validation\n", + "pip install rio-cogeo\n", + "```\n", + "\n", + "**Note:** GDAL can be tricky to install. If you encounter issues, use conda instead of pip for all geospatial libraries:\n", + "\n", + "```bash\n", + "conda install -c conda-forge boto3 rasterio gdal rio-cogeo\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Libraries imported successfully\n" + ] + } + ], + "source": [ + "import boto3\n", + "import re\n", + "import io\n", + "from typing import Dict, Optional, Tuple, List\n", + "import rasterio\n", + "from rasterio.io import MemoryFile\n", + "from datetime import datetime\n", + "from osgeo import gdal\n", + "from rio_cogeo.cogeo import cog_validate, cog_translate\n", + "from rio_cogeo.profiles import cog_profiles\n", + "\n", + "# Enable GDAL exceptions\n", + "gdal.UseExceptions()\n", + "\n", + "print(\"Libraries imported successfully\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## AWS Session and Role Assumption\n", + "\n", + "Assume the current role that's already active in the account." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current AWS Identity:\n", + " Account: 515966502221\n", + " ARN: arn:aws:sts::515966502221:assumed-role/disasters-prod/botocore-session-1773693036\n", + " User ID: AROAXQIQAAVGWA2LHXCPW:botocore-session-1773693036\n", + "\n", + "S3 client created successfully\n" + ] + } + ], + "source": [ + "# Create boto3 session using current credentials\n", + "session = boto3.Session(region_name=AWS_REGION)\n", + "\n", + "# Get STS client to verify current identity\n", + "sts_client = session.client('sts')\n", + "identity = sts_client.get_caller_identity()\n", + "\n", + "print(f\"Current AWS Identity:\")\n", + "print(f\" Account: {identity['Account']}\")\n", + "print(f\" ARN: {identity['Arn']}\")\n", + "print(f\" User ID: {identity['UserId']}\")\n", + "\n", + "# Create S3 client\n", + "s3_client = session.client('s3')\n", + "\n", + "print(\"\\nS3 client created successfully\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Helper functions defined successfully\n" + ] + } + ], + "source": [ + "def detect_activation_event(filename: str) -> Optional[Dict[str, str]]:\n", + " \"\"\"\n", + " Detect activation event pattern in filename.\n", + " \n", + " Pattern: YYYYMM___*.tif\n", + " Example: 202401_EMSR123_FLOOD_extent.tif\n", + " \n", + " Args:\n", + " filename: The filename to check\n", + " \n", + " Returns:\n", + " Dictionary with event metadata if pattern matches, None otherwise\n", + " \"\"\"\n", + " # Pattern: YYYYMM___*.tif\n", + " pattern = r'^(\\d{6})_([A-Za-z0-9]+)_([A-Za-z0-9]+)_(.*)\\.tif$'\n", + " \n", + " match = re.match(pattern, filename)\n", + " \n", + " if match:\n", + " year_month, hazard, location, remainder = match.groups()\n", + " \n", + " return {\n", + " 'year_month': year_month,\n", + " 'hazard': hazard,\n", + " 'location': location,\n", + " 'remainder': remainder,\n", + " 'original_filename': filename\n", + " }\n", + " \n", + " return None\n", + "\n", + "\n", + "def list_s3_files(bucket: str, prefix: str, pattern: str = '.tif') -> List[str]:\n", + " \"\"\"\n", + " List all files in S3 bucket matching the pattern.\n", + " \n", + " Args:\n", + " bucket: S3 bucket name\n", + " prefix: Prefix (folder path) within bucket\n", + " pattern: File extension or pattern to match\n", + " \n", + " Returns:\n", + " List of S3 keys\n", + " \"\"\"\n", + " files = []\n", + " paginator = s3_client.get_paginator('list_objects_v2')\n", + " \n", + " for page in paginator.paginate(Bucket=bucket, Prefix=prefix):\n", + " if 'Contents' in page:\n", + " for obj in page['Contents']:\n", + " key = obj['Key']\n", + " if pattern in key:\n", + " files.append(key)\n", + " \n", + " return files\n", + "\n", + "\n", + "def read_geotiff_from_s3(bucket: str, key: str) -> Tuple[bytes, dict]:\n", + " \"\"\"\n", + " Read GeoTIFF file from S3 into memory.\n", + " \n", + " Args:\n", + " bucket: S3 bucket name\n", + " key: S3 object key\n", + " \n", + " Returns:\n", + " Tuple of (file bytes, rasterio metadata)\n", + " \"\"\"\n", + " # Download file to memory\n", + " obj = s3_client.get_object(Bucket=bucket, Key=key)\n", + " file_bytes = obj['Body'].read()\n", + " \n", + " # Read metadata with rasterio\n", + " with MemoryFile(file_bytes) as memfile:\n", + " with memfile.open() as dataset:\n", + " metadata = dataset.meta.copy()\n", + " tags = dataset.tags()\n", + " \n", + " return file_bytes, metadata, tags\n", + "\n", + "\n", + "def validate_cog(file_bytes: bytes, filename: str = \"temp.tif\") -> Tuple[bool, dict]:\n", + " \"\"\"\n", + " Validate if a GeoTIFF is a valid Cloud-Optimized GeoTIFF (COG).\n", + " \n", + " Args:\n", + " file_bytes: GeoTIFF file bytes\n", + " filename: Optional filename for display purposes\n", + " \n", + " Returns:\n", + " Tuple of (is_valid: bool, validation_info: dict)\n", + " \"\"\"\n", + " vsimem_path = f'/vsimem/{filename}'\n", + " \n", + " try:\n", + " # Write to virtual memory\n", + " gdal.FileFromMemBuffer(vsimem_path, file_bytes)\n", + " \n", + " # Validate COG structure\n", + " is_valid, errors, warnings = cog_validate(vsimem_path)\n", + " \n", + " # Get detailed info\n", + " ds = gdal.Open(vsimem_path)\n", + " if ds:\n", + " metadata = {\n", + " 'is_cog': is_valid,\n", + " 'errors': errors,\n", + " 'warnings': warnings,\n", + " 'width': ds.RasterXSize,\n", + " 'height': ds.RasterYSize,\n", + " 'bands': ds.RasterCount,\n", + " 'compression': ds.GetMetadataItem('COMPRESSION', 'IMAGE_STRUCTURE') or 'None',\n", + " 'tiled': ds.GetMetadataItem('INTERLEAVE', 'IMAGE_STRUCTURE') == 'PIXEL',\n", + " 'blocksize': (ds.GetRasterBand(1).GetBlockSize() if ds.RasterCount > 0 else (None, None)),\n", + " 'overviews': ds.GetRasterBand(1).GetOverviewCount() if ds.RasterCount > 0 else 0\n", + " }\n", + " ds = None\n", + " else:\n", + " metadata = {'is_cog': False, 'errors': ['Could not open file'], 'warnings': []}\n", + " \n", + " # Clean up\n", + " gdal.Unlink(vsimem_path)\n", + " \n", + " return is_valid, metadata\n", + " \n", + " except Exception as e:\n", + " gdal.Unlink(vsimem_path)\n", + " return False, {'is_cog': False, 'errors': [str(e)], 'warnings': []}\n", + "\n", + "\n", + "def add_metadata_and_create_cog(file_bytes: bytes, event_metadata: Dict[str, str]) -> bytes:\n", + " \"\"\"\n", + " Rebuild GeoTIFF as a valid COG with metadata using rio-cogeo library.\n", + " \n", + " Preserves all original compression settings (compression type, level, predictor)\n", + " from the source file and only adds activation event metadata.\n", + " \n", + " Args:\n", + " file_bytes: Original GeoTIFF file bytes\n", + " event_metadata: Dictionary with event information\n", + " \n", + " Returns:\n", + " Modified GeoTIFF file bytes as a valid COG with original compression settings\n", + " \"\"\"\n", + " input_path = '/vsimem/input.tif'\n", + " output_path = '/vsimem/output_cog.tif'\n", + " \n", + " try:\n", + " # Write input to virtual memory\n", + " gdal.FileFromMemBuffer(input_path, file_bytes)\n", + " \n", + " # Read original file settings with rasterio\n", + " print(\" Reading original file settings...\")\n", + " with MemoryFile(file_bytes) as memfile:\n", + " with memfile.open() as src:\n", + " original_profile = src.profile.copy()\n", + " \n", + " # Extract compression settings from original file\n", + " compress = original_profile.get('compress', 'ZSTD')\n", + " predictor = original_profile.get('predictor', None)\n", + " \n", + " # Get compression level if available\n", + " compress_level = None\n", + " if 'zstd_level' in original_profile:\n", + " compress_level = original_profile['zstd_level']\n", + " elif 'zlevel' in original_profile: # For DEFLATE compression\n", + " compress_level = original_profile['zlevel']\n", + " \n", + " print(f\" Original settings: compress={compress}, predictor={predictor}, level={compress_level}\")\n", + " print(\" Rebuilding as COG (preserving original compression settings)...\")\n", + " \n", + " # Prepare metadata tags\n", + " tags = {\n", + " 'ACTIVATION_EVENT': f\"{event_metadata['year_month']}_{event_metadata['hazard']}_{event_metadata['location']}\",\n", + " 'YEAR_MONTH': event_metadata['year_month'],\n", + " 'HAZARD': event_metadata['hazard'],\n", + " 'LOCATION': event_metadata['location'],\n", + " 'PROCESSING_DATE': datetime.utcnow().isoformat(),\n", + " 'PROCESSOR': 'S3 GeoTIFF COG Processor v1.0'\n", + " }\n", + " \n", + " # Create COG profile using original compression settings\n", + " cog_profile = {\n", + " 'driver': 'GTiff',\n", + " 'interleave': 'pixel',\n", + " 'tiled': True,\n", + " 'blockxsize': 512,\n", + " 'blockysize': 512,\n", + " 'compress': compress\n", + " }\n", + " \n", + " # Add compression level if present\n", + " if compress_level is not None:\n", + " if compress.upper() == 'ZSTD':\n", + " cog_profile['zstd_level'] = compress_level\n", + " elif compress.upper() in ['DEFLATE', 'LZW']:\n", + " cog_profile['zlevel'] = compress_level\n", + " \n", + " # Add predictor if present in original\n", + " if predictor is not None:\n", + " cog_profile['predictor'] = predictor\n", + " print(f\" Using original predictor: {predictor}\")\n", + " \n", + " # Use cog_translate to create proper COG\n", + " cog_translate(\n", + " source=input_path,\n", + " dst_path=output_path,\n", + " dst_kwargs=cog_profile,\n", + " add_mask=False,\n", + " overview_level=4,\n", + " overview_resampling='average',\n", + " web_optimized=True,\n", + " additional_cog_metadata=tags,\n", + " quiet=False\n", + " )\n", + " \n", + " print(\" COG creation complete with embedded metadata\")\n", + " \n", + " # Read output from virtual memory\n", + " vsi_file = gdal.VSIFOpenL(output_path, 'rb')\n", + " if not vsi_file:\n", + " raise Exception(\"Could not read output COG from virtual memory\")\n", + " \n", + " gdal.VSIFSeekL(vsi_file, 0, 2) # Seek to end\n", + " file_size = gdal.VSIFTellL(vsi_file)\n", + " gdal.VSIFSeekL(vsi_file, 0, 0) # Seek to start\n", + " \n", + " output_bytes = gdal.VSIFReadL(1, file_size, vsi_file)\n", + " gdal.VSIFCloseL(vsi_file)\n", + " \n", + " # Clean up virtual files\n", + " gdal.Unlink(input_path)\n", + " gdal.Unlink(output_path)\n", + " \n", + " print(f\" Output size: {len(output_bytes):,} bytes\")\n", + " \n", + " return output_bytes\n", + " \n", + " except Exception as e:\n", + " # Print full traceback for debugging\n", + " import traceback\n", + " print(f\"\\n ERROR DETAILS:\")\n", + " traceback.print_exc()\n", + " \n", + " # Clean up on error\n", + " try:\n", + " gdal.Unlink(input_path)\n", + " except:\n", + " pass\n", + " try:\n", + " gdal.Unlink(output_path)\n", + " except:\n", + " pass\n", + " \n", + " raise Exception(f\"COG creation failed: {str(e)}\")\n", + "\n", + "\n", + "def upload_to_s3(file_bytes: bytes, bucket: str, key: str) -> bool:\n", + " \"\"\"\n", + " Upload file bytes to S3.\n", + " \n", + " Args:\n", + " file_bytes: File content as bytes\n", + " bucket: Destination S3 bucket\n", + " key: Destination S3 key\n", + " \n", + " Returns:\n", + " True if successful\n", + " \"\"\"\n", + " try:\n", + " s3_client.put_object(\n", + " Bucket=bucket,\n", + " Key=key,\n", + " Body=file_bytes,\n", + " ContentType='image/tiff'\n", + " )\n", + " return True\n", + " except Exception as e:\n", + " print(f\"Error uploading to S3: {e}\")\n", + " return False\n", + "\n", + "\n", + "def delete_from_s3(bucket: str, key: str) -> bool:\n", + " \"\"\"\n", + " Delete file from S3.\n", + " \n", + " Args:\n", + " bucket: S3 bucket name\n", + " key: S3 object key\n", + " \n", + " Returns:\n", + " True if successful\n", + " \"\"\"\n", + " try:\n", + " s3_client.delete_object(Bucket=bucket, Key=key)\n", + " return True\n", + " except Exception as e:\n", + " print(f\"Error deleting from S3: {e}\")\n", + " return False\n", + "\n", + "\n", + "print(\"Helper functions defined successfully\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test Pattern Detection\n", + "\n", + "Test the filename pattern detection with some examples." + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing pattern detection:\n", + "\n", + "✓ 202302_Earthquake_Turkiye_Turkey_UNW_2022-04-06_to_2023-02-08_day.tif\n", + " Year/Month: 202302\n", + " Location: Turkiye\n", + " Hazard: Earthquake\n", + " Remainder: Turkey_UNW_2022-04-06_to_2023-02-08_day\n", + "\n" + ] + } + ], + "source": [ + "# Test cases\n", + "test_filenames = [\n", + " \"202302_Earthquake_Turkiye_Turkey_UNW_2022-04-06_to_2023-02-08_day.tif\"\n", + "]\n", + "\n", + "print(\"Testing pattern detection:\\n\")\n", + "for filename in test_filenames:\n", + " result = detect_activation_event(filename)\n", + " if result:\n", + " print(f\"✓ {filename}\")\n", + " print(f\" Year/Month: {result['year_month']}\")\n", + " print(f\" Location: {result['location']}\")\n", + " print(f\" Hazard: {result['hazard']}\")\n", + " print(f\" Remainder: {result['remainder']}\")\n", + " else:\n", + " print(f\"✗ {filename} - No match\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List Files in Source Bucket" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Listing files in s3://nasa-disasters/drcs_activations_new/Landsat/burnRatio\n", + "\n", + "Found 1 .tif files:\n", + " ✓ drcs_activations_new/Landsat/burnRatio/202501_Fire_CA_LC09_NBR_182831_041036_2025-01-14_day.tif - Location: CA, Hazard: Fire\n" + ] + } + ], + "source": [ + "# List all .tif files in source bucket\n", + "print(f\"Listing files in s3://{SOURCE_BUCKET}/{SOURCE_PREFIX}\\n\")\n", + "\n", + "tif_files = list_s3_files(SOURCE_BUCKET, SOURCE_PREFIX, '.tif')\n", + "\n", + "print(f\"Found {len(tif_files)} .tif files:\")\n", + "for file_key in tif_files:\n", + " filename = file_key.split('/')[-1]\n", + " event_info = detect_activation_event(filename)\n", + " \n", + " if event_info:\n", + " print(f\" ✓ {file_key} - Location: {event_info['location']}, Hazard: {event_info['hazard']}\")\n", + " else:\n", + " print(f\" ✗ {file_key} - No activation event pattern\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Process Files\n", + "\n", + "Main processing loop: detect activation events, add metadata, move to destination." + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "================================================================================\n", + "Starting batch processing\n", + "================================================================================\n", + "\n", + "Processing: 202501_Fire_CA_LC09_NBR_182831_041036_2025-01-14_day.tif\n", + " Location: CA, Hazard: Fire\n", + " Reading from S3...\n", + " Size: 158,238,568 bytes\n", + " Dimensions: 8554x7185\n", + " CRS: EPSG:4326\n", + " Validating input COG structure...\n", + " ✓ Input is valid COG\n", + " Compression: ZSTD\n", + " Tile size: [512, 512]\n", + " Overviews: 4 levels\n", + " Creating COG with metadata and ZSTD compression...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1272/2684847193.py:178: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n", + " 'PROCESSING_DATE': datetime.utcnow().isoformat(),\n", + "Reading input: /vsimem/input.tif\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Reading original file settings...\n", + " Original settings: compress=zstd, predictor=None, level=None\n", + " Rebuilding as COG (preserving original compression settings)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Adding overviews...\n", + "Updating dataset tags...\n", + "Writing output to: /vsimem/output_cog.tif\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " COG creation complete with embedded metadata\n", + " Output size: 150,638,215 bytes\n", + " Output size: 150,638,215 bytes\n", + " Size change: -4.8%\n", + " Validating output COG structure...\n", + " ✓ Output is valid COG\n", + " Compression: ZSTD\n", + " Tile size: [512, 512]\n", + " Overviews: 4 levels\n", + " Uploading to s3://nasa-disasters-dev/ProgramData/Landsat/burnRatio/202501_Fire_CA_LC09_NBR_182831_041036_2025-01-14_day.tif...\n", + " ✓ Upload successful\n", + "\n", + "================================================================================\n", + "Processing Complete\n", + "================================================================================\n" + ] + } + ], + "source": [ + "def process_file(source_key: str) -> Dict[str, any]:\n", + " \"\"\"\n", + " Process a single GeoTIFF file.\n", + " \n", + " Returns:\n", + " Dictionary with processing results\n", + " \"\"\"\n", + " filename = source_key.split('/')[-1]\n", + " result = {\n", + " 'filename': filename,\n", + " 'source_key': source_key,\n", + " 'success': False,\n", + " 'message': ''\n", + " }\n", + " \n", + " # Step 1: Detect activation event pattern\n", + " event_metadata = detect_activation_event(filename)\n", + " \n", + " if not event_metadata:\n", + " result['message'] = 'No activation event pattern detected - skipping'\n", + " return result\n", + " \n", + " print(f\"\\nProcessing: {filename}\")\n", + " print(f\" Location: {event_metadata['location']}, Hazard: {event_metadata['hazard']}\")\n", + " \n", + " try:\n", + " # Step 2: Read file from S3\n", + " print(\" Reading from S3...\")\n", + " file_bytes, metadata, tags = read_geotiff_from_s3(SOURCE_BUCKET, source_key)\n", + " print(f\" Size: {len(file_bytes):,} bytes\")\n", + " print(f\" Dimensions: {metadata['width']}x{metadata['height']}\")\n", + " print(f\" CRS: {metadata.get('crs', 'Unknown')}\")\n", + " \n", + " # Step 2.5: Validate input COG status\n", + " print(\" Validating input COG structure...\")\n", + " input_is_cog, input_cog_info = validate_cog(file_bytes, f\"input_{filename}\")\n", + " \n", + " if input_is_cog:\n", + " print(f\" ✓ Input is valid COG\")\n", + " print(f\" Compression: {input_cog_info.get('compression', 'Unknown')}\")\n", + " print(f\" Tile size: {input_cog_info.get('blocksize', (0, 0))}\")\n", + " print(f\" Overviews: {input_cog_info.get('overviews', 0)} levels\")\n", + " else:\n", + " print(f\" ⚠ Input is NOT a valid COG - will convert\")\n", + " if input_cog_info.get('errors'):\n", + " for error in input_cog_info['errors'][:3]: # Show first 3 errors\n", + " print(f\" - {error}\")\n", + " \n", + " # Step 3: Add metadata and create/optimize COG\n", + " print(\" Creating COG with metadata and ZSTD compression...\")\n", + " modified_bytes = add_metadata_and_create_cog(file_bytes, event_metadata)\n", + " print(f\" Output size: {len(modified_bytes):,} bytes\")\n", + " \n", + " size_change = ((len(modified_bytes) - len(file_bytes)) / len(file_bytes)) * 100\n", + " print(f\" Size change: {size_change:+.1f}%\")\n", + " \n", + " # Step 3.5: Validate output COG status\n", + " print(\" Validating output COG structure...\")\n", + " output_is_cog, output_cog_info = validate_cog(modified_bytes, f\"output_{filename}\")\n", + " \n", + " if output_is_cog:\n", + " print(f\" ✓ Output is valid COG\")\n", + " print(f\" Compression: {output_cog_info.get('compression', 'Unknown')}\")\n", + " print(f\" Tile size: {output_cog_info.get('blocksize', (0, 0))}\")\n", + " print(f\" Overviews: {output_cog_info.get('overviews', 0)} levels\")\n", + " result['cog_valid'] = True\n", + " else:\n", + " print(f\" ✗ WARNING: Output is NOT a valid COG!\")\n", + " if output_cog_info.get('errors'):\n", + " for error in output_cog_info['errors'][:3]:\n", + " print(f\" - {error}\")\n", + " result['cog_valid'] = False\n", + " \n", + " # Step 4: Upload to destination\n", + " dest_key = DESTINATION_PREFIX + filename\n", + " print(f\" Uploading to s3://{DESTINATION_BUCKET}/{dest_key}...\")\n", + " \n", + " if upload_to_s3(modified_bytes, DESTINATION_BUCKET, dest_key):\n", + " print(\" ✓ Upload successful\")\n", + " result['destination_key'] = dest_key\n", + " result['success'] = True\n", + " result['message'] = 'Processed successfully'\n", + " result['input_was_cog'] = input_is_cog\n", + " result['output_is_cog'] = output_is_cog\n", + " \n", + " # Step 5: Optionally delete from source\n", + " if DELETE_AFTER_MOVE:\n", + " print(f\" Deleting from source...\")\n", + " if delete_from_s3(SOURCE_BUCKET, source_key):\n", + " print(\" ✓ Deleted from source\")\n", + " result['deleted_from_source'] = True\n", + " else:\n", + " print(\" ✗ Failed to delete from source\")\n", + " result['deleted_from_source'] = False\n", + " else:\n", + " result['message'] = 'Upload to destination failed'\n", + " \n", + " except Exception as e:\n", + " result['message'] = f'Error: {str(e)}'\n", + " print(f\" ✗ Error: {e}\")\n", + " \n", + " return result\n", + "\n", + "\n", + "# Process all files\n", + "print(f\"\\n{'='*80}\")\n", + "print(\"Starting batch processing\")\n", + "print(f\"{'='*80}\")\n", + "\n", + "results = []\n", + "for file_key in tif_files:\n", + " result = process_file(file_key)\n", + " results.append(result)\n", + "\n", + "print(f\"\\n{'='*80}\")\n", + "print(\"Processing Complete\")\n", + "print(f\"{'='*80}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary Report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Summary statistics\n", + "total_files = len(results)\n", + "successful = sum(1 for r in results if r['success'])\n", + "skipped = sum(1 for r in results if 'skipping' in r['message'])\n", + "failed = total_files - successful - skipped\n", + "\n", + "# COG statistics\n", + "input_cogs = sum(1 for r in results if r.get('input_was_cog', False))\n", + "output_cogs = sum(1 for r in results if r.get('output_is_cog', False))\n", + "converted_to_cog = sum(1 for r in results if r.get('success') and not r.get('input_was_cog', False) and r.get('output_is_cog', False))\n", + "\n", + "print(\"\\nProcessing Summary:\")\n", + "print(f\" Total files: {total_files}\")\n", + "print(f\" ✓ Successful: {successful}\")\n", + "print(f\" ○ Skipped (no pattern): {skipped}\")\n", + "print(f\" ✗ Failed: {failed}\")\n", + "\n", + "print(\"\\nCOG Status:\")\n", + "print(f\" Input COGs: {input_cogs}/{total_files}\")\n", + "print(f\" Output COGs: {output_cogs}/{total_files}\")\n", + "print(f\" Converted to COG: {converted_to_cog}\")\n", + "\n", + "if DELETE_AFTER_MOVE:\n", + " deleted = sum(1 for r in results if r.get('deleted_from_source', False))\n", + " print(f\"\\n 🗑 Deleted from source: {deleted}\")\n", + "\n", + "print(\"\\nDetailed Results:\")\n", + "for result in results:\n", + " status = '✓' if result['success'] else ('○' if 'skipping' in result['message'] else '✗')\n", + " cog_status = ''\n", + " if result.get('success'):\n", + " if result.get('output_is_cog'):\n", + " cog_status = ' [COG ✓]'\n", + " else:\n", + " cog_status = ' [COG ✗]'\n", + " print(f\" {status} {result['filename']}: {result['message']}{cog_status}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Verify Metadata in Processed Files\n", + "\n", + "Read a processed file and verify the metadata was added correctly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get first successful result\n", + "successful_results = [r for r in results if r['success']]\n", + "\n", + "if successful_results:\n", + " sample_result = successful_results[0]\n", + " dest_key = sample_result['destination_key']\n", + " \n", + " print(f\"Verifying metadata in: s3://{DESTINATION_BUCKET}/{dest_key}\\n\")\n", + " \n", + " # Read processed file\n", + " file_bytes, metadata, tags = read_geotiff_from_s3(DESTINATION_BUCKET, dest_key)\n", + " \n", + " print(\"=\" * 60)\n", + " print(\"GeoTIFF Metadata Tags:\")\n", + " print(\"=\" * 60)\n", + " for key, value in tags.items():\n", + " if 'ACTIVATION' in key or 'PROCESSING' in key or 'ORIGINAL' in key or 'PROCESSOR' in key or 'COG' in key:\n", + " print(f\" {key}: {value}\")\n", + " \n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"COG Validation Results:\")\n", + " print(\"=\" * 60)\n", + " \n", + " # Validate COG structure\n", + " is_cog, cog_info = validate_cog(file_bytes, dest_key.split('/')[-1])\n", + " \n", + " if is_cog:\n", + " print(\"✓ File is a VALID Cloud-Optimized GeoTIFF (COG)\\n\")\n", + " \n", + " print(\"Structure Details:\")\n", + " print(f\" Dimensions: {cog_info.get('width')}x{cog_info.get('height')}\")\n", + " print(f\" Bands: {cog_info.get('bands')}\")\n", + " print(f\" Compression: {cog_info.get('compression')}\")\n", + " print(f\" Tiled: {cog_info.get('tiled')}\")\n", + " print(f\" Block/Tile Size: {cog_info.get('blocksize')}\")\n", + " print(f\" Overview Levels: {cog_info.get('overviews')}\")\n", + " \n", + " if cog_info.get('warnings'):\n", + " print(\"\\nWarnings:\")\n", + " for warning in cog_info['warnings']:\n", + " print(f\" ⚠ {warning}\")\n", + " else:\n", + " print(\"✗ File is NOT a valid COG\\n\")\n", + " \n", + " if cog_info.get('errors'):\n", + " print(\"Errors:\")\n", + " for error in cog_info['errors']:\n", + " print(f\" ✗ {error}\")\n", + " \n", + " if cog_info.get('warnings'):\n", + " print(\"\\nWarnings:\")\n", + " for warning in cog_info['warnings']:\n", + " print(f\" ⚠ {warning}\")\n", + " \n", + " print(\"\\n\" + \"=\" * 60)\n", + " \n", + "else:\n", + " print(\"No successfully processed files to verify\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/_quarto.yml b/_quarto.yml index 649bb8b..9a5ad53 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -60,6 +60,8 @@ website: - Jupyterhub/clone_conversion_repo.ipynb - Jupyterhub/sentinel2_workflow-testUpdates.ipynb - Jupyterhub/CSDA-demo.ipynb + - text: "GeoTIFF Event Processor" + href: Jupyterhub/extract_eventname_metadata.ipynb - section: workflow.qmd text: Data Workflow Diagrams contents: