From 0bc078f1659c7e409cd5e8f622c2674270550753 Mon Sep 17 00:00:00 2001 From: Claudio Date: Thu, 2 Oct 2025 10:19:40 +1300 Subject: [PATCH 01/21] Remove merge_ts --- workflow/scripts/merge_ts.py | 136 ----------------------------- workflow/scripts/merge_ts_loop.pyx | 79 ----------------- 2 files changed, 215 deletions(-) delete mode 100644 workflow/scripts/merge_ts.py delete mode 100644 workflow/scripts/merge_ts_loop.pyx diff --git a/workflow/scripts/merge_ts.py b/workflow/scripts/merge_ts.py deleted file mode 100644 index c1143232..00000000 --- a/workflow/scripts/merge_ts.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python3 -"""Merge EMOD3D Timeslices. - -Description ------------ -Merge the output timeslice files of EMOD3D. - -Inputs ------- -1. A directory containing EMOD3D timeslice files. - -Outputs -------- -1. A merged output timeslice file. - -Environment ------------ -Can be run in the cybershake container. Can also be run from your own computer using the `merge-ts` command which is installed after running `pip install workflow@git+https://github.com/ucgmsim/workflow`. - -Usage ------ -`merge_ts XYTS_DIRECTORY XYTS_DIRECTORY/output.e3d` - -For More Help -------------- -See the output of `merge-ts --help`. -""" - -import os -from pathlib import Path -from typing import Annotated - -import typer - -from qcore import cli, xyts -from workflow.scripts import merge_ts_loop - -app = typer.Typer() - - -@cli.from_docstring(app) -def merge_ts( - component_xyts_directory: Annotated[ - Path, - typer.Argument( - dir_okay=True, - file_okay=False, - exists=True, - readable=True, - ), - ], - output: Annotated[ - Path, - typer.Argument(dir_okay=False, writable=True), - ], - glob_pattern: Annotated[str, typer.Option()] = "*xyts-*.e3d", -) -> None: - """Merge XYTS files. - - Parameters - ---------- - component_xyts_directory : Path - The input xyts directory containing files to merge. - output : Path - The output xyts file. - glob_pattern : str, optional - Set a custom glob pattern for merging the xyts files, by default "*xyts-*.e3d". - """ - component_xyts_files = sorted( - [ - xyts.XYTSFile( - xyts_file_path, proc_local_file=True, meta_only=True, round_dt=False - ) - for xyts_file_path in component_xyts_directory.glob(glob_pattern) - ], - key=lambda xyts_file: (xyts_file.y0, xyts_file.x0), - ) - top_left = component_xyts_files[0] - merged_ny = top_left.ny - merged_nt = top_left.nt - - xyts_proc_header_size = 72 - - xyts_file_descriptors: list[int] = [] - for xyts_file in component_xyts_files: - xyts_file_descriptor = os.open(xyts_file.xyts_path, os.O_RDONLY) - # Skip the header for each file descriptor - head_skip = os.lseek(xyts_file_descriptor, xyts_proc_header_size, os.SEEK_SET) - if head_skip != xyts_proc_header_size: - raise ValueError( - f"Failed to skip header for {xyts_file.xyts_path} at {head_skip}" - ) - xyts_file_descriptors.append(xyts_file_descriptor) - - # If output doesn't exist when we os.open it, we'll get an error. - output.touch() - merged_fd = os.open(output, os.O_WRONLY) - - xyts_header: bytes = ( - top_left.x0.tobytes() - + top_left.y0.tobytes() - + top_left.z0.tobytes() - + top_left.t0.tobytes() - + top_left.nx.tobytes() - + top_left.ny.tobytes() - + top_left.nz.tobytes() - + top_left.nt.tobytes() - + top_left.dx.tobytes() - + top_left.dy.tobytes() - + top_left.hh.tobytes() - + top_left.dt.tobytes() - + top_left.mrot.tobytes() - + top_left.mlat.tobytes() - + top_left.mlon.tobytes() - ) - - written = os.write(merged_fd, xyts_header) - if written != len(xyts_header): - raise ValueError( - f"Failed to write header for {output} at {written} bytes written" - ) - - merge_ts_loop.merge_fds( - merged_fd, - xyts_file_descriptors, - merged_nt, - merged_ny, - [f.local_nx for f in component_xyts_files], - [f.local_ny for f in component_xyts_files], - [f.y0 for f in component_xyts_files], - ) - - for xyts_file_descriptor in xyts_file_descriptors: - os.close(xyts_file_descriptor) - - os.close(merged_fd) diff --git a/workflow/scripts/merge_ts_loop.pyx b/workflow/scripts/merge_ts_loop.pyx deleted file mode 100644 index d83a57c3..00000000 --- a/workflow/scripts/merge_ts_loop.pyx +++ /dev/null @@ -1,79 +0,0 @@ -import os - -import cython - -from libc.stdlib cimport free, malloc - -cdef extern from "sys/types.h": - ctypedef long off_t -cdef extern from "sys/sendfile.h": - ssize_t sendfile(int out_fd, int in_fd, off_t * offset, size_t count) - -def merge_fds( - int merged_fd, _component_xyts_files, int merged_nt, int merged_ny, _local_nxs, _local_nys, _y0s -): - ''' Merge a list of XYTS files by copying their values in order. - - To merge the files we copy the velocity values in each component, for each - timestep, in the order they would fit in the merged domain. That is if four - files tile the domain like so: - - ┌───────────┬──────────┐ - │***********│##########│ - │!!!!!!!!!!!│++++++++++│ - │ f1 │ f2 │ - │ │ │ - ├───────────┼──────────┤ - │$$$$$$$$$$$│%%%%%%%%%%│ - │ │ │ - │ f3 │ f4 │ - │ │ │ - │ │ │ - └───────────┴──────────┘ - - Then they are concatenated in the output domain as: - - ***********##########!!!!!!!!!!!++++++++++ ... $$$$$$$$$$$%%%%%%%%%% - - It is assumed _component_xyts_files is a list of xyts files sorted by their - top left corner. - ''' - cdef int float_size, cur_timestep, cur_component, cur_y, i, y0, local_ny, local_nx, xyts_fd, n_files - cdef int *component_xyts_files, *local_nxs, *local_nys, *y0s - - n_files = len(_component_xyts_files) - - # The lists _component_xyts_files, ... are CPython lists. - # If we access these inside our copying loop everything becomes very slow - # because we need to go to the Python interpreter. So we first copy each - # list into an equivalent C list. - component_xyts_files = malloc(len(_component_xyts_files) * cython.sizeof(int)) - local_nxs = malloc(len(_local_nxs) * cython.sizeof(int)) - local_nys = malloc(len(_local_nys) * cython.sizeof(int)) - y0s = malloc(len(_y0s) * cython.sizeof(int)) - # NOTE: we cannot use memcpy() here because CPython lists are not continuous - # chunks of memory as they are in C. - for i in range(len(_component_xyts_files)): - component_xyts_files[i] = _component_xyts_files[i] - local_nxs[i] = _local_nxs[i] - local_nys[i] = _local_nys[i] - y0s[i] = _y0s[i] - for cur_timestep in range(merged_nt): - for cur_component in range(3): # for each component - for cur_y in range(merged_ny): - for i in range(n_files): - y0 = y0s[i] - local_ny = local_nys[i] - local_nx = local_nxs[i] - xyts_fd = component_xyts_files[i] - if y0 > cur_y: - break - if cur_y >= y0 + local_ny: - continue - # By passing NULL as the offset, sendfile() will read from - # the current position in xyts_fd - sendfile(merged_fd, xyts_fd, NULL, local_nx * 4) - free(component_xyts_files) - free(local_nxs) - free(local_nys) - free(y0s) From b2ac444ebdb47328b478ef8b0adba46fe0f4be2d Mon Sep 17 00:00:00 2001 From: Claudio Date: Thu, 2 Oct 2025 10:36:42 +1300 Subject: [PATCH 02/21] Remove merge_ts --- pyproject.toml | 1 - setup.py | 14 -------------- wiki/Stages.md | 14 -------------- workflow/__init__.py | 1 - workflow/scripts/plan_workflow.py | 1 - 5 files changed, 31 deletions(-) delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml index 497e5ba8..f1ee0c0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,6 @@ generate-rupture-propagation = "workflow.scripts.generate_rupture_propagation:ap copy-domain-parameters = "workflow.scripts.copy_velocity_model_parameters:app" create-e3d-par = "workflow.scripts.create_e3d_par:app" generate-stoch = "workflow.scripts.generate_stoch:app" -merge-ts = "workflow.scripts.merge_ts:app" hf-sim = "workflow.scripts.hf_sim:app" bb-sim = "workflow.scripts.bb_sim:app" im-calc = "workflow.scripts.im_calc:app" diff --git a/setup.py b/setup.py deleted file mode 100644 index 9a458991..00000000 --- a/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -from Cython.Build import cythonize -from setuptools import Extension, setup - -extensions = [ - Extension( - "workflow.scripts.merge_ts_loop", - ["workflow/scripts/merge_ts_loop.pyx"], - ), -] - -setup( - name="workflow", - ext_modules=cythonize(extensions), -) diff --git a/wiki/Stages.md b/wiki/Stages.md index af14ecaf..49a54d40 100644 --- a/wiki/Stages.md +++ b/wiki/Stages.md @@ -219,20 +219,6 @@ Can be run in the cybershake container. Can also be run from your own computer u See the output of `create-e3d-par --help` or [create_e3d_par.py](https://github.com/ucgmsim/workflow/blob/pegasus/workflow/scripts/create_e3d_par.py). See our description of the [EMOD3D Parameters](https://wiki.canterbury.ac.nz/pages/viewpage.action?pageId=100794983) for documentation on the EMOD3D parameter file format. -## Merge Timeslices -### Description -Merge the output timeslice files of EMOD3D. -### Inputs -1. A directory containing EMOD3D timeslice files. -### Outputs -1. A merged output timeslice file. -### Environment -Can be run in the cybershake container. Can also be run from your own computer using the `merge-ts` command which is installed after running `pip install workflow@git+https://github.com/ucgmsim/workflow`. -### Usage -`merge_ts XYTS_DIRECTORY XYTS_DIRECTORY/output.e3d` -### For More Help -See the output of `merge-ts --help` or [merge_ts.py](https://github.com/ucgmsim/workflow/blob/pegasus/merge_ts/merge_ts.py). - ## Create Simulation Video ### Description Create a simulation video from the low frequency simulation output. diff --git a/workflow/__init__.py b/workflow/__init__.py index 26c75bf3..a8e2b769 100644 --- a/workflow/__init__.py +++ b/workflow/__init__.py @@ -70,7 +70,6 @@ | EMOD3D | See below. | | High Frequency Simulation | `workflow.scripts.hf_sim` | | Broadband Simulation | `workflow.scripts.bb_sim` | -| Merge Timeslices | `merge_ts.merge_ts` | | Create Simulation Video | `workflow.scripts.plot_ts` | | Intensity Measure Calculation | `workflow.scripts.im_calc` | diff --git a/workflow/scripts/plan_workflow.py b/workflow/scripts/plan_workflow.py index 6d256e2b..b6f51d12 100644 --- a/workflow/scripts/plan_workflow.py +++ b/workflow/scripts/plan_workflow.py @@ -52,7 +52,6 @@ class StageIdentifier(StrEnum): LowFrequency = "emod3d" Broadband = "bb_sim" IntensityMeasureCalculation = "im_calc" - MergeTimeslices = "merge_ts" NSHMToRealisation = "nshm_to_realisation" From eb60f74bf6e8b40bcc1b718fb3549b101a920e85 Mon Sep 17 00:00:00 2001 From: Claudio Date: Thu, 2 Oct 2025 20:17:27 +1300 Subject: [PATCH 03/21] Initial version of virtual sites grid code. --- wiki/Virtual-Sites.md | 29 ++++ workflow/scripts/virt_sites_cmds.py | 247 ++++++++++++++++++++++++++++ workflow/virt_sites_grid.py | 170 +++++++++++++++++++ 3 files changed, 446 insertions(+) create mode 100644 wiki/Virtual-Sites.md create mode 100644 workflow/scripts/virt_sites_cmds.py create mode 100644 workflow/virt_sites_grid.py diff --git a/wiki/Virtual-Sites.md b/wiki/Virtual-Sites.md new file mode 100644 index 00000000..65c8a62f --- /dev/null +++ b/wiki/Virtual-Sites.md @@ -0,0 +1,29 @@ +## Virtual Simulation Sites + +### Background + +As every set of simulation has different goals and priorities, a single set of virtual stations that meets all requirements is not feasible. On the other hand we want to prevent every simulation set developing their own custom grid of virtual stations, as this would make comparison between simulation sets difficult. + +Instead a high density uniform "general" grid has been developed, and the code implemented here allows the generation of custom virtual station grid for each simulation set by sub-sampling this "general" grid as needed. This "general" grid ensures consistency across simulation sets while also allowing for customization. + +### Custom Grid Generation + +A custom grid can be generated using the 'virt_sites_cmds.py' script using the 'gen-custom-grid-from-rel' command. +For details/help on the command run 'python virt_sites_cmds.py gen-custom-grid-from-rel --help' +**Note that you will need about 16Gb of free memory.** + +Running the custom grid generation will produce a parquet file with the columns 'site_id', 'lon', 'lat' and 'in_basin'. +Note that the 'in_basin' column is False for all sites unless a velocity model version was specified. + +### Plot Generation + +Visualisation of the custom grid can be done using the 'gen-plot' command. +For details/help 'run python virt_sites_cmds.py gen-plot --help'. + +Running this will produce a html file that can be viewed in any browser. +Note that for large domains and high density grids this might be slow to load. + + + + + diff --git a/workflow/scripts/virt_sites_cmds.py b/workflow/scripts/virt_sites_cmds.py new file mode 100644 index 00000000..741de1eb --- /dev/null +++ b/workflow/scripts/virt_sites_cmds.py @@ -0,0 +1,247 @@ +from pathlib import Path +from typing import Annotated + +import geopandas as gpd +import numpy as np +import pandas as pd +import plotly.graph_objects as go +import pygmt +import shapely +import typer +import xarray as xr + +from qcore import cli, coordinates +from workflow.realisations import DomainParameters, SourceConfig +from workflow.virt_sites_grid import GRID_DATA, CustomGrid, get_basin_boundaries + +app = typer.Typer() + + +@app.command("gen-general-grid") +def gen_general_grid(grid_spacing: str, output_ffp: Path): + """Generate the general grid.""" + x_spacing, y_spacing = grid_spacing.split("/") + if x_spacing != y_spacing: + raise ValueError("Currently only supports equal x and y spacing.") + + try: + spacing = int(x_spacing.rstrip("e")) + except ValueError: + raise ValueError("Grid spacing must be an integer in metres.") + + land_df = gpd.read_parquet( + GRID_DATA.fetch("nz_coastline.parquet") + ) + # Combine into a single polygon + land_polygon = shapely.coverage_union_all(land_df.geometry) + land_polygon = shapely.transform( + land_polygon, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) + ) + + # Generate grid + print("Generating grid...") + land_mask_grid = pygmt.grdlandmask(region="NZ", spacing=grid_spacing).astype(bool) + land_mask_grid[:] = False + land_mask_grid.attrs = {"spacing": spacing} + + # Use float32 for coords + land_mask_grid = land_mask_grid.assign_coords( + lat=land_mask_grid.lat.astype(np.float32), + lon=land_mask_grid.lon.astype(np.float32), + ) + + grid_lat, grid_lon = xr.broadcast(land_mask_grid.lat, land_mask_grid.lon) + grid_nztm = coordinates.wgs_depth_to_nztm( + np.vstack((grid_lat.values.ravel(), grid_lon.values.ravel())).T + ) + + # Apply land masking + print("Applying land mask...") + mask = shapely.contains_xy(land_polygon, grid_nztm[:, 0], grid_nztm[:, 1]).reshape( + land_mask_grid.shape + ) + land_mask_grid.values[mask] = 1 + + print(f"Saving to {output_ffp}...") + land_mask_grid.to_netcdf(output_ffp) + + +@cli.from_docstring(app) +def gen_custom_grid_from_rel( + rel_ffp: Annotated[Path, typer.Argument()], + uniform_grid_spacing: Annotated[int, typer.Argument()], + output_ffp: Annotated[Path, typer.Argument()], + basin_spacing: Annotated[int | None, typer.Option()] = None, + vel_model_version: Annotated[str | None, typer.Option()] = None, +) -> None: + """ + Generate a custom grid from a realisation config. + + Parameters + ---------- + rel_ffp : Path + The path to the realisation config. + uniform_grid_spacing : int + The uniform grid spacing in metres. + This must be a multiple of the general grid spacing. + output_ffp : Path + The path to save the output parquet file. + basin_spacing : int, optional + The grid spacing in metres to use within basins. + This must be a multiple of the general grid spacing. + If not provided, no basin spacing will be applied. + vel_model_version : str, optional + The velocity model version to use for basin spacing. + This must be provided if basin_spacing is specified. + """ + domain_config = DomainParameters.read_from_realisation(rel_ffp) + + region = shapely.Polygon(domain_config.domain.corners) + custom_grid = ( + CustomGrid() + .add_land_only_filter() + .add_region_filter(region) + .add_uniform_spacing_filter(uniform_grid_spacing) + ) + if basin_spacing is not None: + if vel_model_version is None: + raise ValueError( + "vel_model_version must be provided if basin_spacing is provided." + ) + custom_grid.add_basin_spacing_filter(vel_model_version, basin_spacing) + + site_df = custom_grid.get_site_df() + site_df.to_parquet(output_ffp) + + +@cli.from_docstring(app) +def gen_plot( + site_df_ffp: Annotated[Path, typer.Argument()], + output_ffp: Annotated[Path, typer.Argument()], + rel_ffp: Annotated[Path | None, typer.Option()] = None, + vel_model_version: Annotated[str | None, typer.Option()] = None, + nzgmdb_site_ffp: Annotated[Path | None, typer.Option()] = None, +) -> None: + """ + Generate a plot of the custom grid. + + Parameters + ---------- + site_df_ffp : Path + The path to the custom grid site dataframe file (parquet). + output_ffp : Path + The path to save the output HTML file. + rel_ffp : Path, optional + The path to the realisation config. + If provided, the domain and source will be plotted. + vel_model_version : str, optional + The velocity model version to use for plotting basin boundaries. + If provided, the basin boundaries will be plotted. + nzgmdb_site_ffp : Path, optional + Path to the NZGMDB site file (csv). + If provided, the NZGMDB sites will be plotted. + """ + site_df = pd.read_parquet(site_df_ffp) + + fig = go.Figure() + + fig.add_trace( + go.Scattermap( + lon=site_df["lon"], + lat=site_df["lat"], + mode="markers", + marker=dict(color="blue", size=4, symbol="circle"), + hoverinfo="skip", + ) + ) + + if vel_model_version is not None: + # Plot the basin boundaries + basin_boundaries = get_basin_boundaries(vel_model_version) + basin_line_properties = dict(color="red", width=1) + + for cur_basin_boundary in basin_boundaries.values(): + if isinstance(cur_basin_boundary, shapely.MultiPolygon): + for poly in cur_basin_boundary.geoms: + fig.add_trace( + go.Scattermap( + lon=np.array(poly.exterior.xy[0]), + lat=np.array(poly.exterior.xy[1]), + mode="lines", + line=basin_line_properties, + hoverinfo="skip", + ) + ) + else: + fig.add_trace( + go.Scattermap( + lon=np.array(cur_basin_boundary.exterior.xy[0]), + lat=np.array(cur_basin_boundary.exterior.xy[1]), + mode="lines", + line=basin_line_properties, + hoverinfo="skip", + ) + ) + + if rel_ffp is not None: + # Plot the domain + domain_corners = DomainParameters.read_from_realisation(rel_ffp).domain.corners + domain_corners = shapely.Polygon(domain_corners[:, ::-1]) + fig.add_trace( + go.Scattermap( + lon=np.array(domain_corners.exterior.xy[0]), + lat=np.array(domain_corners.exterior.xy[1]), + mode="lines", + line=dict(color="black"), + hoverinfo="skip", + ) + ) + + # Plot the source + source_config = SourceConfig.read_from_realisation(rel_ffp) + for _, fault in source_config.source_geometries.items(): + for cur_plane in fault.planes: + cur_polygon = shapely.transform( + cur_plane.geometry, + lambda x: coordinates.nztm_to_wgs_depth(x)[:, ::-1], + ) + fig.add_trace( + go.Scattermap( + lon=np.array(cur_polygon.exterior.xy[0]), + lat=np.array(cur_polygon.exterior.xy[1]), + mode="lines", + line=dict(color="black"), + hoverinfo="skip", + ) + ) + + if nzgmdb_site_ffp is not None: + nzgmdb_site_df = pd.read_csv( + nzgmdb_site_ffp, index_col="sta", usecols=["sta", "lon", "lat"] + ) + fig.add_trace( + go.Scattermap( + lon=nzgmdb_site_df["lon"], + lat=nzgmdb_site_df["lat"], + mode="markers", + marker=dict(symbol="triangle"), + hovertext=nzgmdb_site_df.index, + hoverinfo="text", + ) + ) + + fig.update_layout( + map=dict(zoom=6, center=dict(lat=site_df.lat.mean(), lon=site_df.lon.mean())), + showlegend=False, + title=( + f"Virtual Sites Grid - Velocity Model {vel_model_version} - " + f"Total Sites: {len(site_df)} - " + f"Basin Sites: {site_df['in_basin'].sum()}" + ), + ) + fig.write_html(output_ffp) + + +if __name__ == "__main__": + app() + app() diff --git a/workflow/virt_sites_grid.py b/workflow/virt_sites_grid.py new file mode 100644 index 00000000..c67417c7 --- /dev/null +++ b/workflow/virt_sites_grid.py @@ -0,0 +1,170 @@ +from pathlib import Path + +import pooch +import numpy as np +import pandas as pd +import shapely +import xarray as xr + +from qcore import coordinates +from velocity_modelling.constants import get_data_root +from velocity_modelling.registry import CVMRegistry + +# # GENERAL_GRID_FFP = Path("/home/claudy/dev/tmp_share/virt_sites/general_land_mask_grid.nc") +# GENERAL_GRID_FFP = Path( +# "/Users/claudy/dev/tmp_share/virt_sites/general_grid.nc" +# ) + +GRID_DATA = pooch.create( + path=pooch.os_cache("virt_sites"), + base_url="", + registry={ + "general_grid.nc": "sha256:06d675c60d90545f29573ab505db17e7662f7958270093d4a8808d6fc379a984", + "nz_coastline.parquet": "sha256:9a965326690c560d372720b4b64b57480ede10e479b2fe62ba75760707bc521c", + }, + urls={ + "general_grid.nc": "https://www.dropbox.com/scl/fi/4x987t01kvbz8bbyeid81/general_grid.nc?rlkey=rpztofwihh839500jn2bkw255&st=rpazzciz&dl=1", + "nz_coastline.parquet": "https://www.dropbox.com/scl/fi/nx7hm82ern7s2b4v7u6zq/nz_coastline.parquet?rlkey=xzvh7gumefaigzhbgyjtuxu2p&st=e4gwa9a9&dl=1", + }, +) + + +class GeneralGrid: + + def __init__(self, land_mask_grid: xr.DataArray, spacing: int): + self.land_mask_grid = land_mask_grid + self.site_ids = np.arange(land_mask_grid.size).reshape(land_mask_grid.shape) + self.spacing = spacing + + self.grid_lat, self.grid_lon = xr.broadcast( + self.land_mask_grid.lat, self.land_mask_grid.lon + ) + # TODO: Optimize memory usage, high atm + self.grid_lat, self.grid_lon = ( + self.grid_lat.values, + self.grid_lon.values, + ) + + grid_nztm = coordinates.wgs_depth_to_nztm( + np.vstack((self.grid_lat.ravel(), self.grid_lon.ravel())).T + ) + self.grid_nztm_x = grid_nztm[:, 1].reshape(self.land_mask_grid.shape) + self.grid_nztm_y = grid_nztm[:, 0].reshape(self.land_mask_grid.shape) + + @property + def shape(self): + return self.land_mask_grid.shape + + @classmethod + def load(cls, land_mask_grid_ffp: Path): + return cls(xr.load_dataarray(land_mask_grid_ffp), 50) + + +class CustomGrid: + + def __init__(self): + self.general_grid = GeneralGrid.load(GRID_DATA.fetch("general_grid.nc")) + + self._and_mask = np.ones(self.general_grid.shape, dtype=bool) + self._or_mask = np.ones(self.general_grid.shape, dtype=bool) + + self._in_basin_mask = None + + @property + def mask(self): + return self._and_mask & self._or_mask + + def add_region_filter(self, region: shapely.Polygon): + # Convert to NZTM + region = shapely.transform(region, lambda x: coordinates.wgs_depth_to_nztm(x)) + + region_mask = shapely.contains_xy( + region, + self.general_grid.grid_nztm_y.ravel(), + self.general_grid.grid_nztm_x.ravel(), + ).reshape(self.general_grid.shape) + self._and_mask &= region_mask + + return self + + def add_uniform_spacing_filter(self, spacing: int): + if spacing < self.general_grid.spacing: + raise ValueError( + "Uniform spacing must be greater than or equal to the general grid spacing." + ) + if spacing % self.general_grid.spacing != 0: + raise ValueError( + f"Uniform spacing must be a multiple of the general grid spacing {self.general_grid.spacing}." + ) + + idx_interval = spacing // self.general_grid.spacing + spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) + spacing_mask[::idx_interval, ::idx_interval] = True + self._or_mask &= spacing_mask + + return self + + def add_land_only_filter(self): + self._and_mask &= self.general_grid.land_mask_grid.values.astype(bool) + + return self + + def add_basin_spacing_filter(self, vel_model_version: str, spacing: int): + if spacing < self.general_grid.spacing: + raise ValueError( + "Basin spacing must be greater than or equal to the general grid spacing." + ) + if spacing % self.general_grid.spacing != 0: + raise ValueError( + f"Basin spacing must be a multiple of the general grid spacing {self.general_grid.spacing}." + ) + + basin_boundaries = get_basin_boundaries(vel_model_version) + comb_basin_boundaries = shapely.union_all(list(basin_boundaries.values())) + comb_basin_boundaries = shapely.transform( + comb_basin_boundaries, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) + ) + self._in_basin_mask = shapely.contains_xy( + comb_basin_boundaries, + self.general_grid.grid_nztm_y.ravel(), + self.general_grid.grid_nztm_x.ravel(), + ).reshape(self.general_grid.shape) + + idx_interval = spacing // self.general_grid.spacing + spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) + spacing_mask[::idx_interval, ::idx_interval] = True + + self._or_mask |= self._in_basin_mask & spacing_mask + return self + + def get_site_df(self): + site_ids = self.general_grid.site_ids[self.mask] + lons = self.general_grid.grid_lon[self.mask] + lats = self.general_grid.grid_lat[self.mask] + + site_df = pd.DataFrame( + {"site_id": site_ids, "lon": lons, "lat": lats, "in_basin": False} + ).set_index("site_id") + + if self._in_basin_mask is not None: + site_df["in_basin"] = self._in_basin_mask[self.mask] + + return site_df + + +def get_basin_boundaries(vel_model_version: str): + cvm_registry = CVMRegistry(vel_model_version, get_data_root()) + basin_data = cvm_registry.load_basin_data(cvm_registry.global_params["basins"]) + + basin_boundaries = {} + for cur_basin_data in basin_data: + if len(cur_basin_data.boundaries) == 1: + basin_boundaries[cur_basin_data.name] = shapely.Polygon( + cur_basin_data.boundaries[0] + ) + else: + basin_boundaries[cur_basin_data.name] = shapely.union_all( + [shapely.Polygon(b) for b in cur_basin_data.boundaries] + ) + + return basin_boundaries From 1d228f4960429d41290ae7cadb303142ce04d21b Mon Sep 17 00:00:00 2001 From: Claudio Date: Mon, 20 Oct 2025 14:03:44 +1300 Subject: [PATCH 04/21] Improvements --- wiki/Virtual-Sites.md | 36 ++- workflow/scripts/virt_sites_cmds.py | 109 +++++--- workflow/virt_sites_grid.py | 373 +++++++++++++++++++++++++--- 3 files changed, 449 insertions(+), 69 deletions(-) diff --git a/wiki/Virtual-Sites.md b/wiki/Virtual-Sites.md index 65c8a62f..310b668c 100644 --- a/wiki/Virtual-Sites.md +++ b/wiki/Virtual-Sites.md @@ -12,8 +12,40 @@ A custom grid can be generated using the 'virt_sites_cmds.py' script using the ' For details/help on the command run 'python virt_sites_cmds.py gen-custom-grid-from-rel --help' **Note that you will need about 16Gb of free memory.** -Running the custom grid generation will produce a parquet file with the columns 'site_id', 'lon', 'lat' and 'in_basin'. -Note that the 'in_basin' column is False for all sites unless a velocity model version was specified. +Running the custom grid generation will produce a parquet file with the following columns: + +| Column | Description | Notes | +|--------|-------------|-------| +| `site_id` | Unique identifier for each virtual site | Made up as follows `{4 character lat/lon hash}{2 character region code}` | +| `lon` | Longitude coordinate in decimal degrees | | +| `lat` | Latitude coordinate in decimal degrees | | +| `nztm_x` | X coordinate in New Zealand Transverse Mercator (NZTM) projection | | +| `nztm_y` | Y coordinate in New Zealand Transverse Mercator (NZTM) projection | | +| `source` | Source of the site | Either `virtual` or `real` | +| `basin` | Basin name | None if not in a basin | +| `Z1.0` | Depth to 1.0 km/s shear-wave velocity (meters) | | +| `Z2.5` | Depth to 2.5 km/s shear-wave velocity (meters) | | +| `vs30` | Time-averaged shear-wave velocity in the top 30 meters (m/s) | | + +Additionally, it will also produce a metadata file that contains the setting used to generate the custom grid: + +| Setting | Type | Description | +|---------|------|-------------| +| `land_only` | bool | Whether to include only land-based sites | +| `region` | geojson | Geographic region for the grid | +| `uniform_spacing` | int | Spacing between grid points | +| `vel_model_version` | str | Version of the velocity model used | + +in addition to some additional information such as: + +| Field | Description | +|----------------|-------------| +| `num_sites` | Total number of sites in the grid | +| `num_virtual_sites` | Number of virtual sites | +| `num_real_sites` | Number of real sites | +| `num_basin_sites` | Number of sites located within basins | +| `sites_per_basin` | Number of sites per basin | + ### Plot Generation diff --git a/workflow/scripts/virt_sites_cmds.py b/workflow/scripts/virt_sites_cmds.py index 741de1eb..ab9a873c 100644 --- a/workflow/scripts/virt_sites_cmds.py +++ b/workflow/scripts/virt_sites_cmds.py @@ -1,6 +1,8 @@ +import logging from pathlib import Path from typing import Annotated +import yaml import geopandas as gpd import numpy as np import pandas as pd @@ -12,26 +14,38 @@ from qcore import cli, coordinates from workflow.realisations import DomainParameters, SourceConfig -from workflow.virt_sites_grid import GRID_DATA, CustomGrid, get_basin_boundaries +from workflow.virt_sites_grid import ( + GRID_DATA, + CustomGrid, + NZGMDBVersion, + get_basin_boundaries, +) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], + force=True, +) +logger = logging.getLogger(__name__) app = typer.Typer() @app.command("gen-general-grid") -def gen_general_grid(grid_spacing: str, output_ffp: Path): +def gen_general_grid(grid_spacing: str, output_ffp: Path) -> None: """Generate the general grid.""" x_spacing, y_spacing = grid_spacing.split("/") if x_spacing != y_spacing: raise ValueError("Currently only supports equal x and y spacing.") - + try: spacing = int(x_spacing.rstrip("e")) except ValueError: raise ValueError("Grid spacing must be an integer in metres.") - - land_df = gpd.read_parquet( - GRID_DATA.fetch("nz_coastline.parquet") - ) + + land_df = gpd.read_parquet(GRID_DATA.fetch("nz_coastline.parquet")) # Combine into a single polygon land_polygon = shapely.coverage_union_all(land_df.geometry) land_polygon = shapely.transform( @@ -39,7 +53,7 @@ def gen_general_grid(grid_spacing: str, output_ffp: Path): ) # Generate grid - print("Generating grid...") + logger.info("Generating grid...") land_mask_grid = pygmt.grdlandmask(region="NZ", spacing=grid_spacing).astype(bool) land_mask_grid[:] = False land_mask_grid.attrs = {"spacing": spacing} @@ -56,13 +70,13 @@ def gen_general_grid(grid_spacing: str, output_ffp: Path): ) # Apply land masking - print("Applying land mask...") + logger.info("Applying land mask...") mask = shapely.contains_xy(land_polygon, grid_nztm[:, 0], grid_nztm[:, 1]).reshape( land_mask_grid.shape ) land_mask_grid.values[mask] = 1 - print(f"Saving to {output_ffp}...") + logger.info(f"Saving to {output_ffp}...") land_mask_grid.to_netcdf(output_ffp) @@ -73,6 +87,7 @@ def gen_custom_grid_from_rel( output_ffp: Annotated[Path, typer.Argument()], basin_spacing: Annotated[int | None, typer.Option()] = None, vel_model_version: Annotated[str | None, typer.Option()] = None, + nzgmdb_version: Annotated[NZGMDBVersion | None, typer.Option()] = None, ) -> None: """ Generate a custom grid from a realisation config. @@ -82,7 +97,7 @@ def gen_custom_grid_from_rel( rel_ffp : Path The path to the realisation config. uniform_grid_spacing : int - The uniform grid spacing in metres. + The uniform grid spacing in metres. This must be a multiple of the general grid spacing. output_ffp : Path The path to save the output parquet file. @@ -93,6 +108,9 @@ def gen_custom_grid_from_rel( vel_model_version : str, optional The velocity model version to use for basin spacing. This must be provided if basin_spacing is specified. + If basin_spacing is not provided, and this is provided, + then this velocity model version will be used to set + basin membership in the output site dataframe. """ domain_config = DomainParameters.read_from_realisation(rel_ffp) @@ -110,9 +128,16 @@ def gen_custom_grid_from_rel( ) custom_grid.add_basin_spacing_filter(vel_model_version, basin_spacing) - site_df = custom_grid.get_site_df() + site_df = custom_grid.get_site_df( + nzgmdb_version=nzgmdb_version, + vel_model_version=vel_model_version if not basin_spacing else None, + ) site_df.to_parquet(output_ffp) + metadata = custom_grid.get_metadata(site_df) + with (output_ffp.parent / f"{output_ffp.stem}_metadata.yaml").open("w") as meta_ffp: + yaml.dump(metadata, meta_ffp) + @cli.from_docstring(app) def gen_plot( @@ -137,28 +162,56 @@ def gen_plot( vel_model_version : str, optional The velocity model version to use for plotting basin boundaries. If provided, the basin boundaries will be plotted. - nzgmdb_site_ffp : Path, optional - Path to the NZGMDB site file (csv). - If provided, the NZGMDB sites will be plotted. """ site_df = pd.read_parquet(site_df_ffp) fig = go.Figure() + virt_sites_mask = site_df.loc[:, "source"] == "virtual" fig.add_trace( go.Scattermap( - lon=site_df["lon"], - lat=site_df["lat"], + lon=site_df.loc[virt_sites_mask, "lon"], + lat=site_df.loc[virt_sites_mask, "lat"], mode="markers", marker=dict(color="blue", size=4, symbol="circle"), hoverinfo="skip", + hovertemplate=( + "Site ID: %{customdata[0]}
" + "Lat: %{lat:.6f}
" + "Lon: %{lon:.6f}
" + "" + ), + customdata=np.asarray( + [site_df.loc[virt_sites_mask].index.values.astype(str)] + ).T, ) ) + if "real" in site_df.source.unique(): + real_sites_mask = site_df.loc[:, "source"] == "real" + fig.add_trace( + go.Scattermap( + lon=site_df.loc[real_sites_mask, "lon"], + lat=site_df.loc[real_sites_mask, "lat"], + mode="markers", + marker=dict(color="darkgreen", size=6, symbol="circle"), + hovertemplate=( + "Site ID: %{customdata[0]}
" + "Lat: %{lat:.6f}
" + "Lon: %{lon:.6f}
" + "" + ), + customdata=np.asarray( + [site_df.loc[real_sites_mask].index.values.astype(str)] + ).T, + ) + ) + if vel_model_version is not None: # Plot the basin boundaries basin_boundaries = get_basin_boundaries(vel_model_version) basin_line_properties = dict(color="red", width=1) + basin_fill_color = "rgba(255,0,0,0.05)" for cur_basin_boundary in basin_boundaries.values(): if isinstance(cur_basin_boundary, shapely.MultiPolygon): @@ -168,6 +221,8 @@ def gen_plot( lon=np.array(poly.exterior.xy[0]), lat=np.array(poly.exterior.xy[1]), mode="lines", + fill="toself", + fillcolor=basin_fill_color, line=basin_line_properties, hoverinfo="skip", ) @@ -178,6 +233,8 @@ def gen_plot( lon=np.array(cur_basin_boundary.exterior.xy[0]), lat=np.array(cur_basin_boundary.exterior.xy[1]), mode="lines", + fill="toself", + fillcolor=basin_fill_color, line=basin_line_properties, hoverinfo="skip", ) @@ -215,28 +272,12 @@ def gen_plot( ) ) - if nzgmdb_site_ffp is not None: - nzgmdb_site_df = pd.read_csv( - nzgmdb_site_ffp, index_col="sta", usecols=["sta", "lon", "lat"] - ) - fig.add_trace( - go.Scattermap( - lon=nzgmdb_site_df["lon"], - lat=nzgmdb_site_df["lat"], - mode="markers", - marker=dict(symbol="triangle"), - hovertext=nzgmdb_site_df.index, - hoverinfo="text", - ) - ) - fig.update_layout( map=dict(zoom=6, center=dict(lat=site_df.lat.mean(), lon=site_df.lon.mean())), showlegend=False, title=( f"Virtual Sites Grid - Velocity Model {vel_model_version} - " - f"Total Sites: {len(site_df)} - " - f"Basin Sites: {site_df['in_basin'].sum()}" + f"Total Sites: {len(site_df)}" ), ) fig.write_html(output_ffp) diff --git a/workflow/virt_sites_grid.py b/workflow/virt_sites_grid.py index c67417c7..7c33eb06 100644 --- a/workflow/virt_sites_grid.py +++ b/workflow/virt_sites_grid.py @@ -1,19 +1,52 @@ +import time +import base64 +import enum +import hashlib +import logging from pathlib import Path -import pooch +import geopandas as gpd import numpy as np import pandas as pd +import pooch import shapely import xarray as xr from qcore import coordinates from velocity_modelling.constants import get_data_root from velocity_modelling.registry import CVMRegistry +from velocity_modelling.threshold import compute_station_thresholds + +logger = logging.getLogger(__name__) + +NZTM_CRS = "EPSG:2193" +WSG84_CRS = "EPSG:4326" + + +class NZGMDBVersion(enum.StrEnum): + """NZGMDB Version Enum.""" + + V4p3 = "v4.3" + + +NZGMDB_VERSION_TO_TABLE_NAME = { + NZGMDBVersion.V4p3: "nzgmdb_4p3_site_table.csv", +} -# # GENERAL_GRID_FFP = Path("/home/claudy/dev/tmp_share/virt_sites/general_land_mask_grid.nc") -# GENERAL_GRID_FFP = Path( -# "/Users/claudy/dev/tmp_share/virt_sites/general_grid.nc" -# ) +REGION_CODES = { + "North Auckland": "NAK", + "South Auckland": "SAK", + "Hawkes Bay": "HBY", + "Gisborne": "GIS", + "Taranaki": "TAR", + "Wellington": "WEL", + "Nelson": "NEL", + "Marlborough": "MAR", + "Westland": "WES", + "Canterbury": "CAN", + "Otago": "OTA", + "Southland": "STL", +} GRID_DATA = pooch.create( path=pooch.os_cache("virt_sites"), @@ -21,17 +54,28 @@ registry={ "general_grid.nc": "sha256:06d675c60d90545f29573ab505db17e7662f7958270093d4a8808d6fc379a984", "nz_coastline.parquet": "sha256:9a965326690c560d372720b4b64b57480ede10e479b2fe62ba75760707bc521c", + "nzgmdb_4p3_site_table.csv": "sha256:0144c6e066a461b63441e9c1a1608741c5f056fd3e2926f2cb16748e24296317", + "nz_regions.parquet": "sha256:c08b250332052126d4e68ef0894d72416791c4184b5bfc23da93adf39582f2f2", + "nz_territory.parquet": "sha256:afe5a70572d6d42cbcbe4b26291983b03619bd0b567eb6f6cafa20306d03b0f7", }, urls={ "general_grid.nc": "https://www.dropbox.com/scl/fi/4x987t01kvbz8bbyeid81/general_grid.nc?rlkey=rpztofwihh839500jn2bkw255&st=rpazzciz&dl=1", "nz_coastline.parquet": "https://www.dropbox.com/scl/fi/nx7hm82ern7s2b4v7u6zq/nz_coastline.parquet?rlkey=xzvh7gumefaigzhbgyjtuxu2p&st=e4gwa9a9&dl=1", + "nzgmdb_4p3_site_table.csv": "https://www.dropbox.com/scl/fi/m3l559m2nmjgufipyq03s/nzgmdb_4p3_site_table.csv?rlkey=gg1hn2mm38zxew9fn45ac8p0v&st=iq0kagtv&dl=1", + "nz_regions.parquet": "https://www.dropbox.com/scl/fi/gvvjf16vpe69zqbglh8ns/nz_regions.parquet?rlkey=14y2exb6wr48h7d0pj7dcvxwa&st=t23pxjsv&dl=1", + "nz_territory.parquet": "https://www.dropbox.com/scl/fi/3ywi83r8pucp5ajs26do7/nz_territory.parquet?rlkey=viemibogajrtomm0n9xypwq1v&st=ph5pkg2z&dl=1", }, ) class GeneralGrid: + """ + Represents the general base grid of virtual sites. + Any custom grid is built off this grid. + """ - def __init__(self, land_mask_grid: xr.DataArray, spacing: int): + def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> "GeneralGrid": + """Initialize GeneralGrid.""" self.land_mask_grid = land_mask_grid self.site_ids = np.arange(land_mask_grid.size).reshape(land_mask_grid.shape) self.spacing = spacing @@ -39,7 +83,6 @@ def __init__(self, land_mask_grid: xr.DataArray, spacing: int): self.grid_lat, self.grid_lon = xr.broadcast( self.land_mask_grid.lat, self.land_mask_grid.lon ) - # TODO: Optimize memory usage, high atm self.grid_lat, self.grid_lon = ( self.grid_lat.values, self.grid_lon.values, @@ -52,29 +95,62 @@ def __init__(self, land_mask_grid: xr.DataArray, spacing: int): self.grid_nztm_y = grid_nztm[:, 0].reshape(self.land_mask_grid.shape) @property - def shape(self): + def shape(self) -> tuple[int, int]: + """Shape of the grid.""" return self.land_mask_grid.shape @classmethod - def load(cls, land_mask_grid_ffp: Path): - return cls(xr.load_dataarray(land_mask_grid_ffp), 50) + def load(cls, land_mask_grid_ffp: Path) -> "GeneralGrid": + """Load GeneralGrid from file.""" + logger.info(f"Loading general grid from {land_mask_grid_ffp}...") + return cls(xr.load_dataarray(land_mask_grid_ffp, engine="h5netcdf"), 50) class CustomGrid: - - def __init__(self): + """ + Represents a custom grid of virtual sites + built from the general grid. + """ + + def __init__(self) -> "CustomGrid": + """Initialize CustomGrid.""" + logger.info("Loading general grid...") self.general_grid = GeneralGrid.load(GRID_DATA.fetch("general_grid.nc")) self._and_mask = np.ones(self.general_grid.shape, dtype=bool) self._or_mask = np.ones(self.general_grid.shape, dtype=bool) + self._land_only = None + self._uniform_spacing = None + self._region = None self._in_basin_mask = None - + self._vel_model_version = None + @property - def mask(self): + def mask(self) -> np.ndarray: + """ + Site mask that gives the custom grid + when applied to the the general grid. + """ return self._and_mask & self._or_mask - def add_region_filter(self, region: shapely.Polygon): + def add_region_filter(self, region: shapely.Polygon) -> "CustomGrid": + """ + Adds a region filter. Only sites within the region are kept. + This is an AND filter, i.e., it removes sites outside the region. + Can only be applied once. + + Parameters + ---------- + region : shapely.Polygon + The region polygon in WGS84 coordinates. + """ + logger.info("Adding region filter to custom grid...") + if self._region is not None: + logger.error("Region filter has already been added.") + raise + self._region = region + # Convert to NZTM region = shapely.transform(region, lambda x: coordinates.wgs_depth_to_nztm(x)) @@ -87,16 +163,35 @@ def add_region_filter(self, region: shapely.Polygon): return self - def add_uniform_spacing_filter(self, spacing: int): + def add_uniform_spacing_filter(self, spacing: int) -> "CustomGrid": + """ + Adds a uniform spacing filter. + This is an OR filter, i.e., it only adds sites. + Can only be applied once. + + Parameters + ---------- + spacing : int + The uniform grid spacing in metres. + """ + logger.info("Adding uniform spacing filter...") + if self._uniform_spacing is not None: + logger.error("Uniform spacing filter has already been added.") + raise if spacing < self.general_grid.spacing: - raise ValueError( - "Uniform spacing must be greater than or equal to the general grid spacing." + logger.error( + "Uniform spacing must be greater than or" + " equal to the general grid spacing." ) + raise if spacing % self.general_grid.spacing != 0: - raise ValueError( - f"Uniform spacing must be a multiple of the general grid spacing {self.general_grid.spacing}." + logger.error( + f"Uniform spacing must be a multiple of" + f" the general grid spacing {self.general_grid.spacing}." ) - + raise + self._uniform_spacing = spacing + idx_interval = spacing // self.general_grid.spacing spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) spacing_mask[::idx_interval, ::idx_interval] = True @@ -104,12 +199,38 @@ def add_uniform_spacing_filter(self, spacing: int): return self - def add_land_only_filter(self): + def add_land_only_filter(self) -> "CustomGrid": + """Adds a land only filter, only land sites are kept.""" + logger.info("Adding land only filter...") + self._land_only = True self._and_mask &= self.general_grid.land_mask_grid.values.astype(bool) return self - def add_basin_spacing_filter(self, vel_model_version: str, spacing: int): + def add_basin_spacing_filter( + self, vel_model_version: str, spacing: int + ) -> "CustomGrid": + """ + Adds a basin spacing filter. + Note that this is an OR filter, i.e., it only adds sites + + Parameters + ---------- + vel_model_version : str + The velocity model version to use for basin spacing. + spacing : int + The grid spacing in metres to use within basins. + """ + logger.info("Adding basin spacing filter...") + if ( + self._vel_model_version is not None + and vel_model_version != self._vel_model_version + ): + raise ValueError( + "Basin spacing filter has already been added with a different velocity model version." + ) + self._vel_model_version = vel_model_version + if spacing < self.general_grid.spacing: raise ValueError( "Basin spacing must be greater than or equal to the general grid spacing." @@ -136,23 +257,182 @@ def add_basin_spacing_filter(self, vel_model_version: str, spacing: int): self._or_mask |= self._in_basin_mask & spacing_mask return self + + def get_metadata(self, site_df: pd.DataFrame = None) -> dict: + """ + Gets the metadata dictionary for the custom grid. + """ + region = None + if self._region is not None: + region = shapely.geometry.mapping(self._region) + + site_metadata = {} + if site_df is not None: + site_metadata = { + "num_sites": len(site_df), + "num_virtual_sites": len(site_df[site_df.source == "virtual"]), + "num_real_sites": len(site_df[site_df.source == "real"]), + "num_basin_sites": len(site_df[site_df.basin.notna()]), + } + + # Number of sites per basin + site_metadata["sites_per_basin"] = site_df.groupby("basin").size().to_dict() + + metadata = site_metadata | { + "land_only": self._land_only, + "region": region, + "uniform_spacing": self._uniform_spacing, + "vel_model_version": self._vel_model_version, + } + return metadata + + def get_site_df( + self, + nzgmdb_version: NZGMDBVersion | None = None, + vel_model_version: str | None = None, + ) -> pd.DataFrame: + """ + Gets the site dataframe for the custom grid. + + Parameters + ---------- + nzgmdb_version : NZGMDBVersion, optional + The NZGMDB version to include real stations from. + vel_model_version : str, optional + The velocity model version to use for basin membership + and Z-values. Is only used if the basin spacing filter + has not already been added to the custom grid. + + Returns + ------- + pd.DataFrame + The site dataframe with columns: + - site_id: The site ID. + - lon: The site longitude. + - lat: The site latitude. + - nztm_x: The site NZTM X coordinate. + - nztm_y: The site NZTM Y coordinate. + - source: "virtual" for virtual sites, "real" for NZGMDB sites. + - region_name: The name of the region the site is in. + - region_code: The code of the region the site is in. + - territory_name: The name of the territory the site is in. + """ + logger.info("Getting site dataframe...") + site_df = pd.DataFrame( + { + "general_site_id": self.general_grid.site_ids[self.mask], + "lon": self.general_grid.grid_lon[self.mask], + "lat": self.general_grid.grid_lat[self.mask], + "nztm_x": self.general_grid.grid_nztm_x[self.mask], + "nztm_y": self.general_grid.grid_nztm_y[self.mask], + "source": "virtual", + } + ).set_index("general_site_id") + + # Add region + logger.info("Adding region information...") + region_df = ( + gpd.read_parquet(GRID_DATA.fetch("nz_regions.parquet")) + .to_crs(NZTM_CRS) + .astype({"name": "category", "code": "category"}) + ) - def get_site_df(self): - site_ids = self.general_grid.site_ids[self.mask] - lons = self.general_grid.grid_lon[self.mask] - lats = self.general_grid.grid_lat[self.mask] + site_points_df = gpd.GeoDataFrame( + site_df[["nztm_x", "nztm_y"]], + geometry=gpd.points_from_xy(site_df.nztm_x, site_df.nztm_y), + crs=NZTM_CRS, + ) + joined = gpd.sjoin(site_points_df, region_df, how="left", predicate="within") + site_df["region_name"] = joined["name"] + site_df["region_code"] = joined["code"] + + # Add territory + logger.info("Adding territory information...") + territory_df = ( + gpd.read_parquet(GRID_DATA.fetch("nz_territory.parquet")) + .to_crs(NZTM_CRS) + .astype({"name": "category"}) + ) + joined = gpd.sjoin( + site_points_df, + territory_df, + how="left", + predicate="within", + ) + site_df["territory_name"] = joined["name"] + + # Add basin membership & Z-values + if self._vel_model_version or vel_model_version: + if ( + self._vel_model_version + and vel_model_version + and self._vel_model_version != vel_model_version + ): + err_msg = "Basin spacing filter has been added with a different velocity model version." + logger.error(err_msg) + raise ValueError(err_msg) + vel_model_version = self._vel_model_version or vel_model_version + + # Basin membership + logger.info("Adding basin membership information...") + basin_boundaries = get_basin_boundaries(vel_model_version) + basin_df = gpd.GeoDataFrame( + {"basin": list(basin_boundaries.keys())}, + geometry=list(basin_boundaries.values()), + crs=WSG84_CRS, + ).to_crs(NZTM_CRS) + joined = gpd.sjoin( + site_points_df, + basin_df, + how="left", + predicate="within", + ) + # Keep first basin if in multiple basins + joined = joined.groupby(level=0).first() + site_df["basin"] = joined["basin"] + + # Get Z-values + logger.info("Adding Z-values...") + start = time.time() + z_values =compute_station_thresholds(site_df, model_version=vel_model_version, show_progress=False) + logger.info(f"Took: {time.time() - start} to compute Z-values") + site_df["Z1.0"] = z_values["Z_1.0(km)"] + site_df["Z2.5"] = z_values["Z_2.5(km)"] + + # Add site ids + logger.info("Adding site IDs...") + site_df["site_id"] = np.char.add( + latlon_ids(site_df.lat.values, site_df.lon.values, length=5), + site_df.region_code.astype(str).values, + ) + site_df = site_df.set_index("site_id") + + # Add NZGMDB sites + if nzgmdb_version is not None: + logger.info("Adding NZGMDB sites...") + nzgmdb_site_df = pd.read_csv( + GRID_DATA.fetch(NZGMDB_VERSION_TO_TABLE_NAME[nzgmdb_version]), + index_col="sta", + usecols=["sta", "lat", "lon"], + ) + nzgmdb_site_df["source"] = "real" + nzgmdb_nztm_values = coordinates.wgs_depth_to_nztm( + nzgmdb_site_df[["lat", "lon"]].values + ) + nzgmdb_site_df["nztm_x"] = nzgmdb_nztm_values[:, 1] + nzgmdb_site_df["nztm_y"] = nzgmdb_nztm_values[:, 0] - site_df = pd.DataFrame( - {"site_id": site_ids, "lon": lons, "lat": lats, "in_basin": False} - ).set_index("site_id") + site_df = pd.concat([site_df, nzgmdb_site_df], axis=0) - if self._in_basin_mask is not None: - site_df["in_basin"] = self._in_basin_mask[self.mask] + site_df = site_df.astype({"source": "category"}) + # Sanity check + assert site_df.index.is_unique, "Site IDs are not unique!" return site_df -def get_basin_boundaries(vel_model_version: str): +def get_basin_boundaries(vel_model_version: str) -> dict[str, shapely.Polygon]: + """Gets the basin boundaries for a given velocity model version.""" cvm_registry = CVMRegistry(vel_model_version, get_data_root()) basin_data = cvm_registry.load_basin_data(cvm_registry.global_params["basins"]) @@ -168,3 +448,30 @@ def get_basin_boundaries(vel_model_version: str): ) return basin_boundaries + + +def latlon_ids(lat: np.ndarray, lon: np.ndarray, length: int = 4) -> np.ndarray: + """Generate a short unique ID from latitude and longitude arrays. + + Parameters + ---------- + lat : np.ndarray + Array of latitude values. + lon : np.ndarray + Array of longitude values. + length : int, optional + Length of the hash ID, by default 4. + + Returns + ------- + np.ndarray + Array of hash IDs. + """ + + ids = np.empty(lat.shape, dtype=f"U{length}") + for i, (lat_val, lon_val) in enumerate(zip(lat.ravel(), lon.ravel())): + text = f"{lat_val:.6f},{lon_val:.6f}" + h = hashlib.sha256(text.encode()) + ids[i] = base64.urlsafe_b64encode(h.digest()).decode()[:length] + + return ids From c496e3a9e8f3c66ebe31edf6014d06ebe345940d Mon Sep 17 00:00:00 2001 From: Claudio Date: Mon, 20 Oct 2025 14:27:38 +1300 Subject: [PATCH 05/21] Renaming. --- wiki/{Virtual-Sites.md => Site-Generation.md} | 6 +++--- workflow/scripts/{virt_sites_cmds.py => site_gen_cmds.py} | 8 ++++---- workflow/{virt_sites_grid.py => site_gen.py} | 0 3 files changed, 7 insertions(+), 7 deletions(-) rename wiki/{Virtual-Sites.md => Site-Generation.md} (90%) rename workflow/scripts/{virt_sites_cmds.py => site_gen_cmds.py} (97%) rename workflow/{virt_sites_grid.py => site_gen.py} (100%) diff --git a/wiki/Virtual-Sites.md b/wiki/Site-Generation.md similarity index 90% rename from wiki/Virtual-Sites.md rename to wiki/Site-Generation.md index 310b668c..6260b74e 100644 --- a/wiki/Virtual-Sites.md +++ b/wiki/Site-Generation.md @@ -8,8 +8,8 @@ Instead a high density uniform "general" grid has been developed, and the code i ### Custom Grid Generation -A custom grid can be generated using the 'virt_sites_cmds.py' script using the 'gen-custom-grid-from-rel' command. -For details/help on the command run 'python virt_sites_cmds.py gen-custom-grid-from-rel --help' +A custom grid can be generated using the `site_gen_cmds.py` script using the `gen-custom-grid-from-rel` command. +For details/help on the command run `python site_gen_cmds.py gen-custom-grid-from-rel --help` **Note that you will need about 16Gb of free memory.** Running the custom grid generation will produce a parquet file with the following columns: @@ -50,7 +50,7 @@ in addition to some additional information such as: ### Plot Generation Visualisation of the custom grid can be done using the 'gen-plot' command. -For details/help 'run python virt_sites_cmds.py gen-plot --help'. +For details/help run `python site_gen_cmds.py gen-plot --help`. Running this will produce a html file that can be viewed in any browser. Note that for large domains and high density grids this might be slow to load. diff --git a/workflow/scripts/virt_sites_cmds.py b/workflow/scripts/site_gen_cmds.py similarity index 97% rename from workflow/scripts/virt_sites_cmds.py rename to workflow/scripts/site_gen_cmds.py index ab9a873c..fa71883b 100644 --- a/workflow/scripts/virt_sites_cmds.py +++ b/workflow/scripts/site_gen_cmds.py @@ -14,7 +14,7 @@ from qcore import cli, coordinates from workflow.realisations import DomainParameters, SourceConfig -from workflow.virt_sites_grid import ( +from workflow.site_gen import ( GRID_DATA, CustomGrid, NZGMDBVersion, @@ -142,10 +142,9 @@ def gen_custom_grid_from_rel( @cli.from_docstring(app) def gen_plot( site_df_ffp: Annotated[Path, typer.Argument()], + site_df_meta_ffp: Annotated[Path, typer.Argument()], output_ffp: Annotated[Path, typer.Argument()], rel_ffp: Annotated[Path | None, typer.Option()] = None, - vel_model_version: Annotated[str | None, typer.Option()] = None, - nzgmdb_site_ffp: Annotated[Path | None, typer.Option()] = None, ) -> None: """ Generate a plot of the custom grid. @@ -164,6 +163,7 @@ def gen_plot( If provided, the basin boundaries will be plotted. """ site_df = pd.read_parquet(site_df_ffp) + metadata = yaml.load(site_df_meta_ffp.open("r"), Loader=yaml.FullLoader) fig = go.Figure() @@ -207,7 +207,7 @@ def gen_plot( ) ) - if vel_model_version is not None: + if (vel_model_version := metadata.get("vel_model_version")) is not None: # Plot the basin boundaries basin_boundaries = get_basin_boundaries(vel_model_version) basin_line_properties = dict(color="red", width=1) diff --git a/workflow/virt_sites_grid.py b/workflow/site_gen.py similarity index 100% rename from workflow/virt_sites_grid.py rename to workflow/site_gen.py From f3e0e950f5a06a5bcca0d999b3fdaed6ac2509f8 Mon Sep 17 00:00:00 2001 From: Claudio Date: Mon, 20 Oct 2025 14:31:17 +1300 Subject: [PATCH 06/21] Documentation update --- wiki/Site-Generation.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/wiki/Site-Generation.md b/wiki/Site-Generation.md index 6260b74e..fa6edc5e 100644 --- a/wiki/Site-Generation.md +++ b/wiki/Site-Generation.md @@ -16,7 +16,7 @@ Running the custom grid generation will produce a parquet file with the followin | Column | Description | Notes | |--------|-------------|-------| -| `site_id` | Unique identifier for each virtual site | Made up as follows `{4 character lat/lon hash}{2 character region code}` | +| `site_id` | Unique identifier for each virtual site | `{4 character lat/lon hash}{2 character region code}`, e.g. "ijSBAOT", is a site in Otago as last two characters are "OT" | | `lon` | Longitude coordinate in decimal degrees | | | `lat` | Latitude coordinate in decimal degrees | | | `nztm_x` | X coordinate in New Zealand Transverse Mercator (NZTM) projection | | @@ -27,6 +27,24 @@ Running the custom grid generation will produce a parquet file with the followin | `Z2.5` | Depth to 2.5 km/s shear-wave velocity (meters) | | | `vs30` | Time-averaged shear-wave velocity in the top 30 meters (m/s) | | +Region code mapping: + +| Region | Code | +|--------|------| +| North Auckland | NA | +| South Auckland | SA | +| Hawkes Bay | HB | +| Gisborne | GI | +| Taranaki | TA | +| Wellington | WE | +| Nelson | NE | +| Marlborough | MA | +| Westland | WL | +| Canterbury | CA | +| Otago | OT | +| Southland | SL | + + Additionally, it will also produce a metadata file that contains the setting used to generate the custom grid: | Setting | Type | Description | From e3e4a40dda74e2490a81f6bb7be5d9d9fc831032 Mon Sep 17 00:00:00 2001 From: Claudio Date: Wed, 22 Oct 2025 14:56:07 +1300 Subject: [PATCH 07/21] Fixed site name generation, bug fixes, plotting improvments, added support for generating NZ wide custom grid. --- workflow/scripts/site_gen_cmds.py | 63 ++++++++++------ workflow/site_gen.py | 120 ++++++++++++++++++++---------- 2 files changed, 122 insertions(+), 61 deletions(-) diff --git a/workflow/scripts/site_gen_cmds.py b/workflow/scripts/site_gen_cmds.py index fa71883b..6d1ce5d7 100644 --- a/workflow/scripts/site_gen_cmds.py +++ b/workflow/scripts/site_gen_cmds.py @@ -2,7 +2,6 @@ from pathlib import Path from typing import Annotated -import yaml import geopandas as gpd import numpy as np import pandas as pd @@ -11,6 +10,7 @@ import shapely import typer import xarray as xr +import yaml from qcore import cli, coordinates from workflow.realisations import DomainParameters, SourceConfig @@ -24,7 +24,7 @@ # Configure logging logging.basicConfig( level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + format="%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()], force=True, ) @@ -81,21 +81,19 @@ def gen_general_grid(grid_spacing: str, output_ffp: Path) -> None: @cli.from_docstring(app) -def gen_custom_grid_from_rel( - rel_ffp: Annotated[Path, typer.Argument()], +def gen_custom_grid( uniform_grid_spacing: Annotated[int, typer.Argument()], output_ffp: Annotated[Path, typer.Argument()], basin_spacing: Annotated[int | None, typer.Option()] = None, vel_model_version: Annotated[str | None, typer.Option()] = None, nzgmdb_version: Annotated[NZGMDBVersion | None, typer.Option()] = None, + rel_ffp: Annotated[Path | None, typer.Option()] = None, ) -> None: """ - Generate a custom grid from a realisation config. + Generate a NZ-wide custom grid. Parameters ---------- - rel_ffp : Path - The path to the realisation config. uniform_grid_spacing : int The uniform grid spacing in metres. This must be a multiple of the general grid spacing. @@ -112,15 +110,17 @@ def gen_custom_grid_from_rel( then this velocity model version will be used to set basin membership in the output site dataframe. """ - domain_config = DomainParameters.read_from_realisation(rel_ffp) - - region = shapely.Polygon(domain_config.domain.corners) - custom_grid = ( - CustomGrid() - .add_land_only_filter() - .add_region_filter(region) - .add_uniform_spacing_filter(uniform_grid_spacing) - ) + custom_grid = CustomGrid().add_land_only_filter() + + # If a realisation file is provided, use its domain to filter the grid + if rel_ffp is not None: + domain_config = DomainParameters.read_from_realisation(rel_ffp) + region = shapely.Polygon(domain_config.domain.corners) + custom_grid.add_region_filter(region) + + custom_grid.add_uniform_spacing_filter(uniform_grid_spacing) + + # Add basin spacing filter if specified if basin_spacing is not None: if vel_model_version is None: raise ValueError( @@ -128,12 +128,14 @@ def gen_custom_grid_from_rel( ) custom_grid.add_basin_spacing_filter(vel_model_version, basin_spacing) + # Generate site dataframe and save site_df = custom_grid.get_site_df( nzgmdb_version=nzgmdb_version, vel_model_version=vel_model_version if not basin_spacing else None, ) site_df.to_parquet(output_ffp) + # Save metadata metadata = custom_grid.get_metadata(site_df) with (output_ffp.parent / f"{output_ffp.stem}_metadata.yaml").open("w") as meta_ffp: yaml.dump(metadata, meta_ffp) @@ -176,13 +178,21 @@ def gen_plot( marker=dict(color="blue", size=4, symbol="circle"), hoverinfo="skip", hovertemplate=( - "Site ID: %{customdata[0]}
" - "Lat: %{lat:.6f}
" - "Lon: %{lon:.6f}
" - "" + "Site ID: %{customdata[0]}
" + "Lat: %{lat:.6f}
" + "Lon: %{lon:.6f}
" + "Z1.0: %{customdata[1]:.3f} km
" + "Z2.5: %{customdata[2]:.3f} km
" + "Vs30: %{customdata[3]:.1f} m/s
" + "" ), customdata=np.asarray( - [site_df.loc[virt_sites_mask].index.values.astype(str)] + [ + site_df.loc[virt_sites_mask].index.values.astype(str), + site_df.loc[virt_sites_mask, "Z1.0"].values, + site_df.loc[virt_sites_mask, "Z2.5"].values, + site_df.loc[virt_sites_mask, "Vs30"].values, + ] ).T, ) ) @@ -199,10 +209,18 @@ def gen_plot( "Site ID: %{customdata[0]}
" "Lat: %{lat:.6f}
" "Lon: %{lon:.6f}
" + "Z1.0: %{customdata[1]:.3f} km
" + "Z2.5: %{customdata[2]:.3f} km
" + "Vs30: %{customdata[3]:.1f} m/s
" "" ), customdata=np.asarray( - [site_df.loc[real_sites_mask].index.values.astype(str)] + [ + site_df.loc[real_sites_mask].index.values.astype(str), + site_df.loc[real_sites_mask, "Z1.0"].values, + site_df.loc[real_sites_mask, "Z2.5"].values, + site_df.loc[real_sites_mask, "Vs30"].values, + ] ).T, ) ) @@ -285,4 +303,3 @@ def gen_plot( if __name__ == "__main__": app() - app() diff --git a/workflow/site_gen.py b/workflow/site_gen.py index 7c33eb06..7de69dcc 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -1,8 +1,7 @@ -import time -import base64 import enum -import hashlib import logging +import string +import time from pathlib import Path import geopandas as gpd @@ -77,7 +76,9 @@ class GeneralGrid: def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> "GeneralGrid": """Initialize GeneralGrid.""" self.land_mask_grid = land_mask_grid - self.site_ids = np.arange(land_mask_grid.size).reshape(land_mask_grid.shape) + self.site_ids = np.arange(land_mask_grid.size, dtype=np.uint32).reshape( + land_mask_grid.shape + ) self.spacing = spacing self.grid_lat, self.grid_lon = xr.broadcast( @@ -125,7 +126,7 @@ def __init__(self) -> "CustomGrid": self._region = None self._in_basin_mask = None self._vel_model_version = None - + @property def mask(self) -> np.ndarray: """ @@ -146,6 +147,7 @@ def add_region_filter(self, region: shapely.Polygon) -> "CustomGrid": The region polygon in WGS84 coordinates. """ logger.info("Adding region filter to custom grid...") + start_time = time.time() if self._region is not None: logger.error("Region filter has already been added.") raise @@ -160,6 +162,7 @@ def add_region_filter(self, region: shapely.Polygon) -> "CustomGrid": self.general_grid.grid_nztm_x.ravel(), ).reshape(self.general_grid.shape) self._and_mask &= region_mask + logger.info(f"Region filter added in {time.time() - start_time} seconds.") return self @@ -175,6 +178,7 @@ def add_uniform_spacing_filter(self, spacing: int) -> "CustomGrid": The uniform grid spacing in metres. """ logger.info("Adding uniform spacing filter...") + start_time = time.time() if self._uniform_spacing is not None: logger.error("Uniform spacing filter has already been added.") raise @@ -191,20 +195,25 @@ def add_uniform_spacing_filter(self, spacing: int) -> "CustomGrid": ) raise self._uniform_spacing = spacing - + idx_interval = spacing // self.general_grid.spacing spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) spacing_mask[::idx_interval, ::idx_interval] = True self._or_mask &= spacing_mask + logger.info( + f"Uniform spacing filter added in {time.time() - start_time} seconds." + ) return self def add_land_only_filter(self) -> "CustomGrid": """Adds a land only filter, only land sites are kept.""" logger.info("Adding land only filter...") + start_time = time.time() self._land_only = True self._and_mask &= self.general_grid.land_mask_grid.values.astype(bool) + logger.info(f"Land only filter added in {time.time() - start_time} seconds.") return self def add_basin_spacing_filter( @@ -222,6 +231,7 @@ def add_basin_spacing_filter( The grid spacing in metres to use within basins. """ logger.info("Adding basin spacing filter...") + start_time = time.time() if ( self._vel_model_version is not None and vel_model_version != self._vel_model_version @@ -256,8 +266,11 @@ def add_basin_spacing_filter( spacing_mask[::idx_interval, ::idx_interval] = True self._or_mask |= self._in_basin_mask & spacing_mask + logger.info( + f"Basin spacing filter added in {time.time() - start_time} seconds." + ) return self - + def get_metadata(self, site_df: pd.DataFrame = None) -> dict: """ Gets the metadata dictionary for the custom grid. @@ -272,11 +285,14 @@ def get_metadata(self, site_df: pd.DataFrame = None) -> dict: "num_sites": len(site_df), "num_virtual_sites": len(site_df[site_df.source == "virtual"]), "num_real_sites": len(site_df[site_df.source == "real"]), - "num_basin_sites": len(site_df[site_df.basin.notna()]), } # Number of sites per basin - site_metadata["sites_per_basin"] = site_df.groupby("basin").size().to_dict() + if "basin" in site_df.columns: + site_metadata["num_basin_sites"] = len(site_df[site_df.basin.notna()]) + site_metadata["sites_per_basin"] = ( + site_df.groupby("basin").size().to_dict() + ) metadata = site_metadata | { "land_only": self._land_only, @@ -316,6 +332,9 @@ def get_site_df( - region_name: The name of the region the site is in. - region_code: The code of the region the site is in. - territory_name: The name of the territory the site is in. + - basin: The basin the site is in (if any). + - Z1.0: The Z1.0 value for the site (if vel_model_version is provided). + - Z2.5: The Z2.5 value for the site (if vel_model_version is provided). """ logger.info("Getting site dataframe...") site_df = pd.DataFrame( @@ -331,6 +350,7 @@ def get_site_df( # Add region logger.info("Adding region information...") + start = time.time() region_df = ( gpd.read_parquet(GRID_DATA.fetch("nz_regions.parquet")) .to_crs(NZTM_CRS) @@ -343,11 +363,15 @@ def get_site_df( crs=NZTM_CRS, ) joined = gpd.sjoin(site_points_df, region_df, how="left", predicate="within") + # Keep first region if in multiple regions + joined = joined.groupby(level=0).first() site_df["region_name"] = joined["name"] site_df["region_code"] = joined["code"] + logger.info(f"Took: {time.time() - start} ") # Add territory logger.info("Adding territory information...") + start = time.time() territory_df = ( gpd.read_parquet(GRID_DATA.fetch("nz_territory.parquet")) .to_crs(NZTM_CRS) @@ -360,9 +384,13 @@ def get_site_df( predicate="within", ) site_df["territory_name"] = joined["name"] + # Keep first territory if in multiple territories + site_df["territory_name"] = site_df["territory_name"].groupby(level=0).first() + logger.info(f"Took: {time.time() - start} ") # Add basin membership & Z-values if self._vel_model_version or vel_model_version: + start = time.time() if ( self._vel_model_version and vel_model_version @@ -390,30 +418,39 @@ def get_site_df( # Keep first basin if in multiple basins joined = joined.groupby(level=0).first() site_df["basin"] = joined["basin"] + logger.info(f"Took: {time.time() - start} to add basin membership") # Get Z-values - logger.info("Adding Z-values...") start = time.time() - z_values =compute_station_thresholds(site_df, model_version=vel_model_version, show_progress=False) - logger.info(f"Took: {time.time() - start} to compute Z-values") + logger.info("Adding Z-values...") + logging.getLogger("nzcvm.threshold").setLevel(logging.WARNING) + z_values = compute_station_thresholds( + site_df, + model_version=vel_model_version, + show_progress=False, + include_sigma=False, + logger=logger, + ) site_df["Z1.0"] = z_values["Z_1.0(km)"] site_df["Z2.5"] = z_values["Z_2.5(km)"] + logger.info(f"Took: {time.time() - start} to add Z-values") # Add site ids - logger.info("Adding site IDs...") - site_df["site_id"] = np.char.add( - latlon_ids(site_df.lat.values, site_df.lon.values, length=5), + logger.info("Adding site code...") + site_df["site_code"] = np.char.add( + encode_base62_fixed_array(site_df.index.values, length=5), site_df.region_code.astype(str).values, ) - site_df = site_df.set_index("site_id") + site_df = site_df.set_index("site_code") # Add NZGMDB sites if nzgmdb_version is not None: + start = time.time() logger.info("Adding NZGMDB sites...") nzgmdb_site_df = pd.read_csv( GRID_DATA.fetch(NZGMDB_VERSION_TO_TABLE_NAME[nzgmdb_version]), index_col="sta", - usecols=["sta", "lat", "lon"], + usecols=["sta", "lat", "lon", "Vs30", "Z1.0", "Z2.5"], ) nzgmdb_site_df["source"] = "real" nzgmdb_nztm_values = coordinates.wgs_depth_to_nztm( @@ -421,8 +458,22 @@ def get_site_df( ) nzgmdb_site_df["nztm_x"] = nzgmdb_nztm_values[:, 1] nzgmdb_site_df["nztm_y"] = nzgmdb_nztm_values[:, 0] + # Convert Z1.0 to km + nzgmdb_site_df["Z1.0"] /= 1000 + + if self._region is not None: + region_nztm = shapely.transform( + self._region, lambda x: coordinates.wgs_depth_to_nztm(x) + ) + region_mask = shapely.contains_xy( + region_nztm, + nzgmdb_site_df["nztm_y"].values, + nzgmdb_site_df["nztm_x"].values, + ) + nzgmdb_site_df = nzgmdb_site_df[region_mask] site_df = pd.concat([site_df, nzgmdb_site_df], axis=0) + logger.info(f"Took: {time.time() - start} to add NZGMDB sites") site_df = site_df.astype({"source": "category"}) @@ -450,28 +501,21 @@ def get_basin_boundaries(vel_model_version: str) -> dict[str, shapely.Polygon]: return basin_boundaries -def latlon_ids(lat: np.ndarray, lon: np.ndarray, length: int = 4) -> np.ndarray: - """Generate a short unique ID from latitude and longitude arrays. +def encode_base62_fixed_array(nums: np.ndarray, length: int) -> np.ndarray: + """Vectorized Base62 encoder for many integers.""" + alphabet = np.array(list(string.digits + string.ascii_letters)) + alphabet_size = len(alphabet) - Parameters - ---------- - lat : np.ndarray - Array of latitude values. - lon : np.ndarray - Array of longitude values. - length : int, optional - Length of the hash ID, by default 4. + if np.any(nums >= alphabet_size**length): + raise ValueError(f"Some numbers too large for {length}-char Base62 ID.") - Returns - ------- - np.ndarray - Array of hash IDs. - """ + # Create empty char array: shape (N, MAX_LEN) + out = np.empty((nums.size, length), dtype=" Date: Thu, 23 Oct 2025 12:39:03 +1300 Subject: [PATCH 08/21] Switched to config based approach. --- workflow/scripts/site_gen_cmds.py | 63 ++++++---- workflow/site_gen.py | 193 +++++++++++++++++++----------- 2 files changed, 163 insertions(+), 93 deletions(-) diff --git a/workflow/scripts/site_gen_cmds.py b/workflow/scripts/site_gen_cmds.py index 6d1ce5d7..e34aef44 100644 --- a/workflow/scripts/site_gen_cmds.py +++ b/workflow/scripts/site_gen_cmds.py @@ -17,6 +17,7 @@ from workflow.site_gen import ( GRID_DATA, CustomGrid, + CustomGridConfig, NZGMDBVersion, get_basin_boundaries, ) @@ -82,8 +83,9 @@ def gen_general_grid(grid_spacing: str, output_ffp: Path) -> None: @cli.from_docstring(app) def gen_custom_grid( - uniform_grid_spacing: Annotated[int, typer.Argument()], output_ffp: Annotated[Path, typer.Argument()], + config_ffp: Annotated[Path | None, typer.Option()] = None, + uniform_grid_spacing: Annotated[int | None, typer.Option()] = None, basin_spacing: Annotated[int | None, typer.Option()] = None, vel_model_version: Annotated[str | None, typer.Option()] = None, nzgmdb_version: Annotated[NZGMDBVersion | None, typer.Option()] = None, @@ -94,45 +96,58 @@ def gen_custom_grid( Parameters ---------- - uniform_grid_spacing : int - The uniform grid spacing in metres. - This must be a multiple of the general grid spacing. output_ffp : Path The path to save the output parquet file. + config_ffp : Path, optional + The path to the custom grid config file. + If provided, all other config options will be ignored. + uniform_grid_spacing : int, optional + The uniform grid spacing in metres. + This must be a multiple of the general grid spacing. + Ignored if a config file is provided. basin_spacing : int, optional The grid spacing in metres to use within basins. This must be a multiple of the general grid spacing. If not provided, no basin spacing will be applied. + Ignored if a config file is provided. vel_model_version : str, optional The velocity model version to use for basin spacing. This must be provided if basin_spacing is specified. If basin_spacing is not provided, and this is provided, then this velocity model version will be used to set basin membership in the output site dataframe. + nzgmdb_version : NZGMDBVersion, optional + The NZGMDB version to use for site parameters. + Ignored if a config file is provided. + rel_ffp : Path, optional + The path to the realisation config. + If provided, the domain will be used to filter the grid. + Ignored if a config file is provided. """ - custom_grid = CustomGrid().add_land_only_filter() + if config_ffp is not None: + logger.info(f"Reading custom grid config from {config_ffp}...") + config_dict = yaml.safe_load(config_ffp.open("r")) + grid_config = CustomGridConfig.from_dict(config_dict) + rel_ffp = config_dict.get("rel_ffp") + else: + logger.info("Generating custom grid config from command line options...") + grid_config = CustomGridConfig( + land_only=True, + uniform_spacing=uniform_grid_spacing, + vel_model_version=vel_model_version, + basin_spacing=basin_spacing, + ) # If a realisation file is provided, use its domain to filter the grid if rel_ffp is not None: domain_config = DomainParameters.read_from_realisation(rel_ffp) - region = shapely.Polygon(domain_config.domain.corners) - custom_grid.add_region_filter(region) + region = shapely.Polygon(domain_config.domain.corners[:, ::-1]) + grid_config.region = region - custom_grid.add_uniform_spacing_filter(uniform_grid_spacing) - - # Add basin spacing filter if specified - if basin_spacing is not None: - if vel_model_version is None: - raise ValueError( - "vel_model_version must be provided if basin_spacing is provided." - ) - custom_grid.add_basin_spacing_filter(vel_model_version, basin_spacing) + custom_grid = CustomGrid().apply_config(grid_config) # Generate site dataframe and save - site_df = custom_grid.get_site_df( - nzgmdb_version=nzgmdb_version, - vel_model_version=vel_model_version if not basin_spacing else None, - ) + site_df = custom_grid.get_site_df() site_df.to_parquet(output_ffp) # Save metadata @@ -165,7 +180,7 @@ def gen_plot( If provided, the basin boundaries will be plotted. """ site_df = pd.read_parquet(site_df_ffp) - metadata = yaml.load(site_df_meta_ffp.open("r"), Loader=yaml.FullLoader) + config_dict = yaml.load(site_df_meta_ffp.open("r"), Loader=yaml.FullLoader)["config"] fig = go.Figure() @@ -181,6 +196,7 @@ def gen_plot( "Site ID: %{customdata[0]}
" "Lat: %{lat:.6f}
" "Lon: %{lon:.6f}
" + "Basin: %{customdata[4]}
" "Z1.0: %{customdata[1]:.3f} km
" "Z2.5: %{customdata[2]:.3f} km
" "Vs30: %{customdata[3]:.1f} m/s
" @@ -192,6 +208,7 @@ def gen_plot( site_df.loc[virt_sites_mask, "Z1.0"].values, site_df.loc[virt_sites_mask, "Z2.5"].values, site_df.loc[virt_sites_mask, "Vs30"].values, + site_df.loc[virt_sites_mask, "basin"].values, ] ).T, ) @@ -209,6 +226,7 @@ def gen_plot( "Site ID: %{customdata[0]}
" "Lat: %{lat:.6f}
" "Lon: %{lon:.6f}
" + "Basin: %{customdata[4]}
" "Z1.0: %{customdata[1]:.3f} km
" "Z2.5: %{customdata[2]:.3f} km
" "Vs30: %{customdata[3]:.1f} m/s
" @@ -220,12 +238,13 @@ def gen_plot( site_df.loc[real_sites_mask, "Z1.0"].values, site_df.loc[real_sites_mask, "Z2.5"].values, site_df.loc[real_sites_mask, "Vs30"].values, + site_df.loc[real_sites_mask, "basin"].values, ] ).T, ) ) - if (vel_model_version := metadata.get("vel_model_version")) is not None: + if (vel_model_version := config_dict.get("vel_model_version")) is not None: # Plot the basin boundaries basin_boundaries = get_basin_boundaries(vel_model_version) basin_line_properties = dict(color="red", width=1) diff --git a/workflow/site_gen.py b/workflow/site_gen.py index 7de69dcc..6bf4fab5 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -2,6 +2,7 @@ import logging import string import time +from dataclasses import dataclass from pathlib import Path import geopandas as gpd @@ -107,6 +108,63 @@ def load(cls, land_mask_grid_ffp: Path) -> "GeneralGrid": return cls(xr.load_dataarray(land_mask_grid_ffp, engine="h5netcdf"), 50) +@dataclass +class CustomGridConfig: + """Configuration for CustomGrid.""" + + land_only: bool | None = None + """Whether to only include land sites.""" + + region: shapely.Polygon | None = None + """Region polygon in WGS84 coordinates (lon/lat).""" + + uniform_spacing: int | None = None + """Uniform grid spacing in metres.""" + + vel_model_version: str | None = None + """Velocity model version for basin spacing.""" + + basin_spacing: int | None = None + """Grid spacing in metres within basins.""" + + per_basin_spacing: dict[str, int] | None = None + """Per-basin specific spacing in metres.""" + + nzgmdb_version: NZGMDBVersion | None = None + """NZGMDB version for real stations.""" + + @classmethod + def from_dict(cls, config_dict: dict) -> "CustomGridConfig": + """Create CustomGridConfig from dictionary.""" + return cls( + land_only=config_dict.get("land_only"), + region=config_dict.get("region"), + uniform_spacing=config_dict.get("uniform_spacing"), + vel_model_version=config_dict.get("vel_model_version"), + basin_spacing=config_dict.get("basin_spacing"), + per_basin_spacing=config_dict.get("per_basin_spacing"), + nzgmdb_version=NZGMDBVersion(config_dict.get("nzgmdb_version")), + ) + + def as_dict(self) -> dict: + """Convert CustomGridConfig to dictionary.""" + return { + "land_only": self.land_only, + "region": ( + shapely.geometry.mapping(self.region) + if self.region is not None + else None + ), + "uniform_spacing": self.uniform_spacing, + "vel_model_version": self.vel_model_version, + "basin_spacing": self.basin_spacing, + "per_basin_spacing": self.per_basin_spacing, + "nzgmdb_version": ( + self.nzgmdb_version.value if self.nzgmdb_version is not None else None + ), + } + + class CustomGrid: """ Represents a custom grid of virtual sites @@ -118,14 +176,13 @@ def __init__(self) -> "CustomGrid": logger.info("Loading general grid...") self.general_grid = GeneralGrid.load(GRID_DATA.fetch("general_grid.nc")) + self._reset() + + def _reset(self) -> None: + """Resets the custom grid.""" self._and_mask = np.ones(self.general_grid.shape, dtype=bool) self._or_mask = np.ones(self.general_grid.shape, dtype=bool) - - self._land_only = None - self._uniform_spacing = None - self._region = None - self._in_basin_mask = None - self._vel_model_version = None + self.config = None @property def mask(self) -> np.ndarray: @@ -135,7 +192,42 @@ def mask(self) -> np.ndarray: """ return self._and_mask & self._or_mask - def add_region_filter(self, region: shapely.Polygon) -> "CustomGrid": + def apply_config(self, config: CustomGridConfig) -> "CustomGrid": + """ + Applies a CustomGridConfig to the CustomGrid. + Resets any previously applied filters. + """ + self._reset() + self.config = config + + if self.config.land_only: + self._add_land_only_filter() + if self.config.region is not None: + self._add_region_filter(self.config.region) + if self.config.uniform_spacing is not None: + self._add_uniform_spacing_filter(self.config.uniform_spacing) + if ( + self.config.vel_model_version is not None + and self.config.basin_spacing is not None + ): + self._add_basin_spacing_filter( + self.config.vel_model_version, self.config.basin_spacing + ) + if self.config.per_basin_spacing is not None: + # Group by spacing + basin_spacing_series = pd.Series(self.config.per_basin_spacing) + spacing_groups = basin_spacing_series.groupby(basin_spacing_series) + + for spacing, basins in spacing_groups.groups.items(): + self._add_basin_spacing_filter( + self.config.vel_model_version, + spacing, + basins.tolist(), + ) + + return self + + def _add_region_filter(self, region: shapely.Polygon) -> None: """ Adds a region filter. Only sites within the region are kept. This is an AND filter, i.e., it removes sites outside the region. @@ -144,29 +236,23 @@ def add_region_filter(self, region: shapely.Polygon) -> "CustomGrid": Parameters ---------- region : shapely.Polygon - The region polygon in WGS84 coordinates. + The region polygon in WGS84 coordinates (lon/lat). """ logger.info("Adding region filter to custom grid...") start_time = time.time() - if self._region is not None: - logger.error("Region filter has already been added.") - raise - self._region = region # Convert to NZTM - region = shapely.transform(region, lambda x: coordinates.wgs_depth_to_nztm(x)) + region_nztm = shapely.transform(region, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1])) region_mask = shapely.contains_xy( - region, + region_nztm, self.general_grid.grid_nztm_y.ravel(), self.general_grid.grid_nztm_x.ravel(), ).reshape(self.general_grid.shape) self._and_mask &= region_mask logger.info(f"Region filter added in {time.time() - start_time} seconds.") - return self - - def add_uniform_spacing_filter(self, spacing: int) -> "CustomGrid": + def _add_uniform_spacing_filter(self, spacing: int) -> None: """ Adds a uniform spacing filter. This is an OR filter, i.e., it only adds sites. @@ -179,9 +265,6 @@ def add_uniform_spacing_filter(self, spacing: int) -> "CustomGrid": """ logger.info("Adding uniform spacing filter...") start_time = time.time() - if self._uniform_spacing is not None: - logger.error("Uniform spacing filter has already been added.") - raise if spacing < self.general_grid.spacing: logger.error( "Uniform spacing must be greater than or" @@ -194,7 +277,6 @@ def add_uniform_spacing_filter(self, spacing: int) -> "CustomGrid": f" the general grid spacing {self.general_grid.spacing}." ) raise - self._uniform_spacing = spacing idx_interval = spacing // self.general_grid.spacing spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) @@ -204,21 +286,17 @@ def add_uniform_spacing_filter(self, spacing: int) -> "CustomGrid": logger.info( f"Uniform spacing filter added in {time.time() - start_time} seconds." ) - return self - def add_land_only_filter(self) -> "CustomGrid": + def _add_land_only_filter(self) -> None: """Adds a land only filter, only land sites are kept.""" logger.info("Adding land only filter...") start_time = time.time() - self._land_only = True self._and_mask &= self.general_grid.land_mask_grid.values.astype(bool) - logger.info(f"Land only filter added in {time.time() - start_time} seconds.") - return self - def add_basin_spacing_filter( - self, vel_model_version: str, spacing: int - ) -> "CustomGrid": + def _add_basin_spacing_filter( + self, vel_model_version: str, spacing: int, basins: list[str] | None = None + ) -> None: """ Adds a basin spacing filter. Note that this is an OR filter, i.e., it only adds sites @@ -230,17 +308,8 @@ def add_basin_spacing_filter( spacing : int The grid spacing in metres to use within basins. """ - logger.info("Adding basin spacing filter...") + logger.info(f"Adding basin {spacing} spacing filter for {basins if basins is not None else 'all'} basins...") start_time = time.time() - if ( - self._vel_model_version is not None - and vel_model_version != self._vel_model_version - ): - raise ValueError( - "Basin spacing filter has already been added with a different velocity model version." - ) - self._vel_model_version = vel_model_version - if spacing < self.general_grid.spacing: raise ValueError( "Basin spacing must be greater than or equal to the general grid spacing." @@ -251,11 +320,15 @@ def add_basin_spacing_filter( ) basin_boundaries = get_basin_boundaries(vel_model_version) + if basins is not None: + basin_boundaries = { + k: v for k, v in basin_boundaries.items() if k in basins + } comb_basin_boundaries = shapely.union_all(list(basin_boundaries.values())) comb_basin_boundaries = shapely.transform( comb_basin_boundaries, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) ) - self._in_basin_mask = shapely.contains_xy( + in_basin_mask = shapely.contains_xy( comb_basin_boundaries, self.general_grid.grid_nztm_y.ravel(), self.general_grid.grid_nztm_x.ravel(), @@ -265,20 +338,15 @@ def add_basin_spacing_filter( spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) spacing_mask[::idx_interval, ::idx_interval] = True - self._or_mask |= self._in_basin_mask & spacing_mask + self._or_mask |= in_basin_mask & spacing_mask logger.info( f"Basin spacing filter added in {time.time() - start_time} seconds." ) - return self def get_metadata(self, site_df: pd.DataFrame = None) -> dict: """ Gets the metadata dictionary for the custom grid. """ - region = None - if self._region is not None: - region = shapely.geometry.mapping(self._region) - site_metadata = {} if site_df is not None: site_metadata = { @@ -294,18 +362,11 @@ def get_metadata(self, site_df: pd.DataFrame = None) -> dict: site_df.groupby("basin").size().to_dict() ) - metadata = site_metadata | { - "land_only": self._land_only, - "region": region, - "uniform_spacing": self._uniform_spacing, - "vel_model_version": self._vel_model_version, - } + metadata = {"metadata": site_metadata, "config": self.config.as_dict()} return metadata def get_site_df( self, - nzgmdb_version: NZGMDBVersion | None = None, - vel_model_version: str | None = None, ) -> pd.DataFrame: """ Gets the site dataframe for the custom grid. @@ -389,21 +450,11 @@ def get_site_df( logger.info(f"Took: {time.time() - start} ") # Add basin membership & Z-values - if self._vel_model_version or vel_model_version: + if self.config.vel_model_version: start = time.time() - if ( - self._vel_model_version - and vel_model_version - and self._vel_model_version != vel_model_version - ): - err_msg = "Basin spacing filter has been added with a different velocity model version." - logger.error(err_msg) - raise ValueError(err_msg) - vel_model_version = self._vel_model_version or vel_model_version - # Basin membership logger.info("Adding basin membership information...") - basin_boundaries = get_basin_boundaries(vel_model_version) + basin_boundaries = get_basin_boundaries(self.config.vel_model_version) basin_df = gpd.GeoDataFrame( {"basin": list(basin_boundaries.keys())}, geometry=list(basin_boundaries.values()), @@ -426,7 +477,7 @@ def get_site_df( logging.getLogger("nzcvm.threshold").setLevel(logging.WARNING) z_values = compute_station_thresholds( site_df, - model_version=vel_model_version, + model_version=self.config.vel_model_version, show_progress=False, include_sigma=False, logger=logger, @@ -444,11 +495,11 @@ def get_site_df( site_df = site_df.set_index("site_code") # Add NZGMDB sites - if nzgmdb_version is not None: + if self.config.nzgmdb_version is not None: start = time.time() logger.info("Adding NZGMDB sites...") nzgmdb_site_df = pd.read_csv( - GRID_DATA.fetch(NZGMDB_VERSION_TO_TABLE_NAME[nzgmdb_version]), + GRID_DATA.fetch(NZGMDB_VERSION_TO_TABLE_NAME[self.config.nzgmdb_version]), index_col="sta", usecols=["sta", "lat", "lon", "Vs30", "Z1.0", "Z2.5"], ) @@ -461,9 +512,9 @@ def get_site_df( # Convert Z1.0 to km nzgmdb_site_df["Z1.0"] /= 1000 - if self._region is not None: + if self.config.region is not None: region_nztm = shapely.transform( - self._region, lambda x: coordinates.wgs_depth_to_nztm(x) + self.config.region, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) ) region_mask = shapely.contains_xy( region_nztm, From 87b141739c1ae3c8d36acf8d3d637c7646423924 Mon Sep 17 00:00:00 2001 From: Claudio Date: Thu, 23 Oct 2025 14:49:35 +1300 Subject: [PATCH 09/21] Saving --- wiki/Site-Generation.md | 96 ++++++++++++++++++- workflow/scripts/site_gen_cmds.py | 36 +++++-- workflow/site_gen.py | 152 ++++++++++++++++++++++++++++-- 3 files changed, 267 insertions(+), 17 deletions(-) diff --git a/wiki/Site-Generation.md b/wiki/Site-Generation.md index fa6edc5e..07b53a22 100644 --- a/wiki/Site-Generation.md +++ b/wiki/Site-Generation.md @@ -10,7 +10,101 @@ Instead a high density uniform "general" grid has been developed, and the code i A custom grid can be generated using the `site_gen_cmds.py` script using the `gen-custom-grid-from-rel` command. For details/help on the command run `python site_gen_cmds.py gen-custom-grid-from-rel --help` -**Note that you will need about 16Gb of free memory.** +There are two options to generate the custom grid via the `gen-custom-grid-from-rel` either specify the spacing via the command line arguments, or pass a configuration yaml file. The configuration allows for more customization, such as specifying spacing per basin or custom defined region. + +An example configuration file might look something like this +```yaml +land_only: true +uniform_spacing: 5000 + +rel_ffp: null + +vel_model_version: "2.09" +basin_spacing: 2500 +per_basin_spacing: + Hanmer_v25p3: 1250 +per_region_spacing: + - name: "Christchurch" + geojson_ffp: "/home/claudy/dev/tmp_share/virt_sites/chch.json" + spacing: 1250 + +nzgmdb_version: v4.3 +``` +This configuration will select NZ land sites, with a default spacing of 5km, in basin spacing of 2.5km. +Additionally, 1250m spacing is applied in Hanmer basin, and the custom Christchurch region defined via the geojson file. The geojson needs the following format: +```json +{ + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "properties": {}, + "geometry": { + "coordinates": [ + [ + [ + 172.64635288571822, + -43.46746584851593 + ], + [ + 172.5764348336035, + -43.46875527988916 + ], + [ + 172.48512720365642, + -43.510780173947616 + ], + [ + 172.48164483335898, + -43.581684944518635 + ], + [ + 172.51943709438905, + -43.605574705482184 + ], + [ + 172.60676846169588, + -43.61039335808902 + ], + [ + 172.74171895701926, + -43.575568637023046 + ], + [ + 172.811825205922, + -43.54170866114695 + ], + [ + 172.7825735380037, + -43.479415396924466 + ], + [ + 172.75289631792998, + -43.449932811038906 + ], + [ + 172.68739428988397, + -43.45477584526233 + ], + [ + 172.6620601942388, + -43.46177619980023 + ], + [ + 172.64635288571822, + -43.46746584851593 + ] + ] + ], + "type": "Polygon" + } + } + ] +} +``` + +**Note that you will need about 20Gb of free memory to generate custom grids.** + Running the custom grid generation will produce a parquet file with the following columns: diff --git a/workflow/scripts/site_gen_cmds.py b/workflow/scripts/site_gen_cmds.py index e34aef44..95a5510a 100644 --- a/workflow/scripts/site_gen_cmds.py +++ b/workflow/scripts/site_gen_cmds.py @@ -127,7 +127,7 @@ def gen_custom_grid( if config_ffp is not None: logger.info(f"Reading custom grid config from {config_ffp}...") config_dict = yaml.safe_load(config_ffp.open("r")) - grid_config = CustomGridConfig.from_dict(config_dict) + grid_config = CustomGridConfig.from_config(config_dict) rel_ffp = config_dict.get("rel_ffp") else: logger.info("Generating custom grid config from command line options...") @@ -136,6 +136,7 @@ def gen_custom_grid( uniform_spacing=uniform_grid_spacing, vel_model_version=vel_model_version, basin_spacing=basin_spacing, + nzgmdb_version=nzgmdb_version, ) # If a realisation file is provided, use its domain to filter the grid @@ -172,15 +173,17 @@ def gen_plot( The path to the custom grid site dataframe file (parquet). output_ffp : Path The path to save the output HTML file. - rel_ffp : Path, optional + rel_ffp: Path, optional The path to the realisation config. If provided, the domain and source will be plotted. - vel_model_version : str, optional - The velocity model version to use for plotting basin boundaries. - If provided, the basin boundaries will be plotted. """ site_df = pd.read_parquet(site_df_ffp) - config_dict = yaml.load(site_df_meta_ffp.open("r"), Loader=yaml.FullLoader)["config"] + + with site_df_meta_ffp.open("r") as f: + config = CustomGridConfig.from_metadata( + yaml.load(f, Loader=yaml.FullLoader)["config"] + ) + # config_dict = yaml.load(site_df_meta_ffp.open("r"), Loader=yaml.FullLoader)["config"] fig = go.Figure() @@ -244,9 +247,9 @@ def gen_plot( ) ) - if (vel_model_version := config_dict.get("vel_model_version")) is not None: + if config.vel_model_version is not None: # Plot the basin boundaries - basin_boundaries = get_basin_boundaries(vel_model_version) + basin_boundaries = get_basin_boundaries(config.vel_model_version) basin_line_properties = dict(color="red", width=1) basin_fill_color = "rgba(255,0,0,0.05)" @@ -276,7 +279,20 @@ def gen_plot( hoverinfo="skip", ) ) - + + if config.per_region_spacing is not None: + # Plot the regions with different spacing + for region_spacing in config.per_region_spacing: + fig.add_trace( + go.Scattermap( + lon=np.array(region_spacing.region.exterior.xy[0]), + lat=np.array(region_spacing.region.exterior.xy[1]), + mode="lines", + line=dict(color="orange", width=1), + hoverinfo="skip", + ) + ) + if rel_ffp is not None: # Plot the domain domain_corners = DomainParameters.read_from_realisation(rel_ffp).domain.corners @@ -313,7 +329,7 @@ def gen_plot( map=dict(zoom=6, center=dict(lat=site_df.lat.mean(), lon=site_df.lon.mean())), showlegend=False, title=( - f"Virtual Sites Grid - Velocity Model {vel_model_version} - " + f"Sites Grid - Velocity Model {config.vel_model_version} - " f"Total Sites: {len(site_df)}" ), ) diff --git a/workflow/site_gen.py b/workflow/site_gen.py index 6bf4fab5..f0a4806d 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -5,6 +5,8 @@ from dataclasses import dataclass from pathlib import Path + +import geojson import geopandas as gpd import numpy as np import pandas as pd @@ -108,6 +110,51 @@ def load(cls, land_mask_grid_ffp: Path) -> "GeneralGrid": return cls(xr.load_dataarray(land_mask_grid_ffp, engine="h5netcdf"), 50) +@dataclass +class RegionSpacingConfig: + """Configuration for region specific spacing.""" + + name: str + """Name of the region.""" + + region: shapely.Polygon + """Region polygon in WGS84 coordinates (lon/lat).""" + + spacing: int + """Grid spacing in metres within the region.""" + + def as_dict(self) -> dict: + """Convert RegionSpacingConfig to dictionary.""" + return { + "name": self.name, + "region": shapely.geometry.mapping(self.region), + "spacing": self.spacing, + } + + @classmethod + def from_config(cls, config_dict: dict) -> "RegionSpacingConfig": + with open(config_dict["geojson_ffp"], "r") as f: + geojson_obj = geojson.load(f) + region_polygon = shapely.geometry.shape( + geojson_obj["features"][0]["geometry"] + ) + + return RegionSpacingConfig( + name=config_dict["name"], + region=region_polygon, + spacing=config_dict["spacing"], + ) + + @classmethod + def from_metadata(cls, metadata_dict: dict) -> "RegionSpacingConfig": + return cls( + name=metadata_dict["name"], + region=shapely.geometry.shape(metadata_dict["region"]), + spacing=metadata_dict["spacing"], + ) + + + @dataclass class CustomGridConfig: """Configuration for CustomGrid.""" @@ -130,12 +177,21 @@ class CustomGridConfig: per_basin_spacing: dict[str, int] | None = None """Per-basin specific spacing in metres.""" + per_region_spacing: list[RegionSpacingConfig] | None = None + """Per-region specific spacing in metres.""" + nzgmdb_version: NZGMDBVersion | None = None """NZGMDB version for real stations.""" @classmethod - def from_dict(cls, config_dict: dict) -> "CustomGridConfig": + def from_config(cls, config_dict: dict) -> "CustomGridConfig": """Create CustomGridConfig from dictionary.""" + + # Get the per-region spacing + per_region_spacing = None + if (prs_configs := config_dict.get("per_region_spacing")) is not None: + per_region_spacing = [RegionSpacingConfig.from_config(config) for config in prs_configs] + return cls( land_only=config_dict.get("land_only"), region=config_dict.get("region"), @@ -143,9 +199,34 @@ def from_dict(cls, config_dict: dict) -> "CustomGridConfig": vel_model_version=config_dict.get("vel_model_version"), basin_spacing=config_dict.get("basin_spacing"), per_basin_spacing=config_dict.get("per_basin_spacing"), + per_region_spacing=per_region_spacing, nzgmdb_version=NZGMDBVersion(config_dict.get("nzgmdb_version")), ) - + + @classmethod + def from_metadata(cls, metadata_dict: dict) -> "CustomGridConfig": + """Create CustomGridConfig from metadata dictionary.""" + region_polygon = None + if (region_geojson := metadata_dict["region"]) is not None: + region_polygon = shapely.geometry.shape(region_geojson) + + per_region_spacing = None + if (prs_metadata := metadata_dict["per_region_spacing"]) is not None: + per_region_spacing = [RegionSpacingConfig.from_metadata(prs) for prs in prs_metadata] + + return cls( + land_only=metadata_dict["land_only"], + region=region_polygon, + uniform_spacing=metadata_dict["uniform_spacing"], + vel_model_version=metadata_dict["vel_model_version"], + basin_spacing=metadata_dict["basin_spacing"], + per_basin_spacing=metadata_dict["per_basin_spacing"], + per_region_spacing=per_region_spacing, + nzgmdb_version=NZGMDBVersion(metadata_dict["nzgmdb_version"]) + if metadata_dict["nzgmdb_version"] is not None + else None, + ) + def as_dict(self) -> dict: """Convert CustomGridConfig to dictionary.""" return { @@ -159,6 +240,11 @@ def as_dict(self) -> dict: "vel_model_version": self.vel_model_version, "basin_spacing": self.basin_spacing, "per_basin_spacing": self.per_basin_spacing, + "per_region_spacing": ( + [prs.as_dict() for prs in self.per_region_spacing] + if self.per_region_spacing is not None + else None + ), "nzgmdb_version": ( self.nzgmdb_version.value if self.nzgmdb_version is not None else None ), @@ -224,6 +310,9 @@ def apply_config(self, config: CustomGridConfig) -> "CustomGrid": spacing, basins.tolist(), ) + if self.config.per_region_spacing is not None: + for region_spacing_config in self.config.per_region_spacing: + self._add_region_spacing_filter(region_spacing_config) return self @@ -242,7 +331,9 @@ def _add_region_filter(self, region: shapely.Polygon) -> None: start_time = time.time() # Convert to NZTM - region_nztm = shapely.transform(region, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1])) + region_nztm = shapely.transform( + region, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) + ) region_mask = shapely.contains_xy( region_nztm, @@ -294,6 +385,50 @@ def _add_land_only_filter(self) -> None: self._and_mask &= self.general_grid.land_mask_grid.values.astype(bool) logger.info(f"Land only filter added in {time.time() - start_time} seconds.") + def _add_region_spacing_filter( + self, region_spacing_config: RegionSpacingConfig + ) -> None: + """ + Adds a region specific spacing filter. + Note that this is an OR filter, i.e., it only adds sites. + + Parameters + ---------- + region_spacing_config : RegionSpacingConfig + The region spacing configuration. + """ + logger.info( + f"Adding region {region_spacing_config.spacing} spacing filter for {region_spacing_config.name}..." + ) + start_time = time.time() + if region_spacing_config.spacing < self.general_grid.spacing: + raise ValueError( + "Region spacing must be greater than or equal to the general grid spacing." + ) + if region_spacing_config.spacing % self.general_grid.spacing != 0: + raise ValueError( + f"Region spacing must be a multiple of the general grid spacing {self.general_grid.spacing}." + ) + + region_nztm = shapely.transform( + region_spacing_config.region, + lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]), + ) + in_region_mask = shapely.contains_xy( + region_nztm, + self.general_grid.grid_nztm_y.ravel(), + self.general_grid.grid_nztm_x.ravel(), + ).reshape(self.general_grid.shape) + + idx_interval = region_spacing_config.spacing // self.general_grid.spacing + spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) + spacing_mask[::idx_interval, ::idx_interval] = True + + self._or_mask |= in_region_mask & spacing_mask + logger.info( + f"Region spacing filter added in {time.time() - start_time} seconds." + ) + def _add_basin_spacing_filter( self, vel_model_version: str, spacing: int, basins: list[str] | None = None ) -> None: @@ -308,7 +443,9 @@ def _add_basin_spacing_filter( spacing : int The grid spacing in metres to use within basins. """ - logger.info(f"Adding basin {spacing} spacing filter for {basins if basins is not None else 'all'} basins...") + logger.info( + f"Adding basin {spacing} spacing filter for {basins if basins is not None else 'all'} basins..." + ) start_time = time.time() if spacing < self.general_grid.spacing: raise ValueError( @@ -499,7 +636,9 @@ def get_site_df( start = time.time() logger.info("Adding NZGMDB sites...") nzgmdb_site_df = pd.read_csv( - GRID_DATA.fetch(NZGMDB_VERSION_TO_TABLE_NAME[self.config.nzgmdb_version]), + GRID_DATA.fetch( + NZGMDB_VERSION_TO_TABLE_NAME[self.config.nzgmdb_version] + ), index_col="sta", usecols=["sta", "lat", "lon", "Vs30", "Z1.0", "Z2.5"], ) @@ -514,7 +653,8 @@ def get_site_df( if self.config.region is not None: region_nztm = shapely.transform( - self.config.region, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) + self.config.region, + lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]), ) region_mask = shapely.contains_xy( region_nztm, From 24a4456eac3da5b4032d9adda06eb46d16f416f8 Mon Sep 17 00:00:00 2001 From: Claudio Date: Tue, 28 Oct 2025 13:03:41 +1300 Subject: [PATCH 10/21] Testing, bug fixes, minor improvements --- tests/test_site_gen.py | 450 ++++++++++++++++++++++++++++++ wiki/Site-Generation.md | 1 + workflow/scripts/site_gen_cmds.py | 64 +---- workflow/site_gen.py | 186 +++++++----- 4 files changed, 580 insertions(+), 121 deletions(-) create mode 100644 tests/test_site_gen.py diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py new file mode 100644 index 00000000..051a6324 --- /dev/null +++ b/tests/test_site_gen.py @@ -0,0 +1,450 @@ +import json +import string +import tempfile + +import numpy as np +import pytest +import shapely +import xarray as xr +from hypothesis import given, strategies as st, assume +from scipy.spatial import distance as sdist + +from workflow import site_gen + + +@pytest.fixture(scope="module", params=[50_000, 25_000]) +def land_mask_grid(request: pytest.FixtureRequest) -> tuple[xr.DataArray, int]: + spacing = request.param + return site_gen.gen_general_land_mask_grid(spacing), spacing + + +@pytest.fixture +def general_grid(land_mask_grid: tuple[xr.DataArray, int]) -> site_gen.GeneralGrid: + return site_gen.GeneralGrid(land_mask_grid[0], land_mask_grid[1]) + + +def test_general_grid(land_mask_grid: tuple[xr.DataArray, int]) -> None: + land_mask_grid, spacing = land_mask_grid + with tempfile.TemporaryDirectory() as tmpdir: + ffp = f"{tmpdir}/general_grid.nc" + land_mask_grid.to_netcdf(ffp) + loaded_grid = site_gen.GeneralGrid.load(ffp) + + assert loaded_grid.land_mask_grid.spacing == spacing + assert loaded_grid.shape == land_mask_grid.shape + assert np.array_equal(loaded_grid.land_mask_grid.values, land_mask_grid.values) + assert np.array_equal( + loaded_grid.land_mask_grid.lat.values, land_mask_grid.lat.values + ) + assert np.array_equal( + loaded_grid.land_mask_grid.lon.values, land_mask_grid.lon.values + ) + + +@pytest.fixture(scope="module") +def chch_region_polygon() -> shapely.Polygon: + polygon = shapely.Polygon( + [ + (172.60691125905976, -43.50338730757493), + (172.55414963918173, -43.52695730676956), + (172.64273442256302, -43.583742177757706), + (172.68415242515607, -43.5513507340181), + (172.6440052010451, -43.526126938652155), + (172.60691125905976, -43.50338730757493), + ] + ) + return polygon + + +@pytest.fixture(scope="module") +def land_region_polygon() -> shapely.Polygon: + polygon = shapely.Polygon( + [ + (170.78376767024793, -42.97481408776882), + (170.78376767024793, -43.68210554829009), + (172.66566136205932, -43.68210554829009), + (172.66566136205932, -42.97481408776882), + (170.78376767024793, -42.97481408776882), + ] + ) + return polygon + + +@pytest.fixture(scope="module") +def canterbury_region_polygon() -> shapely.Polygon: + polygon = shapely.Polygon( + [ + (172.86435627311448, -43.338682922734954), + (172.2989207589734, -43.342255579287965), + (171.53771742001976, -43.782575480660896), + (171.7830393264249, -44.18964091597699), + (173.3801704840081, -43.85958260231439), + (173.1293617392618, -43.60655002794223), + (172.86435627311448, -43.338682922734954), + ] + ) + return polygon + + +@pytest.fixture(scope="module") +def water_region_polygon() -> shapely.Polygon: + polygon = shapely.Polygon( + [ + (172.95789409381717, -39.97031515444409), + (172.95789409381717, -40.67305495760141), + (174.87451261095003, -40.67305495760141), + (174.87451261095003, -39.97031515444409), + (172.95789409381717, -39.97031515444409), + ] + ) + return polygon + + +def test_region_spacing_config_region_file( + chch_region_polygon: shapely.Polygon, +) -> None: + # Save region as geojson dict + with tempfile.TemporaryDirectory() as tmpdir: + geojson_dict = shapely.geometry.mapping(chch_region_polygon) + geojson_ffp = f"{tmpdir}/chch_region.geojson" + + with open(geojson_ffp, "w") as f: + json.dump( + { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "properties": {}, "geometry": geojson_dict} + ], + }, + f, + ) + + config_dict = {"name": "chch", "geojson_ffp": geojson_ffp, "spacing": 1000} + region_spacing_config = site_gen.RegionSpacingConfig.from_config(config_dict) + + assert region_spacing_config.name == "chch" + assert region_spacing_config.spacing == 1000 + assert shapely.equals_exact(region_spacing_config.region, chch_region_polygon) + + +def test_region_spacing_config_region_metadata( + chch_region_polygon: shapely.Polygon, +) -> None: + # Create region spacing config from metadata dict + metadata_dict = { + "name": "chch", + "region": shapely.geometry.mapping(chch_region_polygon), + "spacing": 1000, + } + + region_spacing_config = site_gen.RegionSpacingConfig.from_config(metadata_dict) + + assert region_spacing_config.name == "chch" + assert region_spacing_config.spacing == 1000 + assert shapely.equals_exact(region_spacing_config.region, chch_region_polygon) + + +def test_custom_grid_config(chch_region_polygon: shapely.Polygon) -> None: + # Create the custom grid config dict + with tempfile.TemporaryDirectory() as tmpdir: + geojson_dict = shapely.geometry.mapping(chch_region_polygon) + geojson_ffp = f"{tmpdir}/chch_region.geojson" + + with open(geojson_ffp, "w") as f: + json.dump( + { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "properties": {}, "geometry": geojson_dict} + ], + }, + f, + ) + + config_dict = { + "land_only": True, + "region": chch_region_polygon, + "uniform_spacing": 5000, + "vel_model_version": "2.09", + "basin_spacing": 2500, + "per_basin_spacing": {"Hanmer_v25p3": 1250}, + "per_region_spacing": [ + {"name": "Christchurch", "geojson_ffp": geojson_ffp, "spacing": 1250} + ], + "nzgmdb_version": "v4.3", + } + + custom_grid_config_1 = site_gen.CustomGridConfig.from_config(config_dict) + + assert custom_grid_config_1.land_only is True + assert shapely.equals_exact(custom_grid_config_1.region, chch_region_polygon) + assert custom_grid_config_1.uniform_spacing == 5000 + assert custom_grid_config_1.vel_model_version == "2.09" + assert custom_grid_config_1.basin_spacing == 2500 + assert custom_grid_config_1.per_basin_spacing == {"Hanmer_v25p3": 1250} + assert len(custom_grid_config_1.per_region_spacing) == 1 + per_region_config_1 = custom_grid_config_1.per_region_spacing[0] + assert per_region_config_1.name == "Christchurch" + assert per_region_config_1.spacing == 1250 + assert shapely.equals_exact(per_region_config_1.region, chch_region_polygon) + + config_dict = custom_grid_config_1.as_dict() + custom_grid_config_2 = site_gen.CustomGridConfig.from_config(config_dict) + + assert custom_grid_config_1.land_only == custom_grid_config_2.land_only + assert shapely.equals_exact( + custom_grid_config_1.region, custom_grid_config_2.region + ) + assert ( + custom_grid_config_1.uniform_spacing == custom_grid_config_2.uniform_spacing + ) + assert ( + custom_grid_config_1.vel_model_version + == custom_grid_config_2.vel_model_version + ) + assert custom_grid_config_1.basin_spacing == custom_grid_config_2.basin_spacing + assert ( + custom_grid_config_1.per_basin_spacing + == custom_grid_config_2.per_basin_spacing + ) + assert len(custom_grid_config_2.per_region_spacing) == 1 + per_region_config_2 = custom_grid_config_2.per_region_spacing[0] + assert per_region_config_1.name == per_region_config_2.name + assert per_region_config_1.spacing == per_region_config_2.spacing + assert shapely.equals_exact( + per_region_config_1.region, per_region_config_2.region + ) + + +def test_custom_grid_land_region_filtering( + general_grid: site_gen.GeneralGrid, land_region_polygon: shapely.Polygon +) -> None: + land_only_grid_config = site_gen.CustomGridConfig( + land_only=True, + region=land_region_polygon, + uniform_spacing=general_grid.spacing, + ) + land_only_grid = site_gen.CustomGrid(general_grid).apply_config( + land_only_grid_config + ) + + # All points in the custom grid should be land points + # and therefore all should be included + # Check via neighbour distances + site_df = land_only_grid.get_site_df() + distances = sdist.squareform(sdist.pdist(site_df[["nztm_x", "nztm_y"]].values)) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, general_grid.spacing, rtol=0.05) + + region_config = site_gen.CustomGridConfig( + region=land_region_polygon, + uniform_spacing=general_grid.spacing, + ) + region_grid = site_gen.CustomGrid(general_grid).apply_config(region_config) + + assert site_df.equals(region_grid.get_site_df()) + + +def test_custom_grid_water_region_filtering( + general_grid: site_gen.GeneralGrid, water_region_polygon: shapely.Polygon +) -> None: + land_only_grid_config = site_gen.CustomGridConfig( + land_only=True, + region=water_region_polygon, + uniform_spacing=general_grid.spacing, + ) + land_only_grid = site_gen.CustomGrid(general_grid).apply_config( + land_only_grid_config + ) + + site_df = land_only_grid.get_site_df() + assert site_df.shape[0] == 0 + + +def test_uniform_too_small_spacing_error( + general_grid: site_gen.GeneralGrid, +) -> None: + custom_grid = site_gen.CustomGrid(general_grid) + spacing = general_grid.spacing - 1000 + + with pytest.raises(ValueError): + custom_grid._add_uniform_spacing_filter(spacing) + + +def test_uniform_non_multiple_spacing_error( + general_grid: site_gen.GeneralGrid, +) -> None: + custom_grid = site_gen.CustomGrid(general_grid) + spacing = general_grid.spacing + 3000 + + with pytest.raises(ValueError): + custom_grid._add_uniform_spacing_filter(spacing) + + +def test_uniform_valid_spacing( + general_grid: site_gen.GeneralGrid, +) -> None: + config = site_gen.CustomGridConfig( + uniform_spacing=general_grid.spacing * 2, + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + distances = sdist.squareform(sdist.pdist(site_df[["nztm_x", "nztm_y"]].values)) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, general_grid.spacing * 2, rtol=0.12) + + +def test_basin_spacing( + general_grid: site_gen.GeneralGrid, canterbury_region_polygon: shapely.Polygon +) -> None: + uniform_spacing = general_grid.spacing * 2 + config = site_gen.CustomGridConfig( + region=canterbury_region_polygon, + uniform_spacing=uniform_spacing, + basin_spacing=general_grid.spacing, + vel_model_version="2.09", + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + distances = sdist.squareform(sdist.pdist(site_df[["nztm_x", "nztm_y"]].values)) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, general_grid.spacing, rtol=0.12) + assert np.all(min_distances < uniform_spacing) + + +@pytest.mark.parametrize("uniform_spacing", [5_000, 10_000]) +def test_small_uniform_spacing_error( + general_grid: site_gen.GeneralGrid, uniform_spacing: float +) -> None: + config = site_gen.CustomGridConfig(uniform_spacing=uniform_spacing) + + with pytest.raises( + ValueError, + match="Uniform spacing must be greater than or equal to the general grid spacing", + ): + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + + +@pytest.mark.parametrize("factor", [1.2, 1.5]) +def test_multiple_error_uniform_spacing( + general_grid: site_gen.GeneralGrid, factor: float +) -> None: + config = site_gen.CustomGridConfig( + uniform_spacing=general_grid.spacing * factor, + ) + + with pytest.raises( + ValueError, + match="Uniform spacing must be a multiple of the general grid spacing", + ): + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + + +@pytest.mark.parametrize("basin_spacing", [5_000, 10_000]) +def test_small_basin_spacing_error( + general_grid: site_gen.GeneralGrid, basin_spacing: float +) -> None: + uniform_spacing = general_grid.spacing * 2 + config = site_gen.CustomGridConfig( + uniform_spacing=uniform_spacing, + basin_spacing=basin_spacing, + vel_model_version="2.09", + ) + + with pytest.raises( + ValueError, + match="Basin spacing must be greater than or equal to the general grid spacing.", + ): + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + + +@pytest.mark.parametrize("factor", [1.2, 1.5]) +def test_multiple_error_basin_spacing( + general_grid: site_gen.GeneralGrid, factor: float +) -> None: + uniform_spacing = general_grid.spacing * 2 + basin_spacing = general_grid.spacing * factor + config = site_gen.CustomGridConfig( + uniform_spacing=uniform_spacing, + basin_spacing=basin_spacing, + vel_model_version="2.09", + ) + + with pytest.raises( + ValueError, match="Basin spacing must be a multiple of the general grid spacing" + ): + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + + +# Hypothesis-based property tests for encode_base62_fixed_array + + +@given( + nums=st.lists(st.integers(min_value=0, max_value=62**4 - 1), min_size=2, max_size=500), + length=st.integers(min_value=3, max_value=5), +) +def test_encode_base62_combined_properties(nums: list[int], length: int) -> None: + """Property: encoded strings should have fixed length, be unique, and use valid base62 characters.""" + nums_array = np.array(nums) + assume(np.all(nums_array < 62**length)) # Ensure all inputs fit in the length + assume(np.unique(nums_array).size == len(nums_array)) # Ensure all inputs are unique + + result = site_gen.encode_base62_fixed_array(nums_array, length=length) + + # The number of output codes should match the number of input numbers + assert len(result) == len(nums) + + # All encoded strings should have the specified length + assert all(len(code) == length for code in result) + + # Different numbers should produce different codes (all inputs are unique) + assert len(np.unique(result)) == len(nums) + + # Encoded strings should only contain valid base62 characters + alphabet = set(string.digits + string.ascii_letters) + for code in result: + assert all(c in alphabet for c in code) + + +@given( + length=st.integers(min_value=1, max_value=6), +) +def test_encode_base62_boundary_values_property(length: int) -> None: + """Property: test boundary values (0 and max) for a given length.""" + max_val = 62**length - 1 + nums = np.array([0, max_val]) + + result = site_gen.encode_base62_fixed_array(nums, length=length) + + assert len(result) == 2 + assert all(len(code) == length for code in result) + assert result[0] == "0" * length + +@given( + too_large_num=st.integers(min_value=62**3, max_value=62**4), + length=st.just(3), +) +def test_encode_base62_too_large_error_property(too_large_num: int, length: int) -> None: + """Property: numbers too large for the specified length should raise ValueError.""" + nums = np.array([too_large_num]) + + with pytest.raises(ValueError, match="too large"): + site_gen.encode_base62_fixed_array(nums, length=length) + +@given( + num=st.integers(min_value=0, max_value=62**3 - 1), +) +def test_encode_base62_same_input_same_output_property(num: int) -> None: + """Property: encoding the same number multiple times should give the same result.""" + nums = np.array([num, num, num]) + + result = site_gen.encode_base62_fixed_array(nums, length=3) + + assert result[0] == result[1] == result[2] diff --git a/wiki/Site-Generation.md b/wiki/Site-Generation.md index 07b53a22..d4868c9c 100644 --- a/wiki/Site-Generation.md +++ b/wiki/Site-Generation.md @@ -137,6 +137,7 @@ Region code mapping: | Canterbury | CA | | Otago | OT | | Southland | SL | +| No Region | NR | Additionally, it will also produce a metadata file that contains the setting used to generate the custom grid: diff --git a/workflow/scripts/site_gen_cmds.py b/workflow/scripts/site_gen_cmds.py index 95a5510a..6d2da514 100644 --- a/workflow/scripts/site_gen_cmds.py +++ b/workflow/scripts/site_gen_cmds.py @@ -2,25 +2,16 @@ from pathlib import Path from typing import Annotated -import geopandas as gpd import numpy as np import pandas as pd import plotly.graph_objects as go -import pygmt import shapely import typer -import xarray as xr import yaml from qcore import cli, coordinates +from workflow import site_gen from workflow.realisations import DomainParameters, SourceConfig -from workflow.site_gen import ( - GRID_DATA, - CustomGrid, - CustomGridConfig, - NZGMDBVersion, - get_basin_boundaries, -) # Configure logging logging.basicConfig( @@ -37,45 +28,10 @@ @app.command("gen-general-grid") def gen_general_grid(grid_spacing: str, output_ffp: Path) -> None: """Generate the general grid.""" - x_spacing, y_spacing = grid_spacing.split("/") - if x_spacing != y_spacing: - raise ValueError("Currently only supports equal x and y spacing.") - - try: - spacing = int(x_spacing.rstrip("e")) - except ValueError: - raise ValueError("Grid spacing must be an integer in metres.") - - land_df = gpd.read_parquet(GRID_DATA.fetch("nz_coastline.parquet")) - # Combine into a single polygon - land_polygon = shapely.coverage_union_all(land_df.geometry) - land_polygon = shapely.transform( - land_polygon, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) - ) - - # Generate grid - logger.info("Generating grid...") - land_mask_grid = pygmt.grdlandmask(region="NZ", spacing=grid_spacing).astype(bool) - land_mask_grid[:] = False - land_mask_grid.attrs = {"spacing": spacing} - - # Use float32 for coords - land_mask_grid = land_mask_grid.assign_coords( - lat=land_mask_grid.lat.astype(np.float32), - lon=land_mask_grid.lon.astype(np.float32), - ) - - grid_lat, grid_lon = xr.broadcast(land_mask_grid.lat, land_mask_grid.lon) - grid_nztm = coordinates.wgs_depth_to_nztm( - np.vstack((grid_lat.values.ravel(), grid_lon.values.ravel())).T - ) - - # Apply land masking - logger.info("Applying land mask...") - mask = shapely.contains_xy(land_polygon, grid_nztm[:, 0], grid_nztm[:, 1]).reshape( - land_mask_grid.shape + + land_mask_grid = site_gen.gen_general_land_mask_grid( + grid_spacing, ) - land_mask_grid.values[mask] = 1 logger.info(f"Saving to {output_ffp}...") land_mask_grid.to_netcdf(output_ffp) @@ -88,7 +44,7 @@ def gen_custom_grid( uniform_grid_spacing: Annotated[int | None, typer.Option()] = None, basin_spacing: Annotated[int | None, typer.Option()] = None, vel_model_version: Annotated[str | None, typer.Option()] = None, - nzgmdb_version: Annotated[NZGMDBVersion | None, typer.Option()] = None, + nzgmdb_version: Annotated[site_gen.NZGMDBVersion | None, typer.Option()] = None, rel_ffp: Annotated[Path | None, typer.Option()] = None, ) -> None: """ @@ -127,11 +83,11 @@ def gen_custom_grid( if config_ffp is not None: logger.info(f"Reading custom grid config from {config_ffp}...") config_dict = yaml.safe_load(config_ffp.open("r")) - grid_config = CustomGridConfig.from_config(config_dict) + grid_config = site_gen.CustomGridConfig.from_config(config_dict) rel_ffp = config_dict.get("rel_ffp") else: logger.info("Generating custom grid config from command line options...") - grid_config = CustomGridConfig( + grid_config = site_gen.CustomGridConfig( land_only=True, uniform_spacing=uniform_grid_spacing, vel_model_version=vel_model_version, @@ -145,7 +101,7 @@ def gen_custom_grid( region = shapely.Polygon(domain_config.domain.corners[:, ::-1]) grid_config.region = region - custom_grid = CustomGrid().apply_config(grid_config) + custom_grid = site_gen.CustomGrid().apply_config(grid_config) # Generate site dataframe and save site_df = custom_grid.get_site_df() @@ -180,7 +136,7 @@ def gen_plot( site_df = pd.read_parquet(site_df_ffp) with site_df_meta_ffp.open("r") as f: - config = CustomGridConfig.from_metadata( + config = site_gen.CustomGridConfig.from_metadata( yaml.load(f, Loader=yaml.FullLoader)["config"] ) # config_dict = yaml.load(site_df_meta_ffp.open("r"), Loader=yaml.FullLoader)["config"] @@ -249,7 +205,7 @@ def gen_plot( if config.vel_model_version is not None: # Plot the basin boundaries - basin_boundaries = get_basin_boundaries(config.vel_model_version) + basin_boundaries = site_gen.get_basin_boundaries(config.vel_model_version) basin_line_properties = dict(color="red", width=1) basin_fill_color = "rgba(255,0,0,0.05)" diff --git a/workflow/site_gen.py b/workflow/site_gen.py index f0a4806d..e912c97c 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -1,16 +1,16 @@ import enum +import json import logging import string import time from dataclasses import dataclass from pathlib import Path - -import geojson import geopandas as gpd import numpy as np import pandas as pd import pooch +import pygmt import shapely import xarray as xr @@ -35,21 +35,6 @@ class NZGMDBVersion(enum.StrEnum): NZGMDBVersion.V4p3: "nzgmdb_4p3_site_table.csv", } -REGION_CODES = { - "North Auckland": "NAK", - "South Auckland": "SAK", - "Hawkes Bay": "HBY", - "Gisborne": "GIS", - "Taranaki": "TAR", - "Wellington": "WEL", - "Nelson": "NEL", - "Marlborough": "MAR", - "Westland": "WES", - "Canterbury": "CAN", - "Otago": "OTA", - "Southland": "STL", -} - GRID_DATA = pooch.create( path=pooch.os_cache("virt_sites"), base_url="", @@ -107,7 +92,64 @@ def shape(self) -> tuple[int, int]: def load(cls, land_mask_grid_ffp: Path) -> "GeneralGrid": """Load GeneralGrid from file.""" logger.info(f"Loading general grid from {land_mask_grid_ffp}...") - return cls(xr.load_dataarray(land_mask_grid_ffp, engine="h5netcdf"), 50) + da = xr.load_dataarray(land_mask_grid_ffp, engine="h5netcdf") + return cls(da, int(da.attrs["spacing"])) + + +def gen_general_land_mask_grid(spacing: int) -> xr.DataArray: + """ + Generates the general land mask grid with given spacing. + + Parameters + ---------- + grid_spacing : str + Grid spacing in metres + + + Notes + ----- + Common grid spacing formats: + - To specify grid spacing of `x` units: `"{x}{unit}/{x}{unit}"`, + where `unit` can be metres (`e`) or kilometres (`k`). + + Returns + ------- + xr.DataArray + The land mask grid DataArray, with values of 1 for land + and 0 for ocean. + """ + land_df = gpd.read_parquet(GRID_DATA.fetch("nz_coastline.parquet")) + # Combine into a single polygon + land_polygon = shapely.coverage_union_all(land_df.geometry) + land_polygon = shapely.transform( + land_polygon, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) + ) + + # Generate grid + logger.info("Generating grid...") + land_mask_grid = pygmt.grdlandmask(region="NZ", spacing=f"{spacing}e/{spacing}e").astype(bool) + land_mask_grid[:] = False + land_mask_grid.attrs = {"spacing": spacing} + + # Use float32 for coords + land_mask_grid = land_mask_grid.assign_coords( + lat=land_mask_grid.lat.astype(np.float32), + lon=land_mask_grid.lon.astype(np.float32), + ) + + grid_lat, grid_lon = xr.broadcast(land_mask_grid.lat, land_mask_grid.lon) + grid_nztm = coordinates.wgs_depth_to_nztm( + np.vstack((grid_lat.values.ravel(), grid_lon.values.ravel())).T + ) + + # Apply land masking + logger.info("Applying land mask...") + mask = shapely.contains_xy(land_polygon, grid_nztm[:, 0], grid_nztm[:, 1]).reshape( + land_mask_grid.shape + ) + land_mask_grid.values[mask] = 1 + + return land_mask_grid @dataclass @@ -133,27 +175,26 @@ def as_dict(self) -> dict: @classmethod def from_config(cls, config_dict: dict) -> "RegionSpacingConfig": - with open(config_dict["geojson_ffp"], "r") as f: - geojson_obj = geojson.load(f) - region_polygon = shapely.geometry.shape( - geojson_obj["features"][0]["geometry"] + """Create RegionSpacingConfig from configuration dictionary.""" + if "geojson_ffp" not in config_dict and "region" not in config_dict: + raise ValueError( + "Either 'geojson_ffp' or 'region' must be provided in the config dictionary." ) + if "geojson_ffp" in config_dict: + with open(config_dict["geojson_ffp"], "r") as f: + geojson_obj = json.load(f) + region_polygon = shapely.geometry.shape( + geojson_obj["features"][0]["geometry"] + ) + else: + region_polygon = shapely.geometry.shape(config_dict["region"]) + return RegionSpacingConfig( name=config_dict["name"], region=region_polygon, spacing=config_dict["spacing"], ) - - @classmethod - def from_metadata(cls, metadata_dict: dict) -> "RegionSpacingConfig": - return cls( - name=metadata_dict["name"], - region=shapely.geometry.shape(metadata_dict["region"]), - spacing=metadata_dict["spacing"], - ) - - @dataclass class CustomGridConfig: @@ -183,50 +224,45 @@ class CustomGridConfig: nzgmdb_version: NZGMDBVersion | None = None """NZGMDB version for real stations.""" + + def __post_init__(self) -> None: + """Post-initialization checks.""" + if self.basin_spacing is not None and self.vel_model_version is None: + raise ValueError( + "vel_model_version must be provided if basin_spacing is set." + ) + @classmethod def from_config(cls, config_dict: dict) -> "CustomGridConfig": """Create CustomGridConfig from dictionary.""" + # Get the region polygon + region = config_dict.get("region") + if region is not None and isinstance(region, dict): + region = shapely.geometry.shape(region) # Get the per-region spacing per_region_spacing = None if (prs_configs := config_dict.get("per_region_spacing")) is not None: - per_region_spacing = [RegionSpacingConfig.from_config(config) for config in prs_configs] + per_region_spacing = [ + RegionSpacingConfig.from_config(config) for config in prs_configs + ] + + # Get the NZGMDB version + nzgmdb_version = config_dict.get("nzgmdb_version") + if nzgmdb_version is not None: + nzgmdb_version = NZGMDBVersion(nzgmdb_version) return cls( land_only=config_dict.get("land_only"), - region=config_dict.get("region"), + region=region, uniform_spacing=config_dict.get("uniform_spacing"), vel_model_version=config_dict.get("vel_model_version"), basin_spacing=config_dict.get("basin_spacing"), per_basin_spacing=config_dict.get("per_basin_spacing"), per_region_spacing=per_region_spacing, - nzgmdb_version=NZGMDBVersion(config_dict.get("nzgmdb_version")), + nzgmdb_version=nzgmdb_version, ) - - @classmethod - def from_metadata(cls, metadata_dict: dict) -> "CustomGridConfig": - """Create CustomGridConfig from metadata dictionary.""" - region_polygon = None - if (region_geojson := metadata_dict["region"]) is not None: - region_polygon = shapely.geometry.shape(region_geojson) - per_region_spacing = None - if (prs_metadata := metadata_dict["per_region_spacing"]) is not None: - per_region_spacing = [RegionSpacingConfig.from_metadata(prs) for prs in prs_metadata] - - return cls( - land_only=metadata_dict["land_only"], - region=region_polygon, - uniform_spacing=metadata_dict["uniform_spacing"], - vel_model_version=metadata_dict["vel_model_version"], - basin_spacing=metadata_dict["basin_spacing"], - per_basin_spacing=metadata_dict["per_basin_spacing"], - per_region_spacing=per_region_spacing, - nzgmdb_version=NZGMDBVersion(metadata_dict["nzgmdb_version"]) - if metadata_dict["nzgmdb_version"] is not None - else None, - ) - def as_dict(self) -> dict: """Convert CustomGridConfig to dictionary.""" return { @@ -257,10 +293,13 @@ class CustomGrid: built from the general grid. """ - def __init__(self) -> "CustomGrid": + def __init__(self, general_grid: GeneralGrid = None) -> "CustomGrid": """Initialize CustomGrid.""" - logger.info("Loading general grid...") - self.general_grid = GeneralGrid.load(GRID_DATA.fetch("general_grid.nc")) + if general_grid is None: + logger.info("Loading general grid...") + self.general_grid = GeneralGrid.load(GRID_DATA.fetch("general_grid.nc")) + else: + self.general_grid = general_grid self._reset() @@ -357,17 +396,19 @@ def _add_uniform_spacing_filter(self, spacing: int) -> None: logger.info("Adding uniform spacing filter...") start_time = time.time() if spacing < self.general_grid.spacing: - logger.error( + error_msg = ( "Uniform spacing must be greater than or" " equal to the general grid spacing." ) - raise + logger.error(error_msg) + raise ValueError(error_msg) if spacing % self.general_grid.spacing != 0: - logger.error( + error_msg = ( f"Uniform spacing must be a multiple of" f" the general grid spacing {self.general_grid.spacing}." ) - raise + logger.error(error_msg) + raise ValueError(error_msg) idx_interval = spacing // self.general_grid.spacing spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) @@ -534,6 +575,10 @@ def get_site_df( - Z1.0: The Z1.0 value for the site (if vel_model_version is provided). - Z2.5: The Z2.5 value for the site (if vel_model_version is provided). """ + if self.mask.sum() == 0: + logger.warning("Custom grid has no sites selected.") + return pd.DataFrame() + logger.info("Getting site dataframe...") site_df = pd.DataFrame( { @@ -561,6 +606,13 @@ def get_site_df( crs=NZTM_CRS, ) joined = gpd.sjoin(site_points_df, region_df, how="left", predicate="within") + + # Deal with sites that are not in a region + joined["name"] = joined["name"].cat.add_categories("No Region") + joined["name"] = joined["name"].fillna("No Region") + joined["code"] = joined["code"].cat.add_categories("NR") + joined["code"] = joined["code"].fillna("NR") + # Keep first region if in multiple regions joined = joined.groupby(level=0).first() site_df["region_name"] = joined["name"] @@ -627,7 +679,7 @@ def get_site_df( logger.info("Adding site code...") site_df["site_code"] = np.char.add( encode_base62_fixed_array(site_df.index.values, length=5), - site_df.region_code.astype(str).values, + site_df.region_code.values.astype(str), ) site_df = site_df.set_index("site_code") From 95c2c02bf57eca27577ead019fe39a205209655a Mon Sep 17 00:00:00 2001 From: Claudio Date: Wed, 29 Oct 2025 10:23:24 +1300 Subject: [PATCH 11/21] LLM PR comments --- wiki/Site-Generation.md | 12 ++++++------ workflow/scripts/site_gen_cmds.py | 7 ++++--- workflow/site_gen.py | 12 ++++++------ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/wiki/Site-Generation.md b/wiki/Site-Generation.md index d4868c9c..66586378 100644 --- a/wiki/Site-Generation.md +++ b/wiki/Site-Generation.md @@ -8,9 +8,9 @@ Instead a high density uniform "general" grid has been developed, and the code i ### Custom Grid Generation -A custom grid can be generated using the `site_gen_cmds.py` script using the `gen-custom-grid-from-rel` command. -For details/help on the command run `python site_gen_cmds.py gen-custom-grid-from-rel --help` -There are two options to generate the custom grid via the `gen-custom-grid-from-rel` either specify the spacing via the command line arguments, or pass a configuration yaml file. The configuration allows for more customization, such as specifying spacing per basin or custom defined region. +A custom grid can be generated using the `site_gen_cmds.py` script using the `gen-custom-grid` command. +For details/help on the command run `python site_gen_cmds.py gen-custom-grid --help` +There are two options to generate the custom grid via the `gen-custom-grid` either specify the spacing via the command line arguments, or pass a configuration yaml file. The configuration allows for more customization, such as specifying spacing per basin or custom defined region. An example configuration file might look something like this ```yaml @@ -110,15 +110,15 @@ Running the custom grid generation will produce a parquet file with the followin | Column | Description | Notes | |--------|-------------|-------| -| `site_id` | Unique identifier for each virtual site | `{4 character lat/lon hash}{2 character region code}`, e.g. "ijSBAOT", is a site in Otago as last two characters are "OT" | +| `site_id` | Unique identifier for each virtual site | `{5 character base 62 encoding of integer site id}{2 character region code}` | | `lon` | Longitude coordinate in decimal degrees | | | `lat` | Latitude coordinate in decimal degrees | | | `nztm_x` | X coordinate in New Zealand Transverse Mercator (NZTM) projection | | | `nztm_y` | Y coordinate in New Zealand Transverse Mercator (NZTM) projection | | | `source` | Source of the site | Either `virtual` or `real` | | `basin` | Basin name | None if not in a basin | -| `Z1.0` | Depth to 1.0 km/s shear-wave velocity (meters) | | -| `Z2.5` | Depth to 2.5 km/s shear-wave velocity (meters) | | +| `Z1.0` | Depth to 1.0 km/s shear-wave velocity (km) | | +| `Z2.5` | Depth to 2.5 km/s shear-wave velocity (km) | | | `vs30` | Time-averaged shear-wave velocity in the top 30 meters (m/s) | | Region code mapping: diff --git a/workflow/scripts/site_gen_cmds.py b/workflow/scripts/site_gen_cmds.py index 6d2da514..e683fe59 100644 --- a/workflow/scripts/site_gen_cmds.py +++ b/workflow/scripts/site_gen_cmds.py @@ -116,7 +116,6 @@ def gen_custom_grid( @cli.from_docstring(app) def gen_plot( site_df_ffp: Annotated[Path, typer.Argument()], - site_df_meta_ffp: Annotated[Path, typer.Argument()], output_ffp: Annotated[Path, typer.Argument()], rel_ffp: Annotated[Path | None, typer.Option()] = None, ) -> None: @@ -135,11 +134,13 @@ def gen_plot( """ site_df = pd.read_parquet(site_df_ffp) + site_df_meta_ffp = ( + site_df_ffp.parent / f"{site_df_ffp.stem}_metadata.yaml" + ) with site_df_meta_ffp.open("r") as f: - config = site_gen.CustomGridConfig.from_metadata( + config = site_gen.CustomGridConfig.from_config( yaml.load(f, Loader=yaml.FullLoader)["config"] ) - # config_dict = yaml.load(site_df_meta_ffp.open("r"), Loader=yaml.FullLoader)["config"] fig = go.Figure() diff --git a/workflow/site_gen.py b/workflow/site_gen.py index e912c97c..692a6db1 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -22,7 +22,7 @@ logger = logging.getLogger(__name__) NZTM_CRS = "EPSG:2193" -WSG84_CRS = "EPSG:4326" +WGS84_CRS = "EPSG:4326" class NZGMDBVersion(enum.StrEnum): @@ -61,7 +61,7 @@ class GeneralGrid: Any custom grid is built off this grid. """ - def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> "GeneralGrid": + def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> None: """Initialize GeneralGrid.""" self.land_mask_grid = land_mask_grid self.site_ids = np.arange(land_mask_grid.size, dtype=np.uint32).reshape( @@ -293,7 +293,7 @@ class CustomGrid: built from the general grid. """ - def __init__(self, general_grid: GeneralGrid = None) -> "CustomGrid": + def __init__(self, general_grid: GeneralGrid = None) -> None: """Initialize CustomGrid.""" if general_grid is None: logger.info("Loading general grid...") @@ -306,7 +306,7 @@ def __init__(self, general_grid: GeneralGrid = None) -> "CustomGrid": def _reset(self) -> None: """Resets the custom grid.""" self._and_mask = np.ones(self.general_grid.shape, dtype=bool) - self._or_mask = np.ones(self.general_grid.shape, dtype=bool) + self._or_mask = np.zeros(self.general_grid.shape, dtype=bool) self.config = None @property @@ -413,7 +413,7 @@ def _add_uniform_spacing_filter(self, spacing: int) -> None: idx_interval = spacing // self.general_grid.spacing spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) spacing_mask[::idx_interval, ::idx_interval] = True - self._or_mask &= spacing_mask + self._or_mask |= spacing_mask logger.info( f"Uniform spacing filter added in {time.time() - start_time} seconds." @@ -647,7 +647,7 @@ def get_site_df( basin_df = gpd.GeoDataFrame( {"basin": list(basin_boundaries.keys())}, geometry=list(basin_boundaries.values()), - crs=WSG84_CRS, + crs=WGS84_CRS, ).to_crs(NZTM_CRS) joined = gpd.sjoin( site_points_df, From 1d1279e96ef8af9d49c78b1b7f9273c032f3001d Mon Sep 17 00:00:00 2001 From: Claudio Date: Tue, 4 Nov 2025 12:35:26 +1300 Subject: [PATCH 12/21] Bug fix, more tests --- tests/test_site_gen.py | 42 ++++++++++++++++++---- workflow/site_gen.py | 79 +++++++++++++++++++++--------------------- 2 files changed, 76 insertions(+), 45 deletions(-) diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index 051a6324..33dadfa1 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -299,7 +299,7 @@ def test_uniform_valid_spacing( def test_basin_spacing( - general_grid: site_gen.GeneralGrid, canterbury_region_polygon: shapely.Polygon + general_grid: site_gen.GeneralGrid, canterbury_region_polygon: shapely.Polygon ) -> None: uniform_spacing = general_grid.spacing * 2 config = site_gen.CustomGridConfig( @@ -329,8 +329,7 @@ def test_small_uniform_spacing_error( ValueError, match="Uniform spacing must be greater than or equal to the general grid spacing", ): - custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) - + site_gen.CustomGrid(general_grid).apply_config(config) @pytest.mark.parametrize("factor", [1.2, 1.5]) def test_multiple_error_uniform_spacing( @@ -344,8 +343,7 @@ def test_multiple_error_uniform_spacing( ValueError, match="Uniform spacing must be a multiple of the general grid spacing", ): - custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) - + site_gen.CustomGrid(general_grid).apply_config(config) @pytest.mark.parametrize("basin_spacing", [5_000, 10_000]) def test_small_basin_spacing_error( @@ -382,9 +380,41 @@ def test_multiple_error_basin_spacing( ): custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) +def test_site_dataframe( + general_grid: site_gen.GeneralGrid +) -> None: + config = site_gen.CustomGridConfig( + uniform_spacing=general_grid.spacing * 2, + vel_model_version="2.09", + nzgmdb_version="v4.3", + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + assert (~site_df.region_name.isnull()).all() + assert (~site_df.region_code.isnull()).all() + assert site_df.source.isin(["virtual", "real"]).all() -# Hypothesis-based property tests for encode_base62_fixed_array +def test_site_dataframe_basin( + general_grid: site_gen.GeneralGrid, canterbury_region_polygon: shapely.Polygon +) -> None: + config = site_gen.CustomGridConfig( + region=canterbury_region_polygon, + uniform_spacing=general_grid.spacing * 2, + vel_model_version="2.09", + nzgmdb_version="v4.3", + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + assert (site_df.basin == "Canterbury_v25p9").all() + + + + @given( nums=st.lists(st.integers(min_value=0, max_value=62**4 - 1), min_size=2, max_size=500), diff --git a/workflow/site_gen.py b/workflow/site_gen.py index 692a6db1..b5f907ac 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -591,6 +591,41 @@ def get_site_df( } ).set_index("general_site_id") + # Add NZGMDB sites + if self.config.nzgmdb_version is not None: + start = time.time() + logger.info("Adding NZGMDB sites...") + nzgmdb_site_df = pd.read_csv( + GRID_DATA.fetch( + NZGMDB_VERSION_TO_TABLE_NAME[self.config.nzgmdb_version] + ), + index_col="sta", + usecols=["sta", "lat", "lon", "Vs30", "Z1.0", "Z2.5"], + ) + nzgmdb_site_df["source"] = "real" + nzgmdb_nztm_values = coordinates.wgs_depth_to_nztm( + nzgmdb_site_df[["lat", "lon"]].values + ) + nzgmdb_site_df["nztm_x"] = nzgmdb_nztm_values[:, 1] + nzgmdb_site_df["nztm_y"] = nzgmdb_nztm_values[:, 0] + # Convert Z1.0 to km + nzgmdb_site_df["Z1.0"] /= 1000 + + if self.config.region is not None: + region_nztm = shapely.transform( + self.config.region, + lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]), + ) + region_mask = shapely.contains_xy( + region_nztm, + nzgmdb_site_df["nztm_y"].values, + nzgmdb_site_df["nztm_x"].values, + ) + nzgmdb_site_df = nzgmdb_site_df[region_mask] + + site_df = pd.concat([site_df, nzgmdb_site_df], axis=0) + logger.info(f"Took: {time.time() - start} to add NZGMDB sites") + # Add region logger.info("Adding region information...") start = time.time() @@ -677,47 +712,13 @@ def get_site_df( # Add site ids logger.info("Adding site code...") - site_df["site_code"] = np.char.add( - encode_base62_fixed_array(site_df.index.values, length=5), - site_df.region_code.values.astype(str), + virt_mask = site_df.source == "virtual" + site_df.loc[virt_mask, "site_code"] = np.char.add( + encode_base62_fixed_array(site_df.loc[virt_mask].index.values.astype(int), length=5), + site_df.loc[virt_mask].region_code.values.astype(str), ) + site_df.loc[~virt_mask, "site_code"] = site_df.loc[~virt_mask].index.values site_df = site_df.set_index("site_code") - - # Add NZGMDB sites - if self.config.nzgmdb_version is not None: - start = time.time() - logger.info("Adding NZGMDB sites...") - nzgmdb_site_df = pd.read_csv( - GRID_DATA.fetch( - NZGMDB_VERSION_TO_TABLE_NAME[self.config.nzgmdb_version] - ), - index_col="sta", - usecols=["sta", "lat", "lon", "Vs30", "Z1.0", "Z2.5"], - ) - nzgmdb_site_df["source"] = "real" - nzgmdb_nztm_values = coordinates.wgs_depth_to_nztm( - nzgmdb_site_df[["lat", "lon"]].values - ) - nzgmdb_site_df["nztm_x"] = nzgmdb_nztm_values[:, 1] - nzgmdb_site_df["nztm_y"] = nzgmdb_nztm_values[:, 0] - # Convert Z1.0 to km - nzgmdb_site_df["Z1.0"] /= 1000 - - if self.config.region is not None: - region_nztm = shapely.transform( - self.config.region, - lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]), - ) - region_mask = shapely.contains_xy( - region_nztm, - nzgmdb_site_df["nztm_y"].values, - nzgmdb_site_df["nztm_x"].values, - ) - nzgmdb_site_df = nzgmdb_site_df[region_mask] - - site_df = pd.concat([site_df, nzgmdb_site_df], axis=0) - logger.info(f"Took: {time.time() - start} to add NZGMDB sites") - site_df = site_df.astype({"source": "category"}) # Sanity check From fb336fce285d7cfd659ab47f5a1e5c42e065f6ac Mon Sep 17 00:00:00 2001 From: Claudio Date: Tue, 4 Nov 2025 13:31:16 +1300 Subject: [PATCH 13/21] Address PR checks --- .github/workflows/pytest.yml | 2 + pyproject.toml | 5 + tests/test_site_gen.py | 3 +- workflow/scripts/site_gen_cmds.py | 27 +++-- workflow/site_gen.py | 188 +++++++++++++++++++++++++----- 5 files changed, 183 insertions(+), 42 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index e8a39e89..58199000 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -16,6 +16,8 @@ jobs: **/pyproject.toml - run: uv venv - run: uv pip install -e ".[test,dev]" + - name: Download velocity data + run: nzcvm-data-helper ensure --no-full - name: Run tests run: uv run pytest --cov=workflow --cov-report=html tests - name: Upload coverage data diff --git a/pyproject.toml b/pyproject.toml index 81308b5b..83a0eda3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,11 @@ dependencies = [ "tqdm", "typer", + # Plotting + "plotly", + "pygmt", + "pooch", + # Misc. "requests", # For gcmt-to-realisation "schema", # For loading realisations diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index 33dadfa1..32f9df57 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -6,7 +6,8 @@ import pytest import shapely import xarray as xr -from hypothesis import given, strategies as st, assume +from hypothesis import assume, given +from hypothesis import strategies as st from scipy.spatial import distance as sdist from workflow import site_gen diff --git a/workflow/scripts/site_gen_cmds.py b/workflow/scripts/site_gen_cmds.py index e683fe59..33c047a2 100644 --- a/workflow/scripts/site_gen_cmds.py +++ b/workflow/scripts/site_gen_cmds.py @@ -1,3 +1,4 @@ +"""Commands for generating simulation site grids.""" import logging from pathlib import Path from typing import Annotated @@ -25,10 +26,18 @@ app = typer.Typer() -@app.command("gen-general-grid") -def gen_general_grid(grid_spacing: str, output_ffp: Path) -> None: - """Generate the general grid.""" - +@cli.from_docstring(app) +def gen_general_grid(grid_spacing: int, output_ffp: Path) -> None: + """ + Generate the general grid. + + Parameters + ---------- + grid_spacing : int + The grid spacing in metres. + output_ffp : Path + The path to save the output netCDF file. + """ land_mask_grid = site_gen.gen_general_land_mask_grid( grid_spacing, ) @@ -128,15 +137,13 @@ def gen_plot( The path to the custom grid site dataframe file (parquet). output_ffp : Path The path to save the output HTML file. - rel_ffp: Path, optional + rel_ffp : Path, optional The path to the realisation config. If provided, the domain and source will be plotted. """ site_df = pd.read_parquet(site_df_ffp) - site_df_meta_ffp = ( - site_df_ffp.parent / f"{site_df_ffp.stem}_metadata.yaml" - ) + site_df_meta_ffp = site_df_ffp.parent / f"{site_df_ffp.stem}_metadata.yaml" with site_df_meta_ffp.open("r") as f: config = site_gen.CustomGridConfig.from_config( yaml.load(f, Loader=yaml.FullLoader)["config"] @@ -236,7 +243,7 @@ def gen_plot( hoverinfo="skip", ) ) - + if config.per_region_spacing is not None: # Plot the regions with different spacing for region_spacing in config.per_region_spacing: @@ -249,7 +256,7 @@ def gen_plot( hoverinfo="skip", ) ) - + if rel_ffp is not None: # Plot the domain domain_corners = DomainParameters.read_from_realisation(rel_ffp).domain.corners diff --git a/workflow/site_gen.py b/workflow/site_gen.py index b5f907ac..6131f3b7 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -62,7 +62,16 @@ class GeneralGrid: """ def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> None: - """Initialize GeneralGrid.""" + """ + Initialize GeneralGrid. + + Parameters + ---------- + land_mask_grid : xr.DataArray + The land mask grid as an xarray DataArray. + spacing : int + The grid spacing in metres. + """ self.land_mask_grid = land_mask_grid self.site_ids = np.arange(land_mask_grid.size, dtype=np.uint32).reshape( land_mask_grid.shape @@ -90,7 +99,19 @@ def shape(self) -> tuple[int, int]: @classmethod def load(cls, land_mask_grid_ffp: Path) -> "GeneralGrid": - """Load GeneralGrid from file.""" + """ + Load GeneralGrid from file. + + Parameters + ---------- + land_mask_grid_ffp : Path + The file path to the general grid file. + + Returns + ------- + GeneralGrid + The loaded GeneralGrid instance. + """ logger.info(f"Loading general grid from {land_mask_grid_ffp}...") da = xr.load_dataarray(land_mask_grid_ffp, engine="h5netcdf") return cls(da, int(da.attrs["spacing"])) @@ -102,21 +123,20 @@ def gen_general_land_mask_grid(spacing: int) -> xr.DataArray: Parameters ---------- - grid_spacing : str + spacing : int Grid spacing in metres + Returns + ------- + xr.DataArray + The land mask grid DataArray, with values of 1 for land + and 0 for ocean. Notes ----- Common grid spacing formats: - To specify grid spacing of `x` units: `"{x}{unit}/{x}{unit}"`, where `unit` can be metres (`e`) or kilometres (`k`). - - Returns - ------- - xr.DataArray - The land mask grid DataArray, with values of 1 for land - and 0 for ocean. """ land_df = gpd.read_parquet(GRID_DATA.fetch("nz_coastline.parquet")) # Combine into a single polygon @@ -127,7 +147,9 @@ def gen_general_land_mask_grid(spacing: int) -> xr.DataArray: # Generate grid logger.info("Generating grid...") - land_mask_grid = pygmt.grdlandmask(region="NZ", spacing=f"{spacing}e/{spacing}e").astype(bool) + land_mask_grid = pygmt.grdlandmask( + region="NZ", spacing=f"{spacing}e/{spacing}e" + ).astype(bool) land_mask_grid[:] = False land_mask_grid.attrs = {"spacing": spacing} @@ -166,7 +188,14 @@ class RegionSpacingConfig: """Grid spacing in metres within the region.""" def as_dict(self) -> dict: - """Convert RegionSpacingConfig to dictionary.""" + """ + Convert RegionSpacingConfig to dictionary. + + Returns + ------- + dict + Dictionary representation of the configuration. + """ return { "name": self.name, "region": shapely.geometry.mapping(self.region), @@ -175,7 +204,20 @@ def as_dict(self) -> dict: @classmethod def from_config(cls, config_dict: dict) -> "RegionSpacingConfig": - """Create RegionSpacingConfig from configuration dictionary.""" + """ + Create RegionSpacingConfig from configuration dictionary. + + Parameters + ---------- + config_dict : dict + Configuration dictionary with keys 'name', 'spacing', + and either 'region' or 'geojson_ffp'. + + Returns + ------- + RegionSpacingConfig + The created RegionSpacingConfig instance. + """ if "geojson_ffp" not in config_dict and "region" not in config_dict: raise ValueError( "Either 'geojson_ffp' or 'region' must be provided in the config dictionary." @@ -196,6 +238,7 @@ def from_config(cls, config_dict: dict) -> "RegionSpacingConfig": spacing=config_dict["spacing"], ) + @dataclass class CustomGridConfig: """Configuration for CustomGrid.""" @@ -224,7 +267,6 @@ class CustomGridConfig: nzgmdb_version: NZGMDBVersion | None = None """NZGMDB version for real stations.""" - def __post_init__(self) -> None: """Post-initialization checks.""" if self.basin_spacing is not None and self.vel_model_version is None: @@ -234,7 +276,19 @@ def __post_init__(self) -> None: @classmethod def from_config(cls, config_dict: dict) -> "CustomGridConfig": - """Create CustomGridConfig from dictionary.""" + """ + Create CustomGridConfig from dictionary. + + Parameters + ---------- + config_dict : dict + Configuration dictionary containing CustomGridConfig settings. + + Returns + ------- + CustomGridConfig + The created CustomGridConfig instance. + """ # Get the region polygon region = config_dict.get("region") if region is not None and isinstance(region, dict): @@ -264,7 +318,14 @@ def from_config(cls, config_dict: dict) -> "CustomGridConfig": ) def as_dict(self) -> dict: - """Convert CustomGridConfig to dictionary.""" + """ + Convert CustomGridConfig to dictionary. + + Returns + ------- + dict + Dictionary representation of the configuration. + """ return { "land_only": self.land_only, "region": ( @@ -294,7 +355,19 @@ class CustomGrid: """ def __init__(self, general_grid: GeneralGrid = None) -> None: - """Initialize CustomGrid.""" + """ + Initialize CustomGrid. + + Parameters + ---------- + general_grid : GeneralGrid, optional + The general grid to use. If None, it will be loaded from the + default location. + + Returns + ------- + None + """ if general_grid is None: logger.info("Loading general grid...") self.general_grid = GeneralGrid.load(GRID_DATA.fetch("general_grid.nc")) @@ -304,7 +377,9 @@ def __init__(self, general_grid: GeneralGrid = None) -> None: self._reset() def _reset(self) -> None: - """Resets the custom grid.""" + """ + Resets the custom grid. + """ self._and_mask = np.ones(self.general_grid.shape, dtype=bool) self._or_mask = np.zeros(self.general_grid.shape, dtype=bool) self.config = None @@ -321,6 +396,16 @@ def apply_config(self, config: CustomGridConfig) -> "CustomGrid": """ Applies a CustomGridConfig to the CustomGrid. Resets any previously applied filters. + + Parameters + ---------- + config : CustomGridConfig + The configuration to apply. + + Returns + ------- + CustomGrid + Returns self for method chaining. """ self._reset() self.config = config @@ -475,7 +560,7 @@ def _add_basin_spacing_filter( ) -> None: """ Adds a basin spacing filter. - Note that this is an OR filter, i.e., it only adds sites + Note that this is an OR filter, i.e., it only adds sites. Parameters ---------- @@ -483,6 +568,12 @@ def _add_basin_spacing_filter( The velocity model version to use for basin spacing. spacing : int The grid spacing in metres to use within basins. + basins : list[str] | None, optional + The specific basins to apply the spacing to. If None, applies to all basins. + + Returns + ------- + None """ logger.info( f"Adding basin {spacing} spacing filter for {basins if basins is not None else 'all'} basins..." @@ -524,6 +615,16 @@ def _add_basin_spacing_filter( def get_metadata(self, site_df: pd.DataFrame = None) -> dict: """ Gets the metadata dictionary for the custom grid. + + Parameters + ---------- + site_df : pd.DataFrame, optional + The site dataframe to generate metadata from. + + Returns + ------- + dict + Dictionary containing metadata and configuration information. """ site_metadata = {} if site_df is not None: @@ -549,20 +650,11 @@ def get_site_df( """ Gets the site dataframe for the custom grid. - Parameters - ---------- - nzgmdb_version : NZGMDBVersion, optional - The NZGMDB version to include real stations from. - vel_model_version : str, optional - The velocity model version to use for basin membership - and Z-values. Is only used if the basin spacing filter - has not already been added to the custom grid. - Returns ------- pd.DataFrame The site dataframe with columns: - - site_id: The site ID. + - general_site_id: The site ID from the general grid. - lon: The site longitude. - lat: The site latitude. - nztm_x: The site NZTM X coordinate. @@ -574,6 +666,12 @@ def get_site_df( - basin: The basin the site is in (if any). - Z1.0: The Z1.0 value for the site (if vel_model_version is provided). - Z2.5: The Z2.5 value for the site (if vel_model_version is provided). + - site_code: The site code. + + Notes + ----- + Basin membership and Z-values are only added if vel_model_version + is set in the configuration. """ if self.mask.sum() == 0: logger.warning("Custom grid has no sites selected.") @@ -714,7 +812,9 @@ def get_site_df( logger.info("Adding site code...") virt_mask = site_df.source == "virtual" site_df.loc[virt_mask, "site_code"] = np.char.add( - encode_base62_fixed_array(site_df.loc[virt_mask].index.values.astype(int), length=5), + encode_base62_fixed_array( + site_df.loc[virt_mask].index.values.astype(int), length=5 + ), site_df.loc[virt_mask].region_code.values.astype(str), ) site_df.loc[~virt_mask, "site_code"] = site_df.loc[~virt_mask].index.values @@ -727,7 +827,19 @@ def get_site_df( def get_basin_boundaries(vel_model_version: str) -> dict[str, shapely.Polygon]: - """Gets the basin boundaries for a given velocity model version.""" + """ + Gets the basin boundaries for a given velocity model version. + + Parameters + ---------- + vel_model_version : str + The velocity model version to load basin data for. + + Returns + ------- + dict[str, shapely.Polygon] + Dictionary mapping basin names to their boundary polygons. + """ cvm_registry = CVMRegistry(vel_model_version, get_data_root()) basin_data = cvm_registry.load_basin_data(cvm_registry.global_params["basins"]) @@ -746,7 +858,21 @@ def get_basin_boundaries(vel_model_version: str) -> dict[str, shapely.Polygon]: def encode_base62_fixed_array(nums: np.ndarray, length: int) -> np.ndarray: - """Vectorized Base62 encoder for many integers.""" + """ + Vectorized Base62 encoder for many integers. + + Parameters + ---------- + nums : np.ndarray + Array of integers to encode. + length : int + The fixed length for each encoded string. + + Returns + ------- + np.ndarray + Array of Base62 encoded strings of fixed length. + """ alphabet = np.array(list(string.digits + string.ascii_letters)) alphabet_size = len(alphabet) From 9d6e346c759f6f3ffef1f2518a5f284d1e2c84ed Mon Sep 17 00:00:00 2001 From: Claudio Date: Tue, 4 Nov 2025 13:42:14 +1300 Subject: [PATCH 14/21] PR test fixes. --- .github/workflows/pytest.yml | 1 + tests/test_site_gen.py | 4 +-- workflow/site_gen.py | 57 ++++++++++++++++++------------------ 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 58199000..a40d385b 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -17,6 +17,7 @@ jobs: - run: uv venv - run: uv pip install -e ".[test,dev]" - name: Download velocity data + shell: bash -l -e {0} run: nzcvm-data-helper ensure --no-full - name: Run tests run: uv run pytest --cov=workflow --cov-report=html tests diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index 32f9df57..381b2b0a 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -361,7 +361,7 @@ def test_small_basin_spacing_error( ValueError, match="Basin spacing must be greater than or equal to the general grid spacing.", ): - custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_gen.CustomGrid(general_grid).apply_config(config) @pytest.mark.parametrize("factor", [1.2, 1.5]) @@ -379,7 +379,7 @@ def test_multiple_error_basin_spacing( with pytest.raises( ValueError, match="Basin spacing must be a multiple of the general grid spacing" ): - custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_gen.CustomGrid(general_grid).apply_config(config) def test_site_dataframe( general_grid: site_gen.GeneralGrid diff --git a/workflow/site_gen.py b/workflow/site_gen.py index 6131f3b7..a2d37b29 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -1,3 +1,4 @@ +"""Module for generating simulation site grids.""" import enum import json import logging @@ -59,19 +60,17 @@ class GeneralGrid: """ Represents the general base grid of virtual sites. Any custom grid is built off this grid. + + Parameters + ---------- + land_mask_grid : xr.DataArray + The land mask grid as an xarray DataArray. + spacing : int + The grid spacing in metres. """ def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> None: - """ - Initialize GeneralGrid. - - Parameters - ---------- - land_mask_grid : xr.DataArray - The land mask grid as an xarray DataArray. - spacing : int - The grid spacing in metres. - """ + """Initialize GeneralGrid.""" self.land_mask_grid = land_mask_grid self.site_ids = np.arange(land_mask_grid.size, dtype=np.uint32).reshape( land_mask_grid.shape @@ -94,7 +93,14 @@ def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> None: @property def shape(self) -> tuple[int, int]: - """Shape of the grid.""" + """ + Shape of the grid. + + Returns + ------- + tuple[int, int] + The shape of the grid. + """ return self.land_mask_grid.shape @classmethod @@ -352,22 +358,16 @@ class CustomGrid: """ Represents a custom grid of virtual sites built from the general grid. + + Parameters + ---------- + general_grid : GeneralGrid, optional + The general grid to use. If None, it will be loaded from the + default location. """ def __init__(self, general_grid: GeneralGrid = None) -> None: - """ - Initialize CustomGrid. - - Parameters - ---------- - general_grid : GeneralGrid, optional - The general grid to use. If None, it will be loaded from the - default location. - - Returns - ------- - None - """ + """Initialize CustomGrid.""" if general_grid is None: logger.info("Loading general grid...") self.general_grid = GeneralGrid.load(GRID_DATA.fetch("general_grid.nc")) @@ -389,6 +389,11 @@ def mask(self) -> np.ndarray: """ Site mask that gives the custom grid when applied to the the general grid. + + Returns + ------- + np.ndarray + The custom grid mask. """ return self._and_mask & self._or_mask @@ -570,10 +575,6 @@ def _add_basin_spacing_filter( The grid spacing in metres to use within basins. basins : list[str] | None, optional The specific basins to apply the spacing to. If None, applies to all basins. - - Returns - ------- - None """ logger.info( f"Adding basin {spacing} spacing filter for {basins if basins is not None else 'all'} basins..." From bad818b9c57103651e07bce930ba780e8b628d0a Mon Sep 17 00:00:00 2001 From: Claudio Date: Mon, 10 Nov 2025 10:16:58 +1300 Subject: [PATCH 15/21] PR issues --- .github/workflows/pytest.yml | 6 +++-- tests/test_site_gen.py | 44 ++++++++++++++++++++---------------- workflow/scripts/merge_ts.py | 31 ------------------------- workflow/site_gen.py | 35 ++++++++++++++++------------ 4 files changed, 49 insertions(+), 67 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index a40d385b..564461dc 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -16,9 +16,11 @@ jobs: **/pyproject.toml - run: uv venv - run: uv pip install -e ".[test,dev]" + # - name: Download velocity data + # shell: bash -l -e {0} + # run: uv run nzcvm-data-helper ensure --no-full - name: Download velocity data - shell: bash -l -e {0} - run: nzcvm-data-helper ensure --no-full + run: uv run nzcvm-data-helper ensure --no-full - name: Run tests run: uv run pytest --cov=workflow --cov-report=html tests - name: Upload coverage data diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index 381b2b0a..dd49441d 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -300,7 +300,7 @@ def test_uniform_valid_spacing( def test_basin_spacing( - general_grid: site_gen.GeneralGrid, canterbury_region_polygon: shapely.Polygon + general_grid: site_gen.GeneralGrid, canterbury_region_polygon: shapely.Polygon ) -> None: uniform_spacing = general_grid.spacing * 2 config = site_gen.CustomGridConfig( @@ -332,6 +332,7 @@ def test_small_uniform_spacing_error( ): site_gen.CustomGrid(general_grid).apply_config(config) + @pytest.mark.parametrize("factor", [1.2, 1.5]) def test_multiple_error_uniform_spacing( general_grid: site_gen.GeneralGrid, factor: float @@ -346,6 +347,7 @@ def test_multiple_error_uniform_spacing( ): site_gen.CustomGrid(general_grid).apply_config(config) + @pytest.mark.parametrize("basin_spacing", [5_000, 10_000]) def test_small_basin_spacing_error( general_grid: site_gen.GeneralGrid, basin_spacing: float @@ -381,9 +383,8 @@ def test_multiple_error_basin_spacing( ): site_gen.CustomGrid(general_grid).apply_config(config) -def test_site_dataframe( - general_grid: site_gen.GeneralGrid -) -> None: + +def test_site_dataframe(general_grid: site_gen.GeneralGrid) -> None: config = site_gen.CustomGridConfig( uniform_spacing=general_grid.spacing * 2, vel_model_version="2.09", @@ -412,32 +413,33 @@ def test_site_dataframe_basin( site_df = custom_grid.get_site_df() assert (site_df.basin == "Canterbury_v25p9").all() - - - @given( - nums=st.lists(st.integers(min_value=0, max_value=62**4 - 1), min_size=2, max_size=500), + nums=st.lists( + st.integers(min_value=0, max_value=62**4 - 1), min_size=2, max_size=500 + ), length=st.integers(min_value=3, max_value=5), ) def test_encode_base62_combined_properties(nums: list[int], length: int) -> None: """Property: encoded strings should have fixed length, be unique, and use valid base62 characters.""" nums_array = np.array(nums) assume(np.all(nums_array < 62**length)) # Ensure all inputs fit in the length - assume(np.unique(nums_array).size == len(nums_array)) # Ensure all inputs are unique - + assume( + np.unique(nums_array).size == len(nums_array) + ) # Ensure all inputs are unique + result = site_gen.encode_base62_fixed_array(nums_array, length=length) - + # The number of output codes should match the number of input numbers assert len(result) == len(nums) # All encoded strings should have the specified length assert all(len(code) == length for code in result) - + # Different numbers should produce different codes (all inputs are unique) assert len(np.unique(result)) == len(nums) - + # Encoded strings should only contain valid base62 characters alphabet = set(string.digits + string.ascii_letters) for code in result: @@ -451,31 +453,35 @@ def test_encode_base62_boundary_values_property(length: int) -> None: """Property: test boundary values (0 and max) for a given length.""" max_val = 62**length - 1 nums = np.array([0, max_val]) - + result = site_gen.encode_base62_fixed_array(nums, length=length) - + assert len(result) == 2 assert all(len(code) == length for code in result) assert result[0] == "0" * length + @given( too_large_num=st.integers(min_value=62**3, max_value=62**4), length=st.just(3), ) -def test_encode_base62_too_large_error_property(too_large_num: int, length: int) -> None: +def test_encode_base62_too_large_error_property( + too_large_num: int, length: int +) -> None: """Property: numbers too large for the specified length should raise ValueError.""" nums = np.array([too_large_num]) - + with pytest.raises(ValueError, match="too large"): site_gen.encode_base62_fixed_array(nums, length=length) + @given( num=st.integers(min_value=0, max_value=62**3 - 1), ) def test_encode_base62_same_input_same_output_property(num: int) -> None: """Property: encoding the same number multiple times should give the same result.""" nums = np.array([num, num, num]) - + result = site_gen.encode_base62_fixed_array(nums, length=3) - + assert result[0] == result[1] == result[2] diff --git a/workflow/scripts/merge_ts.py b/workflow/scripts/merge_ts.py index aa82424e..4f1d4498 100644 --- a/workflow/scripts/merge_ts.py +++ b/workflow/scripts/merge_ts.py @@ -155,34 +155,3 @@ def merge_ts_hdf5( } }, ) - - -@cli.from_docstring(app, name="xyts") -def merge_ts( - component_xyts_directory: Annotated[ - Path, - typer.Argument( - dir_okay=True, - file_okay=False, - exists=True, - readable=True, - ), - ], - output: Annotated[ - Path, - typer.Argument(dir_okay=False, writable=True), - ], - glob_pattern: str = "*xyts-*.e3d", -) -> None: - """Merge XYTS files. - - Parameters - ---------- - component_xyts_directory : Path - The input xyts directory containing files to merge. - output : Path - The output xyts file. - glob_pattern : str, optional - Set a custom glob pattern for merging the xyts files, by default "*xyts-*.e3d". - """ - merge_ts_xyts(component_xyts_directory, output, glob_pattern) diff --git a/workflow/site_gen.py b/workflow/site_gen.py index a2d37b29..f30c7142 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -60,17 +60,20 @@ class GeneralGrid: """ Represents the general base grid of virtual sites. Any custom grid is built off this grid. - - Parameters - ---------- - land_mask_grid : xr.DataArray - The land mask grid as an xarray DataArray. - spacing : int - The grid spacing in metres. """ def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> None: - """Initialize GeneralGrid.""" + """ + Initialize GeneralGrid. + + Parameters + ---------- + land_mask_grid : xr.DataArray + The land mask grid as an xarray DataArray. + spacing : int + The grid spacing in metres. + + """ self.land_mask_grid = land_mask_grid self.site_ids = np.arange(land_mask_grid.size, dtype=np.uint32).reshape( land_mask_grid.shape @@ -358,16 +361,18 @@ class CustomGrid: """ Represents a custom grid of virtual sites built from the general grid. - - Parameters - ---------- - general_grid : GeneralGrid, optional - The general grid to use. If None, it will be loaded from the - default location. """ def __init__(self, general_grid: GeneralGrid = None) -> None: - """Initialize CustomGrid.""" + """ + Initialize CustomGrid. + + Parameters + ---------- + general_grid : GeneralGrid, optional + The general grid to use. If None, it will be loaded from the + default location. + """ if general_grid is None: logger.info("Loading general grid...") self.general_grid = GeneralGrid.load(GRID_DATA.fetch("general_grid.nc")) From 9bcebb2a44e13084423f5e1cb4575a631c4763b7 Mon Sep 17 00:00:00 2001 From: Claudio Date: Mon, 10 Nov 2025 11:05:15 +1300 Subject: [PATCH 16/21] Bug fix, fix tests --- tests/test_site_gen.py | 18 +++++++++--------- workflow/site_gen.py | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index dd49441d..9da5cd92 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -31,15 +31,15 @@ def test_general_grid(land_mask_grid: tuple[xr.DataArray, int]) -> None: land_mask_grid.to_netcdf(ffp) loaded_grid = site_gen.GeneralGrid.load(ffp) - assert loaded_grid.land_mask_grid.spacing == spacing - assert loaded_grid.shape == land_mask_grid.shape - assert np.array_equal(loaded_grid.land_mask_grid.values, land_mask_grid.values) - assert np.array_equal( - loaded_grid.land_mask_grid.lat.values, land_mask_grid.lat.values - ) - assert np.array_equal( - loaded_grid.land_mask_grid.lon.values, land_mask_grid.lon.values - ) + assert loaded_grid.land_mask_grid.spacing == spacing + assert loaded_grid.shape == land_mask_grid.shape + assert np.array_equal(loaded_grid.land_mask_grid.values, land_mask_grid.values) + assert np.array_equal( + loaded_grid.land_mask_grid.lat.values, land_mask_grid.lat.values + ) + assert np.array_equal( + loaded_grid.land_mask_grid.lon.values, land_mask_grid.lon.values + ) @pytest.fixture(scope="module") diff --git a/workflow/site_gen.py b/workflow/site_gen.py index f30c7142..a20a1497 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -60,6 +60,13 @@ class GeneralGrid: """ Represents the general base grid of virtual sites. Any custom grid is built off this grid. + + Parameters + ---------- + land_mask_grid : xr.DataArray + The land mask grid as an xarray DataArray. + spacing : int + The grid spacing in metres. """ def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> None: @@ -72,7 +79,6 @@ def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> None: The land mask grid as an xarray DataArray. spacing : int The grid spacing in metres. - """ self.land_mask_grid = land_mask_grid self.site_ids = np.arange(land_mask_grid.size, dtype=np.uint32).reshape( @@ -361,6 +367,12 @@ class CustomGrid: """ Represents a custom grid of virtual sites built from the general grid. + + Parameters + ---------- + general_grid : GeneralGrid, optional + The general grid to use. If None, it will be loaded from the + default location. """ def __init__(self, general_grid: GeneralGrid = None) -> None: @@ -810,8 +822,8 @@ def get_site_df( include_sigma=False, logger=logger, ) - site_df["Z1.0"] = z_values["Z_1.0(km)"] - site_df["Z2.5"] = z_values["Z_2.5(km)"] + site_df["Z1.0"] = z_values["Z1.0(km)"] + site_df["Z2.5"] = z_values["Z2.5(km)"] logger.info(f"Took: {time.time() - start} to add Z-values") # Add site ids From 72457bbbe51672059e048d10dc43d00cff752443 Mon Sep 17 00:00:00 2001 From: Claudio Date: Mon, 10 Nov 2025 11:37:09 +1300 Subject: [PATCH 17/21] PR test fix. --- tests/test_site_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index 9da5cd92..1cf7597f 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -28,7 +28,7 @@ def test_general_grid(land_mask_grid: tuple[xr.DataArray, int]) -> None: land_mask_grid, spacing = land_mask_grid with tempfile.TemporaryDirectory() as tmpdir: ffp = f"{tmpdir}/general_grid.nc" - land_mask_grid.to_netcdf(ffp) + land_mask_grid.to_netcdf(ffp, engine="scipy") loaded_grid = site_gen.GeneralGrid.load(ffp) assert loaded_grid.land_mask_grid.spacing == spacing From 086d3114ac955ae3db5862ca5a57189b9b09ef45 Mon Sep 17 00:00:00 2001 From: Claudio Date: Mon, 10 Nov 2025 12:33:55 +1300 Subject: [PATCH 18/21] PR test fixes. Add type checking. --- .github/workflows/types.yml | 29 +++++++++++++++++++++++++++++ tests/test_site_gen.py | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/types.yml diff --git a/.github/workflows/types.yml b/.github/workflows/types.yml new file mode 100644 index 00000000..70965a1b --- /dev/null +++ b/.github/workflows/types.yml @@ -0,0 +1,29 @@ +name: Type Check + +on: + push: + branches: [pegasus] + pull_request: + branches: [pegasus] + +jobs: + typecheck: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Install project with types + run: uv sync --all-extras --dev + + - name: Run type checking with ty + run: uv run ty check --exclude workflow/schemas.py --exclude setup.py \ No newline at end of file diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index 1cf7597f..61f1ae06 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -28,7 +28,7 @@ def test_general_grid(land_mask_grid: tuple[xr.DataArray, int]) -> None: land_mask_grid, spacing = land_mask_grid with tempfile.TemporaryDirectory() as tmpdir: ffp = f"{tmpdir}/general_grid.nc" - land_mask_grid.to_netcdf(ffp, engine="scipy") + land_mask_grid.to_netcdf(ffp, engine="h5netcdf") loaded_grid = site_gen.GeneralGrid.load(ffp) assert loaded_grid.land_mask_grid.spacing == spacing From 2cd7bef470be9b0dafb2734701879642c2e40052 Mon Sep 17 00:00:00 2001 From: Claudio Date: Wed, 12 Nov 2025 12:48:39 +1300 Subject: [PATCH 19/21] More test, typing --- tests/test_site_gen.py | 185 ++++++++++++++++++++++++++++++++++++++--- workflow/site_gen.py | 47 ++++++----- 2 files changed, 203 insertions(+), 29 deletions(-) diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index 61f1ae06..bb54ab2d 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -1,8 +1,12 @@ import json import string import tempfile +from pathlib import Path +from unittest.mock import patch +import geopandas as gpd import numpy as np +import pandas as pd import pytest import shapely import xarray as xr @@ -10,24 +14,31 @@ from hypothesis import strategies as st from scipy.spatial import distance as sdist +from qcore import coordinates from workflow import site_gen @pytest.fixture(scope="module", params=[50_000, 25_000]) -def land_mask_grid(request: pytest.FixtureRequest) -> tuple[xr.DataArray, int]: +def land_mask_grid_spacing_tuple( + request: pytest.FixtureRequest, +) -> tuple[xr.DataArray, int]: spacing = request.param return site_gen.gen_general_land_mask_grid(spacing), spacing @pytest.fixture -def general_grid(land_mask_grid: tuple[xr.DataArray, int]) -> site_gen.GeneralGrid: - return site_gen.GeneralGrid(land_mask_grid[0], land_mask_grid[1]) +def general_grid( + land_mask_grid_spacing_tuple: tuple[xr.DataArray, int], +) -> site_gen.GeneralGrid: + return site_gen.GeneralGrid( + land_mask_grid_spacing_tuple[0], land_mask_grid_spacing_tuple[1] + ) -def test_general_grid(land_mask_grid: tuple[xr.DataArray, int]) -> None: - land_mask_grid, spacing = land_mask_grid +def test_general_grid(land_mask_grid_spacing_tuple: tuple[xr.DataArray, int]) -> None: + land_mask_grid, spacing = land_mask_grid_spacing_tuple with tempfile.TemporaryDirectory() as tmpdir: - ffp = f"{tmpdir}/general_grid.nc" + ffp = Path(f"{tmpdir}/general_grid.nc") land_mask_grid.to_netcdf(ffp, engine="h5netcdf") loaded_grid = site_gen.GeneralGrid.load(ffp) @@ -101,6 +112,20 @@ def water_region_polygon() -> shapely.Polygon: return polygon +@pytest.fixture(scope="module") +def central_south_island_region_polygon() -> shapely.Polygon: + polygon = shapely.Polygon( + [ + (169.07901872038428, -42.23259539489639), + (169.07901872038428, -44.678482592421624), + (174.25821453531285, -44.678482592421624), + (174.25821453531285, -42.23259539489639), + (169.07901872038428, -42.23259539489639), + ] + ) + return polygon + + def test_region_spacing_config_region_file( chch_region_polygon: shapely.Polygon, ) -> None: @@ -128,6 +153,17 @@ def test_region_spacing_config_region_file( assert shapely.equals_exact(region_spacing_config.region, chch_region_polygon) +def test_region_spacing_config_no_region_specified() -> None: + # Create region spacing config with no region specified + config_dict = {"name": "chch", "spacing": 1000} + + with pytest.raises( + ValueError, + match="Either 'geojson_ffp' or 'region' must be provided in the config dictionary.", + ): + site_gen.RegionSpacingConfig.from_config(config_dict) + + def test_region_spacing_config_region_metadata( chch_region_polygon: shapely.Polygon, ) -> None: @@ -145,6 +181,16 @@ def test_region_spacing_config_region_metadata( assert shapely.equals_exact(region_spacing_config.region, chch_region_polygon) +def test_custom_grid_config_no_vel_model_version() -> None: + config_dict = {"basin_spacing": 2500} + + with pytest.raises( + ValueError, + match="vel_model_version must be provided if basin_spacing is set.", + ): + site_gen.CustomGridConfig.from_config(config_dict) + + def test_custom_grid_config(chch_region_polygon: shapely.Polygon) -> None: # Create the custom grid config dict with tempfile.TemporaryDirectory() as tmpdir: @@ -217,6 +263,19 @@ def test_custom_grid_config(chch_region_polygon: shapely.Polygon) -> None: ) +def test_custom_grid_init_without_general_grid( + general_grid: site_gen.GeneralGrid, +) -> None: + with patch.object( + site_gen.GeneralGrid, + "load", + return_value=general_grid, + ) as mock_load: + custom_grid = site_gen.CustomGrid() + mock_load.assert_called_once() + assert custom_grid.general_grid == general_grid + + def test_custom_grid_land_region_filtering( general_grid: site_gen.GeneralGrid, land_region_polygon: shapely.Polygon ) -> None: @@ -320,9 +379,115 @@ def test_basin_spacing( assert np.all(min_distances < uniform_spacing) +def test_per_basin_spacing( + general_grid: site_gen.GeneralGrid, + central_south_island_region_polygon: shapely.Polygon, +) -> None: + uniform_spacing = general_grid.spacing * 2 + config = site_gen.CustomGridConfig( + region=central_south_island_region_polygon, + uniform_spacing=uniform_spacing, + per_basin_spacing={"Canterbury_v25p9": general_grid.spacing}, + vel_model_version="2.09", + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + cant_basin_sites = site_df[site_df.basin == "Canterbury_v25p9"] + distances = sdist.squareform( + sdist.pdist(cant_basin_sites[["nztm_x", "nztm_y"]].values) + ) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, general_grid.spacing, rtol=0.12) + + other_sites = site_df[site_df.basin != "Canterbury_v25p9"] + distances = sdist.squareform(sdist.pdist(other_sites[["nztm_x", "nztm_y"]].values)) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, uniform_spacing, rtol=0.12) + + +def test_custom_grid_metadata(general_grid: site_gen.GeneralGrid) -> None: + config = site_gen.CustomGridConfig( + uniform_spacing=general_grid.spacing, + ) + + custom_grid = site_gen.CustomGrid(general_grid) + custom_grid.config = config + + # Create fake site dataframe + site_df = pd.DataFrame(data=["real", "virtual"], columns=["source"]) + site_df["basin"] = ["basin_test", None] + + metadata = custom_grid.get_metadata(site_df) + site_metadata = metadata["site_metadata"] + config = metadata["config"] + assert site_metadata["num_sites"] == 2 + assert site_metadata["num_real_sites"] == 1 + assert site_metadata["num_virtual_sites"] == 1 + assert site_metadata["num_basin_sites"] == 1 + assert site_metadata["sites_per_basin"]["basin_test"] == 1 + assert config["uniform_spacing"] == general_grid.spacing + + +def test_per_region_spacing( + general_grid: site_gen.GeneralGrid, + central_south_island_region_polygon: shapely.Polygon, + canterbury_region_polygon: shapely.Polygon, +) -> None: + uniform_spacing = general_grid.spacing * 2 + config = site_gen.CustomGridConfig( + region=central_south_island_region_polygon, + uniform_spacing=uniform_spacing, + per_region_spacing=[ + site_gen.RegionSpacingConfig( + name="Canterbury", + region=canterbury_region_polygon, + spacing=general_grid.spacing, + ) + ], + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + # Get sites within Christchurch region + canterbury_region_polygon_nztm = shapely.transform( + canterbury_region_polygon, + lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1])[:, ::-1], + ) + geo_site_df = gpd.GeoDataFrame( + site_df, + geometry=gpd.points_from_xy(site_df.nztm_x, site_df.nztm_y), + crs=site_gen.NZTM_CRS, + ) + canterbury_sites = geo_site_df[ + geo_site_df.geometry.within(canterbury_region_polygon_nztm) + ] + + # Check spacing within Canterbury region + distances = sdist.squareform( + sdist.pdist(canterbury_sites[["nztm_x", "nztm_y"]].values) + ) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, general_grid.spacing, rtol=0.12) + + # Check spacing outside Canterbury region + non_canterbury_sites = geo_site_df.drop(canterbury_sites.index) + distances = sdist.squareform( + sdist.pdist(non_canterbury_sites[["nztm_x", "nztm_y"]].values) + ) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, uniform_spacing, rtol=0.12) + + @pytest.mark.parametrize("uniform_spacing", [5_000, 10_000]) def test_small_uniform_spacing_error( - general_grid: site_gen.GeneralGrid, uniform_spacing: float + general_grid: site_gen.GeneralGrid, uniform_spacing: int ) -> None: config = site_gen.CustomGridConfig(uniform_spacing=uniform_spacing) @@ -338,7 +503,7 @@ def test_multiple_error_uniform_spacing( general_grid: site_gen.GeneralGrid, factor: float ) -> None: config = site_gen.CustomGridConfig( - uniform_spacing=general_grid.spacing * factor, + uniform_spacing=int(general_grid.spacing * factor), ) with pytest.raises( @@ -388,7 +553,7 @@ def test_site_dataframe(general_grid: site_gen.GeneralGrid) -> None: config = site_gen.CustomGridConfig( uniform_spacing=general_grid.spacing * 2, vel_model_version="2.09", - nzgmdb_version="v4.3", + nzgmdb_version=site_gen.NZGMDBVersion.V4p3, ) custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) @@ -406,7 +571,7 @@ def test_site_dataframe_basin( region=canterbury_region_polygon, uniform_spacing=general_grid.spacing * 2, vel_model_version="2.09", - nzgmdb_version="v4.3", + nzgmdb_version=site_gen.NZGMDBVersion.V4p3, ) custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) diff --git a/workflow/site_gen.py b/workflow/site_gen.py index a20a1497..474e1e99 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -1,4 +1,5 @@ """Module for generating simulation site grids.""" + import enum import json import logging @@ -284,7 +285,9 @@ class CustomGridConfig: def __post_init__(self) -> None: """Post-initialization checks.""" - if self.basin_spacing is not None and self.vel_model_version is None: + if ( + self.basin_spacing is not None or self.per_basin_spacing is not None + ) and self.vel_model_version is None: raise ValueError( "vel_model_version must be provided if basin_spacing is set." ) @@ -375,7 +378,7 @@ class CustomGrid: default location. """ - def __init__(self, general_grid: GeneralGrid = None) -> None: + def __init__(self, general_grid: GeneralGrid | None = None) -> None: """ Initialize CustomGrid. @@ -432,30 +435,33 @@ def apply_config(self, config: CustomGridConfig) -> "CustomGrid": self._reset() self.config = config + # Land filter if self.config.land_only: self._add_land_only_filter() + # Region filter if self.config.region is not None: self._add_region_filter(self.config.region) + # Uniform spacing if self.config.uniform_spacing is not None: self._add_uniform_spacing_filter(self.config.uniform_spacing) - if ( - self.config.vel_model_version is not None - and self.config.basin_spacing is not None - ): - self._add_basin_spacing_filter( - self.config.vel_model_version, self.config.basin_spacing - ) - if self.config.per_basin_spacing is not None: - # Group by spacing - basin_spacing_series = pd.Series(self.config.per_basin_spacing) - spacing_groups = basin_spacing_series.groupby(basin_spacing_series) - - for spacing, basins in spacing_groups.groups.items(): + # Basin spacing + if self.config.vel_model_version is not None: + if self.config.basin_spacing is not None: self._add_basin_spacing_filter( - self.config.vel_model_version, - spacing, - basins.tolist(), + self.config.vel_model_version, self.config.basin_spacing ) + if self.config.per_basin_spacing is not None: + # Group by spacing + basin_spacing_series = pd.Series(self.config.per_basin_spacing) + spacing_groups = basin_spacing_series.groupby(basin_spacing_series) + + for spacing, basins in spacing_groups.groups.items(): + self._add_basin_spacing_filter( + self.config.vel_model_version, + spacing, + basins.tolist(), + ) + # Per-region spacing if self.config.per_region_spacing is not None: for region_spacing_config in self.config.per_region_spacing: self._add_region_spacing_filter(region_spacing_config) @@ -659,7 +665,7 @@ def get_metadata(self, site_df: pd.DataFrame = None) -> dict: site_df.groupby("basin").size().to_dict() ) - metadata = {"metadata": site_metadata, "config": self.config.as_dict()} + metadata = {"site_metadata": site_metadata, "config": self.config.as_dict()} return metadata def get_site_df( @@ -691,6 +697,9 @@ def get_site_df( Basin membership and Z-values are only added if vel_model_version is set in the configuration. """ + if self.config is None: + raise ValueError("CustomGridConfig must be applied before getting site df.") + if self.mask.sum() == 0: logger.warning("Custom grid has no sites selected.") return pd.DataFrame() From 5731846185828b777ccfcc097c563811951bc687 Mon Sep 17 00:00:00 2001 From: Claudio Date: Wed, 12 Nov 2025 13:12:14 +1300 Subject: [PATCH 20/21] typing fixes --- tests/test_site_gen.py | 13 ++++++++----- workflow/site_gen.py | 27 +++++++++++++++------------ 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index bb54ab2d..9951fdff 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -9,6 +9,7 @@ import pandas as pd import pytest import shapely +from shapely import geometry import xarray as xr from hypothesis import assume, given from hypothesis import strategies as st @@ -131,7 +132,7 @@ def test_region_spacing_config_region_file( ) -> None: # Save region as geojson dict with tempfile.TemporaryDirectory() as tmpdir: - geojson_dict = shapely.geometry.mapping(chch_region_polygon) + geojson_dict = geometry.mapping(chch_region_polygon) geojson_ffp = f"{tmpdir}/chch_region.geojson" with open(geojson_ffp, "w") as f: @@ -170,7 +171,7 @@ def test_region_spacing_config_region_metadata( # Create region spacing config from metadata dict metadata_dict = { "name": "chch", - "region": shapely.geometry.mapping(chch_region_polygon), + "region": geometry.mapping(chch_region_polygon), "spacing": 1000, } @@ -194,7 +195,7 @@ def test_custom_grid_config_no_vel_model_version() -> None: def test_custom_grid_config(chch_region_polygon: shapely.Polygon) -> None: # Create the custom grid config dict with tempfile.TemporaryDirectory() as tmpdir: - geojson_dict = shapely.geometry.mapping(chch_region_polygon) + geojson_dict = geometry.mapping(chch_region_polygon) geojson_ffp = f"{tmpdir}/chch_region.geojson" with open(geojson_ffp, "w") as f: @@ -229,6 +230,7 @@ def test_custom_grid_config(chch_region_polygon: shapely.Polygon) -> None: assert custom_grid_config_1.vel_model_version == "2.09" assert custom_grid_config_1.basin_spacing == 2500 assert custom_grid_config_1.per_basin_spacing == {"Hanmer_v25p3": 1250} + assert custom_grid_config_1.per_region_spacing is not None assert len(custom_grid_config_1.per_region_spacing) == 1 per_region_config_1 = custom_grid_config_1.per_region_spacing[0] assert per_region_config_1.name == "Christchurch" @@ -254,6 +256,7 @@ def test_custom_grid_config(chch_region_polygon: shapely.Polygon) -> None: custom_grid_config_1.per_basin_spacing == custom_grid_config_2.per_basin_spacing ) + assert custom_grid_config_2.per_region_spacing is not None assert len(custom_grid_config_2.per_region_spacing) == 1 per_region_config_2 = custom_grid_config_2.per_region_spacing[0] assert per_region_config_1.name == per_region_config_2.name @@ -515,7 +518,7 @@ def test_multiple_error_uniform_spacing( @pytest.mark.parametrize("basin_spacing", [5_000, 10_000]) def test_small_basin_spacing_error( - general_grid: site_gen.GeneralGrid, basin_spacing: float + general_grid: site_gen.GeneralGrid, basin_spacing: int ) -> None: uniform_spacing = general_grid.spacing * 2 config = site_gen.CustomGridConfig( @@ -536,7 +539,7 @@ def test_multiple_error_basin_spacing( general_grid: site_gen.GeneralGrid, factor: float ) -> None: uniform_spacing = general_grid.spacing * 2 - basin_spacing = general_grid.spacing * factor + basin_spacing = int(general_grid.spacing * factor) config = site_gen.CustomGridConfig( uniform_spacing=uniform_spacing, basin_spacing=basin_spacing, diff --git a/workflow/site_gen.py b/workflow/site_gen.py index 474e1e99..6cb68f75 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -15,6 +15,7 @@ import pygmt import shapely import xarray as xr +from shapely import geometry from qcore import coordinates from velocity_modelling.constants import get_data_root @@ -111,7 +112,7 @@ def shape(self) -> tuple[int, int]: tuple[int, int] The shape of the grid. """ - return self.land_mask_grid.shape + return self.land_mask_grid.shape # type: ignore @classmethod def load(cls, land_mask_grid_ffp: Path) -> "GeneralGrid": @@ -214,7 +215,7 @@ def as_dict(self) -> dict: """ return { "name": self.name, - "region": shapely.geometry.mapping(self.region), + "region": geometry.mapping(self.region), "spacing": self.spacing, } @@ -242,11 +243,9 @@ def from_config(cls, config_dict: dict) -> "RegionSpacingConfig": if "geojson_ffp" in config_dict: with open(config_dict["geojson_ffp"], "r") as f: geojson_obj = json.load(f) - region_polygon = shapely.geometry.shape( - geojson_obj["features"][0]["geometry"] - ) + region_polygon = geometry.shape(geojson_obj["features"][0]["geometry"]) else: - region_polygon = shapely.geometry.shape(config_dict["region"]) + region_polygon = geometry.shape(config_dict["region"]) return RegionSpacingConfig( name=config_dict["name"], @@ -310,7 +309,7 @@ def from_config(cls, config_dict: dict) -> "CustomGridConfig": # Get the region polygon region = config_dict.get("region") if region is not None and isinstance(region, dict): - region = shapely.geometry.shape(region) + region = geometry.shape(region) # Get the per-region spacing per_region_spacing = None @@ -347,9 +346,7 @@ def as_dict(self) -> dict: return { "land_only": self.land_only, "region": ( - shapely.geometry.mapping(self.region) - if self.region is not None - else None + geometry.mapping(self.region) if self.region is not None else None ), "uniform_spacing": self.uniform_spacing, "vel_model_version": self.vel_model_version, @@ -458,7 +455,7 @@ def apply_config(self, config: CustomGridConfig) -> "CustomGrid": for spacing, basins in spacing_groups.groups.items(): self._add_basin_spacing_filter( self.config.vel_model_version, - spacing, + spacing, # type: ignore basins.tolist(), ) # Per-region spacing @@ -636,7 +633,7 @@ def _add_basin_spacing_filter( f"Basin spacing filter added in {time.time() - start_time} seconds." ) - def get_metadata(self, site_df: pd.DataFrame = None) -> dict: + def get_metadata(self, site_df: pd.DataFrame | None = None) -> dict: """ Gets the metadata dictionary for the custom grid. @@ -650,6 +647,11 @@ def get_metadata(self, site_df: pd.DataFrame = None) -> dict: dict Dictionary containing metadata and configuration information. """ + if self.config is None: + raise ValueError( + "CustomGridConfig must be applied before getting metadata." + ) + site_metadata = {} if site_df is not None: site_metadata = { @@ -690,6 +692,7 @@ def get_site_df( - basin: The basin the site is in (if any). - Z1.0: The Z1.0 value for the site (if vel_model_version is provided). - Z2.5: The Z2.5 value for the site (if vel_model_version is provided). + - Vs30: Time-averaged shear-wave velocity in the top 30 meters (m/s) (for real sites only). - site_code: The site code. Notes From 3e9af6df7b1fc6fdfa58af4e70a1dcd19c71b6ff Mon Sep 17 00:00:00 2001 From: Claudio Date: Wed, 12 Nov 2025 13:21:58 +1300 Subject: [PATCH 21/21] PR check fixes --- tests/test_site_gen.py | 2 +- workflow/site_gen.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py index 9951fdff..f9edb1e6 100644 --- a/tests/test_site_gen.py +++ b/tests/test_site_gen.py @@ -9,11 +9,11 @@ import pandas as pd import pytest import shapely -from shapely import geometry import xarray as xr from hypothesis import assume, given from hypothesis import strategies as st from scipy.spatial import distance as sdist +from shapely import geometry from qcore import coordinates from workflow import site_gen diff --git a/workflow/site_gen.py b/workflow/site_gen.py index 6cb68f75..b0055028 100644 --- a/workflow/site_gen.py +++ b/workflow/site_gen.py @@ -871,7 +871,7 @@ def get_basin_boundaries(vel_model_version: str) -> dict[str, shapely.Polygon]: Dictionary mapping basin names to their boundary polygons. """ cvm_registry = CVMRegistry(vel_model_version, get_data_root()) - basin_data = cvm_registry.load_basin_data(cvm_registry.global_params["basins"]) + basin_data = cvm_registry.load_basin_data(cvm_registry.global_params["basins"]) # type: ignore basin_boundaries = {} for cur_basin_data in basin_data: