diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index e8a39e89..a40d385b 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -16,6 +16,9 @@ jobs: **/pyproject.toml - run: uv venv - run: uv pip install -e ".[test,dev]" + - name: Download velocity data + shell: bash -l -e {0} + run: nzcvm-data-helper ensure --no-full - name: Run tests run: uv run pytest --cov=workflow --cov-report=html tests - name: Upload coverage data diff --git a/pyproject.toml b/pyproject.toml index 6a3b3134..83a0eda3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,11 @@ dependencies = [ "tqdm", "typer", + # Plotting + "plotly", + "pygmt", + "pooch", + # Misc. "requests", # For gcmt-to-realisation "schema", # For loading realisations @@ -66,7 +71,6 @@ generate-rupture-propagation = "workflow.scripts.generate_rupture_propagation:ap copy-domain-parameters = "workflow.scripts.copy_velocity_model_parameters:app" create-e3d-par = "workflow.scripts.create_e3d_par:app" generate-stoch = "workflow.scripts.generate_stoch:app" -merge-ts = "workflow.scripts.merge_ts:app" hf-sim = "workflow.scripts.hf_sim:app" bb-sim = "workflow.scripts.bb_sim:app" im-calc = "workflow.scripts.im_calc:app" @@ -76,6 +80,7 @@ plan-workflow = "workflow.scripts.plan_workflow:app" gcmt-auto-simulate = "workflow.scripts.gcmt_auto_simulate:app" import-realisation = "workflow.scripts.import_realisation:app" lf-to-xarray = "workflow.scripts.lf_to_xarray:app" +merge-ts = "workflow.scripts.merge_ts:app" [tool.setuptools.package-dir] workflow = "workflow" diff --git a/setup.py b/setup.py deleted file mode 100644 index 9a458991..00000000 --- a/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -from Cython.Build import cythonize -from setuptools import Extension, setup - -extensions = [ - Extension( - "workflow.scripts.merge_ts_loop", - ["workflow/scripts/merge_ts_loop.pyx"], - ), -] - -setup( - name="workflow", - ext_modules=cythonize(extensions), -) diff --git a/tests/test_site_gen.py b/tests/test_site_gen.py new file mode 100644 index 00000000..381b2b0a --- /dev/null +++ b/tests/test_site_gen.py @@ -0,0 +1,481 @@ +import json +import string +import tempfile + +import numpy as np +import pytest +import shapely +import xarray as xr +from hypothesis import assume, given +from hypothesis import strategies as st +from scipy.spatial import distance as sdist + +from workflow import site_gen + + +@pytest.fixture(scope="module", params=[50_000, 25_000]) +def land_mask_grid(request: pytest.FixtureRequest) -> tuple[xr.DataArray, int]: + spacing = request.param + return site_gen.gen_general_land_mask_grid(spacing), spacing + + +@pytest.fixture +def general_grid(land_mask_grid: tuple[xr.DataArray, int]) -> site_gen.GeneralGrid: + return site_gen.GeneralGrid(land_mask_grid[0], land_mask_grid[1]) + + +def test_general_grid(land_mask_grid: tuple[xr.DataArray, int]) -> None: + land_mask_grid, spacing = land_mask_grid + with tempfile.TemporaryDirectory() as tmpdir: + ffp = f"{tmpdir}/general_grid.nc" + land_mask_grid.to_netcdf(ffp) + loaded_grid = site_gen.GeneralGrid.load(ffp) + + assert loaded_grid.land_mask_grid.spacing == spacing + assert loaded_grid.shape == land_mask_grid.shape + assert np.array_equal(loaded_grid.land_mask_grid.values, land_mask_grid.values) + assert np.array_equal( + loaded_grid.land_mask_grid.lat.values, land_mask_grid.lat.values + ) + assert np.array_equal( + loaded_grid.land_mask_grid.lon.values, land_mask_grid.lon.values + ) + + +@pytest.fixture(scope="module") +def chch_region_polygon() -> shapely.Polygon: + polygon = shapely.Polygon( + [ + (172.60691125905976, -43.50338730757493), + (172.55414963918173, -43.52695730676956), + (172.64273442256302, -43.583742177757706), + (172.68415242515607, -43.5513507340181), + (172.6440052010451, -43.526126938652155), + (172.60691125905976, -43.50338730757493), + ] + ) + return polygon + + +@pytest.fixture(scope="module") +def land_region_polygon() -> shapely.Polygon: + polygon = shapely.Polygon( + [ + (170.78376767024793, -42.97481408776882), + (170.78376767024793, -43.68210554829009), + (172.66566136205932, -43.68210554829009), + (172.66566136205932, -42.97481408776882), + (170.78376767024793, -42.97481408776882), + ] + ) + return polygon + + +@pytest.fixture(scope="module") +def canterbury_region_polygon() -> shapely.Polygon: + polygon = shapely.Polygon( + [ + (172.86435627311448, -43.338682922734954), + (172.2989207589734, -43.342255579287965), + (171.53771742001976, -43.782575480660896), + (171.7830393264249, -44.18964091597699), + (173.3801704840081, -43.85958260231439), + (173.1293617392618, -43.60655002794223), + (172.86435627311448, -43.338682922734954), + ] + ) + return polygon + + +@pytest.fixture(scope="module") +def water_region_polygon() -> shapely.Polygon: + polygon = shapely.Polygon( + [ + (172.95789409381717, -39.97031515444409), + (172.95789409381717, -40.67305495760141), + (174.87451261095003, -40.67305495760141), + (174.87451261095003, -39.97031515444409), + (172.95789409381717, -39.97031515444409), + ] + ) + return polygon + + +def test_region_spacing_config_region_file( + chch_region_polygon: shapely.Polygon, +) -> None: + # Save region as geojson dict + with tempfile.TemporaryDirectory() as tmpdir: + geojson_dict = shapely.geometry.mapping(chch_region_polygon) + geojson_ffp = f"{tmpdir}/chch_region.geojson" + + with open(geojson_ffp, "w") as f: + json.dump( + { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "properties": {}, "geometry": geojson_dict} + ], + }, + f, + ) + + config_dict = {"name": "chch", "geojson_ffp": geojson_ffp, "spacing": 1000} + region_spacing_config = site_gen.RegionSpacingConfig.from_config(config_dict) + + assert region_spacing_config.name == "chch" + assert region_spacing_config.spacing == 1000 + assert shapely.equals_exact(region_spacing_config.region, chch_region_polygon) + + +def test_region_spacing_config_region_metadata( + chch_region_polygon: shapely.Polygon, +) -> None: + # Create region spacing config from metadata dict + metadata_dict = { + "name": "chch", + "region": shapely.geometry.mapping(chch_region_polygon), + "spacing": 1000, + } + + region_spacing_config = site_gen.RegionSpacingConfig.from_config(metadata_dict) + + assert region_spacing_config.name == "chch" + assert region_spacing_config.spacing == 1000 + assert shapely.equals_exact(region_spacing_config.region, chch_region_polygon) + + +def test_custom_grid_config(chch_region_polygon: shapely.Polygon) -> None: + # Create the custom grid config dict + with tempfile.TemporaryDirectory() as tmpdir: + geojson_dict = shapely.geometry.mapping(chch_region_polygon) + geojson_ffp = f"{tmpdir}/chch_region.geojson" + + with open(geojson_ffp, "w") as f: + json.dump( + { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "properties": {}, "geometry": geojson_dict} + ], + }, + f, + ) + + config_dict = { + "land_only": True, + "region": chch_region_polygon, + "uniform_spacing": 5000, + "vel_model_version": "2.09", + "basin_spacing": 2500, + "per_basin_spacing": {"Hanmer_v25p3": 1250}, + "per_region_spacing": [ + {"name": "Christchurch", "geojson_ffp": geojson_ffp, "spacing": 1250} + ], + "nzgmdb_version": "v4.3", + } + + custom_grid_config_1 = site_gen.CustomGridConfig.from_config(config_dict) + + assert custom_grid_config_1.land_only is True + assert shapely.equals_exact(custom_grid_config_1.region, chch_region_polygon) + assert custom_grid_config_1.uniform_spacing == 5000 + assert custom_grid_config_1.vel_model_version == "2.09" + assert custom_grid_config_1.basin_spacing == 2500 + assert custom_grid_config_1.per_basin_spacing == {"Hanmer_v25p3": 1250} + assert len(custom_grid_config_1.per_region_spacing) == 1 + per_region_config_1 = custom_grid_config_1.per_region_spacing[0] + assert per_region_config_1.name == "Christchurch" + assert per_region_config_1.spacing == 1250 + assert shapely.equals_exact(per_region_config_1.region, chch_region_polygon) + + config_dict = custom_grid_config_1.as_dict() + custom_grid_config_2 = site_gen.CustomGridConfig.from_config(config_dict) + + assert custom_grid_config_1.land_only == custom_grid_config_2.land_only + assert shapely.equals_exact( + custom_grid_config_1.region, custom_grid_config_2.region + ) + assert ( + custom_grid_config_1.uniform_spacing == custom_grid_config_2.uniform_spacing + ) + assert ( + custom_grid_config_1.vel_model_version + == custom_grid_config_2.vel_model_version + ) + assert custom_grid_config_1.basin_spacing == custom_grid_config_2.basin_spacing + assert ( + custom_grid_config_1.per_basin_spacing + == custom_grid_config_2.per_basin_spacing + ) + assert len(custom_grid_config_2.per_region_spacing) == 1 + per_region_config_2 = custom_grid_config_2.per_region_spacing[0] + assert per_region_config_1.name == per_region_config_2.name + assert per_region_config_1.spacing == per_region_config_2.spacing + assert shapely.equals_exact( + per_region_config_1.region, per_region_config_2.region + ) + + +def test_custom_grid_land_region_filtering( + general_grid: site_gen.GeneralGrid, land_region_polygon: shapely.Polygon +) -> None: + land_only_grid_config = site_gen.CustomGridConfig( + land_only=True, + region=land_region_polygon, + uniform_spacing=general_grid.spacing, + ) + land_only_grid = site_gen.CustomGrid(general_grid).apply_config( + land_only_grid_config + ) + + # All points in the custom grid should be land points + # and therefore all should be included + # Check via neighbour distances + site_df = land_only_grid.get_site_df() + distances = sdist.squareform(sdist.pdist(site_df[["nztm_x", "nztm_y"]].values)) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, general_grid.spacing, rtol=0.05) + + region_config = site_gen.CustomGridConfig( + region=land_region_polygon, + uniform_spacing=general_grid.spacing, + ) + region_grid = site_gen.CustomGrid(general_grid).apply_config(region_config) + + assert site_df.equals(region_grid.get_site_df()) + + +def test_custom_grid_water_region_filtering( + general_grid: site_gen.GeneralGrid, water_region_polygon: shapely.Polygon +) -> None: + land_only_grid_config = site_gen.CustomGridConfig( + land_only=True, + region=water_region_polygon, + uniform_spacing=general_grid.spacing, + ) + land_only_grid = site_gen.CustomGrid(general_grid).apply_config( + land_only_grid_config + ) + + site_df = land_only_grid.get_site_df() + assert site_df.shape[0] == 0 + + +def test_uniform_too_small_spacing_error( + general_grid: site_gen.GeneralGrid, +) -> None: + custom_grid = site_gen.CustomGrid(general_grid) + spacing = general_grid.spacing - 1000 + + with pytest.raises(ValueError): + custom_grid._add_uniform_spacing_filter(spacing) + + +def test_uniform_non_multiple_spacing_error( + general_grid: site_gen.GeneralGrid, +) -> None: + custom_grid = site_gen.CustomGrid(general_grid) + spacing = general_grid.spacing + 3000 + + with pytest.raises(ValueError): + custom_grid._add_uniform_spacing_filter(spacing) + + +def test_uniform_valid_spacing( + general_grid: site_gen.GeneralGrid, +) -> None: + config = site_gen.CustomGridConfig( + uniform_spacing=general_grid.spacing * 2, + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + distances = sdist.squareform(sdist.pdist(site_df[["nztm_x", "nztm_y"]].values)) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, general_grid.spacing * 2, rtol=0.12) + + +def test_basin_spacing( + general_grid: site_gen.GeneralGrid, canterbury_region_polygon: shapely.Polygon +) -> None: + uniform_spacing = general_grid.spacing * 2 + config = site_gen.CustomGridConfig( + region=canterbury_region_polygon, + uniform_spacing=uniform_spacing, + basin_spacing=general_grid.spacing, + vel_model_version="2.09", + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + distances = sdist.squareform(sdist.pdist(site_df[["nztm_x", "nztm_y"]].values)) + np.fill_diagonal(distances, np.inf) + min_distances = distances.min(axis=1) + assert np.allclose(min_distances, general_grid.spacing, rtol=0.12) + assert np.all(min_distances < uniform_spacing) + + +@pytest.mark.parametrize("uniform_spacing", [5_000, 10_000]) +def test_small_uniform_spacing_error( + general_grid: site_gen.GeneralGrid, uniform_spacing: float +) -> None: + config = site_gen.CustomGridConfig(uniform_spacing=uniform_spacing) + + with pytest.raises( + ValueError, + match="Uniform spacing must be greater than or equal to the general grid spacing", + ): + site_gen.CustomGrid(general_grid).apply_config(config) + +@pytest.mark.parametrize("factor", [1.2, 1.5]) +def test_multiple_error_uniform_spacing( + general_grid: site_gen.GeneralGrid, factor: float +) -> None: + config = site_gen.CustomGridConfig( + uniform_spacing=general_grid.spacing * factor, + ) + + with pytest.raises( + ValueError, + match="Uniform spacing must be a multiple of the general grid spacing", + ): + site_gen.CustomGrid(general_grid).apply_config(config) + +@pytest.mark.parametrize("basin_spacing", [5_000, 10_000]) +def test_small_basin_spacing_error( + general_grid: site_gen.GeneralGrid, basin_spacing: float +) -> None: + uniform_spacing = general_grid.spacing * 2 + config = site_gen.CustomGridConfig( + uniform_spacing=uniform_spacing, + basin_spacing=basin_spacing, + vel_model_version="2.09", + ) + + with pytest.raises( + ValueError, + match="Basin spacing must be greater than or equal to the general grid spacing.", + ): + site_gen.CustomGrid(general_grid).apply_config(config) + + +@pytest.mark.parametrize("factor", [1.2, 1.5]) +def test_multiple_error_basin_spacing( + general_grid: site_gen.GeneralGrid, factor: float +) -> None: + uniform_spacing = general_grid.spacing * 2 + basin_spacing = general_grid.spacing * factor + config = site_gen.CustomGridConfig( + uniform_spacing=uniform_spacing, + basin_spacing=basin_spacing, + vel_model_version="2.09", + ) + + with pytest.raises( + ValueError, match="Basin spacing must be a multiple of the general grid spacing" + ): + site_gen.CustomGrid(general_grid).apply_config(config) + +def test_site_dataframe( + general_grid: site_gen.GeneralGrid +) -> None: + config = site_gen.CustomGridConfig( + uniform_spacing=general_grid.spacing * 2, + vel_model_version="2.09", + nzgmdb_version="v4.3", + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + assert (~site_df.region_name.isnull()).all() + assert (~site_df.region_code.isnull()).all() + assert site_df.source.isin(["virtual", "real"]).all() + + +def test_site_dataframe_basin( + general_grid: site_gen.GeneralGrid, canterbury_region_polygon: shapely.Polygon +) -> None: + config = site_gen.CustomGridConfig( + region=canterbury_region_polygon, + uniform_spacing=general_grid.spacing * 2, + vel_model_version="2.09", + nzgmdb_version="v4.3", + ) + + custom_grid = site_gen.CustomGrid(general_grid).apply_config(config) + site_df = custom_grid.get_site_df() + + assert (site_df.basin == "Canterbury_v25p9").all() + + + + + +@given( + nums=st.lists(st.integers(min_value=0, max_value=62**4 - 1), min_size=2, max_size=500), + length=st.integers(min_value=3, max_value=5), +) +def test_encode_base62_combined_properties(nums: list[int], length: int) -> None: + """Property: encoded strings should have fixed length, be unique, and use valid base62 characters.""" + nums_array = np.array(nums) + assume(np.all(nums_array < 62**length)) # Ensure all inputs fit in the length + assume(np.unique(nums_array).size == len(nums_array)) # Ensure all inputs are unique + + result = site_gen.encode_base62_fixed_array(nums_array, length=length) + + # The number of output codes should match the number of input numbers + assert len(result) == len(nums) + + # All encoded strings should have the specified length + assert all(len(code) == length for code in result) + + # Different numbers should produce different codes (all inputs are unique) + assert len(np.unique(result)) == len(nums) + + # Encoded strings should only contain valid base62 characters + alphabet = set(string.digits + string.ascii_letters) + for code in result: + assert all(c in alphabet for c in code) + + +@given( + length=st.integers(min_value=1, max_value=6), +) +def test_encode_base62_boundary_values_property(length: int) -> None: + """Property: test boundary values (0 and max) for a given length.""" + max_val = 62**length - 1 + nums = np.array([0, max_val]) + + result = site_gen.encode_base62_fixed_array(nums, length=length) + + assert len(result) == 2 + assert all(len(code) == length for code in result) + assert result[0] == "0" * length + +@given( + too_large_num=st.integers(min_value=62**3, max_value=62**4), + length=st.just(3), +) +def test_encode_base62_too_large_error_property(too_large_num: int, length: int) -> None: + """Property: numbers too large for the specified length should raise ValueError.""" + nums = np.array([too_large_num]) + + with pytest.raises(ValueError, match="too large"): + site_gen.encode_base62_fixed_array(nums, length=length) + +@given( + num=st.integers(min_value=0, max_value=62**3 - 1), +) +def test_encode_base62_same_input_same_output_property(num: int) -> None: + """Property: encoding the same number multiple times should give the same result.""" + nums = np.array([num, num, num]) + + result = site_gen.encode_base62_fixed_array(nums, length=3) + + assert result[0] == result[1] == result[2] diff --git a/wiki/Site-Generation.md b/wiki/Site-Generation.md new file mode 100644 index 00000000..66586378 --- /dev/null +++ b/wiki/Site-Generation.md @@ -0,0 +1,174 @@ +## Virtual Simulation Sites + +### Background + +As every set of simulation has different goals and priorities, a single set of virtual stations that meets all requirements is not feasible. On the other hand we want to prevent every simulation set developing their own custom grid of virtual stations, as this would make comparison between simulation sets difficult. + +Instead a high density uniform "general" grid has been developed, and the code implemented here allows the generation of custom virtual station grid for each simulation set by sub-sampling this "general" grid as needed. This "general" grid ensures consistency across simulation sets while also allowing for customization. + +### Custom Grid Generation + +A custom grid can be generated using the `site_gen_cmds.py` script using the `gen-custom-grid` command. +For details/help on the command run `python site_gen_cmds.py gen-custom-grid --help` +There are two options to generate the custom grid via the `gen-custom-grid` either specify the spacing via the command line arguments, or pass a configuration yaml file. The configuration allows for more customization, such as specifying spacing per basin or custom defined region. + +An example configuration file might look something like this +```yaml +land_only: true +uniform_spacing: 5000 + +rel_ffp: null + +vel_model_version: "2.09" +basin_spacing: 2500 +per_basin_spacing: + Hanmer_v25p3: 1250 +per_region_spacing: + - name: "Christchurch" + geojson_ffp: "/home/claudy/dev/tmp_share/virt_sites/chch.json" + spacing: 1250 + +nzgmdb_version: v4.3 +``` +This configuration will select NZ land sites, with a default spacing of 5km, in basin spacing of 2.5km. +Additionally, 1250m spacing is applied in Hanmer basin, and the custom Christchurch region defined via the geojson file. The geojson needs the following format: +```json +{ + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "properties": {}, + "geometry": { + "coordinates": [ + [ + [ + 172.64635288571822, + -43.46746584851593 + ], + [ + 172.5764348336035, + -43.46875527988916 + ], + [ + 172.48512720365642, + -43.510780173947616 + ], + [ + 172.48164483335898, + -43.581684944518635 + ], + [ + 172.51943709438905, + -43.605574705482184 + ], + [ + 172.60676846169588, + -43.61039335808902 + ], + [ + 172.74171895701926, + -43.575568637023046 + ], + [ + 172.811825205922, + -43.54170866114695 + ], + [ + 172.7825735380037, + -43.479415396924466 + ], + [ + 172.75289631792998, + -43.449932811038906 + ], + [ + 172.68739428988397, + -43.45477584526233 + ], + [ + 172.6620601942388, + -43.46177619980023 + ], + [ + 172.64635288571822, + -43.46746584851593 + ] + ] + ], + "type": "Polygon" + } + } + ] +} +``` + +**Note that you will need about 20Gb of free memory to generate custom grids.** + + +Running the custom grid generation will produce a parquet file with the following columns: + +| Column | Description | Notes | +|--------|-------------|-------| +| `site_id` | Unique identifier for each virtual site | `{5 character base 62 encoding of integer site id}{2 character region code}` | +| `lon` | Longitude coordinate in decimal degrees | | +| `lat` | Latitude coordinate in decimal degrees | | +| `nztm_x` | X coordinate in New Zealand Transverse Mercator (NZTM) projection | | +| `nztm_y` | Y coordinate in New Zealand Transverse Mercator (NZTM) projection | | +| `source` | Source of the site | Either `virtual` or `real` | +| `basin` | Basin name | None if not in a basin | +| `Z1.0` | Depth to 1.0 km/s shear-wave velocity (km) | | +| `Z2.5` | Depth to 2.5 km/s shear-wave velocity (km) | | +| `vs30` | Time-averaged shear-wave velocity in the top 30 meters (m/s) | | + +Region code mapping: + +| Region | Code | +|--------|------| +| North Auckland | NA | +| South Auckland | SA | +| Hawkes Bay | HB | +| Gisborne | GI | +| Taranaki | TA | +| Wellington | WE | +| Nelson | NE | +| Marlborough | MA | +| Westland | WL | +| Canterbury | CA | +| Otago | OT | +| Southland | SL | +| No Region | NR | + + +Additionally, it will also produce a metadata file that contains the setting used to generate the custom grid: + +| Setting | Type | Description | +|---------|------|-------------| +| `land_only` | bool | Whether to include only land-based sites | +| `region` | geojson | Geographic region for the grid | +| `uniform_spacing` | int | Spacing between grid points | +| `vel_model_version` | str | Version of the velocity model used | + +in addition to some additional information such as: + +| Field | Description | +|----------------|-------------| +| `num_sites` | Total number of sites in the grid | +| `num_virtual_sites` | Number of virtual sites | +| `num_real_sites` | Number of real sites | +| `num_basin_sites` | Number of sites located within basins | +| `sites_per_basin` | Number of sites per basin | + + +### Plot Generation + +Visualisation of the custom grid can be done using the 'gen-plot' command. +For details/help run `python site_gen_cmds.py gen-plot --help`. + +Running this will produce a html file that can be viewed in any browser. +Note that for large domains and high density grids this might be slow to load. + + + + + diff --git a/wiki/Stages.md b/wiki/Stages.md index af14ecaf..839c3b54 100644 --- a/wiki/Stages.md +++ b/wiki/Stages.md @@ -227,11 +227,12 @@ Merge the output timeslice files of EMOD3D. ### Outputs 1. A merged output timeslice file. ### Environment -Can be run in the cybershake container. Can also be run from your own computer using the `merge-ts` command which is installed after running `pip install workflow@git+https://github.com/ucgmsim/workflow`. +Can be run in the cybershake container. Can also be run from your own computer using the `merge-ts-hdf5` command which is installed after running `pip install workflow@git+https://github.com/ucgmsim/workflow`. ### Usage -`merge_ts XYTS_DIRECTORY XYTS_DIRECTORY/output.e3d` +`merge-ts-hdf5 XYTS_DIRECTORY XYTS_DIRECTORY/output.hdf5` ### For More Help -See the output of `merge-ts --help` or [merge_ts.py](https://github.com/ucgmsim/workflow/blob/pegasus/merge_ts/merge_ts.py). +See the output of `merge-ts-hdf5 --help` or [merge_ts.py](https://github.com/ucgmsim/workflow/blob/pegasus/merge_ts/merge_ts.py). + ## Create Simulation Video ### Description diff --git a/workflow/__init__.py b/workflow/__init__.py index 26c75bf3..74451c40 100644 --- a/workflow/__init__.py +++ b/workflow/__init__.py @@ -70,7 +70,7 @@ | EMOD3D | See below. | | High Frequency Simulation | `workflow.scripts.hf_sim` | | Broadband Simulation | `workflow.scripts.bb_sim` | -| Merge Timeslices | `merge_ts.merge_ts` | +| Merge Timeslices | `workflow.scripts.merge_ts` | | Create Simulation Video | `workflow.scripts.plot_ts` | | Intensity Measure Calculation | `workflow.scripts.im_calc` | diff --git a/workflow/scripts/merge_ts.py b/workflow/scripts/merge_ts.py index cc25d41e..aa82424e 100644 --- a/workflow/scripts/merge_ts.py +++ b/workflow/scripts/merge_ts.py @@ -26,7 +26,6 @@ See the output of `merge-ts --help`. """ -import os from pathlib import Path from typing import Annotated @@ -36,108 +35,9 @@ import xarray as xr from qcore import cli, coordinates, xyts -from workflow.scripts import merge_ts_loop app = typer.Typer() - -def merge_ts_xyts( - component_xyts_directory: Annotated[ - Path, - typer.Argument( - dir_okay=True, - file_okay=False, - exists=True, - readable=True, - ), - ], - output: Annotated[ - Path, - typer.Argument(dir_okay=False, writable=True), - ], - glob_pattern: str = "*xyts-*.e3d", -) -> None: - """Merge XYTS files. - - Parameters - ---------- - component_xyts_directory : Path - The input xyts directory containing files to merge. - output : Path - The output xyts file. - glob_pattern : str, optional - Set a custom glob pattern for merging the xyts files, by default "*xyts-*.e3d". - """ - component_xyts_files = sorted( - [ - xyts.XYTSFile( - xyts_file_path, proc_local_file=True, meta_only=True, round_dt=False - ) - for xyts_file_path in component_xyts_directory.glob(glob_pattern) - ], - key=lambda xyts_file: (xyts_file.y0, xyts_file.x0), - ) - top_left = component_xyts_files[0] - merged_ny = top_left.ny - merged_nt = top_left.nt - - xyts_proc_header_size = 72 - - xyts_file_descriptors: list[int] = [] - for xyts_file in component_xyts_files: - xyts_file_descriptor = os.open(xyts_file.xyts_path, os.O_RDONLY) - # Skip the header for each file descriptor - head_skip = os.lseek(xyts_file_descriptor, xyts_proc_header_size, os.SEEK_SET) - if head_skip != xyts_proc_header_size: - raise ValueError( - f"Failed to skip header for {xyts_file.xyts_path} at {head_skip}" - ) - xyts_file_descriptors.append(xyts_file_descriptor) - - # If output doesn't exist when we os.open it, we'll get an error. - output.touch() - merged_fd = os.open(output, os.O_WRONLY) - - xyts_header: bytes = ( - top_left.x0.tobytes() - + top_left.y0.tobytes() - + top_left.z0.tobytes() - + top_left.t0.tobytes() - + top_left.nx.tobytes() - + top_left.ny.tobytes() - + top_left.nz.tobytes() - + top_left.nt.tobytes() - + top_left.dx.tobytes() - + top_left.dy.tobytes() - + top_left.hh.tobytes() - + top_left.dt.tobytes() - + top_left.mrot.tobytes() - + top_left.mlat.tobytes() - + top_left.mlon.tobytes() - ) - - written = os.write(merged_fd, xyts_header) - if written != len(xyts_header): - raise ValueError( - f"Failed to write header for {output} at {written} bytes written" - ) - - merge_ts_loop.merge_fds( - merged_fd, - xyts_file_descriptors, - merged_nt, - merged_ny, - [f.local_nx for f in component_xyts_files], - [f.local_ny for f in component_xyts_files], - [f.y0 for f in component_xyts_files], - ) - - for xyts_file_descriptor in xyts_file_descriptors: - os.close(xyts_file_descriptor) - - os.close(merged_fd) - - @cli.from_docstring(app, name="hdf5") def merge_ts_hdf5( component_xyts_directory: Annotated[ diff --git a/workflow/scripts/merge_ts_loop.pyx b/workflow/scripts/merge_ts_loop.pyx deleted file mode 100644 index d83a57c3..00000000 --- a/workflow/scripts/merge_ts_loop.pyx +++ /dev/null @@ -1,79 +0,0 @@ -import os - -import cython - -from libc.stdlib cimport free, malloc - -cdef extern from "sys/types.h": - ctypedef long off_t -cdef extern from "sys/sendfile.h": - ssize_t sendfile(int out_fd, int in_fd, off_t * offset, size_t count) - -def merge_fds( - int merged_fd, _component_xyts_files, int merged_nt, int merged_ny, _local_nxs, _local_nys, _y0s -): - ''' Merge a list of XYTS files by copying their values in order. - - To merge the files we copy the velocity values in each component, for each - timestep, in the order they would fit in the merged domain. That is if four - files tile the domain like so: - - ┌───────────┬──────────┐ - │***********│##########│ - │!!!!!!!!!!!│++++++++++│ - │ f1 │ f2 │ - │ │ │ - ├───────────┼──────────┤ - │$$$$$$$$$$$│%%%%%%%%%%│ - │ │ │ - │ f3 │ f4 │ - │ │ │ - │ │ │ - └───────────┴──────────┘ - - Then they are concatenated in the output domain as: - - ***********##########!!!!!!!!!!!++++++++++ ... $$$$$$$$$$$%%%%%%%%%% - - It is assumed _component_xyts_files is a list of xyts files sorted by their - top left corner. - ''' - cdef int float_size, cur_timestep, cur_component, cur_y, i, y0, local_ny, local_nx, xyts_fd, n_files - cdef int *component_xyts_files, *local_nxs, *local_nys, *y0s - - n_files = len(_component_xyts_files) - - # The lists _component_xyts_files, ... are CPython lists. - # If we access these inside our copying loop everything becomes very slow - # because we need to go to the Python interpreter. So we first copy each - # list into an equivalent C list. - component_xyts_files = malloc(len(_component_xyts_files) * cython.sizeof(int)) - local_nxs = malloc(len(_local_nxs) * cython.sizeof(int)) - local_nys = malloc(len(_local_nys) * cython.sizeof(int)) - y0s = malloc(len(_y0s) * cython.sizeof(int)) - # NOTE: we cannot use memcpy() here because CPython lists are not continuous - # chunks of memory as they are in C. - for i in range(len(_component_xyts_files)): - component_xyts_files[i] = _component_xyts_files[i] - local_nxs[i] = _local_nxs[i] - local_nys[i] = _local_nys[i] - y0s[i] = _y0s[i] - for cur_timestep in range(merged_nt): - for cur_component in range(3): # for each component - for cur_y in range(merged_ny): - for i in range(n_files): - y0 = y0s[i] - local_ny = local_nys[i] - local_nx = local_nxs[i] - xyts_fd = component_xyts_files[i] - if y0 > cur_y: - break - if cur_y >= y0 + local_ny: - continue - # By passing NULL as the offset, sendfile() will read from - # the current position in xyts_fd - sendfile(merged_fd, xyts_fd, NULL, local_nx * 4) - free(component_xyts_files) - free(local_nxs) - free(local_nys) - free(y0s) diff --git a/workflow/scripts/site_gen_cmds.py b/workflow/scripts/site_gen_cmds.py new file mode 100644 index 00000000..33c047a2 --- /dev/null +++ b/workflow/scripts/site_gen_cmds.py @@ -0,0 +1,304 @@ +"""Commands for generating simulation site grids.""" +import logging +from pathlib import Path +from typing import Annotated + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +import shapely +import typer +import yaml + +from qcore import cli, coordinates +from workflow import site_gen +from workflow.realisations import DomainParameters, SourceConfig + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], + force=True, +) +logger = logging.getLogger(__name__) + +app = typer.Typer() + + +@cli.from_docstring(app) +def gen_general_grid(grid_spacing: int, output_ffp: Path) -> None: + """ + Generate the general grid. + + Parameters + ---------- + grid_spacing : int + The grid spacing in metres. + output_ffp : Path + The path to save the output netCDF file. + """ + land_mask_grid = site_gen.gen_general_land_mask_grid( + grid_spacing, + ) + + logger.info(f"Saving to {output_ffp}...") + land_mask_grid.to_netcdf(output_ffp) + + +@cli.from_docstring(app) +def gen_custom_grid( + output_ffp: Annotated[Path, typer.Argument()], + config_ffp: Annotated[Path | None, typer.Option()] = None, + uniform_grid_spacing: Annotated[int | None, typer.Option()] = None, + basin_spacing: Annotated[int | None, typer.Option()] = None, + vel_model_version: Annotated[str | None, typer.Option()] = None, + nzgmdb_version: Annotated[site_gen.NZGMDBVersion | None, typer.Option()] = None, + rel_ffp: Annotated[Path | None, typer.Option()] = None, +) -> None: + """ + Generate a NZ-wide custom grid. + + Parameters + ---------- + output_ffp : Path + The path to save the output parquet file. + config_ffp : Path, optional + The path to the custom grid config file. + If provided, all other config options will be ignored. + uniform_grid_spacing : int, optional + The uniform grid spacing in metres. + This must be a multiple of the general grid spacing. + Ignored if a config file is provided. + basin_spacing : int, optional + The grid spacing in metres to use within basins. + This must be a multiple of the general grid spacing. + If not provided, no basin spacing will be applied. + Ignored if a config file is provided. + vel_model_version : str, optional + The velocity model version to use for basin spacing. + This must be provided if basin_spacing is specified. + If basin_spacing is not provided, and this is provided, + then this velocity model version will be used to set + basin membership in the output site dataframe. + nzgmdb_version : NZGMDBVersion, optional + The NZGMDB version to use for site parameters. + Ignored if a config file is provided. + rel_ffp : Path, optional + The path to the realisation config. + If provided, the domain will be used to filter the grid. + Ignored if a config file is provided. + """ + if config_ffp is not None: + logger.info(f"Reading custom grid config from {config_ffp}...") + config_dict = yaml.safe_load(config_ffp.open("r")) + grid_config = site_gen.CustomGridConfig.from_config(config_dict) + rel_ffp = config_dict.get("rel_ffp") + else: + logger.info("Generating custom grid config from command line options...") + grid_config = site_gen.CustomGridConfig( + land_only=True, + uniform_spacing=uniform_grid_spacing, + vel_model_version=vel_model_version, + basin_spacing=basin_spacing, + nzgmdb_version=nzgmdb_version, + ) + + # If a realisation file is provided, use its domain to filter the grid + if rel_ffp is not None: + domain_config = DomainParameters.read_from_realisation(rel_ffp) + region = shapely.Polygon(domain_config.domain.corners[:, ::-1]) + grid_config.region = region + + custom_grid = site_gen.CustomGrid().apply_config(grid_config) + + # Generate site dataframe and save + site_df = custom_grid.get_site_df() + site_df.to_parquet(output_ffp) + + # Save metadata + metadata = custom_grid.get_metadata(site_df) + with (output_ffp.parent / f"{output_ffp.stem}_metadata.yaml").open("w") as meta_ffp: + yaml.dump(metadata, meta_ffp) + + +@cli.from_docstring(app) +def gen_plot( + site_df_ffp: Annotated[Path, typer.Argument()], + output_ffp: Annotated[Path, typer.Argument()], + rel_ffp: Annotated[Path | None, typer.Option()] = None, +) -> None: + """ + Generate a plot of the custom grid. + + Parameters + ---------- + site_df_ffp : Path + The path to the custom grid site dataframe file (parquet). + output_ffp : Path + The path to save the output HTML file. + rel_ffp : Path, optional + The path to the realisation config. + If provided, the domain and source will be plotted. + """ + site_df = pd.read_parquet(site_df_ffp) + + site_df_meta_ffp = site_df_ffp.parent / f"{site_df_ffp.stem}_metadata.yaml" + with site_df_meta_ffp.open("r") as f: + config = site_gen.CustomGridConfig.from_config( + yaml.load(f, Loader=yaml.FullLoader)["config"] + ) + + fig = go.Figure() + + virt_sites_mask = site_df.loc[:, "source"] == "virtual" + fig.add_trace( + go.Scattermap( + lon=site_df.loc[virt_sites_mask, "lon"], + lat=site_df.loc[virt_sites_mask, "lat"], + mode="markers", + marker=dict(color="blue", size=4, symbol="circle"), + hoverinfo="skip", + hovertemplate=( + "Site ID: %{customdata[0]}
" + "Lat: %{lat:.6f}
" + "Lon: %{lon:.6f}
" + "Basin: %{customdata[4]}
" + "Z1.0: %{customdata[1]:.3f} km
" + "Z2.5: %{customdata[2]:.3f} km
" + "Vs30: %{customdata[3]:.1f} m/s
" + "" + ), + customdata=np.asarray( + [ + site_df.loc[virt_sites_mask].index.values.astype(str), + site_df.loc[virt_sites_mask, "Z1.0"].values, + site_df.loc[virt_sites_mask, "Z2.5"].values, + site_df.loc[virt_sites_mask, "Vs30"].values, + site_df.loc[virt_sites_mask, "basin"].values, + ] + ).T, + ) + ) + + if "real" in site_df.source.unique(): + real_sites_mask = site_df.loc[:, "source"] == "real" + fig.add_trace( + go.Scattermap( + lon=site_df.loc[real_sites_mask, "lon"], + lat=site_df.loc[real_sites_mask, "lat"], + mode="markers", + marker=dict(color="darkgreen", size=6, symbol="circle"), + hovertemplate=( + "Site ID: %{customdata[0]}
" + "Lat: %{lat:.6f}
" + "Lon: %{lon:.6f}
" + "Basin: %{customdata[4]}
" + "Z1.0: %{customdata[1]:.3f} km
" + "Z2.5: %{customdata[2]:.3f} km
" + "Vs30: %{customdata[3]:.1f} m/s
" + "" + ), + customdata=np.asarray( + [ + site_df.loc[real_sites_mask].index.values.astype(str), + site_df.loc[real_sites_mask, "Z1.0"].values, + site_df.loc[real_sites_mask, "Z2.5"].values, + site_df.loc[real_sites_mask, "Vs30"].values, + site_df.loc[real_sites_mask, "basin"].values, + ] + ).T, + ) + ) + + if config.vel_model_version is not None: + # Plot the basin boundaries + basin_boundaries = site_gen.get_basin_boundaries(config.vel_model_version) + basin_line_properties = dict(color="red", width=1) + basin_fill_color = "rgba(255,0,0,0.05)" + + for cur_basin_boundary in basin_boundaries.values(): + if isinstance(cur_basin_boundary, shapely.MultiPolygon): + for poly in cur_basin_boundary.geoms: + fig.add_trace( + go.Scattermap( + lon=np.array(poly.exterior.xy[0]), + lat=np.array(poly.exterior.xy[1]), + mode="lines", + fill="toself", + fillcolor=basin_fill_color, + line=basin_line_properties, + hoverinfo="skip", + ) + ) + else: + fig.add_trace( + go.Scattermap( + lon=np.array(cur_basin_boundary.exterior.xy[0]), + lat=np.array(cur_basin_boundary.exterior.xy[1]), + mode="lines", + fill="toself", + fillcolor=basin_fill_color, + line=basin_line_properties, + hoverinfo="skip", + ) + ) + + if config.per_region_spacing is not None: + # Plot the regions with different spacing + for region_spacing in config.per_region_spacing: + fig.add_trace( + go.Scattermap( + lon=np.array(region_spacing.region.exterior.xy[0]), + lat=np.array(region_spacing.region.exterior.xy[1]), + mode="lines", + line=dict(color="orange", width=1), + hoverinfo="skip", + ) + ) + + if rel_ffp is not None: + # Plot the domain + domain_corners = DomainParameters.read_from_realisation(rel_ffp).domain.corners + domain_corners = shapely.Polygon(domain_corners[:, ::-1]) + fig.add_trace( + go.Scattermap( + lon=np.array(domain_corners.exterior.xy[0]), + lat=np.array(domain_corners.exterior.xy[1]), + mode="lines", + line=dict(color="black"), + hoverinfo="skip", + ) + ) + + # Plot the source + source_config = SourceConfig.read_from_realisation(rel_ffp) + for _, fault in source_config.source_geometries.items(): + for cur_plane in fault.planes: + cur_polygon = shapely.transform( + cur_plane.geometry, + lambda x: coordinates.nztm_to_wgs_depth(x)[:, ::-1], + ) + fig.add_trace( + go.Scattermap( + lon=np.array(cur_polygon.exterior.xy[0]), + lat=np.array(cur_polygon.exterior.xy[1]), + mode="lines", + line=dict(color="black"), + hoverinfo="skip", + ) + ) + + fig.update_layout( + map=dict(zoom=6, center=dict(lat=site_df.lat.mean(), lon=site_df.lon.mean())), + showlegend=False, + title=( + f"Sites Grid - Velocity Model {config.vel_model_version} - " + f"Total Sites: {len(site_df)}" + ), + ) + fig.write_html(output_ffp) + + +if __name__ == "__main__": + app() diff --git a/workflow/site_gen.py b/workflow/site_gen.py new file mode 100644 index 00000000..a2d37b29 --- /dev/null +++ b/workflow/site_gen.py @@ -0,0 +1,892 @@ +"""Module for generating simulation site grids.""" +import enum +import json +import logging +import string +import time +from dataclasses import dataclass +from pathlib import Path + +import geopandas as gpd +import numpy as np +import pandas as pd +import pooch +import pygmt +import shapely +import xarray as xr + +from qcore import coordinates +from velocity_modelling.constants import get_data_root +from velocity_modelling.registry import CVMRegistry +from velocity_modelling.threshold import compute_station_thresholds + +logger = logging.getLogger(__name__) + +NZTM_CRS = "EPSG:2193" +WGS84_CRS = "EPSG:4326" + + +class NZGMDBVersion(enum.StrEnum): + """NZGMDB Version Enum.""" + + V4p3 = "v4.3" + + +NZGMDB_VERSION_TO_TABLE_NAME = { + NZGMDBVersion.V4p3: "nzgmdb_4p3_site_table.csv", +} + +GRID_DATA = pooch.create( + path=pooch.os_cache("virt_sites"), + base_url="", + registry={ + "general_grid.nc": "sha256:06d675c60d90545f29573ab505db17e7662f7958270093d4a8808d6fc379a984", + "nz_coastline.parquet": "sha256:9a965326690c560d372720b4b64b57480ede10e479b2fe62ba75760707bc521c", + "nzgmdb_4p3_site_table.csv": "sha256:0144c6e066a461b63441e9c1a1608741c5f056fd3e2926f2cb16748e24296317", + "nz_regions.parquet": "sha256:c08b250332052126d4e68ef0894d72416791c4184b5bfc23da93adf39582f2f2", + "nz_territory.parquet": "sha256:afe5a70572d6d42cbcbe4b26291983b03619bd0b567eb6f6cafa20306d03b0f7", + }, + urls={ + "general_grid.nc": "https://www.dropbox.com/scl/fi/4x987t01kvbz8bbyeid81/general_grid.nc?rlkey=rpztofwihh839500jn2bkw255&st=rpazzciz&dl=1", + "nz_coastline.parquet": "https://www.dropbox.com/scl/fi/nx7hm82ern7s2b4v7u6zq/nz_coastline.parquet?rlkey=xzvh7gumefaigzhbgyjtuxu2p&st=e4gwa9a9&dl=1", + "nzgmdb_4p3_site_table.csv": "https://www.dropbox.com/scl/fi/m3l559m2nmjgufipyq03s/nzgmdb_4p3_site_table.csv?rlkey=gg1hn2mm38zxew9fn45ac8p0v&st=iq0kagtv&dl=1", + "nz_regions.parquet": "https://www.dropbox.com/scl/fi/gvvjf16vpe69zqbglh8ns/nz_regions.parquet?rlkey=14y2exb6wr48h7d0pj7dcvxwa&st=t23pxjsv&dl=1", + "nz_territory.parquet": "https://www.dropbox.com/scl/fi/3ywi83r8pucp5ajs26do7/nz_territory.parquet?rlkey=viemibogajrtomm0n9xypwq1v&st=ph5pkg2z&dl=1", + }, +) + + +class GeneralGrid: + """ + Represents the general base grid of virtual sites. + Any custom grid is built off this grid. + + Parameters + ---------- + land_mask_grid : xr.DataArray + The land mask grid as an xarray DataArray. + spacing : int + The grid spacing in metres. + """ + + def __init__(self, land_mask_grid: xr.DataArray, spacing: int) -> None: + """Initialize GeneralGrid.""" + self.land_mask_grid = land_mask_grid + self.site_ids = np.arange(land_mask_grid.size, dtype=np.uint32).reshape( + land_mask_grid.shape + ) + self.spacing = spacing + + self.grid_lat, self.grid_lon = xr.broadcast( + self.land_mask_grid.lat, self.land_mask_grid.lon + ) + self.grid_lat, self.grid_lon = ( + self.grid_lat.values, + self.grid_lon.values, + ) + + grid_nztm = coordinates.wgs_depth_to_nztm( + np.vstack((self.grid_lat.ravel(), self.grid_lon.ravel())).T + ) + self.grid_nztm_x = grid_nztm[:, 1].reshape(self.land_mask_grid.shape) + self.grid_nztm_y = grid_nztm[:, 0].reshape(self.land_mask_grid.shape) + + @property + def shape(self) -> tuple[int, int]: + """ + Shape of the grid. + + Returns + ------- + tuple[int, int] + The shape of the grid. + """ + return self.land_mask_grid.shape + + @classmethod + def load(cls, land_mask_grid_ffp: Path) -> "GeneralGrid": + """ + Load GeneralGrid from file. + + Parameters + ---------- + land_mask_grid_ffp : Path + The file path to the general grid file. + + Returns + ------- + GeneralGrid + The loaded GeneralGrid instance. + """ + logger.info(f"Loading general grid from {land_mask_grid_ffp}...") + da = xr.load_dataarray(land_mask_grid_ffp, engine="h5netcdf") + return cls(da, int(da.attrs["spacing"])) + + +def gen_general_land_mask_grid(spacing: int) -> xr.DataArray: + """ + Generates the general land mask grid with given spacing. + + Parameters + ---------- + spacing : int + Grid spacing in metres + + Returns + ------- + xr.DataArray + The land mask grid DataArray, with values of 1 for land + and 0 for ocean. + + Notes + ----- + Common grid spacing formats: + - To specify grid spacing of `x` units: `"{x}{unit}/{x}{unit}"`, + where `unit` can be metres (`e`) or kilometres (`k`). + """ + land_df = gpd.read_parquet(GRID_DATA.fetch("nz_coastline.parquet")) + # Combine into a single polygon + land_polygon = shapely.coverage_union_all(land_df.geometry) + land_polygon = shapely.transform( + land_polygon, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) + ) + + # Generate grid + logger.info("Generating grid...") + land_mask_grid = pygmt.grdlandmask( + region="NZ", spacing=f"{spacing}e/{spacing}e" + ).astype(bool) + land_mask_grid[:] = False + land_mask_grid.attrs = {"spacing": spacing} + + # Use float32 for coords + land_mask_grid = land_mask_grid.assign_coords( + lat=land_mask_grid.lat.astype(np.float32), + lon=land_mask_grid.lon.astype(np.float32), + ) + + grid_lat, grid_lon = xr.broadcast(land_mask_grid.lat, land_mask_grid.lon) + grid_nztm = coordinates.wgs_depth_to_nztm( + np.vstack((grid_lat.values.ravel(), grid_lon.values.ravel())).T + ) + + # Apply land masking + logger.info("Applying land mask...") + mask = shapely.contains_xy(land_polygon, grid_nztm[:, 0], grid_nztm[:, 1]).reshape( + land_mask_grid.shape + ) + land_mask_grid.values[mask] = 1 + + return land_mask_grid + + +@dataclass +class RegionSpacingConfig: + """Configuration for region specific spacing.""" + + name: str + """Name of the region.""" + + region: shapely.Polygon + """Region polygon in WGS84 coordinates (lon/lat).""" + + spacing: int + """Grid spacing in metres within the region.""" + + def as_dict(self) -> dict: + """ + Convert RegionSpacingConfig to dictionary. + + Returns + ------- + dict + Dictionary representation of the configuration. + """ + return { + "name": self.name, + "region": shapely.geometry.mapping(self.region), + "spacing": self.spacing, + } + + @classmethod + def from_config(cls, config_dict: dict) -> "RegionSpacingConfig": + """ + Create RegionSpacingConfig from configuration dictionary. + + Parameters + ---------- + config_dict : dict + Configuration dictionary with keys 'name', 'spacing', + and either 'region' or 'geojson_ffp'. + + Returns + ------- + RegionSpacingConfig + The created RegionSpacingConfig instance. + """ + if "geojson_ffp" not in config_dict and "region" not in config_dict: + raise ValueError( + "Either 'geojson_ffp' or 'region' must be provided in the config dictionary." + ) + + if "geojson_ffp" in config_dict: + with open(config_dict["geojson_ffp"], "r") as f: + geojson_obj = json.load(f) + region_polygon = shapely.geometry.shape( + geojson_obj["features"][0]["geometry"] + ) + else: + region_polygon = shapely.geometry.shape(config_dict["region"]) + + return RegionSpacingConfig( + name=config_dict["name"], + region=region_polygon, + spacing=config_dict["spacing"], + ) + + +@dataclass +class CustomGridConfig: + """Configuration for CustomGrid.""" + + land_only: bool | None = None + """Whether to only include land sites.""" + + region: shapely.Polygon | None = None + """Region polygon in WGS84 coordinates (lon/lat).""" + + uniform_spacing: int | None = None + """Uniform grid spacing in metres.""" + + vel_model_version: str | None = None + """Velocity model version for basin spacing.""" + + basin_spacing: int | None = None + """Grid spacing in metres within basins.""" + + per_basin_spacing: dict[str, int] | None = None + """Per-basin specific spacing in metres.""" + + per_region_spacing: list[RegionSpacingConfig] | None = None + """Per-region specific spacing in metres.""" + + nzgmdb_version: NZGMDBVersion | None = None + """NZGMDB version for real stations.""" + + def __post_init__(self) -> None: + """Post-initialization checks.""" + if self.basin_spacing is not None and self.vel_model_version is None: + raise ValueError( + "vel_model_version must be provided if basin_spacing is set." + ) + + @classmethod + def from_config(cls, config_dict: dict) -> "CustomGridConfig": + """ + Create CustomGridConfig from dictionary. + + Parameters + ---------- + config_dict : dict + Configuration dictionary containing CustomGridConfig settings. + + Returns + ------- + CustomGridConfig + The created CustomGridConfig instance. + """ + # Get the region polygon + region = config_dict.get("region") + if region is not None and isinstance(region, dict): + region = shapely.geometry.shape(region) + + # Get the per-region spacing + per_region_spacing = None + if (prs_configs := config_dict.get("per_region_spacing")) is not None: + per_region_spacing = [ + RegionSpacingConfig.from_config(config) for config in prs_configs + ] + + # Get the NZGMDB version + nzgmdb_version = config_dict.get("nzgmdb_version") + if nzgmdb_version is not None: + nzgmdb_version = NZGMDBVersion(nzgmdb_version) + + return cls( + land_only=config_dict.get("land_only"), + region=region, + uniform_spacing=config_dict.get("uniform_spacing"), + vel_model_version=config_dict.get("vel_model_version"), + basin_spacing=config_dict.get("basin_spacing"), + per_basin_spacing=config_dict.get("per_basin_spacing"), + per_region_spacing=per_region_spacing, + nzgmdb_version=nzgmdb_version, + ) + + def as_dict(self) -> dict: + """ + Convert CustomGridConfig to dictionary. + + Returns + ------- + dict + Dictionary representation of the configuration. + """ + return { + "land_only": self.land_only, + "region": ( + shapely.geometry.mapping(self.region) + if self.region is not None + else None + ), + "uniform_spacing": self.uniform_spacing, + "vel_model_version": self.vel_model_version, + "basin_spacing": self.basin_spacing, + "per_basin_spacing": self.per_basin_spacing, + "per_region_spacing": ( + [prs.as_dict() for prs in self.per_region_spacing] + if self.per_region_spacing is not None + else None + ), + "nzgmdb_version": ( + self.nzgmdb_version.value if self.nzgmdb_version is not None else None + ), + } + + +class CustomGrid: + """ + Represents a custom grid of virtual sites + built from the general grid. + + Parameters + ---------- + general_grid : GeneralGrid, optional + The general grid to use. If None, it will be loaded from the + default location. + """ + + def __init__(self, general_grid: GeneralGrid = None) -> None: + """Initialize CustomGrid.""" + if general_grid is None: + logger.info("Loading general grid...") + self.general_grid = GeneralGrid.load(GRID_DATA.fetch("general_grid.nc")) + else: + self.general_grid = general_grid + + self._reset() + + def _reset(self) -> None: + """ + Resets the custom grid. + """ + self._and_mask = np.ones(self.general_grid.shape, dtype=bool) + self._or_mask = np.zeros(self.general_grid.shape, dtype=bool) + self.config = None + + @property + def mask(self) -> np.ndarray: + """ + Site mask that gives the custom grid + when applied to the the general grid. + + Returns + ------- + np.ndarray + The custom grid mask. + """ + return self._and_mask & self._or_mask + + def apply_config(self, config: CustomGridConfig) -> "CustomGrid": + """ + Applies a CustomGridConfig to the CustomGrid. + Resets any previously applied filters. + + Parameters + ---------- + config : CustomGridConfig + The configuration to apply. + + Returns + ------- + CustomGrid + Returns self for method chaining. + """ + self._reset() + self.config = config + + if self.config.land_only: + self._add_land_only_filter() + if self.config.region is not None: + self._add_region_filter(self.config.region) + if self.config.uniform_spacing is not None: + self._add_uniform_spacing_filter(self.config.uniform_spacing) + if ( + self.config.vel_model_version is not None + and self.config.basin_spacing is not None + ): + self._add_basin_spacing_filter( + self.config.vel_model_version, self.config.basin_spacing + ) + if self.config.per_basin_spacing is not None: + # Group by spacing + basin_spacing_series = pd.Series(self.config.per_basin_spacing) + spacing_groups = basin_spacing_series.groupby(basin_spacing_series) + + for spacing, basins in spacing_groups.groups.items(): + self._add_basin_spacing_filter( + self.config.vel_model_version, + spacing, + basins.tolist(), + ) + if self.config.per_region_spacing is not None: + for region_spacing_config in self.config.per_region_spacing: + self._add_region_spacing_filter(region_spacing_config) + + return self + + def _add_region_filter(self, region: shapely.Polygon) -> None: + """ + Adds a region filter. Only sites within the region are kept. + This is an AND filter, i.e., it removes sites outside the region. + Can only be applied once. + + Parameters + ---------- + region : shapely.Polygon + The region polygon in WGS84 coordinates (lon/lat). + """ + logger.info("Adding region filter to custom grid...") + start_time = time.time() + + # Convert to NZTM + region_nztm = shapely.transform( + region, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) + ) + + region_mask = shapely.contains_xy( + region_nztm, + self.general_grid.grid_nztm_y.ravel(), + self.general_grid.grid_nztm_x.ravel(), + ).reshape(self.general_grid.shape) + self._and_mask &= region_mask + logger.info(f"Region filter added in {time.time() - start_time} seconds.") + + def _add_uniform_spacing_filter(self, spacing: int) -> None: + """ + Adds a uniform spacing filter. + This is an OR filter, i.e., it only adds sites. + Can only be applied once. + + Parameters + ---------- + spacing : int + The uniform grid spacing in metres. + """ + logger.info("Adding uniform spacing filter...") + start_time = time.time() + if spacing < self.general_grid.spacing: + error_msg = ( + "Uniform spacing must be greater than or" + " equal to the general grid spacing." + ) + logger.error(error_msg) + raise ValueError(error_msg) + if spacing % self.general_grid.spacing != 0: + error_msg = ( + f"Uniform spacing must be a multiple of" + f" the general grid spacing {self.general_grid.spacing}." + ) + logger.error(error_msg) + raise ValueError(error_msg) + + idx_interval = spacing // self.general_grid.spacing + spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) + spacing_mask[::idx_interval, ::idx_interval] = True + self._or_mask |= spacing_mask + + logger.info( + f"Uniform spacing filter added in {time.time() - start_time} seconds." + ) + + def _add_land_only_filter(self) -> None: + """Adds a land only filter, only land sites are kept.""" + logger.info("Adding land only filter...") + start_time = time.time() + self._and_mask &= self.general_grid.land_mask_grid.values.astype(bool) + logger.info(f"Land only filter added in {time.time() - start_time} seconds.") + + def _add_region_spacing_filter( + self, region_spacing_config: RegionSpacingConfig + ) -> None: + """ + Adds a region specific spacing filter. + Note that this is an OR filter, i.e., it only adds sites. + + Parameters + ---------- + region_spacing_config : RegionSpacingConfig + The region spacing configuration. + """ + logger.info( + f"Adding region {region_spacing_config.spacing} spacing filter for {region_spacing_config.name}..." + ) + start_time = time.time() + if region_spacing_config.spacing < self.general_grid.spacing: + raise ValueError( + "Region spacing must be greater than or equal to the general grid spacing." + ) + if region_spacing_config.spacing % self.general_grid.spacing != 0: + raise ValueError( + f"Region spacing must be a multiple of the general grid spacing {self.general_grid.spacing}." + ) + + region_nztm = shapely.transform( + region_spacing_config.region, + lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]), + ) + in_region_mask = shapely.contains_xy( + region_nztm, + self.general_grid.grid_nztm_y.ravel(), + self.general_grid.grid_nztm_x.ravel(), + ).reshape(self.general_grid.shape) + + idx_interval = region_spacing_config.spacing // self.general_grid.spacing + spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) + spacing_mask[::idx_interval, ::idx_interval] = True + + self._or_mask |= in_region_mask & spacing_mask + logger.info( + f"Region spacing filter added in {time.time() - start_time} seconds." + ) + + def _add_basin_spacing_filter( + self, vel_model_version: str, spacing: int, basins: list[str] | None = None + ) -> None: + """ + Adds a basin spacing filter. + Note that this is an OR filter, i.e., it only adds sites. + + Parameters + ---------- + vel_model_version : str + The velocity model version to use for basin spacing. + spacing : int + The grid spacing in metres to use within basins. + basins : list[str] | None, optional + The specific basins to apply the spacing to. If None, applies to all basins. + """ + logger.info( + f"Adding basin {spacing} spacing filter for {basins if basins is not None else 'all'} basins..." + ) + start_time = time.time() + if spacing < self.general_grid.spacing: + raise ValueError( + "Basin spacing must be greater than or equal to the general grid spacing." + ) + if spacing % self.general_grid.spacing != 0: + raise ValueError( + f"Basin spacing must be a multiple of the general grid spacing {self.general_grid.spacing}." + ) + + basin_boundaries = get_basin_boundaries(vel_model_version) + if basins is not None: + basin_boundaries = { + k: v for k, v in basin_boundaries.items() if k in basins + } + comb_basin_boundaries = shapely.union_all(list(basin_boundaries.values())) + comb_basin_boundaries = shapely.transform( + comb_basin_boundaries, lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]) + ) + in_basin_mask = shapely.contains_xy( + comb_basin_boundaries, + self.general_grid.grid_nztm_y.ravel(), + self.general_grid.grid_nztm_x.ravel(), + ).reshape(self.general_grid.shape) + + idx_interval = spacing // self.general_grid.spacing + spacing_mask = np.zeros(self.general_grid.shape, dtype=bool) + spacing_mask[::idx_interval, ::idx_interval] = True + + self._or_mask |= in_basin_mask & spacing_mask + logger.info( + f"Basin spacing filter added in {time.time() - start_time} seconds." + ) + + def get_metadata(self, site_df: pd.DataFrame = None) -> dict: + """ + Gets the metadata dictionary for the custom grid. + + Parameters + ---------- + site_df : pd.DataFrame, optional + The site dataframe to generate metadata from. + + Returns + ------- + dict + Dictionary containing metadata and configuration information. + """ + site_metadata = {} + if site_df is not None: + site_metadata = { + "num_sites": len(site_df), + "num_virtual_sites": len(site_df[site_df.source == "virtual"]), + "num_real_sites": len(site_df[site_df.source == "real"]), + } + + # Number of sites per basin + if "basin" in site_df.columns: + site_metadata["num_basin_sites"] = len(site_df[site_df.basin.notna()]) + site_metadata["sites_per_basin"] = ( + site_df.groupby("basin").size().to_dict() + ) + + metadata = {"metadata": site_metadata, "config": self.config.as_dict()} + return metadata + + def get_site_df( + self, + ) -> pd.DataFrame: + """ + Gets the site dataframe for the custom grid. + + Returns + ------- + pd.DataFrame + The site dataframe with columns: + - general_site_id: The site ID from the general grid. + - lon: The site longitude. + - lat: The site latitude. + - nztm_x: The site NZTM X coordinate. + - nztm_y: The site NZTM Y coordinate. + - source: "virtual" for virtual sites, "real" for NZGMDB sites. + - region_name: The name of the region the site is in. + - region_code: The code of the region the site is in. + - territory_name: The name of the territory the site is in. + - basin: The basin the site is in (if any). + - Z1.0: The Z1.0 value for the site (if vel_model_version is provided). + - Z2.5: The Z2.5 value for the site (if vel_model_version is provided). + - site_code: The site code. + + Notes + ----- + Basin membership and Z-values are only added if vel_model_version + is set in the configuration. + """ + if self.mask.sum() == 0: + logger.warning("Custom grid has no sites selected.") + return pd.DataFrame() + + logger.info("Getting site dataframe...") + site_df = pd.DataFrame( + { + "general_site_id": self.general_grid.site_ids[self.mask], + "lon": self.general_grid.grid_lon[self.mask], + "lat": self.general_grid.grid_lat[self.mask], + "nztm_x": self.general_grid.grid_nztm_x[self.mask], + "nztm_y": self.general_grid.grid_nztm_y[self.mask], + "source": "virtual", + } + ).set_index("general_site_id") + + # Add NZGMDB sites + if self.config.nzgmdb_version is not None: + start = time.time() + logger.info("Adding NZGMDB sites...") + nzgmdb_site_df = pd.read_csv( + GRID_DATA.fetch( + NZGMDB_VERSION_TO_TABLE_NAME[self.config.nzgmdb_version] + ), + index_col="sta", + usecols=["sta", "lat", "lon", "Vs30", "Z1.0", "Z2.5"], + ) + nzgmdb_site_df["source"] = "real" + nzgmdb_nztm_values = coordinates.wgs_depth_to_nztm( + nzgmdb_site_df[["lat", "lon"]].values + ) + nzgmdb_site_df["nztm_x"] = nzgmdb_nztm_values[:, 1] + nzgmdb_site_df["nztm_y"] = nzgmdb_nztm_values[:, 0] + # Convert Z1.0 to km + nzgmdb_site_df["Z1.0"] /= 1000 + + if self.config.region is not None: + region_nztm = shapely.transform( + self.config.region, + lambda x: coordinates.wgs_depth_to_nztm(x[:, ::-1]), + ) + region_mask = shapely.contains_xy( + region_nztm, + nzgmdb_site_df["nztm_y"].values, + nzgmdb_site_df["nztm_x"].values, + ) + nzgmdb_site_df = nzgmdb_site_df[region_mask] + + site_df = pd.concat([site_df, nzgmdb_site_df], axis=0) + logger.info(f"Took: {time.time() - start} to add NZGMDB sites") + + # Add region + logger.info("Adding region information...") + start = time.time() + region_df = ( + gpd.read_parquet(GRID_DATA.fetch("nz_regions.parquet")) + .to_crs(NZTM_CRS) + .astype({"name": "category", "code": "category"}) + ) + + site_points_df = gpd.GeoDataFrame( + site_df[["nztm_x", "nztm_y"]], + geometry=gpd.points_from_xy(site_df.nztm_x, site_df.nztm_y), + crs=NZTM_CRS, + ) + joined = gpd.sjoin(site_points_df, region_df, how="left", predicate="within") + + # Deal with sites that are not in a region + joined["name"] = joined["name"].cat.add_categories("No Region") + joined["name"] = joined["name"].fillna("No Region") + joined["code"] = joined["code"].cat.add_categories("NR") + joined["code"] = joined["code"].fillna("NR") + + # Keep first region if in multiple regions + joined = joined.groupby(level=0).first() + site_df["region_name"] = joined["name"] + site_df["region_code"] = joined["code"] + logger.info(f"Took: {time.time() - start} ") + + # Add territory + logger.info("Adding territory information...") + start = time.time() + territory_df = ( + gpd.read_parquet(GRID_DATA.fetch("nz_territory.parquet")) + .to_crs(NZTM_CRS) + .astype({"name": "category"}) + ) + joined = gpd.sjoin( + site_points_df, + territory_df, + how="left", + predicate="within", + ) + site_df["territory_name"] = joined["name"] + # Keep first territory if in multiple territories + site_df["territory_name"] = site_df["territory_name"].groupby(level=0).first() + logger.info(f"Took: {time.time() - start} ") + + # Add basin membership & Z-values + if self.config.vel_model_version: + start = time.time() + # Basin membership + logger.info("Adding basin membership information...") + basin_boundaries = get_basin_boundaries(self.config.vel_model_version) + basin_df = gpd.GeoDataFrame( + {"basin": list(basin_boundaries.keys())}, + geometry=list(basin_boundaries.values()), + crs=WGS84_CRS, + ).to_crs(NZTM_CRS) + joined = gpd.sjoin( + site_points_df, + basin_df, + how="left", + predicate="within", + ) + # Keep first basin if in multiple basins + joined = joined.groupby(level=0).first() + site_df["basin"] = joined["basin"] + logger.info(f"Took: {time.time() - start} to add basin membership") + + # Get Z-values + start = time.time() + logger.info("Adding Z-values...") + logging.getLogger("nzcvm.threshold").setLevel(logging.WARNING) + z_values = compute_station_thresholds( + site_df, + model_version=self.config.vel_model_version, + show_progress=False, + include_sigma=False, + logger=logger, + ) + site_df["Z1.0"] = z_values["Z_1.0(km)"] + site_df["Z2.5"] = z_values["Z_2.5(km)"] + logger.info(f"Took: {time.time() - start} to add Z-values") + + # Add site ids + logger.info("Adding site code...") + virt_mask = site_df.source == "virtual" + site_df.loc[virt_mask, "site_code"] = np.char.add( + encode_base62_fixed_array( + site_df.loc[virt_mask].index.values.astype(int), length=5 + ), + site_df.loc[virt_mask].region_code.values.astype(str), + ) + site_df.loc[~virt_mask, "site_code"] = site_df.loc[~virt_mask].index.values + site_df = site_df.set_index("site_code") + site_df = site_df.astype({"source": "category"}) + + # Sanity check + assert site_df.index.is_unique, "Site IDs are not unique!" + return site_df + + +def get_basin_boundaries(vel_model_version: str) -> dict[str, shapely.Polygon]: + """ + Gets the basin boundaries for a given velocity model version. + + Parameters + ---------- + vel_model_version : str + The velocity model version to load basin data for. + + Returns + ------- + dict[str, shapely.Polygon] + Dictionary mapping basin names to their boundary polygons. + """ + cvm_registry = CVMRegistry(vel_model_version, get_data_root()) + basin_data = cvm_registry.load_basin_data(cvm_registry.global_params["basins"]) + + basin_boundaries = {} + for cur_basin_data in basin_data: + if len(cur_basin_data.boundaries) == 1: + basin_boundaries[cur_basin_data.name] = shapely.Polygon( + cur_basin_data.boundaries[0] + ) + else: + basin_boundaries[cur_basin_data.name] = shapely.union_all( + [shapely.Polygon(b) for b in cur_basin_data.boundaries] + ) + + return basin_boundaries + + +def encode_base62_fixed_array(nums: np.ndarray, length: int) -> np.ndarray: + """ + Vectorized Base62 encoder for many integers. + + Parameters + ---------- + nums : np.ndarray + Array of integers to encode. + length : int + The fixed length for each encoded string. + + Returns + ------- + np.ndarray + Array of Base62 encoded strings of fixed length. + """ + alphabet = np.array(list(string.digits + string.ascii_letters)) + alphabet_size = len(alphabet) + + if np.any(nums >= alphabet_size**length): + raise ValueError(f"Some numbers too large for {length}-char Base62 ID.") + + # Create empty char array: shape (N, MAX_LEN) + out = np.empty((nums.size, length), dtype="