Creating representative datasets from ncdump output #10773
-
| TLDR: Jump to the bottom I'm interested in Unidata metadata and conventions. 
 def from_xarray_dataset_dict(d) -> xr.Dataset:
    """Reconstruct a dataset with zero data from the output of ``xarray.Dataset.to_dict(data=False)``.
    Useful in issues helping users debug their dataset metadata - sharing dataset schemas with associated metadata
    without sharing the data itself.
    Example
    -------
    >>> import xarray as xr
    >>> from parcels._datasets.structured.generic import datasets
    >>> ds = datasets['ds_2d_left']
    >>> d = ds.to_dict(data=False)
    >>> ds2 = from_xarray_dataset_dict(d)
    """
    return xr.Dataset.from_dict(_fill_with_dummy_data(copy.deepcopy(d)))
def _fill_with_dummy_data(d: dict[str, dict]):
    assert isinstance(d, dict)
    if "dtype" in d:
        d["data"] = np.zeros(d["shape"], dtype=d["dtype"])
        del d["dtype"]
        del d["shape"]
    for k in d:
        if isinstance(d[k], dict):
            d[k] = _fill_with_dummy_data(d[k])
    return dIf a user then does def load_dataset() -> xr.Dataset:
    sgrid_roms = xr.Dataset()
    sgrid_roms["grid"] = xr.DataArray(
        0,
        attrs=dict(
            cf_role="grid_topology",
            topology_dimension=2,
            node_dimensions="xi_psi eta_psi",
            face_dimensions="xi_rho: xi_psi (padding: both) eta_rho: eta_psi (padding: both)",
            edge1_dimensions="xi_u: xi_psi eta_u: eta_psi (padding: both)",
            edge2_dimensions="xi_v: xi_psi (padding: both) eta_v: eta_psi",
            node_coordinates="lon_psi lat_psi",
            face_coordinates="lon_rho lat_rho",
            edge1_coordinates="lon_u lat_u",
            edge2_coordinates="lon_v lat_v",
            vertical_dimensions="s_rho: s_w (padding: none)",
        ),
    )
    sgrid_roms["u"] = (("xi_u", "eta_u"), np.ones((2, 2)), {"grid": "grid"})
    return sgrid_roms
load_dataset().to_dict(data=False)Then copy/pasting the (jsonified if you must) output in a GitHub issue: {'coords': {}, 'attrs': {}, 'dims': {'xi_u': 2, 'eta_u': 2}, 'data_vars': {'grid': {'dims': (), 'attrs': {'cf_role': 'grid_topology', 'topology_dimension': 2, 'node_dimensions': 'xi_psi eta_psi', 'face_dimensions': 'xi_rho: xi_psi (padding: both) eta_rho: eta_psi (padding: both)', 'edge1_dimensions': 'xi_u: xi_psi eta_u: eta_psi (padding: both)', 'edge2_dimensions': 'xi_v: xi_psi (padding: both) eta_v: eta_psi', 'node_coordinates': 'lon_psi lat_psi', 'face_coordinates': 'lon_rho lat_rho', 'edge1_coordinates': 'lon_u lat_u', 'edge2_coordinates': 'lon_v lat_v', 'vertical_dimensions': 's_rho: s_w (padding: none)'}, 'dtype': 'int64', 'shape': ()}, 'u': {'dims': ('xi_u', 'eta_u'), 'attrs': {'grid': 'grid'}, 'dtype': 'float64', 'shape': (2, 2)}}}We can do d = ...
ds = from_xarray_dataset_dict(d)This is great, but it requires the user to have access to the original dataset. Is there tooling that I can use to ingest ncdump output such as those listed on UGRID / SGRID 's websites into a representative xarray dataset? | 
Beta Was this translation helpful? Give feedback.
Replies: 1 comment
-
Beta Was this translation helpful? Give feedback.
ncgen can be used to parse ncdump (i.e., CDL) output