diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..7678b0d8 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,38 @@ +{ + "name": "MTUQ Development", + "dockerComposeFile": "../docker-compose.yml", + "service": "dev", + "workspaceFolder": "/workspace", + + // Configure tool-specific properties + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-python.vscode-pylance", + "ms-python.black-formatter", + "ms-python.isort", + "ms-python.flake8", + "ms-toolsai.jupyter", + "ms-vscode.vscode-json" + ], + "settings": { + "python.defaultInterpreterPath": "/usr/local/bin/python", + "python.testing.pytestEnabled": true, + "python.testing.unittestEnabled": false, + "editor.formatOnSave": true, + "files.eol": "\n" + } + } + }, + + // Use 'forwardPorts' to make a list of ports inside the container available locally + // "forwardPorts": [8000, 8888], + + // Use 'postCreateCommand' to run commands after the container is created + "postCreateCommand": "pip install -e .", + + // Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root + "remoteUser": "developer" +} + diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..69d39805 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,65 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +env/ +ENV/ +.venv + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Git +.git/ +.gitignore + +# Documentation builds +docs/_build/ +docs/build/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# Jupyter +.ipynb_checkpoints/ +*.ipynb + +# OS +.DS_Store +Thumbs.db + +# Docker +Dockerfile +docker-compose.yml +.dockerignore + +# Data files (optional - comment out if you want to include data) +# data/ + diff --git a/.gitignore b/.gitignore index 8e27fa22..a3c75f5b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ tmp* *.eps *.png *.ps +*.nc +*_solution.json build/ __pycache__/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..823f1ff6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,48 @@ +# Development Dockerfile for mtuq +FROM python:3.11-slim + +# Set working directory +WORKDIR /workspace + +# Install system dependencies including OpenMPI for parallel execution +RUN apt-get update && apt-get install -y \ + build-essential \ + gcc \ + g++ \ + gfortran \ + libnetcdf-dev \ + libhdf5-dev \ + pkg-config \ + git \ + vim \ + curl \ + openmpi-bin \ + libopenmpi-dev \ + && rm -rf /var/lib/apt/lists/* + +# Set environment variables +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + OMPI_ALLOW_RUN_AS_ROOT=1 \ + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \ + OMPI_MCA_btl_vader_single_copy_mechanism=none + +# Copy the full source to install dependencies from pyproject.toml +COPY . /workspace + +# Install Python dependencies from pyproject.toml (single source of truth) +# Use [dev] extras to include development dependencies like mpi4py +RUN pip install --upgrade pip setuptools wheel && \ + pip install -e ".[dev]" + +# Create a non-root user for development +RUN useradd -m -s /bin/bash developer && \ + chown -R developer:developer /workspace + +USER developer + +# Default command +CMD ["/bin/bash"] + diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..f3cd7788 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,23 @@ +services: + dev: + build: + context: . + dockerfile: Dockerfile + container_name: mtuq-dev + volumes: + - .:/workspace + - mtuq-data:/workspace/data + working_dir: /workspace + stdin_open: true + tty: true + environment: + - PYTHONUNBUFFERED=1 + - PYTHONDONTWRITEBYTECODE=1 + # Uncomment the following lines if you need to expose ports + # ports: + # - "8000:8000" + # - "8888:8888" + +volumes: + mtuq-data: + diff --git a/mtuq/grid_search.py b/mtuq/grid_search.py index e9e08658..25603d1d 100644 --- a/mtuq/grid_search.py +++ b/mtuq/grid_search.py @@ -1,4 +1,3 @@ - import h5py import netCDF4 import numpy as np @@ -8,8 +7,16 @@ from collections.abc import Iterable from mtuq.event import Origin from mtuq.grid import DataFrame, DataArray, Grid, UnstructuredGrid -from mtuq.util import gather2, iterable, timer, remove_list, warn,\ - ProgressCallback, dataarray_idxmin, dataarray_idxmax +from mtuq.util import ( + gather2, + iterable, + timer, + remove_list, + warn, + ProgressCallback, + dataarray_idxmin, + dataarray_idxmax, +) from os.path import splitext from xarray.core.formatting import unindexed_dims_repr @@ -17,18 +24,26 @@ xarray.set_options(keep_attrs=True) -def grid_search(data, greens, misfit, origins, sources, - msg_interval=25, timed=True, verbose=1, gather=True): - - """ Evaluates misfit over grids +def grid_search( + data, + greens, + misfit, + origins, + sources, + msg_interval=25, + timed=True, + verbose=1, + gather=True, +): + """Evaluates misfit over grids .. rubric :: Usage - Carries out a grid search by evaluating + Carries out a grid search by evaluating `misfit(data, greens.select(origin), source)` over all origins and sources. If `origins` and `sources` are regularly-spaced, returns an `MTUQDataArray` - containing misfit values and corresponding grid points. Otherwise, + containing misfit values and corresponding grid points. Otherwise, an `MTUQDataFrame` is returned. @@ -55,7 +70,7 @@ def grid_search(data, greens, misfit, origins, sources, ``msg_interval`` (`int`): - How frequently, as a percentage of total evaluations, should progress + How frequently, as a percentage of total evaluations, should progress messages be displayed? (`int` between 0 and 100) @@ -85,38 +100,35 @@ def grid_search(data, greens, misfit, origins, sources, if type(sources) not in (Grid, UnstructuredGrid): raise TypeError - size = len(origins)*sources.size - + size = len(origins) * sources.size if _is_mpi_env(): from mpi4py import MPI + comm = MPI.COMM_WORLD iproc, nproc = comm.rank, comm.size if nproc > sources.size: - raise Exception('Number of CPU cores exceeds size of grid') - + raise Exception("Number of CPU cores exceeds size of grid") # print debugging information - if verbose>0 and _is_mpi_env() and iproc==0: + if verbose > 0 and _is_mpi_env() and iproc == 0: try: print(misfit.description()) except: pass - print(' Number of misfit evaluations: {:,}\n'.format(size)) - print(' Number of MPI processes: {:,}'.format(nproc)) - print(' Number of evaluations per process: {:,}\n'.format(size//nproc)) - + print(" Number of misfit evaluations: {:,}\n".format(size)) + print(" Number of MPI processes: {:,}".format(nproc)) + print(" Number of evaluations per process: {:,}\n".format(size // nproc)) - elif verbose>0 and not _is_mpi_env(): + elif verbose > 0 and not _is_mpi_env(): try: print(misfit.description()) except: pass - print(' Number of misfit evaluations: {:,}\n'.format(size)) - + print(" Number of misfit evaluations: {:,}\n".format(size)) if _is_mpi_env(): # @@ -132,14 +144,12 @@ def grid_search(data, greens, misfit, origins, sources, timed = False msg_interval = 0 - # # evaluate misfit over grids # values = _grid_search_serial( - data, greens, misfit, origins, sources, timed=timed, - msg_interval=msg_interval) - + data, greens, misfit, origins, sources, timed=timed, msg_interval=msg_interval + ) # # collect results @@ -149,7 +159,7 @@ def grid_search(data, greens, misfit, origins, sources, values = gather2(comm, values) sources = _all - if iproc!=0: + if iproc != 0: return # convert from NumPy array to DataArray or DataFrame @@ -160,11 +170,11 @@ def grid_search(data, greens, misfit, origins, sources, return _to_dataframe(origins, sources, values) - @timer -def _grid_search_serial(data, greens, misfit, origins, sources, - timed=True, msg_interval=25): - """ Evaluates misfit over origin and source grids +def _grid_search_serial( + data, greens, misfit, origins, sources, timed=True, msg_interval=25 +): + """Evaluates misfit over origin and source grids (serial implementation) """ ni = len(origins) @@ -173,63 +183,62 @@ def _grid_search_serial(data, greens, misfit, origins, sources, values = [] for _i, origin in enumerate(origins): - msg_handle = ProgressCallback( - start=_i*nj, stop=ni*nj, percent=msg_interval) + msg_handle = ProgressCallback(start=_i * nj, stop=ni * nj, percent=msg_interval) # evaluate misfit function - values += [misfit( - data, greens.select(origin), sources, msg_handle)] + values += [misfit(data, greens.select(origin), sources, msg_handle)] - # returns NumPy array of shape `(len(sources), len(origins))` + # returns NumPy array of shape `(len(sources), len(origins))` return np.concatenate(values, axis=1) - class MTUQDataArray(xarray.DataArray): - """ Data structure for storing values on regularly-spaced grids + """Data structure for storing values on regularly-spaced grids .. note:: Besides the methods below, `MTUQDataArray` includes many useful methods - inherited from ``xarray.DataArray``. See - `xarray documentation `_ + inherited from ``xarray.DataArray``. See + `xarray documentation `_ for more information. """ def origin_idxmin(self): - """ Returns `origins` index corresponding to minimum misfit - """ - return int(dataarray_idxmin(self)['origin_idx']) + """Returns `origins` index corresponding to minimum misfit""" + return int(dataarray_idxmin(self)["origin_idx"]) def source_idxmin(self): - """ Returns `sources` index corresponding to minimum misfit - """ + """Returns `sources` index corresponding to minimum misfit""" shape = self._get_shape() return np.unravel_index(self.argmin(), shape)[0] def _get_shape(self): - """ Private helper method - """ - nn = len(self.coords['origin_idx']) - return (int(self.size/nn), nn) + """Private helper method""" + nn = len(self.coords["origin_idx"]) + return (int(self.size / nn), nn) def save(self, filename, *args, **kwargs): - """ Saves grid search results to NetCDF file - """ - print(' saving NetCDF file: %s' % filename) - self.to_netcdf(filename) + """Saves grid search results to NetCDF file""" + import os + + # Remove existing file to avoid HDF5/NetCDF conflicts + if os.path.exists(filename): + os.remove(filename) + print(" saving NetCDF file: %s" % filename) + # Use scipy engine for better compatibility in MPI environments + self.to_netcdf(filename, engine="scipy") def __repr__(self): summary = [ - 'Summary:', - ' grid shape: %s' % self.shape.__repr__(), - ' grid size: %d' % self.size, - ' mean: %.3e' % np.mean(self.values), - ' std: %.3e' % np.std(self.values), - ' min: %.3e' % self.values.min(), - ' max: %.3e' % self.values.max(), - '', + "Summary:", + " grid shape: %s" % self.shape.__repr__(), + " grid size: %d" % self.size, + " mean: %.3e" % np.mean(self.values), + " std: %.3e" % np.std(self.values), + " min: %.3e" % self.values.min(), + " max: %.3e" % self.values.max(), + "", ] if hasattr(self, "coords"): @@ -240,39 +249,41 @@ def __repr__(self): if unindexed_dims_str: summary.append(unindexed_dims_str) - return "\n".join(summary+['']) - + return "\n".join(summary + [""]) class MTUQDataFrame(pandas.DataFrame): - """ Data structure for storing values on irregularly-spaced grids + """Data structure for storing values on irregularly-spaced grids .. note:: Besides the methods below, `MTUQDataFrame` includes many useful methods - inherited from ``pandas.DataFrame``. See `pandas documentation + inherited from ``pandas.DataFrame``. See `pandas documentation `_ for more information. """ + def origin_idxmin(self): - """ Returns coordinates corresponding to minimum misfit - """ + """Returns coordinates corresponding to minimum misfit""" df = self.reset_index() - return df['origin_idx'][df[0].idxmin()] + return df["origin_idx"][df[0].idxmin()] def source_idxmin(self): - """ Returns coordinates corresponding to minimum misfit - """ + """Returns coordinates corresponding to minimum misfit""" df = self.reset_index() - return df['source_idx'][df[0].idxmin()] + return df["source_idx"][df[0].idxmin()] def save(self, filename, *args, **kwargs): - """ Saves grid search results to HDF5 file - """ - print(' saving HDF5 file: %s' % filename) + """Saves grid search results to HDF5 file""" + import os + + # Remove existing file to avoid HDF5 conflicts + if os.path.exists(filename): + os.remove(filename) + print(" saving HDF5 file: %s" % filename) df = pandas.DataFrame(self.values, index=self.index) - df.to_hdf(filename, key='df', mode='w') + df.to_hdf(filename, key="df", mode="w") @property def _constructor(self): @@ -283,6 +294,7 @@ def _constructor(self): # utility functions # + def _is_mpi_env(): try: import mpi4py @@ -294,16 +306,15 @@ def _is_mpi_env(): except ImportError: return False - if mpi4py.MPI.COMM_WORLD.Get_size()>1: + if mpi4py.MPI.COMM_WORLD.Get_size() > 1: return True else: return False def _to_dataarray(origins, sources, values): - """ Converts grid_search inputs to DataArray - """ - origin_dims = ('origin_idx',) + """Converts grid_search inputs to DataArray""" + origin_dims = ("origin_idx",) origin_coords = [np.arange(len(origins))] origin_shape = (len(origins),) @@ -311,23 +322,25 @@ def _to_dataarray(origins, sources, values): source_coords = sources.coords source_shape = sources.shape - return MTUQDataArray(**{ - 'data': np.reshape(values, source_shape + origin_shape), - 'coords': source_coords + origin_coords, - 'dims': source_dims + origin_dims, - }) + return MTUQDataArray( + **{ + "data": np.reshape(values, source_shape + origin_shape), + "coords": source_coords + origin_coords, + "dims": source_dims + origin_dims, + } + ) def _to_dataframe(origins, sources, values, index_type=2): - """ Converts grid_search inputs to DataFrame - """ - if len(origins)*len(sources) > 1.e7: - print(" pandas indexing becomes very slow with >10 million rows\n" - " consider using index_type=1 in mtuq.grid_search._to_dataframe\n" - ) + """Converts grid_search inputs to DataFrame""" + if len(origins) * len(sources) > 1.0e7: + print( + " pandas indexing becomes very slow with >10 million rows\n" + " consider using index_type=1 in mtuq.grid_search._to_dataframe\n" + ) - origin_idx = np.arange(len(origins), dtype='int') - source_idx = np.arange(len(sources), dtype='int') + origin_idx = np.arange(len(origins), dtype="int") + source_idx = np.arange(len(sources), dtype="int") # Cartesian products origin_idx = list(np.repeat(origin_idx, len(sources))) @@ -338,8 +351,8 @@ def _to_dataframe(origins, sources, values, index_type=2): # assemble coordinates coords = [origin_idx, source_idx] - dims = ('origin_idx', 'source_idx') - if index_type==2: + dims = ("origin_idx", "source_idx") + if index_type == 2: coords += source_coords dims += sources.dims @@ -355,8 +368,9 @@ def _to_dataframe(origins, sources, values, index_type=2): # I/O functions # + def open_ds(filename, format=None): - """ Reads grid search results from disk + """Reads grid search results from disk .. rubric :: Parameters @@ -370,34 +384,31 @@ def open_ds(filename, format=None): if not format: # try to determine file format, if not given if h5py.is_hdf5(filename): - format = 'HDF' + format = "HDF" else: try: netCDF.Dataset(filename, "r") - format = 'NETCDF4' + format = "NETCDF4" except: - raise Exception('File format not recognized: %s' % filename) + raise Exception("File format not recognized: %s" % filename) - if format.upper() in ['H5', 'HDF','HDF5']: + if format.upper() in ["H5", "HDF", "HDF5"]: return open_df(filename) - elif format.upper() in ['NC', 'NC4', 'NETCDF', 'NETCDF4']: + elif format.upper() in ["NC", "NC4", "NETCDF", "NETCDF4"]: return open_da(filename) else: - raise Exception('File format not supported: %s' % filename) + raise Exception("File format not supported: %s" % filename) def open_da(filename): - """ Reads MTUQDataArray from NetCDF file - """ + """Reads MTUQDataArray from NetCDF file""" da = xarray.open_dataarray(filename) return MTUQDataArray(data=da.values, coords=da.coords, dims=da.dims) def open_df(filename): - """ Reads MTUQDataFrame from HDF5 file - """ + """Reads MTUQDataFrame from HDF5 file""" df = pandas.read_hdf(filename) return MTUQDataFrame(df.values, index=df.index) - diff --git a/pyproject.toml b/pyproject.toml index 9743dd2a..1b164d9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,11 @@ dependencies = [ "seisclient", ] +[project.optional-dependencies] +dev = [ + "mpi4py", +] + [project.entry-points."readers"] SAC = "mtuq.io.readers.SAC:read"