From cee676dd97f660881de5ff109eb3016b5c2c49e8 Mon Sep 17 00:00:00 2001 From: Filipe Fernandes Date: Fri, 31 May 2024 19:58:19 +0200 Subject: [PATCH] ruff all and fix lints --- .pre-commit-config.yaml | 2 +- ctd/__init__.py | 5 +- ctd/extras.py | 151 ++++++++------- ctd/plotting.py | 25 ++- ctd/processing.py | 127 +++++++------ ctd/read.py | 282 ++++++++++++++++------------- docs/source/conf.py | 6 +- notebooks/00-reading-data.ipynb | 3 - notebooks/quick_intro.ipynb | 37 ++-- pyproject.toml | 4 +- requirements-dev.txt | 18 +- requirements.txt | 2 +- ruff.toml | 53 ++++++ tests/test_plotting.py | 27 ++- tests/test_processing.py | 45 +++-- tests/test_processing_real_data.py | 28 ++- tests/test_read.py | 107 ++++++----- 17 files changed, 522 insertions(+), 400 deletions(-) create mode 100644 ruff.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c3e6905..a967a2b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,7 @@ repos: - id: add-trailing-comma - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.7 + rev: v0.5.0 hooks: - id: ruff args: ["--fix", "--show-fixes"] diff --git a/ctd/__init__.py b/ctd/__init__.py index e4adab3..8da35d1 100644 --- a/ctd/__init__.py +++ b/ctd/__init__.py @@ -1,6 +1,5 @@ -""" -Tools to load hydrographic data as pandas DataFrame with some handy methods for -data pre-processing and analysis. +"""Tools to load hydrographic data as pandas DataFrame with some handy methods +for data pre-processing and analysis. """ from .plotting import plot_cast diff --git a/ctd/extras.py b/ctd/extras.py index 62f920a..34c79e7 100644 --- a/ctd/extras.py +++ b/ctd/extras.py @@ -1,16 +1,13 @@ -""" -Extra functionality for plotting and post-processing. -""" +"""Extra functionality for plotting and post-processing.""" import matplotlib.pyplot as plt import numpy as np -import numpy.ma as ma -from pandas import Series +import pandas as pd +from numpy import ma def _extrap1d(interpolator): - """ - How to make scipy.interpolate return an extrapolated result beyond the + """How to make scipy.interpolate return an extrapolated result beyond the input range. This is usually bad interpolation! But sometimes useful for pretty pictures, @@ -25,10 +22,9 @@ def pointwise(x): """Pointwise interpolation.""" if x < xs[0]: return ys[0] + (x - xs[0]) * (ys[1] - ys[0]) / (xs[1] - xs[0]) - elif x > xs[-1]: + if x > xs[-1]: return ys[-1] + (x - xs[-1]) * (ys[-1] - ys[-2]) / (xs[-1] - xs[-2]) - else: - return interpolator(x) + return interpolator(x) def ufunclike(xs): """Return an interpolation ufunc.""" @@ -39,30 +35,34 @@ def ufunclike(xs): def get_maxdepth(self): """Return the maximum depth/pressure of a cast.""" - valid_last_depth = self.apply(Series.notnull).values.T - return np.float_(self.index.values * valid_last_depth).max(axis=1) - - -def extrap_sec(data, dist, depth, w1=1.0, w2=0): - """ - Extrapolates `data` to zones where the shallow stations are shadowed by + valid_last_depth = self.apply(pd.Series.notnull).to_numpy().T + return np.float64(self.index.to_numpy() * valid_last_depth).max(axis=1) + + +def extrap_sec( + data: np.ndarray, + dist: np.ndarray, + depth: np.ndarray, + w1: float = 1.0, + w2: float = 0, +) -> np.ndarray: + """Extrapolate `data` to zones where the shallow stations are shadowed by the deep stations. The shadow region usually cannot be extrapolates via linear interpolation. The extrapolation is applied using the gradients of the `data` at a certain level. - Parameters - ---------- - data : array_like - Data to be extrapolated - dist : array_like - Stations distance - fd : float - Decay factor [0-1] + Inputs + ------ + data : Data to be extrapolated + dist : Stations distance + depth : Depth of the profile + w1 : weights [0-1] + w2 : weights [0-1] - Returns + Outputs ------- Sec_extrap : array_like Extrapolated variable @@ -72,39 +72,45 @@ def extrap_sec(data, dist, depth, w1=1.0, w2=0): new_data1 = [] for row in data: + new_row = row.copy() mask = ~np.isnan(row) if mask.any(): y = row[mask] if y.size == 1: - row = np.repeat(y, len(mask)) + new_row = np.repeat(y, len(mask)) else: x = dist[mask] f_i = interp1d(x, y) f_x = _extrap1d(f_i) - row = f_x(dist) - new_data1.append(row) + new_row = f_x(dist) + new_data1.append(new_row) new_data2 = [] for col in data.T: + new_col = col.copy() mask = ~np.isnan(col) if mask.any(): y = col[mask] if y.size == 1: - col = np.repeat(y, len(mask)) + new_col = np.repeat(y, len(mask)) else: z = depth[mask] f_i = interp1d(z, y) f_z = _extrap1d(f_i) - col = f_z(depth) - new_data2.append(col) + new_col = f_z(depth) + new_data2.append(new_col) - new_data = np.array(new_data1) * w1 + np.array(new_data2).T * w2 - return new_data + return np.array(new_data1) * w1 + np.array(new_data2).T * w2 -def gen_topomask(h, lon, lat, dx=1.0, kind="linear", plot=False): - """ - Generates a topography mask from an oceanographic transect taking the +def gen_topomask( + h: np.ndarray, + lon: np.ndarray, + lat: np.ndarray, + dx: float = 1.0, + kind: str = "linear", +) -> tuple: + """Generate a topography mask from an oceanographic transect taking the deepest CTD scan as the depth of each station. Inputs @@ -119,8 +125,6 @@ def gen_topomask(h, lon, lat, dx=1.0, kind="linear", plot=False): kind : string, optional Type of the interpolation to be performed. See scipy.interpolate.interp1d documentation for details. - plot : bool - Whether to plot mask for visualization. Outputs ------- @@ -134,7 +138,6 @@ def gen_topomask(h, lon, lat, dx=1.0, kind="linear", plot=False): André Palóczy Filho (paloczy@gmail.com) -- October/2012 """ - import gsw from scipy.interpolate import interp1d @@ -142,18 +145,26 @@ def gen_topomask(h, lon, lat, dx=1.0, kind="linear", plot=False): # Distance in km. x = np.append(0, np.cumsum(gsw.distance(lon, lat)[0] / 1e3)) h = -gsw.z_from_p(h, lat.mean()) - Ih = interp1d(x, h, kind=kind, bounds_error=False, fill_value=h[-1]) + ih = interp1d(x, h, kind=kind, bounds_error=False, fill_value=h[-1]) xm = np.arange(0, x.max() + dx, dx) - hm = Ih(xm) + hm = ih(xm) return xm, hm -def plot_section(self, reverse=False, filled=False, **kw): +def plot_section( # noqa: PLR0915 + self: pd.DataFrame, + *, + reverse: bool = False, + filled: bool = False, + **kw: dict, +) -> tuple: """Plot a sequence of CTD casts as a section.""" import gsw - lon, lat, data = list(map(np.asanyarray, (self.lon, self.lat, self.values))) + lon, lat, data = list( + map(np.asanyarray, (self.lon, self.lat, self.to_numpy())), + ) data = ma.masked_invalid(data) h = self.get_maxdepth() if reverse: @@ -163,7 +174,7 @@ def plot_section(self, reverse=False, filled=False, **kw): h = h[::-1] lon, lat = map(np.atleast_2d, (lon, lat)) x = np.append(0, np.cumsum(gsw.distance(lon, lat)[0] / 1e3)) - z = self.index.values.astype(float) + z = self.index.to_numpy().astype(float) if filled: # CAVEAT: this method cause discontinuities. data = data.filled(fill_value=np.nan) @@ -248,51 +259,53 @@ def plot_section(self, reverse=False, filled=False, **kw): return fig, ax, cb -def cell_thermal_mass(temperature, conductivity): - """ - Sample interval is measured in seconds. +def cell_thermal_mass( + temperature: pd.Series, + conductivity: pd.Series, +) -> pd.Series: + """Sample interval is measured in seconds. Temperature in degrees. CTM is calculated in S/m. """ - alpha = 0.03 # Thermal anomaly amplitude. beta = 1.0 / 7 # Thermal anomaly time constant (1/beta). sample_interval = 1 / 15.0 a = 2 * alpha / (sample_interval * beta + 2) b = 1 - (2 * a / alpha) - dCodT = 0.1 * (1 + 0.006 * [temperature - 20]) - dT = np.diff(temperature) - ctm = -1.0 * b * conductivity + a * (dCodT) * dT # [S/m] - return ctm + dc_o_dt = 0.1 * (1 + 0.006 * [temperature - 20]) + dt = np.diff(temperature) + return -1.0 * b * conductivity + a * (dc_o_dt) * dt # [S/m] -def mixed_layer_depth(CT, method="half degree"): +def mixed_layer_depth(ct: pd.Series, method: str = "half degree") -> pd.Series: """Return the mixed layer depth based on the "half degree" criteria.""" - if method == "half degree": - mask = CT[0] - CT < 0.5 - else: - mask = np.zeros_like(CT) - return Series(mask, index=CT.index, name="MLD") + half_degree = 0.5 + mask = ( + ct[0] - ct < half_degree + if method == "half degree" + else np.zeros_like(ct) + ) + return pd.Series(mask, index=ct.index, name="MLD") -def barrier_layer_thickness(SA, CT): - """ - Compute the thickness of water separating the mixed surface layer from the - thermocline. A more precise definition would be the difference between - mixed layer depth (MLD) calculated from temperature minus the mixed layer - depth calculated using density. +def barrier_layer_thickness(sa: pd.Series, ct: pd.Series) -> pd.Series: + """Compute the thickness of water separating the mixed surface layer from + the thermocline. + A more precise definition would be the difference between mixed layer depth + (MLD) calculated from temperature minus the mixed layer depth calculated + using density. """ import gsw - sigma_theta = gsw.sigma0(SA, CT) - mask = mixed_layer_depth(CT) + sigma_theta = gsw.sigma0(sa, ct) + mask = mixed_layer_depth(ct) mld = np.where(mask)[0][-1] sig_surface = sigma_theta[0] - sig_bottom_mld = gsw.sigma0(SA[0], CT[mld]) + sig_bottom_mld = gsw.sigma0(sa[0], ct[mld]) d_sig_t = sig_surface - sig_bottom_mld d_sig = sigma_theta - sig_bottom_mld mask = d_sig < d_sig_t # Barrier layer. - return Series(mask, index=SA.index, name="BLT") + return pd.Series(mask, index=sa.index, name="BLT") diff --git a/ctd/plotting.py b/ctd/plotting.py index 260f80f..0558e1e 100644 --- a/ctd/plotting.py +++ b/ctd/plotting.py @@ -1,20 +1,25 @@ -""" -Plotting module -""" +"""Plotting module.""" + +from __future__ import annotations import matplotlib.pyplot as plt import pandas as pd from pandas_flavor import register_dataframe_method, register_series_method +cast = pd.DataFrame | pd.Series + @register_series_method @register_dataframe_method -def plot_cast(df, secondary_y=False, label=None, ax=None, *args, **kwargs): - """ - Plot a CTD variable with the index in the y-axis instead of x-axis. - - """ - +def plot_cast( + df: cast, + *, + secondary_y: bool = False, + label: str | None = None, + ax: plt.Axes | None = None, + **kwargs: dict, +) -> cast: + """Plot a CTD variable with the index in the y-axis instead of x-axis.""" fignums = plt.get_fignums() if ax is None and not fignums: ax = plt.axes() @@ -44,7 +49,7 @@ def plot_cast(df, secondary_y=False, label=None, ax=None, *args, **kwargs): ax.plot(series, series.index, label=labels[k]) elif isinstance(df, pd.Series): label = label if label else str(df.name) - ax.plot(df.values, df.index, *args, label=label, **kwargs) + ax.plot(df.values, df.index, label=label, **kwargs) ax.set_ylabel(ylabel) ax.set_xlabel(xlabel) diff --git a/ctd/processing.py b/ctd/processing.py index 8aecc11..cdecfb9 100644 --- a/ctd/processing.py +++ b/ctd/processing.py @@ -1,37 +1,35 @@ -""" -Processing module -""" +"""Processing module.""" import numpy as np -import numpy.ma as ma import pandas as pd +from numpy import ma from pandas_flavor import register_dataframe_method, register_series_method +cast = pd.DataFrame | pd.Series -def _rolling_window(data, block): - """ - http://stackoverflow.com/questions/4936620/ + +def _rolling_window(data: np.ndarray, block: int) -> np.ndarray: + """http://stackoverflow.com/questions/4936620/ Using strides for an efficient moving average filter. """ shape = data.shape[:-1] + (data.shape[-1] - block + 1, block) - strides = data.strides + (data.strides[-1],) + strides = (*data.strides, data.strides[-1]) return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides) @register_series_method @register_dataframe_method -def remove_above_water(df): +def remove_above_water(df: cast) -> cast: """Remove all data above the water line.""" return remove_up_to(df, idx=0) @register_series_method @register_dataframe_method -def remove_up_to(df, idx): - """ - Remove all the data above a certain index value where index can be pressure or depth. - +def remove_up_to(df: cast, idx: int) -> cast: + """Remove all the data above a certain index value where index can be + pressure or depth. """ new_df = df.copy() return new_df[new_df.index >= idx] @@ -39,8 +37,8 @@ def remove_up_to(df, idx): @register_series_method @register_dataframe_method -def split(df): - """Returns a tuple with down/up-cast.""" +def split(df: cast) -> cast: + """Return a tuple with down/up-cast.""" idx = df.index.argmax() + 1 down = df.iloc[:idx] # Reverse index to orient it as a CTD cast. @@ -50,9 +48,12 @@ def split(df): @register_series_method @register_dataframe_method -def lp_filter(df, sample_rate=24.0, time_constant=0.15): - """ - Filter a series with `time_constant` (use 0.15 s for pressure), and for +def lp_filter( + df: cast, + sample_rate: float = 24.0, + time_constant: float = 0.15, +) -> cast: + """Filter a series with `time_constant` (use 0.15 s for pressure), and for a signal of `sample_rate` in Hertz (24 Hz for 911+). NOTE: 911+ systems do not require filter for temperature nor salinity. @@ -65,9 +66,9 @@ def lp_filter(df, sample_rate=24.0, time_constant=0.15): >>> raw = ctd.from_cnv(data_path.joinpath("CTD-spiked-unfiltered.cnv.bz2")) >>> prc = ctd.from_cnv(data_path.joinpath("CTD-spiked-filtered.cnv.bz2")) >>> kw = {"sample_rate": 24.0, "time_constant": 0.15} - >>> original = prc.index.values - >>> unfiltered = raw.index.values - >>> filtered = raw.lp_filter(**kw).index.values + >>> original = prc.index.to_numpy() + >>> unfiltered = raw.index.to_numpy() + >>> filtered = raw.lp_filter(**kw).index.to_numpy() >>> fig, ax = plt.subplots() >>> (l1,) = ax.plot(original, "k", label="original") >>> (l2,) = ax.plot(unfiltered, "r", label="unfiltered") @@ -79,26 +80,22 @@ def lp_filter(df, sample_rate=24.0, time_constant=0.15): https://scipy-cookbook.readthedocs.io/items/FIRFilter.html """ - from scipy import signal # Butter is closer to what SBE is doing with their cosine filter. - Wn = (1.0 / time_constant) / (sample_rate * 2.0) - b, a = signal.butter(2, Wn, "low") + wn = (1.0 / time_constant) / (sample_rate * 2.0) + b, a = signal.butter(2, wn, "low") new_df = df.copy() - new_df.index = signal.filtfilt(b, a, df.index.values) + new_df.index = signal.filtfilt(b, a, df.index.to_numpy()) return new_df @register_series_method @register_dataframe_method -def press_check(df): - """ - Remove pressure reversals from the index. - - """ +def press_check(df: cast) -> cast: + """Remove pressure reversals from the index.""" new_df = df.copy() - press = new_df.copy().index.values + press = new_df.copy().index.to_numpy() ref = press[0] inversions = np.diff(np.r_[press, press[-1]]) < 0 @@ -108,11 +105,11 @@ def press_check(df): ref = press[k] cut = press[k + 1 :] < ref mask[k + 1 :][cut] = True - new_df[mask] = np.NaN + new_df[mask] = np.nan return new_df -def _bindata(series, delta, method): +def _bindata(series: pd.Series, delta: int, method: str) -> pd.Series: """Average the data into bins of the size `delta`.""" start = np.ceil(series.index[0]) stop = np.floor(series.index[-1]) @@ -125,17 +122,17 @@ def _bindata(series, delta, method): data = np.interp(new_index, series.index, series) return pd.Series(data, index=new_index, name=series.name) else: + msg = f"Expected method `average` or `interpolate`, but got {method}." raise ValueError( - f"Expected method `average` or `interpolate`, but got {method}.", + msg, ) return new_series @register_series_method @register_dataframe_method -def bindata(df, delta=1.0, method="average"): - """ - Bin average the index (usually pressure) to a given interval (default +def bindata(df: cast, delta: float = 1.0, method: str = "average") -> cast: + """Bin average the index (usually pressure) to a given interval (default delta = 1). """ @@ -146,14 +143,12 @@ def bindata(df, delta=1.0, method="average"): return new_df -def _despike(series, n1, n2, block, keep): - """ - Wild Edit Seabird-like function. Passes with Standard deviation +def _despike(series: pd.Series, n1: int, n2: int, block: int) -> pd.Series: + """Wild Edit Seabird-like function. Passes with Standard deviation `n1` and `n2` with window size `block`. """ - - data = series.values.astype(float).copy() + data = series.to_numpy().astype(float).copy() roll = _rolling_window(data, block) roll = ma.masked_invalid(roll) std = n1 * roll.std(axis=1) @@ -161,8 +156,10 @@ def _despike(series, n1, n2, block, keep): # Use the last value to fill-up. std = np.r_[std, np.tile(std[-1], block - 1)] mean = np.r_[mean, np.tile(mean[-1], block - 1)] - mask = np.abs(data - mean.filled(fill_value=np.NaN)) > std.filled(fill_value=np.NaN) - data[mask] = np.NaN + mask = np.abs(data - mean.filled(fill_value=np.nan)) > std.filled( + fill_value=np.nan, + ) + data[mask] = np.nan # Pass two recompute the mean and std without the flagged values from pass # one and removed the flagged data. @@ -173,34 +170,32 @@ def _despike(series, n1, n2, block, keep): # Use the last value to fill-up. std = np.r_[std, np.tile(std[-1], block - 1)] mean = np.r_[mean, np.tile(mean[-1], block - 1)] - values = series.values.astype(float) - mask = np.abs(values - mean.filled(fill_value=np.NaN)) > std.filled( - fill_value=np.NaN, + values = series.to_numpy().astype(float) + mask = np.abs(values - mean.filled(fill_value=np.nan)) > std.filled( + fill_value=np.nan, ) clean = series.astype(float).copy() - clean[mask] = np.NaN + clean[mask] = np.nan return clean @register_series_method @register_dataframe_method -def despike(df, n1=2, n2=20, block=100, keep=0): - """ - Wild Edit Seabird-like function. Passes with Standard deviation +def despike(df: cast, n1: int = 2, n2: int = 20, block: int = 100) -> cast: + """Wild Edit Seabird-like function. Passes with Standard deviation `n1` and `n2` with window size `block`. """ if isinstance(df, pd.Series): - new_df = _despike(df, n1=n1, n2=n2, block=block, keep=keep) + new_df = _despike(df, n1=n1, n2=n2, block=block) else: - new_df = df.apply(_despike, n1=n1, n2=n2, block=block, keep=keep) + new_df = df.apply(_despike, n1=n1, n2=n2, block=block) return new_df -def _smooth(series, window_len, window): +def _smooth(series: pd.Series, window_len: int, window: str) -> pd.Series: """Smooth the data using a window with requested size.""" - windows = { "flat": np.ones, "hanning": np.hanning, @@ -208,15 +203,17 @@ def _smooth(series, window_len, window): "bartlett": np.bartlett, "blackman": np.blackman, } - data = series.values.copy() + data = series.to_numpy().copy() - if window_len < 3: + min_window_length = 3 + if window_len < min_window_length: return pd.Series(data, index=series.index, name=series.name) if window not in list(windows.keys()): + msg = """window must be one of 'flat', 'hanning', + 'hamming', 'bartlett', 'blackman'""" raise ValueError( - """window must be one of 'flat', 'hanning', - 'hamming', 'bartlett', 'blackman'""", + msg, ) s = np.r_[ @@ -234,7 +231,7 @@ def _smooth(series, window_len, window): @register_series_method @register_dataframe_method -def smooth(df, window_len=11, window="hanning"): +def smooth(df: cast, window_len: int = 11, window: str = "hanning") -> cast: """Smooth the data using a window with requested size.""" if isinstance(df, pd.Series): new_df = _smooth(df, window_len=window_len, window=window) @@ -243,23 +240,21 @@ def smooth(df, window_len=11, window="hanning"): return new_df -def _movingaverage(series, window_size=48): - """Moving average function on a pandas series.""" +def _movingaverage(series: pd.Series, window_size: int = 48) -> pd.Series: + """Perform Moving Average function on a pandas series.""" window = np.ones(int(window_size)) / float(window_size) return pd.Series(np.convolve(series, window, "same"), index=series.index) @register_series_method @register_dataframe_method -def movingaverage(df, window_size=48): - """ - Moving average on a data frame or series. +def movingaverage(df: cast, window_size: int = 48) -> cast: + """Perform Moving Average on a DataFrame or Series. Inputs: windows_size : integer """ - if isinstance(df, pd.Series): new_df = _movingaverage(df, window_size=window_size) else: diff --git a/ctd/read.py b/ctd/read.py index 24fbfbe..99dbea9 100644 --- a/ctd/read.py +++ b/ctd/read.py @@ -1,15 +1,15 @@ -""" -Read module -""" +"""Read module.""" + +from __future__ import annotations import bz2 import collections +import datetime import gzip import linecache import re import warnings import zipfile -from datetime import datetime from io import StringIO from pathlib import Path @@ -19,7 +19,7 @@ import pandas as pd -def _basename(fname): +def _basename(fname: str | Path) -> (str, str, str): """Return file name without path.""" if not isinstance(fname, Path): fname = Path(fname) @@ -27,14 +27,13 @@ def _basename(fname): return path, name, ext -def _normalize_names(name): +def _normalize_names(name: str) -> str: """Normalize column names.""" name = name.strip() - name = name.strip("*") - return name + return name.strip("*") -def _open_compressed(fname): +def _open_compressed(fname: Path) -> str: """Open compressed gzip, gz, zip or bz2 files.""" extension = fname.suffix.casefold() if extension in [".gzip", ".gz"]: @@ -50,15 +49,19 @@ def _open_compressed(fname): name = zfile.namelist()[0] cfile = zfile.open(name) else: + msg = ( + "Unrecognized file extension. " + f"Expected .gzip, .bz2, or .zip, got {extension}" + ) raise ValueError( - f"Unrecognized file extension. Expected .gzip, .bz2, or .zip, got {extension}", + msg, ) contents = cfile.read() cfile.close() return contents -def _read_file(fname): +def _read_file(fname: str | Path | StringIO) -> StringIO: """Read file contents, or read from StringIO object.""" if isinstance(fname, StringIO): fname.seek(0) @@ -71,11 +74,15 @@ def _read_file(fname): extension = fname.suffix.casefold() if extension in [".gzip", ".gz", ".bz2", ".zip"]: contents = _open_compressed(fname) - elif extension in [".cnv", ".edf", ".txt", ".ros", ".btl"]: + elif extension in [".cnv", ".edf", ".txt", ".ros", ".btl", ".bl", ".csv"]: contents = fname.read_bytes() else: + msg = ( + "Unrecognized file extension. " + f"Expected .cnv, .edf, .txt, .ros, or .btl got {extension}" + ) raise ValueError( - f"Unrecognized file extension. Expected .cnv, .edf, .txt, .ros, or .btl got {extension}", + msg, ) # Read as bytes but we need to return strings for the parsers. encoding = chardet.detect(contents)["encoding"] @@ -83,27 +90,34 @@ def _read_file(fname): return StringIO(text) -def _remane_duplicate_columns(names): +def _remane_duplicate_columns(names: str) -> str: """Rename a column when it is duplicated.""" items = collections.Counter(names).items() dup = [] for item, count in items: - if count > 2: + if count > 2: # noqa: PLR2004 + msg = ( + "Cannot handle more than two duplicated columns. " + f"Found {count} for {item}." + ) raise ValueError( - f"Cannot handle more than two duplicated columns. Found {count} for {item}.", + msg, ) if count > 1: dup.append(item) - # since we can assume there are only two instances of a word in the list, how about we find the last - # index of an instance, which will be the second occurrence of the item - second_occurrences = [len(names) - names[::-1].index(item) - 1 for item in dup] + # We can assume there are only two instances of a word in the list, + # we find the last index of an instance, + # which will be the second occurrence of the item. + second_occurrences = [ + len(names) - names[::-1].index(item) - 1 for item in dup + ] for idx in second_occurrences: names[idx] = f"{names[idx]}_" return names -def _parse_seabird(lines, ftype): +def _parse_seabird(lines: list, ftype: str) -> dict: # noqa: C901, PLR0912, PLR0915 """Parse searbird formats.""" # Initialize variables. lon = lat = time = None, None, None @@ -112,15 +126,15 @@ def _parse_seabird(lines, ftype): metadata = {} header, config, names = [], [], [] - for k, line in enumerate(lines): - line = line.strip() + for k, raw_line in enumerate(lines): + line = raw_line.strip() - # Only cnv has columns names, for bottle files we will use the variable row. - if ftype == "cnv": - if "# name" in line: - name, unit = line.split("=")[1].split(":") - name, unit = list(map(_normalize_names, (name, unit))) - names.append(name) + # Only cnv has columns names, + # for bottle files we will use the variable row. + if ftype == "cnv" and "# name" in line: + name, unit = line.split("=")[1].split(":") + name, unit = list(map(_normalize_names, (name, unit))) + names.append(name) # Seabird headers starts with *. if line.startswith("*"): @@ -137,27 +151,32 @@ def _parse_seabird(lines, ftype): if "NMEA Latitude" in line: hemisphere = line[-1] lat = line.strip(hemisphere).split("=")[1].strip() - lat = np.float_(lat.split()) + lat = np.float64(lat.split()) if hemisphere == "S": lat = -(lat[0] + lat[1] / 60.0) elif hemisphere == "N": lat = lat[0] + lat[1] / 60.0 else: - raise ValueError("Latitude not recognized.") + msg = "Latitude not recognized." + raise ValueError(msg) if "NMEA Longitude" in line: hemisphere = line[-1] lon = line.strip(hemisphere).split("=")[1].strip() - lon = np.float_(lon.split()) + lon = np.float64(lon.split()) if hemisphere == "W": lon = -(lon[0] + lon[1] / 60.0) elif hemisphere == "E": lon = lon[0] + lon[1] / 60.0 else: - raise ValueError("Latitude not recognized.") + msg = "Latitude not recognized." + raise ValueError(msg) if "NMEA UTC (Time)" in line: time = line.split("=")[-1].strip() # Should use some fuzzy datetime parser to make this more robust. - time = datetime.strptime(time, "%b %d %Y %H:%M:%S") + time = datetime.datetime.strptime( + time, + "%b %d %Y %H:%M:%S", + ).astimezone(datetime.UTC) # cnv file header ends with *END* while if ftype == "cnv": @@ -174,7 +193,7 @@ def _parse_seabird(lines, ftype): # Fix commonly occurring problem when Sbeox.* exists in the file # the name is concatenated to previous parameter # example: - # CStarAt0Sbeox0Mm/Kg to CStarAt0 Sbeox0Mm/Kg (really two different params) + # CStarAt0Sbeox0Mm/Kg to CStarAt0 Sbeox0Mm/Kg line = re.sub(r"(\S)Sbeox", "\\1 Sbeox", line) names = line.split() @@ -198,10 +217,10 @@ def _parse_seabird(lines, ftype): return metadata -def from_bl(fname): - """Read Seabird bottle-trip (bl) file +def from_bl(fname: str | Path) -> pd.DataFrame: + """Read Seabird bottle-trip (bl) file. - Example + Example: ------- >>> from pathlib import Path >>> import ctd @@ -211,24 +230,24 @@ def from_bl(fname): datetime.datetime(2018, 6, 25, 20, 8, 55) """ - df = pd.read_csv( - fname, + f = _read_file(fname) + cast = pd.read_csv( + f, skiprows=2, parse_dates=[1], index_col=0, names=["bottle_number", "time", "startscan", "endscan"], ) - df._metadata = { + cast._metadata = { # noqa: SLF001 "time_of_reset": pd.to_datetime( linecache.getline(str(fname), 2)[6:-1], ).to_pydatetime(), } - return df + return cast -def from_btl(fname): - """ - DataFrame constructor to open Seabird CTD BTL-ASCII format. +def from_btl(fname: str | Path) -> pd.DataFrame: + """DataFrame constructor to open Seabird CTD BTL-ASCII format. Examples -------- @@ -243,7 +262,7 @@ def from_btl(fname): f.seek(0) - df = pd.read_fwf( + cast = pd.read_fwf( f, header=None, index_col=False, @@ -258,22 +277,24 @@ def from_btl(fname): # Also needs date,time,and bottle number to be converted to one per line. # Get row types, see what you have: avg, std, min, max or just avg, std. - rowtypes = df[df.columns[-1]].unique() + rowtypes = cast[cast.columns[-1]].unique() # Get times and dates which occur on second line of each bottle. date_idx = metadata["names"].index("Date") - dates = df.iloc[:: len(rowtypes), date_idx].reset_index(drop=True) - times = df.iloc[1 :: len(rowtypes), date_idx].reset_index(drop=True) + dates = cast.iloc[:: len(rowtypes), date_idx].reset_index(drop=True) + times = cast.iloc[1 :: len(rowtypes), date_idx].reset_index(drop=True) datetimes = dates + " " + times # Fill the Date column with datetimes. - df.loc[:: len(rowtypes), "Date"] = datetimes.values - df.loc[1 :: len(rowtypes), "Date"] = datetimes.values + cast.loc[:: len(rowtypes), "Date"] = datetimes.to_numpy() + cast.loc[1 :: len(rowtypes), "Date"] = datetimes.to_numpy() # Fill missing rows. - df["Bottle"] = df["Bottle"].fillna(method="ffill") - df["Date"] = df["Date"].fillna(method="ffill") + cast["Bottle"] = cast["Bottle"].ffill() + cast["Date"] = cast["Date"].ffill() - df["Statistic"] = df["Statistic"].str.lstrip("(").str.rstrip(")") # (avg) to avg + cast["Statistic"] = ( + cast["Statistic"].str.lstrip("(").str.rstrip(")") + ) # (avg) to avg if "name" not in metadata: name = _basename(fname)[1] @@ -288,26 +309,25 @@ def from_btl(fname): "Statistic": str, "Date": str, } - for column in df.columns: + for column in cast.columns: if column in dtypes: - df[column] = df[column].astype(dtypes[column]) + cast[column] = cast[column].astype(dtypes[column]) else: try: - df[column] = df[column].astype(float) + cast[column] = cast[column].astype(float) except ValueError: warnings.warn( f"Could not convert {column} to float.", stacklevel=2, ) - df["Date"] = pd.to_datetime(df["Date"]) - df._metadata = metadata - return df + cast["Date"] = pd.to_datetime(cast["Date"]) + cast._metadata = metadata # noqa: SLF001 + return cast -def from_edf(fname): - """ - DataFrame constructor to open XBT EDF ASCII format. +def from_edf(fname: str | Path) -> pd.DataFrame: # noqa: C901, PLR0912 + """DataFrame constructor to open XBT EDF ASCII format. Examples -------- @@ -320,15 +340,15 @@ def from_edf(fname): """ f = _read_file(fname) header, names = [], [] - for k, line in enumerate(f.readlines()): - line = line.strip() + for k, raw_line in enumerate(f.readlines()): + line = raw_line.strip() if line.startswith("Serial Number"): serial = line.strip().split(":")[1].strip() elif line.startswith("Latitude"): try: hemisphere = line[-1] lat = line.strip(hemisphere).split(":")[1].strip() - lat = np.float_(lat.split()) + lat = np.float64(lat.split()) if hemisphere == "S": lat = -(lat[0] + lat[1] / 60.0) elif hemisphere == "N": @@ -339,7 +359,7 @@ def from_edf(fname): try: hemisphere = line[-1] lon = line.strip(hemisphere).split(":")[1].strip() - lon = np.float_(lon.split()) + lon = np.float64(lon.split()) if hemisphere == "W": lon = -(lon[0] + lon[1] / 60.0) elif hemisphere == "E": @@ -356,18 +376,18 @@ def from_edf(fname): break f.seek(0) - df = pd.read_csv( + cast = pd.read_csv( f, header=None, index_col=None, names=names, skiprows=skiprows, - delim_whitespace=True, + sep=r"\s+", ) f.close() - df.set_index("depth", drop=True, inplace=True) - df.index.name = "Depth [m]" + cast = cast.set_index("depth", drop=True) + cast.index.name = "Depth [m]" name = _basename(fname)[1] metadata = { @@ -377,13 +397,12 @@ def from_edf(fname): "header": "\n".join(header), "serial": serial, } - df._metadata = metadata - return df + cast._metadata = metadata # noqa: SLF001 + return cast -def from_cnv(fname): - """ - DataFrame constructor to open Seabird CTD CNV-ASCII format. +def from_cnv(fname: str | Path) -> pd.DataFrame: + """DataFrame constructor to open Seabird CTD CNV-ASCII format. Examples -------- @@ -399,13 +418,13 @@ def from_cnv(fname): metadata = _parse_seabird(f.readlines(), ftype="cnv") f.seek(0) - df = pd.read_fwf( + cast = pd.read_fwf( f, header=None, index_col=None, names=metadata["names"], skiprows=metadata["skiprows"], - delim_whitespace=True, + sep=r"\s+", widths=[11] * len(metadata["names"]), ) f.close() @@ -422,56 +441,60 @@ def from_cnv(fname): "depSM", "prDE", ] - df.columns = df.columns.str.strip() - prkey = [key for key in prkeys if key in df.columns] + cast.columns = cast.columns.str.strip() + prkey = [key for key in prkeys if key in cast.columns] if len(prkey) == 0: - raise ValueError("Expected one pressure/depth column, didn't receive any") - elif len(prkey) > 1: - # if multiple keys present then keep the first one + msg = "Expected one pressure/depth column, didn't receive any" + raise ValueError( + msg, + ) + if len(prkey) > 1: + # If multiple keys present then keep the first one. prkey = prkey[0] - df.set_index(prkey, drop=True, inplace=True) - df.index.name = "Pressure [dbar]" + cast = cast.set_index(prkey, drop=True) + cast.index.name = "Pressure [dbar]" if prkey == "depSM": lat = metadata.get("lat", None) if lat is not None: - df.index = gsw.p_from_z( - df.index, + cast.index = gsw.p_from_z( + cast.index, lat, geo_strf_dyn_height=0, sea_surface_geopotential=0, ) else: - warnings.war( - f"Missing latitude information. Cannot compute pressure! Your index is {prkey}, " - "please compute pressure manually with `gsw.p_from_z` and overwrite your index.", + msg = ( + "Missing latitude information. Cannot compute pressure! " + f"Your index is {prkey}, please compute pressure manually " + "with `gsw.p_from_z` and overwrite your index." ) - df.index.name = prkey + warnings.war(msg) + cast.index.name = prkey if "name" not in metadata: name = _basename(fname)[1] metadata["name"] = str(name) dtypes = {"bpos": int, "pumps": bool, "flag": bool} - for column in df.columns: + for column in cast.columns: if column in dtypes: - df[column] = df[column].astype(dtypes[column]) + cast[column] = cast[column].astype(dtypes[column]) else: try: - df[column] = df[column].astype(float) + cast[column] = cast[column].astype(float) except ValueError: warnings.warn( f"Could not convert {column} to float.", stacklevel=2, ) - df._metadata = metadata - return df + cast._metadata = metadata # noqa: SLF001 + return cast -def from_fsi(fname, skiprows=9): - """ - DataFrame constructor to open Falmouth Scientific, Inc. (FSI) CTD +def from_fsi(fname: str | Path, skiprows: int = 9) -> pd.DataFrame: + """DataFrame constructor to open Falmouth Scientific, Inc. (FSI) CTD ASCII format. Examples @@ -485,26 +508,25 @@ def from_fsi(fname, skiprows=9): """ f = _read_file(fname) - df = pd.read_csv( + fsi = pd.read_csv( f, header="infer", index_col=None, skiprows=skiprows, dtype=float, - delim_whitespace=True, + sep=r"\s+", ) f.close() - df.set_index("PRES", drop=True, inplace=True) - df.index.name = "Pressure [dbar]" + fsi = fsi.set_index("PRES", drop=True) + fsi.index.name = "Pressure [dbar]" metadata = {"name": str(fname)} - df._metadata = metadata - return df + fsi._metadata = metadata # noqa: SLF001 + return fsi -def rosette_summary(fname): - """ - Make a BTL (bottle) file from a ROS (bottle log) file. +def rosette_summary(fname: str | Path) -> pd.DataFrame: + """Make a BTL (bottle) file from a ROS (bottle log) file. More control for the averaging process and at which step we want to perform this averaging eliminating the need to read the data into SBE @@ -519,23 +541,24 @@ def rosette_summary(fname): >>> fname = data_path.joinpath("CTD/g01l01s01.ros") >>> ros = ctd.rosette_summary(fname) >>> ros = ros.groupby(ros.index).mean() - >>> ros.pressure.values.astype(int) + >>> ros.pressure.to_numpy().astype(int) array([835, 806, 705, 604, 503, 404, 303, 201, 151, 100, 51, 1]) """ ros = from_cnv(fname) - ros["pressure"] = ros.index.values.astype(float) + ros["pressure"] = ros.index.to_numpy().astype(float) ros["nbf"] = ros["nbf"].astype(int) - ros.set_index("nbf", drop=True, inplace=True, verify_integrity=False) + metadata = ros._metadata # noqa: SLF001 + ros = ros.set_index("nbf", drop=True, verify_integrity=False) + ros._metadata = metadata # noqa: SLF001 return ros -def from_castaway_csv(fname): - """ - DataFrame constructor to open CastAway CSV format. +def from_castaway_csv(fname: str | Path) -> pd.DataFrame: + """DataFrame constructor to open CastAway CSV format. - Example - -------- + Example: + ------- >>> import ctd >>> cast = ctd.from_castaway_csv("tests/data/castaway_data.csv") >>> cast.columns @@ -544,27 +567,28 @@ def from_castaway_csv(fname): dtype='object') """ - with open(fname) as file: - f = file.readlines() + f = _read_file(fname) + lines = f.readlines() # Strip newline characters - f = [s.strip() for s in f] + lines = [s.strip() for s in lines] # Separate meta data and CTD profile - meta = [s for s in f if s[0] == "%"][0:-1] - data = [s.split(",") for s in f if s[0] != "%"] - df = pd.DataFrame(data[1:-1], columns=data[0]) + meta = [s for s in lines if s[0] == "%"][0:-1] + data = [s.split(",") for s in lines if s[0] != "%"] + cast = pd.DataFrame(data[1:-1], columns=data[0]) # Convert to numeric - for col in df.columns: - df[col] = pd.to_numeric(df[col]) + for col in cast.columns: + cast[col] = pd.to_numeric(cast[col]) # Normalise column names and extract units - units = [s[s.find("(") + 1 : s.find(")")] for s in df.columns] - df.columns = [ - _normalize_names(s.split("(")[0]).lower().replace(" ", "_") for s in df.columns + units = [s[s.find("(") + 1 : s.find(")")] for s in cast.columns] + cast.columns = [ + _normalize_names(s.split("(")[0]).lower().replace(" ", "_") + for s in cast.columns ] - df.set_index("pressure", drop=True, inplace=True, verify_integrity=False) + cast = cast.set_index("pressure", drop=True, verify_integrity=False) # Add metadata meta = [s.replace("%", "").strip().split(",") for s in meta] @@ -572,6 +596,6 @@ def from_castaway_csv(fname): for line in meta: metadata[line[0]] = line[1] metadata["units"] = units - df._metadata = metadata + cast._metadata = metadata # noqa: SLF001 - return df + return cast diff --git a/docs/source/conf.py b/docs/source/conf.py index 01cbf0f..70a6a47 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -58,11 +58,9 @@ # |version| and |release|, also used in various other places throughout the # built documents. # -from ctd import __version__ as VERSION +from ctd import __version__ -version = VERSION -# The full version, including alpha/beta/rc tags. -release = VERSION +version = release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/notebooks/00-reading-data.ipynb b/notebooks/00-reading-data.ipynb index 43167d3..87e5774 100644 --- a/notebooks/00-reading-data.ipynb +++ b/notebooks/00-reading-data.ipynb @@ -8,10 +8,8 @@ "source": [ "from pathlib import Path\n", "\n", - "import pandas as pd\n", "import ctd\n", "\n", - "\n", "path = Path(\"..\", \"tests\", \"data\")" ] }, @@ -58,7 +56,6 @@ "source": [ "from ctd import rosette_summary\n", "\n", - "\n", "ros = rosette_summary(path.joinpath(\"CTD\", \"g01l01s01.ros\"))\n", "ros = ros.groupby(ros.index).mean()\n", "\n", diff --git a/notebooks/quick_intro.ipynb b/notebooks/quick_intro.ipynb index 0e02e5b..91a6e9b 100644 --- a/notebooks/quick_intro.ipynb +++ b/notebooks/quick_intro.ipynb @@ -17,25 +17,15 @@ }, "outputs": [], "source": [ - "import io\n", - "import requests\n", - "from pathlib import Path\n", + "import pooch\n", "\n", + "test_data = \"CTD-spiked-unfiltered.cnv.bz2\"\n", + "url = f\"https://github.com/pyoceans/python-ctd/raw/main/tests/data/{test_data}\"\n", "\n", - "def download_demo_file(url):\n", - " if not Path(\"CTD-spiked-unfiltered.cnv.bz2\").exists():\n", - " response = requests.get(url, allow_redirects=True)\n", - " try:\n", - " response.raise_for_status()\n", - " except requests.exceptions.HTTPError as err:\n", - " raise requests.exceptions.HTTPError(f\"{response.content.decode()}\") from err\n", - " data = io.BytesIO(response.content)\n", - " data.seek(0)\n", - " Path(\"CTD-spiked-unfiltered.cnv.bz2\").write_bytes(data.read())\n", - "\n", - "\n", - "url = \"https://github.com/pyoceans/python-ctd/raw/main/tests/data/CTD-spiked-unfiltered.cnv.bz2\"\n", - "download_demo_file(url)" + "fname = pooch.retrieve(\n", + " url=url,\n", + " known_hash=\"sha256:1de4b7ce665d5cece925c5feb4552c13bbc19cef3e229bc87dfd77acb1a730d3\",\n", + ")" ] }, { @@ -46,8 +36,7 @@ "source": [ "import ctd\n", "\n", - "\n", - "cast = ctd.from_cnv(\"CTD-spiked-unfiltered.cnv.bz2\")\n", + "cast = ctd.from_cnv(fname)\n", "down, up = cast.split()\n", "\n", "down.head()" @@ -84,7 +73,6 @@ "source": [ "from matplotlib import style\n", "\n", - "\n", "style.use(\"seaborn-v0_8-whitegrid\")\n", "\n", "down[\"t090C\"].plot_cast()\n", @@ -169,7 +157,6 @@ "source": [ "import matplotlib.pyplot as plt\n", "\n", - "\n", "fig, ax = plt.subplots()\n", "ax.plot(down.index, label=\"unfiltered\")\n", "ax.plot(down.lp_filter().index, label=\"filtered\")\n", @@ -233,7 +220,6 @@ "source": [ "import gsw\n", "\n", - "\n", "p = proc.index\n", "\n", "SP = gsw.SP_from_C(proc[\"c0S/m\"].to_numpy() * 10.0, proc[\"t090C\"].to_numpy(), p)\n", @@ -318,7 +304,10 @@ "ax1.set_xlabel(\"Absolute Salinity (g kg$^{-1}$)\")\n", "\n", "(l2,) = ax2.plot(\n", - " proc[\"sigma0_CT\"], proc.index, color=colors[2], label=r\"$\\sigma_{0\\_CT}$\"\n", + " proc[\"sigma0_CT\"],\n", + " proc.index,\n", + " color=colors[2],\n", + " label=r\"$\\sigma_{0\\_CT}$\",\n", ")\n", "ax2.set_xlabel(r\"$\\sigma_{0\\_CT}$ (kg m$^{-3}$)\")\n", "\n", @@ -360,7 +349,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index c2a9344..c3b453c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,11 +13,9 @@ license = { text = "BSD-3-Clause" } authors = [ { name = "Filipe Fernandes", email = "ocefpaf+ctd@gmail.com" }, ] -requires-python = ">=3.9" +requires-python = ">=3.11" classifiers = [ "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] diff --git a/requirements-dev.txt b/requirements-dev.txt index 736a9ad..1a84fe8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,26 +1,12 @@ -black check-manifest -doctr -flake8 -flake8-builtins -flake8-comprehensions -flake8-mutable -flake8-print -ipykernel -isort -jupyter -jupyter_client mypy +nbclassic nbconvert nbsphinx +pooch pre-commit -pycodestyle -pylint pytest pytest-cov -pytest-flake8 -pytest-xdist setuptools_scm sphinx twine -wheel diff --git a/requirements.txt b/requirements.txt index e722ff8..a8cf98b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ chardet gsw>=3.3.0 matplotlib -numpy +numpy>=2 pandas>=0.24.0 pandas-flavor>=0.1.2 scipy diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..cf3061a --- /dev/null +++ b/ruff.toml @@ -0,0 +1,53 @@ +line-length = 80 + +lint.select = ["ALL"] + +lint.ignore = [ + "D203", # 1 blank line required before class docstring + "D205", # 1 blank line required between summary line and description + "D213", # incompatible. Ignoring `multi-line-summary-second-line` + "TRY003", # Avoid specifying long messages outside the exception class +] + +[lint.extend-per-file-ignores] +"docs/source/conf.py" = [ + "A001", # builtin-variable-shadowing + "D100", # Missing docstring in public module + "E402", # Module level import not at top of file + "ERA001", # Found commented-out code + "ERA001", # Found commented-out code + "EXE001", # Shebang is present but file is not executable +] +"test_*.py" = [ + "ANN001", # Missing type annotation for function argument + "ANN201", # Missing return type annotation for public function + "ANN202", # Missing return type annotation for private function + "INP001", # File is part of an implicit namespace package + "PD901", # Avoid using the generic variable name `df` for DataFrames + "S101", # Use of assert detected + "ANN002", # Missing type annotation for `*args` + "ANN003", # Missing type annotation for `**kwargs` +] +"ctd/extras.py" = [ + "ANN001", # Missing type annotation for function argument + "ANN201", # Missing return type annotation for public function + "ANN202", # Missing return type annotation for private function +] +# nbqa-ruff acts on converted .py so we cannot glob .ipynb :-/ +# https://github.com/nbQA-dev/nbQA/issues/823 +"notebooks/*" = [ + "ANN001", # Missing type annotation for function argument + "ANN201", # Missing return type annotation for public function + "B018", # Found useless expression. Either assign it to a variable or remove it + "D100", # Missing docstring in public module + "D103", # Missing docstring in public function + "E402", # Module level import not at top of file + "FBT003", # Boolean positional value in function call + "INP001", # File is part of an implicit namespace package + "N816", # Variable in global scope should not be mixedCase + "PD901", # Avoid using the generic variable name `df` for DataFrames + "SLF001", # Private member accessed + "T201", # `print` found" +] +[lint.pycodestyle] +max-doc-length = 180 diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 89edafa..d1b5230 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -1,13 +1,15 @@ +"""Test plotting.""" + from pathlib import Path -import matplotlib +import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np import pytest import ctd -matplotlib.use("Agg") +mpl.use("Agg") data_path = Path(__file__).parent.joinpath("data") @@ -24,7 +26,8 @@ def _assert_is_valid_plot_return_object(objs): else: assert isinstance(objs, (plt.Artist, tuple, dict)), ( "objs is neither an ndarray of Artist instances nor a " - f'single Artist instance, tuple, or dict, "objs" is a {objs.__class__.__name__!r} ' + "single Artist instance, tuple, or dict, " + f'"objs" is a {objs.__class__.__name__!r} ' "" ) @@ -36,28 +39,38 @@ def _check_plot_works(f, *args, **kwargs): plt.close() -# BasicPlotting. -@pytest.fixture +# Basic Plotting. +@pytest.fixture() def xbt(): + """Load XBT.""" yield ctd.from_edf(data_path.joinpath("XBT.EDF.zip")) plt.close("all") -@pytest.fixture +@pytest.fixture() def fsi(): + """Load FSI.""" yield ctd.from_fsi(data_path.joinpath("FSI.txt.gz"), skiprows=9) plt.close("all") -@pytest.fixture +@pytest.fixture() def cnv(): + """Load CNV.""" yield ctd.from_cnv(data_path.joinpath("small.cnv.bz2")) plt.close("all") def test_xbt_plot(xbt): + """Test plotting XBT.""" _check_plot_works(xbt["temperature"].plot_cast) +def test_fsi_temperature(fsi): + """Test plotting CNV.""" + _check_plot_works(fsi["TEMP"].plot_cast) + + def test_cnv_temperature(cnv): + """Test plotting CNV.""" _check_plot_works(cnv["t090C"].plot_cast) diff --git a/tests/test_processing.py b/tests/test_processing.py index 2d9b651..2e2f781 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -1,45 +1,52 @@ +"""Test processing methods.""" + import numpy as np import pandas as pd import pytest -import ctd # noqa - -@pytest.fixture +@pytest.fixture() def series(): + """Load data series.""" index = np.r_[np.linspace(-5, 10, 20), np.linspace(10, -5, 20)] - yield pd.Series(data=np.arange(len(index)), index=index) + return pd.Series(data=np.arange(len(index)), index=index) -@pytest.fixture +@pytest.fixture() def df(): + """Load data frame.""" index = np.r_[np.linspace(-5, 10, 20), np.linspace(10, -5, 20)] - yield pd.DataFrame(data=np.arange(len(index)), index=index) + return pd.DataFrame(data=np.arange(len(index)), index=index) def test_remove_above_water_series(series): + """Test remove above water series.""" assert any(series.index < 0) assert not any(series.remove_above_water().index < 0) def test_remove_above_water_df(df): + """Test remove above water dataframe.""" assert any(df.index < 0) assert not any(df.remove_above_water().index < 0) def test_remove_up_to_series(series): + """Test remove up to series.""" idx = 10 assert any(series.index < idx) assert not any(series.remove_up_to(idx=idx).index < idx) def test_remove_up_to_df(df): + """Test remove up dataframe.""" idx = 10 assert any(df.index < idx) assert not any(df.remove_up_to(idx=idx).index < idx) def test_split_series(series): + """Test split series.""" split = series.split() down, up = split assert isinstance(split, tuple) @@ -47,6 +54,7 @@ def test_split_series(series): def test_split_df(df): + """Test split dataframe.""" split = df.split() down, up = split assert isinstance(split, tuple) @@ -54,18 +62,26 @@ def test_split_df(df): def test_press_check_series(series): - # reverse 7th and 9th and confirm they are removed after the `press_check`. + """Test pressure check series. + + Reverse 7th and 9th and confirm they are removed after the `press_check`. + """ index = [0, 1, 2, 3, 4, 5, 7, 6, 9, 8, 10] - series = pd.Series(data=np.random.randn(len(index)), index=index) + rng = np.random.default_rng() + series = pd.Series(data=rng.standard_normal(len(index)), index=index) series = series.press_check() assert np.isnan(series.iloc[7]) assert np.isnan(series.iloc[9]) def test_press_check_df(df): - # reverse 7th and 9th and confirm they are removed after the `press_check`. + """Test pressure check dataframe. + + Reverse 7th and 9th and confirm they are removed after the `press_check`. + """ index = [0, 1, 2, 3, 4, 5, 7, 6, 9, 8, 10] - arr = np.random.randn(len(index)) + rng = np.random.default_rng() + arr = rng.standard_normal(len(index)) df = pd.DataFrame(data=np.c_[arr, arr], index=index) df = df.press_check() assert np.isnan(df.iloc[7]).all() @@ -73,12 +89,13 @@ def test_press_check_df(df): def test_bindata_average(series): + """Test bin data.""" delta = 1.0 index = series.remove_above_water().split()[0].bindata(delta=delta).index - assert all(index.values == np.arange(1, 9, delta) + delta / 2) - assert np.unique(np.diff(index.values)) == delta + assert all(index.to_numpy() == np.arange(1, 9, delta) + delta / 2) + assert np.unique(np.diff(index.to_numpy())) == delta delta = 2 index = series.remove_above_water().split()[0].bindata(delta=delta).index - assert all(index.values == np.arange(1, 9, delta) + delta / 2) - assert np.unique(np.diff(index.values)) == delta + assert all(index.to_numpy() == np.arange(1, 9, delta) + delta / 2) + assert np.unique(np.diff(index.to_numpy())) == delta diff --git a/tests/test_processing_real_data.py b/tests/test_processing_real_data.py index 72f4043..4fd2835 100644 --- a/tests/test_processing_real_data.py +++ b/tests/test_processing_real_data.py @@ -1,3 +1,5 @@ +"""Test processing real data.""" + from pathlib import Path import numpy as np @@ -8,42 +10,50 @@ data_path = Path(__file__).parent.joinpath("data") -@pytest.fixture +@pytest.fixture() def spiked_ctd(): - yield ctd.from_cnv(data_path.joinpath("CTD-spiked-unfiltered.cnv.bz2")) + """Load spiked CTD.""" + return ctd.from_cnv(data_path.joinpath("CTD-spiked-unfiltered.cnv.bz2")) -@pytest.fixture +@pytest.fixture() def filtered_ctd(): - yield ctd.from_cnv(data_path.joinpath("CTD-spiked-filtered.cnv.bz2")) + """Load spiked-filtered CTD.""" + return ctd.from_cnv(data_path.joinpath("CTD-spiked-filtered.cnv.bz2")) def test_despike_real_data(filtered_ctd): + """Test despike.""" # Looking at downcast only. dirty = filtered_ctd["c0S/m"].split()[0] clean = dirty.despike(n1=2, n2=20, block=500) - spikes = clean.isnull() + spikes = clean.isna() equal = (dirty[~spikes] == clean[~spikes]).all() - assert spikes.any() and equal + assert spikes.any() + assert equal def test_lp_filter_real_data(spiked_ctd, filtered_ctd): + """Test low pass filter.""" kw = {"sample_rate": 24.0, "time_constant": 0.15} - expected = filtered_ctd.index.values + expected = filtered_ctd.index.to_numpy() filtered = spiked_ctd.lp_filter(**kw).index # Caveat: Not really a good test... np.testing.assert_almost_equal(filtered, expected, decimal=1) def test_press_check_real_data(spiked_ctd): + """Test pressure check.""" unchecked = spiked_ctd["t090C"] press_checked = unchecked.press_check() - reversals = press_checked.isnull() + reversals = press_checked.isna() equal = (unchecked[~reversals] == press_checked[~reversals]).all() - assert reversals.any() and equal + assert reversals.any() + assert equal def test_processing_chain_spiked_ctd(spiked_ctd): + """Test all processing steps chained.""" down, up = spiked_ctd.remove_above_water().split() temp = down["t090C"] # despike is a series only method temp = ( diff --git a/tests/test_read.py b/tests/test_read.py index f81d51a..d0e3bbb 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -1,4 +1,6 @@ -from io import StringIO +"""Test reading functionality.""" + +import io from pathlib import Path import numpy as np @@ -13,118 +15,142 @@ # Test `_read_file` and `_open_compressed`. def test_zip(): + """Test reading from zip.""" cfile = _read_file(data_path.joinpath("XBT.EDF.zip")) - assert isinstance(cfile, StringIO) + assert isinstance(cfile, io.StringIO) def test_gzip(): + """Test reading from gzip.""" cfile = _read_file(data_path.joinpath("XBT.EDF.gz")) - assert isinstance(cfile, StringIO) + assert isinstance(cfile, io.StringIO) def test_bz2(): + """Test reading from bzip2.""" cfile = _read_file(data_path.joinpath("XBT.EDF.bz2")) - assert isinstance(cfile, StringIO) + assert isinstance(cfile, io.StringIO) def test_uncompresed(): + """Test reading from uncompressed file.""" cfile = _read_file(data_path.joinpath("XBT.EDF")) - assert isinstance(cfile, StringIO) + assert isinstance(cfile, io.StringIO) # Test ctd DataFrame. -@pytest.fixture +@pytest.fixture() def xbt(): - yield ctd.from_edf(data_path.joinpath("XBT.EDF.zip")) + """Load zip EDF file.""" + return ctd.from_edf(data_path.joinpath("XBT.EDF.zip")) -@pytest.fixture +@pytest.fixture() def fsi(): - yield ctd.from_fsi(data_path.joinpath("FSI.txt.gz"), skiprows=9) + """Load gzip FSI file.""" + return ctd.from_fsi(data_path.joinpath("FSI.txt.gz"), skiprows=9) -@pytest.fixture +@pytest.fixture() def cnv(): - yield ctd.from_cnv(data_path.joinpath("small.cnv.bz2")) + """Load bzip2 CNV file.""" + return ctd.from_cnv(data_path.joinpath("small.cnv.bz2")) -@pytest.fixture +@pytest.fixture() def btl(): - yield ctd.from_btl(data_path.joinpath("btl", "bottletest.btl")) + """Load uncompressed BTL file.""" + return ctd.from_btl(data_path.joinpath("btl", "bottletest.btl")) -@pytest.fixture +@pytest.fixture() def btl_as_stream(): - file = open(mode="rb", file=data_path.joinpath("btl", "alt_bottletest.BTL")) - stream = StringIO(file.read().decode("cp1252")) - yield ctd.from_btl(stream) + """Load stream BTL data.""" + with Path.open( + data_path.joinpath("btl", "alt_bottletest.BTL"), + mode="rb", + ) as f: + stream = io.StringIO(f.read().decode("cp1252")) + return ctd.from_btl(stream) -@pytest.fixture +@pytest.fixture() def ros(): - yield ctd.rosette_summary(data_path.joinpath("CTD", "g01l03s01m-m2.ros")) + """Load uncompressed ROS file.""" + return ctd.rosette_summary(data_path.joinpath("CTD", "g01l03s01m-m2.ros")) def test_xbt_is_dataframe(xbt): + """Test XBT.""" assert isinstance(xbt, pd.DataFrame) assert not xbt.empty def test_fsi_is_dataframe(fsi): + """Test FSI.""" assert isinstance(fsi, pd.DataFrame) assert not fsi.empty def test_cnv_is_dataframe(cnv): + """Test CNV.""" assert isinstance(cnv, pd.DataFrame) assert not cnv.empty def test_btl_is_dataframe(btl): + """Test BTL.""" assert isinstance(btl, pd.DataFrame) assert not btl.empty def test_btl_with_dup_cols(btl_as_stream): + """Test BTL with duplicated columns.""" assert all(col in btl_as_stream.columns for col in ["Bottle", "Bottle_"]) def test_btl_as_stringio(btl_as_stream): + """Test BTL from stream.""" assert isinstance(btl_as_stream, pd.DataFrame) assert not btl_as_stream.empty def test_ros_is_dataframe(ros): + """Test ROS.""" assert isinstance(ros, pd.DataFrame) assert not ros.empty -# if missing the 'File Name' in the header the dataframe._metadata['name'] should be set to 'unknown' -def test_ros_no_file_name(ros): - file = open(mode="rb", file=data_path.joinpath("CTD", "fixstation_hl_02.ros")) - stream = StringIO(file.read().decode("cp1252")) +def test_ros_no_file_name(): + """Test is if missing the 'File Name' is set to 'unknown'.""" + with Path.open( + data_path.joinpath("CTD", "fixstation_hl_02.ros"), + mode="rb", + ) as f: + stream = io.StringIO(f.read().decode("cp1252")) data = ctd.rosette_summary(stream) - assert data._metadata["name"] == "unknown" + assert data._metadata["name"] == "unknown" # noqa: SLF001 -# HeaderTest. def test_header_parse(): - # file with missing positions + """Test header parsing.""" + # File with missing positions. xbt = ctd.from_edf(data_path.joinpath("C3_00005.edf")) - assert xbt._metadata["lon"] is None - assert xbt._metadata["lat"] is None + assert xbt._metadata["lon"] is None # noqa: SLF001 + assert xbt._metadata["lat"] is None # noqa: SLF001 - # file with valid positions + # File with valid positions. xbt = ctd.from_edf(data_path.joinpath("XBT.EDF")) - np.testing.assert_almost_equal(xbt._metadata["lon"], -39.8790283) - np.testing.assert_almost_equal(xbt._metadata["lat"], -19.7174805) + np.testing.assert_almost_equal(xbt._metadata["lon"], -39.8790283) # noqa: SLF001 + np.testing.assert_almost_equal(xbt._metadata["lat"], -19.7174805) # noqa: SLF001 def test_header_parse_blank_line(): - # check that a BTL file can still be loaded if the header section contains blank lines + """Check if file is loaded when the header section contains blank lines. - # if the blank line in the header causes the reader to exit before reading the file - # the line looking for the Date in the ctd.from_btl() will throw a ValueError. + If the blank line in the header causes to exit before reading it, + the line looking for the Date in the `from_btl` will throw a ValueError. + """ btl = ctd.from_btl( data_path.joinpath( "btl", @@ -132,17 +158,16 @@ def test_header_parse_blank_line(): ), ) - # if a value error wasn't thrown, ensure the names array for the _metadata was set - assert btl._metadata["names"].index("Date") + assert btl._metadata["names"].index("Date") # noqa: SLF001 def test_pressure_field_labels(): - """ - Support different pressure field labels encountered in Sea-Bird cnv files. - - """ + """Support different pressure field labels encountered in CNV files.""" for fname in sorted(data_path.glob("press-pass*.cnv")): ctd.from_cnv(fname) for fname in sorted(data_path.glob("press-fails*.cnv")): - with pytest.raises(ValueError): + with pytest.raises( + ValueError, + match="Expected one pressure/depth column, didn't receive any", + ): ctd.from_cnv(fname)