diff --git a/.github/workflows/test-and-deploy.yml b/.github/workflows/test-and-deploy.yml index 4b5612f..7bd8eea 100644 --- a/.github/workflows/test-and-deploy.yml +++ b/.github/workflows/test-and-deploy.yml @@ -24,7 +24,7 @@ jobs: - name: Test id: test run: | - pip install . + pip install .[dev] python -m unittest discover tests/ - name: Deploy diff --git a/numerai_tools/scoring.py b/numerai_tools/scoring.py index f97d14b..e7d8c6d 100644 --- a/numerai_tools/scoring.py +++ b/numerai_tools/scoring.py @@ -1,8 +1,17 @@ +import os +import functools from typing import List, Tuple, Union, Optional import numpy as np import pandas as pd -from scipy import stats + +try: + if os.environ.get("DISABLE_NUMBA_STATS"): + raise ImportError + from numba_stats import norm +except ImportError: + from scipy.stats import norm + from sklearn.preprocessing import OneHotEncoder @@ -171,7 +180,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame: pd.DataFrame - the gaussianized data """ assert np.array_equal(df.index.sort_values(), df.index) - return df.apply(lambda series: stats.norm.ppf(series)) + return df.apply(functools.partial(norm.ppf, loc=0, scale=1)) def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray: diff --git a/setup.py b/setup.py index eba44f0..835a92d 100644 --- a/setup.py +++ b/setup.py @@ -41,4 +41,7 @@ def load(path): "scipy~=1.11.4", "scikit-learn>=1.3.0", ], + extras_require={ + "dev": ["numba-stats>=1.7.0"] + }, ) diff --git a/tests/test_scoring.py b/tests/test_scoring.py index 8cd9aa9..71c21d0 100644 --- a/tests/test_scoring.py +++ b/tests/test_scoring.py @@ -1,4 +1,5 @@ import unittest +from unittest.mock import patch import numpy as np import pandas as pd @@ -21,6 +22,9 @@ stake_weight, ) +from numba_stats import norm as numba_stats_norm +from scipy.stats import norm as scipy_stats_norm + class TestScoring(unittest.TestCase): def setUp(self): @@ -34,6 +38,11 @@ def setUp(self): self.pos_neg = pd.Series([0, -0, 0.5, -0.5, 1.0, -1.0, 2.0, -2.0]).rename( "pos_neg" ) + self.s = [x/4 for x in range(5)] + self.df = pd.DataFrame({ + "target": self.s, + "prediction": reversed(self.s) + }) def test_correlation(self): assert np.isclose(correlation(self.up, self.up), 1) @@ -201,10 +210,14 @@ def test_neutralize(self): ).all() def test_numerai_corr_doesnt_clobber_targets(self): - s = [x/4 for x in range(5)] - df = pd.DataFrame({ - "target": s, - "prediction": reversed(s) - }) - numerai_corr(df[["prediction"]], df["target"]) - assert pd.Series(s).equals(df["target"]), f"{s} != {list(df['target'].values)}" + numerai_corr(self.df[["prediction"]], self.df["target"]) + assert pd.Series(self.s).equals(self.df["target"]), \ + f"{self.s} != {list(self.df['target'].values)}" + + def test_numerai_corr_is_same_with_scipy_and_numba(self): + with patch("numerai_tools.scoring.norm", new=scipy_stats_norm): + corr1 = numerai_corr( + self.df[["prediction"]], self.df["target"]) + with patch("numerai_tools.scoring.norm", new=numba_stats_norm): + corr2 = numerai_corr(self.df[["prediction"]], self.df["target"]) + assert np.isclose(corr1, corr2)