numerai · jeethu · Mar 21, 2024 · Mar 21, 2024 · May 23, 2024 · May 23, 2024
diff --git a/.github/workflows/test-and-deploy.yml b/.github/workflows/test-and-deploy.yml
@@ -24,7 +24,7 @@ jobs:
       - name: Test
         id: test
         run: |
-          pip install .
+          pip install .[dev]
           python -m unittest discover tests/
 
       - name: Deploy

diff --git a/numerai_tools/scoring.py b/numerai_tools/scoring.py
@@ -1,8 +1,17 @@
+import os
+import functools
 from typing import List, Tuple, Union, Optional
 
 import numpy as np
 import pandas as pd
-from scipy import stats
+
+try:
+    if os.environ.get("DISABLE_NUMBA_STATS"):
+        raise ImportError
+    from numba_stats import norm
+except ImportError:
+    from scipy.stats import norm
+
 from sklearn.preprocessing import OneHotEncoder
 
 
@@ -171,7 +180,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame:
         pd.DataFrame - the gaussianized data
     """
     assert np.array_equal(df.index.sort_values(), df.index)
-    return df.apply(lambda series: stats.norm.ppf(series))
+    return df.apply(functools.partial(norm.ppf, loc=0, scale=1))
 
 
 def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:

diff --git a/setup.py b/setup.py
@@ -41,4 +41,7 @@ def load(path):
             "scipy~=1.11.4",
             "scikit-learn>=1.3.0",
         ],
+        extras_require={
+            "dev": ["numba-stats>=1.7.0"]
+        },
     )
diff --git a/tests/test_scoring.py b/tests/test_scoring.py
@@ -1,4 +1,5 @@
 import unittest
+from unittest.mock import patch
 
 import numpy as np
 import pandas as pd
@@ -21,6 +22,9 @@
     stake_weight,
 )
 
+from numba_stats import norm as numba_stats_norm
+from scipy.stats import norm as scipy_stats_norm
+
 
 class TestScoring(unittest.TestCase):
     def setUp(self):
@@ -34,6 +38,11 @@ def setUp(self):
         self.pos_neg = pd.Series([0, -0, 0.5, -0.5, 1.0, -1.0, 2.0, -2.0]).rename(
             "pos_neg"
         )
+        self.s = [x/4 for x in range(5)]
+        self.df = pd.DataFrame({
+            "target": self.s,
+            "prediction": reversed(self.s)
+        })
 
     def test_correlation(self):
         assert np.isclose(correlation(self.up, self.up), 1)
@@ -201,10 +210,14 @@ def test_neutralize(self):
         ).all()
 
     def test_numerai_corr_doesnt_clobber_targets(self):
-        s = [x/4 for x in range(5)]
-        df = pd.DataFrame({
-            "target": s,
-            "prediction": reversed(s)
-        })
-        numerai_corr(df[["prediction"]], df["target"])
-        assert pd.Series(s).equals(df["target"]), f"{s} != {list(df['target'].values)}"
+        numerai_corr(self.df[["prediction"]], self.df["target"])
+        assert pd.Series(self.s).equals(self.df["target"]), \
+                f"{self.s} != {list(self.df['target'].values)}"
+
+    def test_numerai_corr_is_same_with_scipy_and_numba(self):
+        with patch("numerai_tools.scoring.norm", new=scipy_stats_norm):
+            corr1 = numerai_corr(
+                    self.df[["prediction"]], self.df["target"])
+        with patch("numerai_tools.scoring.norm", new=numba_stats_norm):
+            corr2 = numerai_corr(self.df[["prediction"]], self.df["target"])
+        assert np.isclose(corr1, corr2)