diff --git a/diffxpy/pkg_constants.py b/diffxpy/pkg_constants.py
index 12dfea2..48cabe0 100644
--- a/diffxpy/pkg_constants.py
+++ b/diffxpy/pkg_constants.py
@@ -15,4 +15,4 @@
 
 BATCHGLM_BACKEND = "tf1"
 BATCHGLM_FEATUREWISE = True
-BATCHGLM_AUTOGRAD = True
+BATCHGLM_AUTOGRAD = False
diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py
index c378494..a8c9445 100644
--- a/diffxpy/testing/tests.py
+++ b/diffxpy/testing/tests.py
@@ -194,6 +194,8 @@ def _fit(
     constructor_args = {}
     if quick_scale is not None:
         constructor_args["quick_scale"] = quick_scale
+    if batch_size is not None and backend != "tf2":
+        constructor_args["batch_size"] = batch_size
     # Backend-specific constructor arguments:
     if backend.lower() in ["tf1"]:
         constructor_args['provide_optimizers'] = {
diff --git a/diffxpy/unit_test/test_compare_numpy_tf2.py b/diffxpy/unit_test/test_compare_numpy_tf2.py
new file mode 100644
index 0000000..8ddb4f1
--- /dev/null
+++ b/diffxpy/unit_test/test_compare_numpy_tf2.py
@@ -0,0 +1,81 @@
+import unittest
+import logging
+import diffxpy.api as de
+import numpy as np
+
+
+class TestBackendsNb(unittest.TestCase):
+
+    """
+    Negative binomial noise model unit tests that test whether the wald test results
+    in the same logfoldchange, p- and q-values and the coefficents are the same after
+    fitting when using the same simulated data.
+    """
+
+    def __init__(self, *args, **kwargs):
+
+        super(TestBackendsNb, self).__init__(*args, **kwargs)
+
+        from batchglm.api.models.numpy.glm_nb import Simulator
+        self.sim = Simulator(num_observations=10000, num_features=200)
+        self.sim.generate_sample_description(num_batches=0, num_conditions=4)
+        self.sim.generate_params()
+        self.sim.generate_data()
+
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        self.numpy_results = de.test.wald(
+            data=self.sim.input_data,
+            sample_description=self.sim.sample_description,
+            factor_loc_totest="condition",
+            formula_loc="~ 1 + condition + batch",
+            noise_model='nb',
+            backend='numpy'
+        )
+        _ = self.numpy_results.summary()
+
+        self.tf2_results = de.test.wald(
+            data=self.sim.input_data,
+            sample_description=self.sim.sample_description,
+            factor_loc_totest="condition",
+            formula_loc="~ 1 + condition + batch",
+            noise_model='nb',
+            backend='tf2'
+        )
+        _ = self.tf2_results.summary()
+
+    """
+    Test with numpy:
+    """
+
+    def test_coeff_similarity(self):
+
+        a_var_max_diff = np.max(np.abs(self.tf2_results.model_estim.a_var-self.numpy_results.model_estim.a_var))
+        b_var_max_diff = np.max(np.abs(self.tf2_results.model_estim.b_var-self.numpy_results.model_estim.b_var))
+        assert a_var_max_diff < 1e-6 and b_var_max_diff < 1e-6, \
+            ("a_var_max_diff: %f, b_var_max_diff: %f", a_var_max_diff, b_var_max_diff)
+
+        return True
+
+    def test_logfoldchange_similarity(self):
+        max_diff = np.max(np.abs(self.tf2_results.summary()['log2fc'].values-self.numpy_results.summary()['log2fc'].values))
+        assert max_diff < 1e-12, ("log_fold_change difference: %f > 1e-12", max_diff)
+
+        return True
+
+    def test_pval_similarity(self):
+        max_diff = np.max(np.abs(self.tf2_results.pval-self.numpy_results.pval))
+        assert max_diff < 1e-12, ("p-val difference: %f > 1e-12", max_diff)
+
+        return True
+
+    def test_qval_similarity(self):
+        max_diff = np.max(np.abs(self.tf2_results.summary()['qval'].values-self.numpy_results.summary()['qval'].values))
+        assert max_diff < 1e-12, ("q-val difference: %f > 1e-12", max_diff)
+
+        return True
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/diffxpy/unit_test/test_fit.py b/diffxpy/unit_test/test_fit.py
index 99c5430..39d3bce 100644
--- a/diffxpy/unit_test/test_fit.py
+++ b/diffxpy/unit_test/test_fit.py
@@ -17,7 +17,7 @@ def _test_model_fit(
         """
         Test if de.wald() generates a uniform p-value distribution
         if it is given data simulated based on the null model. Returns the p-value
-        of the two-side Kolmgorov-Smirnov test for equality of the observed 
+        of the two-side Kolmgorov-Smirnov test for equality of the observed
         p-value distribution and a uniform distribution.
 
         :param n_cells: Number of cells to simulate (number of observations per test).
@@ -34,7 +34,7 @@ def _test_model_fit(
             raise ValueError("noise model %s not recognized" % noise_model)
 
         sim = Simulator(num_observations=n_cells, num_features=n_genes)
-        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate_sample_description(num_batches=1, num_conditions=1)
         sim.generate_params(rand_fn_scale=rand_fn_scale)
         sim.generate_data()
 
@@ -77,7 +77,7 @@ def _test_model_fit_partition(
             raise ValueError("noise model %s not recognized" % noise_model)
 
         sim = Simulator(num_observations=n_cells, num_features=n_genes)
-        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate_sample_description(num_batches=1, num_conditions=1)
         sim.generate_params(rand_fn_scale=rand_fn_scale)
         sim.generate_data()
 
@@ -121,7 +121,7 @@ def _test_residuals_fit(
             raise ValueError("noise model %s not recognized" % noise_model)
 
         sim = Simulator(num_observations=n_cells, num_features=n_genes)
-        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate_sample_description(num_batches=1, num_conditions=1)
         sim.generate()
 
         random_sample_description = pd.DataFrame({
@@ -209,23 +209,23 @@ def test_residuals_fit(
             noise_model="nb"
         )
 
-
+"""
 class TestFitNorm(_TestFit, unittest.TestCase):
-    """
+
     Normal noise model unit tests that tests whether model fit relay works.
-    """
+
 
     def test_model_fit(
             self,
             n_cells: int = 2000,
             n_genes: int = 2
     ):
-        """
+
         Test if model fit for "norm" noise model works.
 
         :param n_cells: Number of cells to simulate (number of observations per test).
         :param n_genes: Number of genes to simulate (number of tests).
-        """
+
         logging.getLogger("tensorflow").setLevel(logging.ERROR)
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
@@ -242,12 +242,12 @@ def test_model_fit_partition(
             n_cells: int = 2000,
             n_genes: int = 2
     ):
-        """
+
         Test if partitioned model fit for "norm" noise model works.
 
         :param n_cells: Number of cells to simulate (number of observations per test).
         :param n_genes: Number of genes to simulate (number of tests).
-        """
+
         logging.getLogger("tensorflow").setLevel(logging.ERROR)
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
@@ -264,12 +264,12 @@ def test_residuals_fit(
             n_cells: int = 2000,
             n_genes: int = 2
     ):
-        """
+
         Test if residual fit for "norm" noise model works.
 
         :param n_cells: Number of cells to simulate (number of observations per test).
         :param n_genes: Number of genes to simulate (number of tests).
-        """
+
         logging.getLogger("tensorflow").setLevel(logging.ERROR)
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
@@ -280,7 +280,7 @@ def test_residuals_fit(
             n_genes=n_genes,
             noise_model="norm"
         )
-
+"""
 
 if __name__ == '__main__':
     unittest.main()