Enable config tests, fix new dataset format, add tests for it (#145)

neverix · web-flow · commit 4fc9ba65f428 · 2025-08-05T09:16:49.000+01:00
fix: update token loader due to new datasets version.
diff --git a/delphi/utils.py b/delphi/utils.py
@@ -1,7 +1,9 @@
 from typing import Any, TypeVar, cast
 
+import datasets
 import numpy as np
 import torch
+from datasets.table import table_iter
 from torch import Tensor
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -14,10 +16,23 @@ def load_tokenized_data(
     dataset_name: str = "",
     column_name: str = "text",
     seed: int = 22,
+    convert_to_tensor_chunk_size: int = 2**18,
 ):
     """
     Load a huggingface dataset, tokenize it, and shuffle.
     Using this function ensures we are using the same tokens everywhere.
+
+    Args:
+        ctx_len: The context length of the tokens.
+        tokenizer: The tokenizer to use.
+        dataset_repo: The repository of the dataset.
+        dataset_split: The split of the dataset.
+        dataset_name: The name of the dataset.
+        column_name: The name of the column to tokenize.
+        seed: The seed to use for shuffling the dataset.
+        convert_to_tensor_chunk_size: The chunk size to use when converting the dataset
+        from Huggingface's Table format to a tensor. Values around 2**17-2**18 seem to
+        be the fastest.
     """
     from datasets import load_dataset
     from sparsify.data import chunk_and_tokenize
@@ -33,6 +48,16 @@ def load_tokenized_data(
 
     tokens = tokens_ds["input_ids"]
 
+    if isinstance(tokens, datasets.Column):
+        tokens = torch.cat(
+            [
+                torch.from_numpy(np.stack(table_chunk["input_ids"].to_numpy(), axis=0))
+                for table_chunk in table_iter(
+                    tokens.source._data, convert_to_tensor_chunk_size
+                )
+            ]
+        )
+
     return tokens
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,3 +1,4 @@
+import shutil
 from pathlib import Path
 from typing import cast
 
@@ -79,13 +80,17 @@ def cache_setup(tmp_path_factory, mock_dataset: torch.Tensor, model: PreTrainedM
     hookpoint_to_sparse_encode, _ = load_hooks_sparse_coders(model, run_cfg_gemma)
     # Define cache config and initialize cache
     log_path = Path.cwd() / "results" / "test" / "log"
+    shutil.rmtree(log_path, ignore_errors=True)
     log_path.mkdir(parents=True, exist_ok=True)
 
-    cache = LatentCache(
-        model,
-        hookpoint_to_sparse_encode,
-        batch_size=cache_cfg.batch_size,
-        log_path=log_path,
+    cache, empty_cache = (
+        LatentCache(
+            model,
+            hookpoint_to_sparse_encode,
+            batch_size=cache_cfg.batch_size,
+            log_path=log_path,
+        )
+        for _ in range(2)
     )
 
     # Generate mock tokens and run the cache
@@ -104,60 +109,9 @@ def cache_setup(tmp_path_factory, mock_dataset: torch.Tensor, model: PreTrainedM
     )
     return {
         "cache": cache,
+        "empty_cache": empty_cache,
         "tokens": tokens,
         "cache_cfg": cache_cfg,
         "temp_dir": temp_dir,
         "firing_counts": hookpoint_firing_counts,
     }
-
-
-def test_hookpoint_firing_counts_initialization(cache_setup):
-    """
-    Ensure that hookpoint_firing_counts is initialized as an empty dictionary.
-    """
-    cache = cache_setup["cache"]
-    assert isinstance(cache.hookpoint_firing_counts, dict)
-    assert len(cache.hookpoint_firing_counts) == 0  # Should be empty before run()
-
-
-def test_hookpoint_firing_counts_updates(cache_setup):
-    """
-    Ensure that hookpoint_firing_counts is properly updated after running the cache.
-    """
-    cache = cache_setup["cache"]
-    tokens = cache_setup["tokens"]
-    cache.run(cache_setup["cache_cfg"].n_tokens, tokens)
-
-    assert (
-        len(cache.hookpoint_firing_counts) > 0
-    ), "hookpoint_firing_counts should not be empty after run()"
-    for hookpoint, counts in cache.hookpoint_firing_counts.items():
-        assert isinstance(
-            counts, torch.Tensor
-        ), f"Counts for {hookpoint} should be a torch.Tensor"
-        assert counts.ndim == 1, f"Counts for {hookpoint} should be a 1D tensor"
-        assert (counts >= 0).all(), f"Counts for {hookpoint} should be non-negative"
-
-
-def test_hookpoint_firing_counts_persistence(cache_setup):
-    """
-    Ensure that hookpoint_firing_counts are correctly saved and loaded.
-    """
-    cache = cache_setup["cache"]
-    cache.save_firing_counts()
-
-    firing_counts_path = Path.cwd() / "results" / "log" / "hookpoint_firing_counts.pt"
-    assert firing_counts_path.exists(), "Firing counts file should exist after saving"
-
-    loaded_counts = torch.load(firing_counts_path, weights_only=True)
-    assert isinstance(
-        loaded_counts, dict
-    ), "Loaded firing counts should be a dictionary"
-    assert (
-        loaded_counts.keys() == cache.hookpoint_firing_counts.keys()
-    ), "Loaded firing counts keys should match saved keys"
-
-    for hookpoint, counts in loaded_counts.items():
-        assert torch.equal(
-            counts, cache.hookpoint_firing_counts[hookpoint]
-        ), f"Mismatch in firing counts for {hookpoint}"
diff --git a/tests/test_latents/test_config.py b/tests/test_latents/test_config.py
@@ -0,0 +1,78 @@
+from pathlib import Path
+
+import torch
+from transformers import AutoTokenizer
+
+from delphi.utils import load_tokenized_data
+
+
+def test_dataset_is_array():
+    tokens = load_tokenized_data(
+        ctx_len=16,
+        tokenizer=AutoTokenizer.from_pretrained("EleutherAI/pythia-70m"),
+        dataset_repo="NeelNanda/pile-10k",
+        dataset_split="train",
+        dataset_name="",
+        column_name="text",
+        seed=42,
+    )
+    assert isinstance(tokens, torch.Tensor)
+    assert tokens.ndim == 2
+    assert tokens.shape[1] == 16
+    assert tokens.dtype in (torch.int64, torch.int32)
+    assert tokens.min() >= 0
+    assert tokens.max() < 50304
+
+
+def test_hookpoint_firing_counts_initialization(cache_setup):
+    """
+    Ensure that hookpoint_firing_counts is initialized as an empty dictionary.
+    """
+    cache = cache_setup["empty_cache"]
+    assert isinstance(cache.hookpoint_firing_counts, dict)
+    assert len(cache.hookpoint_firing_counts) == 0  # Should be empty before run()
+
+
+def test_hookpoint_firing_counts_updates(cache_setup):
+    """
+    Ensure that hookpoint_firing_counts is properly updated after running the cache.
+    """
+    cache = cache_setup["empty_cache"]
+    tokens = cache_setup["tokens"]
+    cache.run(cache_setup["cache_cfg"].n_tokens, tokens)
+
+    assert (
+        len(cache.hookpoint_firing_counts) > 0
+    ), "hookpoint_firing_counts should not be empty after run()"
+    for hookpoint, counts in cache.hookpoint_firing_counts.items():
+        assert isinstance(
+            counts, torch.Tensor
+        ), f"Counts for {hookpoint} should be a torch.Tensor"
+        assert counts.ndim == 1, f"Counts for {hookpoint} should be a 1D tensor"
+        assert (counts >= 0).all(), f"Counts for {hookpoint} should be non-negative"
+
+
+def test_hookpoint_firing_counts_persistence(cache_setup):
+    """
+    Ensure that hookpoint_firing_counts are correctly saved and loaded.
+    """
+    cache = cache_setup["empty_cache"]
+    cache.save_firing_counts()
+
+    firing_counts_path = (
+        Path.cwd() / "results" / "test" / "log" / "hookpoint_firing_counts.pt"
+    )
+    assert firing_counts_path.exists(), "Firing counts file should exist after saving"
+
+    loaded_counts = torch.load(firing_counts_path, weights_only=True)
+    assert isinstance(
+        loaded_counts, dict
+    ), "Loaded firing counts should be a dictionary"
+    assert (
+        loaded_counts.keys() == cache.hookpoint_firing_counts.keys()
+    ), "Loaded firing counts keys should match saved keys"
+
+    for hookpoint, counts in loaded_counts.items():
+        assert torch.equal(
+            counts, cache.hookpoint_firing_counts[hookpoint]
+        ), f"Mismatch in firing counts for {hookpoint}"