diff --git a/.gitignore b/.gitignore
index f93136374..fe608cde3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,10 @@ __pycache__/
 
 # C extensions
 *.so
+*.pyd
+
+# Cython generated
+ethereum/pow/ethash_cy.c
 
 # qkchash binaries
 qkchash/qkchash
diff --git a/README.md b/README.md
index 45586d9a1..802a9c9f3 100644
--- a/README.md
+++ b/README.md
@@ -71,9 +71,12 @@ To install the required modules for the project. Under `pyquarkchain` dir where
 # you may want to set the following if cryptography complains about header files: (https://github.com/pyca/cryptography/issues/3489)
 # export CPPFLAGS=-I/usr/local/opt/openssl/include
 # export LDFLAGS=-L/usr/local/opt/openssl/lib
-pip install -e .
+pip install -r requirements.txt
+python setup.py build_ext --inplace
 ```
 
+The second command builds the optional Cython extension (`ethash_cy`) that speeds up ethash `calc_dataset_item` by ~20x. It requires a C compiler. If the build is skipped, the pure-Python fallback is used automatically.
+
 Once all the modules are installed, try running all the unit tests under `pyquarkchain`
 
 ```
diff --git a/ethereum/pow/ethash.py b/ethereum/pow/ethash.py
index 7e7a0848a..7237cd1fc 100644
--- a/ethereum/pow/ethash.py
+++ b/ethereum/pow/ethash.py
@@ -1,94 +1,156 @@
-import copy
+import os
+import numpy as np
 from functools import lru_cache
 from typing import Callable, Dict, List
 
-from ethereum.pow.ethash_utils import *
+from ethereum.pow.ethash_utils import (
+    ethash_sha3_512, ethash_sha3_256,
+    FNV_PRIME, HASH_BYTES, WORD_BYTES, MIX_BYTES,
+    DATASET_PARENTS, CACHE_ROUNDS, ACCESSES, EPOCH_LENGTH,
+)
+
+# uint32 overflow is intentional in FNV arithmetic
+np.seterr(over="ignore")
+
+_FNV_PRIME = np.uint32(FNV_PRIME)
+
+# Optional Cython inner loop for calc_dataset_item. Falls back to pure numpy
+# when the compiled extension isn't built (e.g. source checkouts without a
+# C compiler).
+try:
+    from ethereum.pow.ethash_cy import mix_parents as _cy_mix_parents
+except ImportError:  # pragma: no cover
+    _cy_mix_parents = None
 
 cache_seeds = [b"\x00" * 32]  # type: List[bytes]
 
 
-def mkcache(cache_size: int, block_number) -> List[List[int]]:
+
+# ---------------------------------------------------------------------------
+# ETHASH_LIB selects the implementation used for non-test PoW verification.
+#   "ethash"    — pure-Python + numpy (always available)
+#   "ethash_cy" — Cython + C keccak  (requires python setup.py build_ext)
+# Default: auto-detect best available (ethash_cy → ethash)
+# ---------------------------------------------------------------------------
+ETHASH_LIB = os.environ.get("ETHASH_LIB", "auto")
+
+if ETHASH_LIB == "auto":
+    try:
+        from ethereum.pow.ethash_cy import cy_hashimoto_light  # noqa: F401
+        ETHASH_LIB = "ethash_cy"
+    except ImportError:
+        ETHASH_LIB = "ethash"
+
+if ETHASH_LIB == "ethash":
+    @lru_cache(2)
+    def _get_cache(seed: bytes, n: int) -> np.ndarray:
+        """Returns cache as uint32 ndarray of shape (n, 16)."""
+        o = np.empty((n, 16), dtype=np.uint32)
+        o[0] = ethash_sha3_512(seed)
+        for i in range(1, n):
+            o[i] = ethash_sha3_512(o[i - 1])
+        for _ in range(CACHE_ROUNDS):
+            for i in range(n):
+                v = int(o[i, 0]) % n
+                xored = o[(i - 1 + n) % n] ^ o[v]
+                o[i] = ethash_sha3_512(xored)
+        return o
+
+    def hashimoto_light(
+        full_size: int, cache: np.ndarray, header: bytes, nonce: bytes
+    ) -> Dict:
+        return hashimoto(header, nonce, full_size, lambda x: calc_dataset_item(cache, x))
+
+elif ETHASH_LIB == "ethash_cy":
+    from ethereum.pow.ethash_cy import cy_hashimoto_light, cy_mkcache
+
+    @lru_cache(2)
+    def _get_cache(seed: bytes, n: int):
+        return cy_mkcache(np.frombuffer(seed, dtype=np.uint8), n)
+
+    def hashimoto_light(
+        full_size: int, cache: np.ndarray, header: bytes, nonce: bytes
+    ) -> Dict:
+        return cy_hashimoto_light(
+            full_size, cache,
+            np.frombuffer(header, dtype=np.uint8),
+            np.frombuffer(nonce, dtype=np.uint8),
+        )
+
+else:
+    raise ValueError(f"Unknown ETHASH_LIB={ETHASH_LIB!r}. "
+                     f"Use 'ethash', 'ethash_cy', or 'auto'.")
+
+
+def mkcache(cache_size: int, block_number) -> np.ndarray:
     while len(cache_seeds) <= block_number // EPOCH_LENGTH:
-        new_seed = serialize_hash(ethash_sha3_256(cache_seeds[-1]))
+        new_seed = ethash_sha3_256(cache_seeds[-1]).tobytes()
         cache_seeds.append(new_seed)
 
     seed = cache_seeds[block_number // EPOCH_LENGTH]
     return _get_cache(seed, cache_size // HASH_BYTES)
 
 
-@lru_cache(10)
-def _get_cache(seed, n) -> List[List[int]]:
-    # Sequentially produce the initial dataset
-    o = [ethash_sha3_512(seed)]
-    for i in range(1, n):
-        o.append(ethash_sha3_512(o[-1]))
-
-    # Use a low-round version of randmemohash
-    for _ in range(CACHE_ROUNDS):
-        for i in range(n):
-            v = o[i][0] % n
-            o[i] = ethash_sha3_512(list(map(xor, o[(i - 1 + n) % n], o[v])))
-
-    return o
-
-
-def calc_dataset_item(cache: List[List[int]], i: int) -> List[int]:
+def calc_dataset_item(cache: np.ndarray, i: int) -> np.ndarray:
     n = len(cache)
-    r = HASH_BYTES // WORD_BYTES
-    # initialize the mix
-    mix = copy.copy(cache[i % n])  # type: List[int]
-    mix[0] ^= i
+    mix = cache[i % n].copy()
+    mix[0] ^= i                    # numpy auto-converts int, no explicit np.uint32() boxing
     mix = ethash_sha3_512(mix)
-    # fnv it with a lot of random cache nodes based on i
-    for j in range(DATASET_PARENTS):
-        cache_index = fnv(i ^ j, mix[j % r])
-        mix = list(map(fnv, mix, cache[cache_index % n]))
+    if _cy_mix_parents is not None:
+        # mix is already C-contiguous uint32[16] (it's a fresh ndarray from
+        # ethash_sha3_512). cache rows are also contiguous uint32[16].
+        _cy_mix_parents(mix, cache, i)
+    else:
+        r = HASH_BYTES // WORD_BYTES   # 16
+        for j in range(DATASET_PARENTS):
+            cache_index = ((i ^ j) * FNV_PRIME ^ int(mix[j % r])) & 0xFFFFFFFF
+            mix *= _FNV_PRIME           # in-place: no temp array allocation
+            mix ^= cache[cache_index % n]  # in-place: no temp array allocation
     return ethash_sha3_512(mix)
 
 
-def calc_dataset(full_size, cache) -> List[List[int]]:
-    o = []
-    for i in range(full_size // HASH_BYTES):
-        o.append(calc_dataset_item(cache, i))
-    return o
+def calc_dataset(full_size, cache: np.ndarray) -> np.ndarray:
+    rows = full_size // HASH_BYTES
+    out = np.empty((rows, 16), dtype=np.uint32)
+    for i in range(rows):
+        out[i] = calc_dataset_item(cache, i)
+    return out
 
 
 def hashimoto(
     header: bytes,
     nonce: bytes,
     full_size: int,
-    dataset_lookup: Callable[[int], List[int]],
+    dataset_lookup: Callable[[int], np.ndarray],
 ) -> Dict:
     n = full_size // HASH_BYTES
     w = MIX_BYTES // WORD_BYTES
     mixhashes = MIX_BYTES // HASH_BYTES
-    # combine header+nonce into a 64 byte seed
-    s = ethash_sha3_512(header + nonce[::-1])
-    mix = []
-    for _ in range(MIX_BYTES // HASH_BYTES):
-        mix.extend(s)
-    # mix in random dataset nodes
+
+    s = ethash_sha3_512(header + nonce[::-1])     # (16,) uint32
+    mix = np.tile(s, mixhashes)                      # (32,) uint32
+    s0 = int(s[0])                                   # hoist constant, avoid repeated unboxing
+    newdata = np.empty(w, dtype=np.uint32)           # pre-allocate, reused every iteration
+
     for i in range(ACCESSES):
-        p = fnv(i ^ s[0], mix[i % w]) % (n // mixhashes) * mixhashes
-        newdata = []
-        for j in range(mixhashes):
-            newdata.extend(dataset_lookup(p + j))
-        mix = list(map(fnv, mix, newdata))
-    # compress mix
-    cmix = []
-    for i in range(0, len(mix), 4):
-        cmix.append(fnv(fnv(fnv(mix[i], mix[i + 1]), mix[i + 2]), mix[i + 3]))
+        p = ((i ^ s0) * FNV_PRIME ^ int(mix[i % w])) & 0xFFFFFFFF
+        p = p % (n // mixhashes) * mixhashes
+        for j in range(mixhashes):                   # avoid np.concatenate alloc+copy
+            newdata[j * 16:(j + 1) * 16] = dataset_lookup(p + j)
+        mix *= _FNV_PRIME                            # in-place: no temp array
+        mix ^= newdata                               # in-place: no temp array
+
+    mix_r = mix.reshape(-1, 4)
+    cmix = mix_r[:, 0] * _FNV_PRIME ^ mix_r[:, 1]
+    cmix = cmix * _FNV_PRIME ^ mix_r[:, 2]
+    cmix = cmix * _FNV_PRIME ^ mix_r[:, 3]
+
+    s_cmix = np.concatenate([s, cmix])
     return {
-        b"mix digest": serialize_hash(cmix),
-        b"result": serialize_hash(ethash_sha3_256(s + cmix)),
+        b"mix digest": cmix.tobytes(),
+        b"result": ethash_sha3_256(s_cmix).tobytes(),
     }
 
 
-def hashimoto_light(
-    full_size: int, cache: List[List[int]], header: bytes, nonce: bytes
-) -> Dict:
-    return hashimoto(header, nonce, full_size, lambda x: calc_dataset_item(cache, x))
-
-
-def hashimoto_full(dataset: List[List[int]], header: bytes, nonce: bytes) -> Dict:
+def hashimoto_full(dataset: np.ndarray, header: bytes, nonce: bytes) -> Dict:
     return hashimoto(header, nonce, len(dataset) * HASH_BYTES, lambda x: dataset[x])
diff --git a/ethereum/pow/ethash_cy.pyx b/ethereum/pow/ethash_cy.pyx
new file mode 100644
index 000000000..39d2c4585
--- /dev/null
+++ b/ethereum/pow/ethash_cy.pyx
@@ -0,0 +1,229 @@
+# cython: language_level=3
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+"""
+Cython-accelerated Ethash routines.
+
+R3 — ``mix_parents``: inner loop of calc_dataset_item (256-iter FNV mixing).
+R4 — ``cy_calc_dataset_item``, ``cy_hashimoto_light``: full functions that
+      call C keccak directly, eliminating all Python-layer overhead for hashing.
+"""
+
+import numpy as np
+cimport numpy as cnp
+cimport cython
+from libc.stdint cimport uint8_t, uint32_t, uint64_t
+from libc.string cimport memcpy, memset
+
+cnp.import_array()
+
+# ---------- C keccak (keccak_tiny.c, linked at build time) ----------
+cdef extern from "keccak_tiny.h":
+    int keccak_256(uint8_t* out, size_t outlen,
+                   const uint8_t* inp, size_t inlen) nogil
+    int keccak_512(uint8_t* out, size_t outlen,
+                   const uint8_t* inp, size_t inlen) nogil
+
+# ---------- Ethash constants ----------
+cdef uint32_t FNV_PRIME    = 0x01000193u
+
+cdef enum:
+    _DATASET_PARENTS = 256
+    _R            = 16    # HASH_BYTES / WORD_BYTES
+    _HASH_BYTES   = 64
+    _MIX_BYTES    = 128
+    _ACCESSES     = 64
+    _MIX_WORDS    = 32    # MIX_BYTES / WORD_BYTES
+    _MIX_HASHES   = 2     # MIX_BYTES / HASH_BYTES
+    _CACHE_ROUNDS = 3
+
+# ---------- Inline C helpers ----------
+
+cdef inline void _keccak_512_u32(uint32_t* out, const uint32_t* inp) noexcept nogil:
+    """keccak-512: 64 bytes in → 16 uint32 out."""
+    keccak_512(<uint8_t*>out, 64, <const uint8_t*>inp, 64)
+
+cdef inline void _keccak_512_bytes(uint32_t* out,
+                                   const uint8_t* inp,
+                                   size_t inlen) noexcept nogil:
+    """keccak-512: arbitrary bytes in → 16 uint32 out."""
+    keccak_512(<uint8_t*>out, 64, inp, inlen)
+
+cdef inline void _keccak_256_u32(uint32_t* out,
+                                 const uint32_t* inp,
+                                 size_t n_u32) noexcept nogil:
+    """keccak-256: n_u32 uint32 words in → 8 uint32 out."""
+    keccak_256(<uint8_t*>out, 32, <const uint8_t*>inp, n_u32 * 4)
+
+
+# =====================================================================
+# cy_mkcache — build ethash cache using C keccak
+# =====================================================================
+
+def cy_mkcache(const uint8_t[::1] seed, Py_ssize_t n):
+    """Build ethash cache: n rows of 16 uint32, using C keccak-512.
+
+    Parameters
+    ----------
+    seed : bytes (as uint8 array)
+        32-byte seed for this epoch.
+    n : int
+        Number of cache rows (cache_size // HASH_BYTES).
+
+    Returns
+    -------
+    numpy ndarray of shape (n, 16), dtype uint32.
+    """
+    result = np.empty((n, 16), dtype=np.uint32)
+    cdef uint32_t[:, ::1] o = result
+    cdef uint32_t* ptr = &o[0, 0]
+    cdef Py_ssize_t i, rnd
+    cdef uint32_t v
+    cdef uint32_t xored[16]
+
+    # o[0] = keccak_512(seed)
+    keccak_512(<uint8_t*>ptr, 64, &seed[0], seed.shape[0])
+
+    # o[i] = keccak_512(o[i-1])
+    for i in range(1, n):
+        _keccak_512_u32(&ptr[i * _R], &ptr[(i - 1) * _R])
+
+    # CACHE_ROUNDS of RandMemoHash
+    for rnd in range(_CACHE_ROUNDS):
+        for i in range(n):
+            v = ptr[i * _R] % <uint32_t>n
+            # xored = o[(i-1+n) % n] ^ o[v]
+            for k in range(_R):
+                xored[k] = ptr[(((i - 1 + n) % n) * _R) + k] ^ ptr[(v * _R) + k]
+            _keccak_512_u32(&ptr[i * _R], xored)
+
+    return result
+
+
+# =====================================================================
+# R3 — mix_parents (kept for backward compatibility)
+# =====================================================================
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def mix_parents(uint32_t[::1] mix,
+                const uint32_t[:, ::1] cache,
+                uint64_t i):
+    """In-place parent mixing for one dataset item (R3 API)."""
+    cdef Py_ssize_t n = cache.shape[0]
+    cdef Py_ssize_t j, k
+    cdef uint32_t cache_index, mix_word
+    cdef uint32_t i32 = <uint32_t>i
+
+    for j in range(_DATASET_PARENTS):
+        mix_word = mix[j % _R]
+        cache_index = ((i32 ^ <uint32_t>j) * FNV_PRIME) ^ mix_word
+        cache_index = cache_index % <uint32_t>n
+        for k in range(_R):
+            mix[k] = (mix[k] * FNV_PRIME) ^ cache[cache_index, k]
+
+
+# =====================================================================
+# R4 — full calc_dataset_item + hashimoto_light in C/Cython
+# =====================================================================
+
+cdef inline void _calc_dataset_item(uint32_t* out,
+                                    const uint32_t* cache,
+                                    Py_ssize_t n,
+                                    uint32_t idx) noexcept nogil:
+    """Pure C calc_dataset_item.  Writes 16 uint32 to *out*."""
+    cdef uint32_t mix[16]
+    cdef Py_ssize_t j, k
+    cdef uint32_t cache_index, mix_word
+
+    # mix = cache[idx % n]; mix[0] ^= idx
+    memcpy(mix, &cache[(idx % n) * _R], 64)
+    mix[0] ^= idx
+    # mix = keccak_512(mix)
+    _keccak_512_u32(mix, mix)
+    # parent mixing
+    for j in range(_DATASET_PARENTS):
+        mix_word = mix[j % _R]
+        cache_index = ((idx ^ <uint32_t>j) * FNV_PRIME) ^ mix_word
+        cache_index = cache_index % <uint32_t>n
+        for k in range(_R):
+            mix[k] = (mix[k] * FNV_PRIME) ^ cache[(cache_index * _R) + k]
+    # mix = keccak_512(mix)
+    _keccak_512_u32(out, mix)
+
+
+def cy_calc_dataset_item(const uint32_t[:, ::1] cache, uint32_t i):
+    """Python-callable calc_dataset_item (R4). Returns ndarray uint32[16]."""
+    cdef Py_ssize_t n = cache.shape[0]
+    result = np.empty(16, dtype=np.uint32)
+    cdef uint32_t[::1] result_view = result
+    _calc_dataset_item(&result_view[0], &cache[0, 0], n, i)
+    return result
+
+
+def cy_hashimoto_light(Py_ssize_t full_size,
+                       const uint32_t[:, ::1] cache,
+                       const uint8_t[::1] header,
+                       const uint8_t[::1] nonce):
+    """Full hashimoto_light in Cython+C (R4).
+
+    Returns dict identical to the Python version:
+        {b"mix digest": bytes(32), b"result": bytes(32)}
+    """
+    cdef Py_ssize_t n = full_size // _HASH_BYTES
+    cdef Py_ssize_t i, j, k, p
+    cdef uint32_t s0
+    cdef uint32_t s[16]
+    cdef uint32_t mix[_MIX_WORDS]     # 32 uint32
+    cdef uint32_t newdata[_MIX_WORDS]
+    cdef uint32_t cmix[8]
+    cdef uint32_t s_cmix[24]         # s(16) + cmix(8)
+    cdef uint32_t result_hash[8]
+    cdef Py_ssize_t cache_n = cache.shape[0]
+    cdef const uint32_t* cache_ptr = &cache[0, 0]
+
+    # nonce_rev = nonce[::-1]
+    cdef Py_ssize_t header_len = header.shape[0]
+    cdef Py_ssize_t nonce_len = nonce.shape[0]
+    cdef uint8_t seed_buf[128]  # header (up to ~80) + nonce (8)
+    memcpy(seed_buf, &header[0], header_len)
+    # reverse nonce
+    for i in range(nonce_len):
+        seed_buf[header_len + i] = nonce[nonce_len - 1 - i]
+
+    # s = keccak_512(header + nonce[::-1])
+    _keccak_512_bytes(s, seed_buf, header_len + nonce_len)
+
+    # mix = tile(s, 2)
+    memcpy(mix, s, 64)
+    memcpy(&mix[16], s, 64)
+
+    s0 = s[0]
+
+    for i in range(_ACCESSES):
+        p = <Py_ssize_t>(((<uint32_t>i ^ s0) * FNV_PRIME) ^ mix[i % _MIX_WORDS])
+        p = (p % (n // _MIX_HASHES)) * _MIX_HASHES
+        for j in range(_MIX_HASHES):
+            _calc_dataset_item(&newdata[j * _R], cache_ptr, cache_n, <uint32_t>(p + j))
+        for k in range(_MIX_WORDS):
+            mix[k] = (mix[k] * FNV_PRIME) ^ newdata[k]
+
+    # compress mix → cmix (8 uint32)
+    for i in range(8):
+        cmix[i] = mix[i * 4]
+        cmix[i] = (cmix[i] * FNV_PRIME) ^ mix[i * 4 + 1]
+        cmix[i] = (cmix[i] * FNV_PRIME) ^ mix[i * 4 + 2]
+        cmix[i] = (cmix[i] * FNV_PRIME) ^ mix[i * 4 + 3]
+
+    # result = keccak_256(s + cmix)
+    memcpy(s_cmix, s, 64)
+    memcpy(&s_cmix[16], cmix, 32)
+    _keccak_256_u32(result_hash, s_cmix, 24)
+
+    # Return as Python dict with bytes values
+    return {
+        b"mix digest": (<uint8_t*>cmix)[:32],
+        b"result": (<uint8_t*>result_hash)[:32],
+    }
diff --git a/ethereum/pow/ethash_utils.py b/ethereum/pow/ethash_utils.py
index fa6dd0dac..b482b14e6 100644
--- a/ethereum/pow/ethash_utils.py
+++ b/ethereum/pow/ethash_utils.py
@@ -1,29 +1,16 @@
-from typing import List, Union
+from typing import Union
+from Crypto.Hash import keccak
 
-from eth_utils import encode_hex, decode_hex
+import numpy as np
 
-try:
-    from Crypto.Hash import keccak
 
-    def _sha3_256(x):
-        return keccak.new(digest_bits=256, data=x).digest()
+def _sha3_256(x):
+    return keccak.new(digest_bits=256, data=x).digest()
 
-    def _sha3_512(x):
-        return keccak.new(digest_bits=512, data=x).digest()
+def _sha3_512(x):
+    return keccak.new(digest_bits=512, data=x).digest()
 
 
-except Exception:
-    import sha3 as _sha3
-
-    def _sha3_256(x):
-        return _sha3.sha3_256(x).digest()
-
-    def _sha3_512(x):
-        return _sha3.sha3_512(x).digest()
-
-
-import sys
-
 WORD_BYTES = 4  # bytes in word
 DATASET_BYTES_INIT = 2 ** 30  # bytes in dataset at genesis
 DATASET_BYTES_GROWTH = 2 ** 23  # growth per epoch (~7 GB per year)
@@ -39,73 +26,18 @@ def _sha3_512(x):
 FNV_PRIME = 0x01000193
 
 
-def fnv(v1, v2):
-    return (v1 * FNV_PRIME ^ v2) % 2 ** 32
-
-
-# Assumes little endian bit ordering (same as Intel architectures)
-def decode_int(s):
-    return int(encode_hex(s[::-1]), 16) if s else 0
-
-
-def encode_int(s):
-    a = "%x" % s
-    return b"" if s == 0 else decode_hex("0" * (len(a) % 2) + a)[::-1]
-
-
-def zpad(s, length):
-    return s + b"\x00" * max(0, length - len(s))
-
-
-def serialize_hash(h: List[int]) -> bytes:
-    return b"".join([zpad(encode_int(x), 4) for x in h])
-
-
-def deserialize_hash(h: bytes) -> List[int]:
-    return [decode_int(h[i : i + WORD_BYTES]) for i in range(0, len(h), WORD_BYTES)]
-
-
-def hash_words(h, sz, x) -> List[int]:
-    if isinstance(x, list):
-        x = serialize_hash(x)
-    y = h(x)
-    return deserialize_hash(y)
-
-
-def to_bytes(x):
-    if sys.version_info.major > 2 and isinstance(x, str):
-        x = bytes(x, "utf-8")
-    return x
-
-
-def xor(a, b):
-    return a ^ b
-
-
-# sha3 hash function, outputs 64 bytes
-def ethash_sha3_512(x: Union[bytes, List[int]]) -> List[int]:
-    return hash_words(lambda v: _sha3_512(to_bytes(v)), 64, x)
-
-
-def ethash_sha3_256(x: Union[bytes, List[int]]) -> List[int]:
-    return hash_words(lambda v: _sha3_256(to_bytes(v)), 32, x)
-
-
-# Works for dataset and cache
-def serialize_cache(ds):
-    return b"".join([serialize_hash(h) for h in ds])
-
-
-serialize_dataset = serialize_cache
-
-
-def deserialize_cache(ds):
-    return [
-        deserialize_hash(ds[i : i + HASH_BYTES]) for i in range(0, len(ds), HASH_BYTES)
-    ]
+def ethash_sha3_512(x: Union[bytes, np.ndarray]) -> np.ndarray:
+    """sha3-512: bytes or ndarray in, uint32 ndarray (16,) out."""
+    if isinstance(x, np.ndarray):
+        x = x.tobytes()
+    return np.frombuffer(_sha3_512(x), dtype=np.uint32).copy()
 
 
-deserialize_dataset = deserialize_cache
+def ethash_sha3_256(x: Union[bytes, np.ndarray]) -> np.ndarray:
+    """sha3-256: bytes or ndarray in, uint32 ndarray (8,) out."""
+    if isinstance(x, np.ndarray):
+        x = x.tobytes()
+    return np.frombuffer(_sha3_256(x), dtype=np.uint32).copy()
 
 
 def isprime(x):
diff --git a/ethereum/pow/ethpow.py b/ethereum/pow/ethpow.py
index 5493d53d9..afb74359f 100644
--- a/ethereum/pow/ethpow.py
+++ b/ethereum/pow/ethpow.py
@@ -1,64 +1,24 @@
-import warnings
+from typing import Tuple, Optional
 from functools import lru_cache
-from typing import Tuple, Optional, List, Union
-
-from eth_utils import big_endian_to_int
 
 from ethereum.pow import ethash
 from ethereum.pow.ethash_utils import get_full_size, get_cache_size, EPOCH_LENGTH
 
-try:
-    import pyethash
-
-    ETHASH_LIB = "pyethash"  # the C++ based implementation
-except ImportError:
-    ETHASH_LIB = "ethash"
-    warnings.warn("using pure python implementation", ImportWarning)
 
-
-# always have python implementation declared
-def get_cache_slow(cache_size: int, block_number: int) -> List[List[int]]:
+def get_cache(cache_size: int, block_number: int):
     return ethash.mkcache(cache_size, block_number)
 
 
-def hashimoto_slow(
+def hashimoto(
     block_number: int,
     full_size: int,
-    cache: Union[List[List[int]], bytes],
+    cache,
     mining_hash: bytes,
     bin_nonce: bytes,
 ):
     return ethash.hashimoto_light(full_size, cache, mining_hash, bin_nonce)
 
 
-if ETHASH_LIB == "ethash":
-    get_cache = get_cache_slow
-    hashimoto = hashimoto_slow
-elif ETHASH_LIB == "pyethash":
-
-    @lru_cache(10)
-    def calculate_cache(n):
-        return pyethash.mkcache_bytes(n * EPOCH_LENGTH)
-
-    def get_cache(cache_size: int, block_number: int):
-        return calculate_cache(block_number // EPOCH_LENGTH)
-
-    def hashimoto(
-        block_number: int,
-        full_size: int,
-        cache: Union[List[List[int]], bytes],
-        mining_hash: bytes,
-        bin_nonce: bytes,
-    ):
-        return pyethash.hashimoto_light(
-            block_number, cache, mining_hash, big_endian_to_int(bin_nonce)
-        )
-
-
-else:
-    raise Exception("invalid ethash library set")
-
-
 @lru_cache(maxsize=32)
 def check_pow(
     block_number, header_hash, mixhash, nonce, difficulty, is_test=False
@@ -67,20 +27,16 @@ def check_pow(
     if len(mixhash) != 32 or len(header_hash) != 32 or len(nonce) != 8:
         return False
 
-    cache_gen, mining_gen = get_cache, hashimoto
     if is_test:
         cache_size, full_size = 1024, 32 * 1024
-        # use python implementation to allow overriding cache & dataset size
-        cache_gen = get_cache_slow
-        mining_gen = hashimoto_slow
     else:
         cache_size, full_size = (
             get_cache_size(block_number),
             get_full_size(block_number),
         )
 
-    cache = cache_gen(cache_size, block_number)
-    mining_output = mining_gen(block_number, full_size, cache, header_hash, nonce)
+    cache = get_cache(cache_size, block_number)
+    mining_output = hashimoto(block_number, full_size, cache, header_hash, nonce)
     if mining_output[b"mix digest"] != mixhash:
         return False
     result = int.from_bytes(mining_output[b"result"], byteorder="big")
@@ -125,25 +81,20 @@ def mine(
     rounds: int = 1000,
     is_test: bool = False,
 ) -> Tuple[Optional[bytes], Optional[bytes]]:
-    cache_gen, mining_gen = get_cache, hashimoto
     if is_test:
         cache_size, full_size = 1024, 32 * 1024
-        # use python implementation to allow overriding cache & dataset size
-        cache_gen = get_cache_slow
-        mining_gen = hashimoto_slow
     else:
         cache_size, full_size = (
             get_cache_size(block_number),
             get_full_size(block_number),
         )
 
-    cache = cache_gen(cache_size, block_number)
+    cache = get_cache(cache_size, block_number)
     nonce = start_nonce
     target = (2 ** 256 // (difficulty or 1) - 1).to_bytes(32, byteorder="big")
     for i in range(1, rounds + 1):
-        # hashimoto expected big-indian byte representation
         bin_nonce = (nonce + i).to_bytes(8, byteorder="big")
-        o = mining_gen(block_number, full_size, cache, mining_hash, bin_nonce)
+        o = hashimoto(block_number, full_size, cache, mining_hash, bin_nonce)
         if o[b"result"] <= target:
             assert len(bin_nonce) == 8
             assert len(o[b"mix digest"]) == 32
diff --git a/ethereum/pow/keccak_tiny.c b/ethereum/pow/keccak_tiny.c
new file mode 100644
index 000000000..a0f56a4cb
--- /dev/null
+++ b/ethereum/pow/keccak_tiny.c
@@ -0,0 +1,175 @@
+/** libkeccak-tiny
+ *
+ * A single-file implementation of SHA-3 and SHAKE.
+ * Copied from https://github.com/coruus/keccak-tiny
+ *
+ * Implementor: David Leon Gil
+ * License: CC0, attribution kindly requested. Blame taken too,
+ * but not liability.
+ *
+ * Local modifications for Ethash:
+ *   1. Added keccak_256/keccak_512 (original Keccak padding 0x01,
+ *      as required by Ethash — distinct from NIST SHA-3 padding 0x06).
+ *   2. Replaced memset_s with portable secure_zero (volatile memset)
+ *      for MSVC/GCC/Clang compatibility.
+ */
+#include "keccak_tiny.h"
+
+#include <stdint.h>
+#include <string.h>
+
+/******** Portable secure memset ********/
+static void secure_zero(void *p, size_t len) {
+  volatile uint8_t *v = (volatile uint8_t *)p;
+  while (len--) *v++ = 0;
+}
+
+/******** The Keccak-f[1600] permutation ********/
+
+/*** Constants. ***/
+static const uint8_t rho[24] =
+  { 1,  3,   6, 10, 15, 21,
+    28, 36, 45, 55,  2, 14,
+    27, 41, 56,  8, 25, 43,
+    62, 18, 39, 61, 20, 44};
+static const uint8_t pi[24] =
+  {10,  7, 11, 17, 18, 3,
+    5, 16,  8, 21, 24, 4,
+   15, 23, 19, 13, 12, 2,
+   20, 14, 22,  9, 6,  1};
+static const uint64_t RC[24] =
+  {1ULL, 0x8082ULL, 0x800000000000808aULL, 0x8000000080008000ULL,
+   0x808bULL, 0x80000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL,
+   0x8aULL, 0x88ULL, 0x80008009ULL, 0x8000000aULL,
+   0x8000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL,
+   0x8000000000008002ULL, 0x8000000000000080ULL, 0x800aULL, 0x800000008000000aULL,
+   0x8000000080008081ULL, 0x8000000000008080ULL, 0x80000001ULL, 0x8000000080008008ULL};
+
+/*** Helper macros to unroll the permutation. ***/
+#define rol(x, s) (((x) << s) | ((x) >> (64 - s)))
+#define REPEAT6(e) e e e e e e
+#define REPEAT24(e) REPEAT6(e e e e)
+#define REPEAT5(e) e e e e e
+#define FOR5(v, s, e) \
+  v = 0;            \
+  REPEAT5(e; v += s;)
+
+/*** Keccak-f[1600] ***/
+static inline void keccakf(void* state) {
+  uint64_t* a = (uint64_t*)state;
+  uint64_t b[5] = {0};
+  uint64_t t = 0;
+  uint8_t x, y;
+
+  for (int i = 0; i < 24; i++) {
+    // Theta
+    FOR5(x, 1,
+         b[x] = 0;
+         FOR5(y, 5,
+              b[x] ^= a[x + y]; ))
+    FOR5(x, 1,
+         FOR5(y, 5,
+              a[y + x] ^= b[(x + 4) % 5] ^ rol(b[(x + 1) % 5], 1); ))
+    // Rho and pi
+    t = a[1];
+    x = 0;
+    REPEAT24(b[0] = a[pi[x]];
+             a[pi[x]] = rol(t, rho[x]);
+             t = b[0];
+             x++; )
+    // Chi
+    FOR5(y,
+       5,
+       FOR5(x, 1,
+            b[x] = a[y + x];)
+       FOR5(x, 1,
+            a[y + x] = b[x] ^ ((~b[(x + 1) % 5]) & b[(x + 2) % 5]); ))
+    // Iota
+    a[0] ^= RC[i];
+  }
+}
+
+/******** The FIPS202-defined functions. ********/
+
+/*** Some helper macros. ***/
+
+#define _(S) do { S } while (0)
+#define FOR(i, ST, L, S) \
+  _(for (size_t i = 0; i < L; i += ST) { S; })
+#define mkapply_ds(NAME, S)                                          \
+  static inline void NAME(uint8_t* dst,                              \
+                          const uint8_t* src,                        \
+                          size_t len) {                              \
+    FOR(i, 1, len, S);                                               \
+  }
+#define mkapply_sd(NAME, S)                                          \
+  static inline void NAME(const uint8_t* src,                        \
+                          uint8_t* dst,                              \
+                          size_t len) {                              \
+    FOR(i, 1, len, S);                                               \
+  }
+
+mkapply_ds(xorin, dst[i] ^= src[i])  // xorin
+mkapply_sd(setout, dst[i] = src[i])  // setout
+
+#define P keccakf
+#define Plen 200
+
+// Fold P*F over the full blocks of an input.
+#define foldP(I, L, F) \
+  while (L >= rate) {  \
+    F(a, I, rate);     \
+    P(a);              \
+    I += rate;         \
+    L -= rate;         \
+  }
+
+/** The sponge-based hash construction. **/
+static inline int hash(uint8_t* out, size_t outlen,
+                       const uint8_t* in, size_t inlen,
+                       size_t rate, uint8_t delim) {
+  if ((out == NULL) || ((in == NULL) && inlen != 0) || (rate >= Plen)) {
+    return -1;
+  }
+  uint8_t a[Plen] = {0};
+  // Absorb input.
+  foldP(in, inlen, xorin);
+  // Xor in the DS and pad frame.
+  a[inlen] ^= delim;
+  a[rate - 1] ^= 0x80;
+  // Xor in the last block.
+  xorin(a, in, inlen);
+  // Apply P
+  P(a);
+  // Squeeze output.
+  foldP(out, outlen, setout);
+  setout(a, out, outlen);
+  secure_zero(a, 200);
+  return 0;
+}
+
+/* Original Keccak (padding 0x01) — used by Ethash */
+int keccak_256(uint8_t* out, size_t outlen,
+               const uint8_t* in, size_t inlen) {
+  if (outlen > 32) return -1;
+  return hash(out, outlen, in, inlen, 200 - (256 / 4), 0x01);
+}
+
+int keccak_512(uint8_t* out, size_t outlen,
+               const uint8_t* in, size_t inlen) {
+  if (outlen > 64) return -1;
+  return hash(out, outlen, in, inlen, 200 - (512 / 4), 0x01);
+}
+
+/* FIPS-202 SHA-3 (padding 0x06) */
+int sha3_256(uint8_t* out, size_t outlen,
+             const uint8_t* in, size_t inlen) {
+  if (outlen > 32) return -1;
+  return hash(out, outlen, in, inlen, 200 - (256 / 4), 0x06);
+}
+
+int sha3_512(uint8_t* out, size_t outlen,
+             const uint8_t* in, size_t inlen) {
+  if (outlen > 64) return -1;
+  return hash(out, outlen, in, inlen, 200 - (512 / 4), 0x06);
+}
diff --git a/ethereum/pow/keccak_tiny.h b/ethereum/pow/keccak_tiny.h
new file mode 100644
index 000000000..cc5d1628b
--- /dev/null
+++ b/ethereum/pow/keccak_tiny.h
@@ -0,0 +1,24 @@
+/** libkeccak-tiny
+ *
+ * Copied from https://github.com/coruus/keccak-tiny
+ * See keccak_tiny.c for local modifications.
+ */
+#ifndef KECCAK_TINY_H
+#define KECCAK_TINY_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Original Keccak (padding 0x01) — used by Ethash */
+int keccak_256(uint8_t* out, size_t outlen,
+               const uint8_t* in, size_t inlen);
+int keccak_512(uint8_t* out, size_t outlen,
+               const uint8_t* in, size_t inlen);
+
+/* FIPS-202 SHA-3 (padding 0x06) */
+int sha3_256(uint8_t* out, size_t outlen,
+             const uint8_t* in, size_t inlen);
+int sha3_512(uint8_t* out, size_t outlen,
+             const uint8_t* in, size_t inlen);
+
+#endif /* KECCAK_TINY_H */
diff --git a/ethereum/pow/tests/bench_hashimoto.py b/ethereum/pow/tests/bench_hashimoto.py
new file mode 100644
index 000000000..23f72ecff
--- /dev/null
+++ b/ethereum/pow/tests/bench_hashimoto.py
@@ -0,0 +1,61 @@
+"""Benchmark: pyethash.hashimoto_light (C++) vs ethash.hashimoto_light (pure Python).
+
+Run with:
+    python -m ethereum.pow.tests.bench_hashimoto
+or:
+    python ethereum/pow/tests/bench_hashimoto.py
+
+Uses real mainnet parameters (block 0: cache ~16MB, full_size ~1GB) so that
+pure Python and pyethash C++ are compared under identical conditions.
+Note: pyethash may not be supported on Python 3.13, so this benchmark may fail to run on that version.
+See https://github.com/QuarkChain/pyquarkchain/issues/976
+"""
+
+import timeit
+
+from eth_utils import big_endian_to_int
+
+from ethereum.pow.ethash import mkcache, hashimoto_light as py_hashimoto_light
+from ethereum.pow.ethash_utils import get_cache_size, get_full_size
+
+BLOCK_NUMBER = 0
+HEADER = bytes(32)
+NONCE = (0).to_bytes(8, byteorder="big")
+ROUNDS = 100
+
+# Use real mainnet parameters to match pyethash internal sizes
+CACHE_SIZE = get_cache_size(BLOCK_NUMBER)
+FULL_SIZE = get_full_size(BLOCK_NUMBER)
+
+cache = mkcache(CACHE_SIZE, BLOCK_NUMBER)
+
+
+def bench_python():
+    py_hashimoto_light(FULL_SIZE, cache, HEADER, NONCE)
+
+
+results = {}
+
+elapsed = timeit.timeit(bench_python, number=ROUNDS)
+results["pure Python"] = elapsed
+print(f"Cache size: {CACHE_SIZE} bytes, Full size: {FULL_SIZE} bytes")
+print(f"pure Python : {elapsed:.3f}s for {ROUNDS} calls  ({elapsed/ROUNDS*1000:.1f} ms/call)")
+
+try:
+    import pyethash
+
+    cpp_cache = pyethash.mkcache_bytes(BLOCK_NUMBER)
+    nonce_int = big_endian_to_int(NONCE)
+    print(f"pyethash cache size: {len(cpp_cache)} bytes")
+
+    def bench_cpp():
+        pyethash.hashimoto_light(BLOCK_NUMBER, cpp_cache, HEADER, nonce_int)
+
+    elapsed_cpp = timeit.timeit(bench_cpp, number=ROUNDS)
+    results["pyethash (C++)"] = elapsed_cpp
+    print(f"pyethash C++: {elapsed_cpp:.3f}s for {ROUNDS} calls  ({elapsed_cpp/ROUNDS*1000:.1f} ms/call)")
+    print(f"Speedup     : {results['pure Python'] / elapsed_cpp:.1f}x")
+
+except ImportError:
+    print("pyethash not installed — skipping C++ comparison.")
+    print("Install with: pip install pyethash")
diff --git a/ethereum/pow/tests/bench_hashimoto_compare.py b/ethereum/pow/tests/bench_hashimoto_compare.py
new file mode 100644
index 000000000..25f070451
--- /dev/null
+++ b/ethereum/pow/tests/bench_hashimoto_compare.py
@@ -0,0 +1,421 @@
+"""
+Benchmark suite: old (hex-based) vs R1 (struct+list) vs R2 (numpy) vs R3 (numpy+Cython) vs R4 (full Cython).
+
+- old: original hex-based implementation
+- R1:  struct.pack/unpack + Python list
+- R2:  struct.pack/unpack + numpy ndarray
+- R3:  R2 + Cython inner loop for calc_dataset_item (256-iter FNV mixing)
+- R4:  full Cython + C keccak (no Python overhead in hot path)
+
+Sections:
+  1. mkcache build time
+  2. Correctness assertions
+  3. Primitive micro-benchmarks (serialize/fnv/sha3)
+  4. calc_dataset_item throughput
+  5. hashimoto_light throughput
+  6. check_pow end-to-end
+
+Uses is_test=True sizes (cache=1024B, dataset=32KB) for fast iteration.
+Run with:
+    PYTHONPATH=. python -m ethereum.pow.tests.bench_hashimoto_compare
+"""
+import copy
+import struct
+import time
+
+import numpy as np
+from Crypto.Hash import keccak
+
+from . import old_ethash
+
+# ---------------------------------------------------------------------------
+# Shared keccak
+# ---------------------------------------------------------------------------
+def _sha3_256(x): return keccak.new(digest_bits=256, data=x).digest()
+def _sha3_512(x): return keccak.new(digest_bits=512, data=x).digest()
+
+WORD_BYTES = 4
+HASH_BYTES = 64
+MIX_BYTES = 128
+ACCESSES = 64
+DATASET_PARENTS = 256
+CACHE_ROUNDS = 3
+FNV_PRIME = 0x01000193
+
+# ===========================================================================
+# OLD — hex-based (imported from old_ethash.py, which is a copy of the original ethash.py)
+# ===========================================================================
+old_mkcache           = old_ethash.mkcache
+old_calc_dataset_item = old_ethash.calc_dataset_item
+old_hashimoto_light   = old_ethash.hashimoto_light
+_old_fnv              = old_ethash.fnv
+_old_serialize_hash   = old_ethash.serialize_hash
+_old_deserialize_hash = old_ethash.deserialize_hash
+_old_sha3_512         = old_ethash.sha3_512
+_old_sha3_256         = old_ethash.sha3_256
+
+# ===========================================================================
+# Round 1 — struct+list
+# ===========================================================================
+_FMT_16I = struct.Struct("<16I")
+_FMT_8I  = struct.Struct("<8I")
+_FMT_32I = struct.Struct("<32I")
+_MID_FNV_PRIME = FNV_PRIME
+
+def _r1_serialize(h):
+    n = len(h)
+    if n == 16: return _FMT_16I.pack(*h)
+    if n == 8:  return _FMT_8I.pack(*h)
+    if n == 32: return _FMT_32I.pack(*h)
+    return struct.pack("<%dI" % n, *h)
+
+def _r1_deserialize(h):
+    n = len(h)
+    if n == 64:  return list(_FMT_16I.unpack(h))
+    if n == 32:  return list(_FMT_8I.unpack(h))
+    if n == 128: return list(_FMT_32I.unpack(h))
+    return list(struct.unpack("<%dI" % (n // 4), h))
+
+def _r1_sha3_512(x):
+    if isinstance(x, list): x = _r1_serialize(x)
+    return list(_FMT_16I.unpack(_sha3_512(x)))
+
+def _r1_sha3_256(x):
+    if isinstance(x, list): x = _r1_serialize(x)
+    return list(_FMT_8I.unpack(_sha3_256(x)))
+
+def _r1_fnv(v1, v2):
+    return (v1 * _MID_FNV_PRIME ^ v2) & 0xFFFFFFFF
+
+def r1_mkcache(cache_size, seed):
+    n = cache_size // HASH_BYTES
+    o = [_r1_sha3_512(seed)]
+    for i in range(1, n):
+        o.append(_r1_sha3_512(o[-1]))
+    for _ in range(CACHE_ROUNDS):
+        for i in range(n):
+            v = o[i][0] % n
+            o[i] = _r1_sha3_512([a ^ b for a, b in zip(o[(i - 1 + n) % n], o[v])])
+    return o
+
+def r1_calc_dataset_item(cache, i):
+    n = len(cache)
+    r = HASH_BYTES // WORD_BYTES
+    mix = copy.copy(cache[i % n])
+    mix[0] ^= i
+    mix = _r1_sha3_512(mix)
+    for j in range(DATASET_PARENTS):
+        cache_index = _r1_fnv(i ^ j, mix[j % r])
+        mix = list(map(_r1_fnv, mix, cache[cache_index % n]))
+    return _r1_sha3_512(mix)
+
+def r1_hashimoto_light(full_size, cache, header, nonce):
+    n = full_size // HASH_BYTES
+    w = MIX_BYTES // WORD_BYTES
+    mixhashes = MIX_BYTES // HASH_BYTES
+    s = _r1_sha3_512(header + nonce[::-1])
+    mix = list(s) * mixhashes
+    for i in range(ACCESSES):
+        p = _r1_fnv(i ^ s[0], mix[i % w]) % (n // mixhashes) * mixhashes
+        newdata = []
+        for j in range(mixhashes):
+            newdata.extend(r1_calc_dataset_item(cache, p + j))
+        mix = list(map(_r1_fnv, mix, newdata))
+    cmix = []
+    for i in range(0, len(mix), 4):
+        cmix.append(_r1_fnv(_r1_fnv(_r1_fnv(mix[i], mix[i+1]), mix[i+2]), mix[i+3]))
+    return {
+        b"mix digest": _r1_serialize(cmix),
+        b"result": _r1_serialize(_r1_sha3_256(s + cmix)),
+    }
+
+# ===========================================================================
+# Round 2 — numpy ndarray (current implementation)
+# ===========================================================================
+np.seterr(over="ignore")
+
+from ethereum.pow.ethash import (
+    mkcache as r2_mkcache,
+    hashimoto as _r2_hashimoto,
+)
+from ethereum.pow.ethash_utils import ethash_sha3_512 as _r2_sha3_512
+
+_R2_FNV_PRIME = np.uint32(FNV_PRIME)
+
+
+def r2_calc_dataset_item(cache: np.ndarray, i: int) -> np.ndarray:
+    """R2: pure-Python numpy calc_dataset_item."""
+    n = len(cache)
+    r = HASH_BYTES // WORD_BYTES   # 16
+    mix = cache[i % n].copy()
+    mix[0] ^= i
+    mix = _r2_sha3_512(mix)
+    for j in range(DATASET_PARENTS):
+        cache_index = ((i ^ j) * FNV_PRIME ^ int(mix[j % r])) & 0xFFFFFFFF
+        mix *= _R2_FNV_PRIME
+        mix ^= cache[cache_index % n]
+    return _r2_sha3_512(mix)
+
+def r2_hashimoto_light(full_size, cache, header, nonce):
+    """R2: pure-Python hashimoto_light (numpy + pycryptodome keccak)."""
+    return _r2_hashimoto(header, nonce, full_size, lambda x: r2_calc_dataset_item(cache, x))
+
+
+# ===========================================================================
+# Round 3 — numpy + Cython mix_parents
+# ===========================================================================
+try:
+    from ethereum.pow.ethash_cy import mix_parents as _cy_mix_parents
+    _has_cython = True
+except ImportError:
+    _has_cython = False
+
+
+def r3_calc_dataset_item(cache, i):
+    """R3: Cython inner loop (mix_parents only, Python sha3)."""
+    n = len(cache)
+    mix = cache[i % n].copy()
+    mix[0] ^= i
+    mix = _r2_sha3_512(mix)
+    _cy_mix_parents(mix, cache, i)
+    return _r2_sha3_512(mix)
+
+
+def r3_hashimoto_light(full_size, cache, header, nonce):
+    """R3: hashimoto using r3_calc_dataset_item (Cython mix_parents + Python sha3)."""
+    return _r2_hashimoto(header, nonce, full_size, lambda x: r3_calc_dataset_item(cache, x))
+
+
+# ===========================================================================
+# Round 4 — full Cython + C keccak (no Python overhead in hot path)
+# ===========================================================================
+try:
+    from ethereum.pow.ethash_cy import (
+        cy_calc_dataset_item as r4_calc_dataset_item,
+        cy_hashimoto_light as _r4_hashimoto_light_raw,
+    )
+    _has_r4 = True
+except ImportError:
+    _has_r4 = False
+
+
+def r4_hashimoto_light(full_size, cache, header, nonce):
+    """R4: full Cython hashimoto_light. Adapts bytes args to uint8 arrays."""
+    return _r4_hashimoto_light_raw(
+        full_size, cache,
+        np.frombuffer(header, dtype=np.uint8),
+        np.frombuffer(nonce, dtype=np.uint8),
+    )
+
+# ===========================================================================
+# Micro-benchmark helpers
+# ===========================================================================
+def _bench(func, args, rounds=200_000):
+    for _ in range(1000):
+        func(*args)
+    t0 = time.perf_counter()
+    for _ in range(rounds):
+        func(*args)
+    return time.perf_counter() - t0
+
+def _row3(label, fns_and_args, N):
+    times = [_bench(fn, args, N) for fn, args in fns_and_args]
+    t0 = times[0]
+    cols = "".join(f"{t:>10.4f}" for t in times)
+    ratios = "".join(f"{t0/t:>8.1f}x" for t in times[1:])
+    print(f"{label:<30}{cols}{ratios}")
+
+def _row_partial(label, fns_and_args, N):
+    times = [_bench(fn, args, N) if fn is not None else None
+             for fn, args in fns_and_args]
+    t0 = times[0]
+    cols = "".join(f"{t:>10.4f}" if t is not None else f"{'N/A':>10}" for t in times)
+    ratios = "".join(
+        f"{t0/t:>8.1f}x" if t is not None else f"{'N/A':>8}"
+        for t in times[1:]
+    )
+    print(f"{label:<30}{cols}{ratios}")
+
+# ===========================================================================
+# Main
+# ===========================================================================
+if __name__ == "__main__":
+    CACHE_SIZE = 1024
+    FULL_SIZE  = 32 * 1024
+    SEED   = b"\x00" * 32
+    HEADER = bytes.fromhex("c9149cc0386e689d789a1c2f3d5d169a61a6218ed30e74414dc736e442ef3d1f")
+    NONCE  = (0).to_bytes(8, byteorder="big")
+
+    # ---- build caches ----
+    print("Building caches...")
+    t0 = time.perf_counter(); old_cache = old_mkcache(CACHE_SIZE, SEED); t_oc = time.perf_counter() - t0
+    t0 = time.perf_counter(); r1_cache = r1_mkcache(CACHE_SIZE, SEED); t_mc = time.perf_counter() - t0
+    t0 = time.perf_counter(); r2_cache = r2_mkcache(CACHE_SIZE, 0);    t_nc = time.perf_counter() - t0
+    print(f"  mkcache  old={t_oc*1000:.1f}ms  R1={t_mc*1000:.1f}ms  R2={t_nc*1000:.1f}ms  "
+          f"old/R1={t_oc/t_mc:.1f}x  old/R2={t_oc/t_nc:.1f}x")
+
+    # ---- correctness ----
+    old_r = old_hashimoto_light(FULL_SIZE, old_cache, HEADER, NONCE)
+    mid_r = r1_hashimoto_light(FULL_SIZE, r1_cache, HEADER, NONCE)
+    new_r = r2_hashimoto_light(FULL_SIZE, r2_cache, HEADER, NONCE)
+    assert old_r == mid_r, "old/R1 MISMATCH"
+    assert old_r == new_r, "old/R2 MISMATCH"
+
+    if _has_cython:
+        for i in range(16):
+            r2_item = r2_calc_dataset_item(r2_cache, i)
+            r3_item = r3_calc_dataset_item(r2_cache, i)
+            assert np.array_equal(r2_item, r3_item), f"R2/R3 mismatch at item {i}"
+    if _has_r4:
+        for i in range(16):
+            r2_item = r2_calc_dataset_item(r2_cache, i)
+            r4_item = r4_calc_dataset_item(r2_cache, i)
+            assert np.array_equal(r2_item, r4_item), f"R2/R4 mismatch at item {i}"
+        r4_r = r4_hashimoto_light(FULL_SIZE, r2_cache, HEADER, NONCE)
+        assert old_r == r4_r, "old/R4 hashimoto MISMATCH"
+    cy_tag = "OK" if _has_cython else "SKIP"
+    r4_tag = "OK" if _has_r4 else "SKIP"
+    print(f"  result   match=OK  R3={cy_tag}  R4={r4_tag}  mix={old_r[b'mix digest'].hex()[:16]}...\n")
+
+    # ---- calc_dataset_item breakdown ----
+    N2 = 300
+    print(f"calc_dataset_item  x{N2} calls")
+
+    t0 = time.perf_counter()
+    for i in range(N2): old_calc_dataset_item(old_cache, i)
+    t_old_i = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    for i in range(N2): r1_calc_dataset_item(r1_cache, i)
+    t_mid_i = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    for i in range(N2): r2_calc_dataset_item(r2_cache, i)
+    t_r2_i = time.perf_counter() - t0
+
+    print(f"  old  {t_old_i:.3f}s  {t_old_i/N2*1000:.2f}ms/call")
+    print(f"  R1   {t_mid_i:.3f}s  {t_mid_i/N2*1000:.2f}ms/call  old/R1={t_old_i/t_mid_i:.2f}x")
+    print(f"  R2   {t_r2_i:.3f}s  {t_r2_i/N2*1000:.2f}ms/call  old/R2={t_old_i/t_r2_i:.2f}x", end="")
+    if _has_cython:
+        t0 = time.perf_counter()
+        for i in range(N2): r3_calc_dataset_item(r2_cache, i)
+        t_r3_i = time.perf_counter() - t0
+        print(f"\n  R3   {t_r3_i:.3f}s  {t_r3_i/N2*1000:.2f}ms/call  old/R3={t_old_i/t_r3_i:.2f}x  R2/R3={t_r2_i/t_r3_i:.1f}x", end="")
+    else:
+        print("\n  R3   (skipped — Cython extension not built)", end="")
+    if _has_r4:
+        t0 = time.perf_counter()
+        for i in range(N2): r4_calc_dataset_item(r2_cache, i)
+        t_r4_i = time.perf_counter() - t0
+        print(f"\n  R4   {t_r4_i:.3f}s  {t_r4_i/N2*1000:.2f}ms/call  old/R4={t_old_i/t_r4_i:.2f}x  R3/R4={t_r3_i/t_r4_i:.1f}x")
+    else:
+        print("\n  R4   (skipped — Cython R4 not built)")
+
+    # ---- hashimoto_light benchmark ----
+    N = 30
+    print(f"\nhashimoto_light  x{N} calls  (cache=1KB, dataset=32KB)")
+
+    for _ in range(2):
+        old_hashimoto_light(FULL_SIZE, old_cache, HEADER, NONCE)
+        r1_hashimoto_light(FULL_SIZE, r1_cache, HEADER, NONCE)
+        r2_hashimoto_light(FULL_SIZE, r2_cache, HEADER, NONCE)
+
+    t0 = time.perf_counter()
+    for i in range(N): old_hashimoto_light(FULL_SIZE, old_cache, HEADER, i.to_bytes(8, "big"))
+    t_old = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    for i in range(N): r1_hashimoto_light(FULL_SIZE, r1_cache, HEADER, i.to_bytes(8, "big"))
+    t_mid = time.perf_counter() - t0
+
+    # R2: pure Python hashimoto_light (always the _slow variant)
+    for _ in range(2):
+        r2_hashimoto_light(FULL_SIZE, r2_cache, HEADER, NONCE)
+    t0 = time.perf_counter()
+    for i in range(N): r2_hashimoto_light(FULL_SIZE, r2_cache, HEADER, i.to_bytes(8, "big"))
+    t_r2 = time.perf_counter() - t0
+
+    print(f"  old  {t_old:.3f}s  {t_old/N*1000:.1f}ms/call")
+    print(f"  R1   {t_mid:.3f}s  {t_mid/N*1000:.1f}ms/call  old/R1={t_old/t_mid:.2f}x")
+    print(f"  R2   {t_r2:.3f}s  {t_r2/N*1000:.1f}ms/call  old/R2={t_old/t_r2:.2f}x", end="")
+    if _has_cython:
+        # R3: hashimoto_light with Cython mix_parents + Python sha3
+        for _ in range(2):
+            r3_hashimoto_light(FULL_SIZE, r2_cache, HEADER, NONCE)
+        t0 = time.perf_counter()
+        for i in range(N): r3_hashimoto_light(FULL_SIZE, r2_cache, HEADER, i.to_bytes(8, "big"))
+        t_r3 = time.perf_counter() - t0
+        print(f"\n  R3   {t_r3:.3f}s  {t_r3/N*1000:.1f}ms/call  old/R3={t_old/t_r3:.2f}x  R2/R3={t_r2/t_r3:.1f}x", end="")
+    else:
+        print("\n  R3   (skipped — Cython extension not built)", end="")
+    if _has_r4:
+        # R4: full Cython + C keccak
+        for _ in range(2):
+            r4_hashimoto_light(FULL_SIZE, r2_cache, HEADER, NONCE)
+        t0 = time.perf_counter()
+        for i in range(N): r4_hashimoto_light(FULL_SIZE, r2_cache, HEADER, i.to_bytes(8, "big"))
+        t_r4 = time.perf_counter() - t0
+        print(f"\n  R4   {t_r4:.3f}s  {t_r4/N*1000:.1f}ms/call  old/R4={t_old/t_r4:.2f}x  R3/R4={t_r3/t_r4:.1f}x")
+    else:
+        print("\n  R4   (skipped — Cython R4 not built)")
+
+    # ---- primitive micro-benchmarks (old vs R1 vs R2) ----
+    NM = 200_000
+    hash_list_16  = [i * 1000003 & 0xFFFFFFFF for i in range(16)]
+    hash_list_8   = [i * 1000003 & 0xFFFFFFFF for i in range(8)]
+    hash_bytes_64 = _old_serialize_hash(hash_list_16)
+    hash_bytes_32 = _old_serialize_hash(hash_list_8)
+
+    def _r2_sha3_512_list(x):
+        if isinstance(x, list):
+            x = _FMT_16I.pack(*x)
+        return _r2_sha3_512(x)
+
+    print(f"\nprimitive micro-benchmarks  x{NM:,} rounds")
+    print(f"{'Function':<30} {'Old (s)':>10} {'R1 (s)':>10} {'R2 (s)':>10} {'old/R1':>8} {'old/R2':>8}")
+    print("-" * 82)
+    _row_partial("serialize_hash (16 ints)",
+        [(_old_serialize_hash, (hash_list_16,)),
+         (_r1_serialize,       (hash_list_16,)),
+         (None, None)], NM)
+    _row_partial("serialize_hash (8 ints)",
+        [(_old_serialize_hash, (hash_list_8,)),
+         (_r1_serialize,       (hash_list_8,)),
+         (None, None)], NM)
+    _row_partial("deserialize_hash (64B)",
+        [(_old_deserialize_hash, (hash_bytes_64,)),
+         (_r1_deserialize,       (hash_bytes_64,)),
+         (None, None)], NM)
+    _row_partial("deserialize_hash (32B)",
+        [(_old_deserialize_hash, (hash_bytes_32,)),
+         (_r1_deserialize,       (hash_bytes_32,)),
+         (None, None)], NM)
+    _row_partial("fnv",
+        [(_old_fnv,  (0xDEADBEEF, 0xCAFEBABE)),
+         (_r1_fnv,   (0xDEADBEEF, 0xCAFEBABE)),
+         (None, None)], NM)
+    _row3("ethash_sha3_512 (bytes)",
+        [(_old_sha3_512, (hash_bytes_64,)),
+         (_r1_sha3_512,  (hash_bytes_64,)),
+         (_r2_sha3_512,  (hash_bytes_64,))], NM)
+    _row3("ethash_sha3_512 (list)",
+        [(_old_sha3_512,      (hash_list_16,)),
+         (_r1_sha3_512,       (hash_list_16,)),
+         (_r2_sha3_512_list,  (hash_list_16,))], NM)
+
+    # ---- check_pow end-to-end ----
+    print("\ncheck_pow end-to-end  (is_test=True)")
+    from ethereum.pow.ethpow import check_pow
+    _cp_header = b"\xca/\xf0l\xaa\xe7\xc9M\xc9h\xbe}v\xd0\xfb\xf6\r\xd2\xe1\x98\x9e\xe9\xbf\rY1\xe4\x85d\xd5\x14;"
+    _cp_nonce  = (44).to_bytes(8, byteorder="big")
+    _cp_mix    = bytes.fromhex("5dd318d2dff0aac95a3af5617db0bfb07eee8b0ab4a42f01d6161336be758106")
+    N3 = 20
+    check_pow.cache_clear()
+    check_pow(1, _cp_header, _cp_mix, _cp_nonce, 100, is_test=True)
+    check_pow.cache_clear()
+    t0 = time.perf_counter()
+    for _ in range(N3):
+        check_pow.cache_clear()
+        check_pow(1, _cp_header, _cp_mix, _cp_nonce, 100, is_test=True)
+    t_cp = time.perf_counter() - t0
+    print(f"  x{N3}: {t_cp:.4f}s  ({t_cp/N3*1000:.1f}ms/call)")
diff --git a/ethereum/pow/tests/old_ethash.py b/ethereum/pow/tests/old_ethash.py
new file mode 100644
index 000000000..b77cb464b
--- /dev/null
+++ b/ethereum/pow/tests/old_ethash.py
@@ -0,0 +1,101 @@
+"""
+Original hex-based ethash implementation, preserved as a reference baseline
+for tests and benchmarks.
+"""
+
+import copy
+
+from eth_utils import encode_hex, decode_hex
+from Crypto.Hash import keccak
+
+WORD_BYTES = 4
+HASH_BYTES = 64
+MIX_BYTES = 128
+ACCESSES = 64
+DATASET_PARENTS = 256
+CACHE_ROUNDS = 3
+FNV_PRIME = 0x01000193
+
+
+def _sha3_256_raw(x): return keccak.new(digest_bits=256, data=x).digest()
+def _sha3_512_raw(x): return keccak.new(digest_bits=512, data=x).digest()
+
+
+def decode_int(s):
+    return int(encode_hex(s[::-1]), 16) if s else 0
+
+
+def encode_int(s):
+    a = "%x" % s
+    return b"" if s == 0 else decode_hex("0" * (len(a) % 2) + a)[::-1]
+
+
+def serialize_hash(h):
+    return b"".join([encode_int(x).ljust(4, b"\x00") for x in h])
+
+
+def deserialize_hash(h):
+    return [decode_int(h[i:i + WORD_BYTES]) for i in range(0, len(h), WORD_BYTES)]
+
+
+def sha3_512(x):
+    if isinstance(x, list):
+        x = serialize_hash(x)
+    return deserialize_hash(_sha3_512_raw(x))
+
+
+def sha3_256(x):
+    if isinstance(x, list):
+        x = serialize_hash(x)
+    return deserialize_hash(_sha3_256_raw(x))
+
+
+def fnv(v1, v2):
+    return (v1 * FNV_PRIME ^ v2) % 2 ** 32
+
+
+def mkcache(cache_size, seed):
+    n = cache_size // HASH_BYTES
+    o = [sha3_512(seed)]
+    for i in range(1, n):
+        o.append(sha3_512(o[-1]))
+    for _ in range(CACHE_ROUNDS):
+        for i in range(n):
+            v = o[i][0] % n
+            o[i] = sha3_512([a ^ b for a, b in zip(o[(i - 1 + n) % n], o[v])])
+    return o
+
+
+def calc_dataset_item(cache, i):
+    n = len(cache)
+    r = HASH_BYTES // WORD_BYTES
+    mix = copy.copy(cache[i % n])
+    mix[0] ^= i
+    mix = sha3_512(mix)
+    for j in range(DATASET_PARENTS):
+        cache_index = fnv(i ^ j, mix[j % r])
+        mix = list(map(fnv, mix, cache[cache_index % n]))
+    return sha3_512(mix)
+
+
+def hashimoto_light(full_size, cache, header, nonce):
+    n = full_size // HASH_BYTES
+    w = MIX_BYTES // WORD_BYTES
+    mixhashes = MIX_BYTES // HASH_BYTES
+    s = sha3_512(header + nonce[::-1])
+    mix = []
+    for _ in range(mixhashes):
+        mix.extend(s)
+    for i in range(ACCESSES):
+        p = fnv(i ^ s[0], mix[i % w]) % (n // mixhashes) * mixhashes
+        newdata = []
+        for j in range(mixhashes):
+            newdata.extend(calc_dataset_item(cache, p + j))
+        mix = list(map(fnv, mix, newdata))
+    cmix = []
+    for i in range(0, len(mix), 4):
+        cmix.append(fnv(fnv(fnv(mix[i], mix[i+1]), mix[i+2]), mix[i+3]))
+    return {
+        b"mix digest": serialize_hash(cmix),
+        b"result": serialize_hash(sha3_256(s + cmix)),
+    }
diff --git a/ethereum/pow/tests/test_ethash.py b/ethereum/pow/tests/test_ethash.py
index d81c3bb69..e3a64dfd2 100644
--- a/ethereum/pow/tests/test_ethash.py
+++ b/ethereum/pow/tests/test_ethash.py
@@ -1,10 +1,23 @@
 import unittest
 
+import numpy as np
+
 from ethereum.pow.ethash import mkcache, calc_dataset, hashimoto_light, hashimoto_full
-from ethereum.pow.ethash_utils import EPOCH_LENGTH, HASH_BYTES, serialize_hash
+from ethereum.pow.ethash_utils import EPOCH_LENGTH, HASH_BYTES
 from ethereum.pow.ethpow import EthashMiner, check_pow
 
 
+class TestEthashUtils(unittest.TestCase):
+    """Test correctness of ethash_utils functions."""
+
+    def test_ethash_sha3_512_known_vector(self):
+        """ethash_sha3_512 with seed zero is stable across runs."""
+        from ethereum.pow.ethash_utils import ethash_sha3_512
+        seed = b"\x00" * 32
+        result = ethash_sha3_512(seed)
+        self.assertEqual(ethash_sha3_512(seed).tobytes(), result.tobytes())
+
+
 class TestEthash(unittest.TestCase):
     """Same test cases in go-ethereum."""
 
@@ -28,7 +41,7 @@ def test_cache_generation(self):
         for cache_size, epoch, expected_cache in testcases:
             block_number = epoch * EPOCH_LENGTH
             cache = mkcache(cache_size, block_number)
-            cache_hex = "".join(serialize_hash(ls).hex() for ls in cache)
+            cache_hex = "".join(row.tobytes().hex() for row in cache)
             self.assertEqual(cache_hex, expected_cache[2:])
 
     def test_dataset_gen(self):
@@ -45,7 +58,7 @@ def test_dataset_gen(self):
             block_number = epoch * EPOCH_LENGTH
             cache = mkcache(cache_size, block_number)
             dataset = calc_dataset(dataset_size, cache)
-            dataset_hex = "".join(serialize_hash(ls).hex() for ls in dataset)
+            dataset_hex = "".join(row.tobytes().hex() for row in dataset)
             self.assertEqual(dataset_hex, expected_dataset[2:])
 
     def test_hashimoto(self):
@@ -117,6 +130,44 @@ def test_ethash_mining(self):
             )
             self.assertTrue(validity)
 
+    def test_cython_matches_python_fallback(self):
+        """numpy and Cython implementations both match the original hex-based baseline."""
+        try:
+            from ethereum.pow.ethash_cy import cy_calc_dataset_item, cy_hashimoto_light
+        except ImportError:
+            self.skipTest("Cython extension not built")
+
+        from ethereum.pow.ethash import calc_dataset_item, hashimoto
+        from ethereum.pow.tests import old_ethash
+
+        old_cache = old_ethash.mkcache(1024, b"\x00" * 32)
+        new_cache = mkcache(1024, 0)
+
+        # calc_dataset_item and cy_calc_dataset_item vs old baseline
+        for i in range(16):
+            baseline = old_ethash.serialize_hash(old_ethash.calc_dataset_item(old_cache, i))
+            self.assertEqual(
+                calc_dataset_item(new_cache, i).tobytes(), baseline,
+                f"calc_dataset_item mismatch vs old at item {i}",
+            )
+            self.assertEqual(
+                cy_calc_dataset_item(new_cache, i).tobytes(), baseline,
+                f"cy_calc_dataset_item mismatch vs old at item {i}",
+            )
+
+        # hashimoto_light: Python hashimoto vs Cython cy_hashimoto_light
+        header = bytes(32)
+        nonce = (0).to_bytes(8, byteorder="big")
+        full_size = 32 * 1024
+        py_r = hashimoto(header, nonce, full_size, lambda x: calc_dataset_item(new_cache, x))
+        cy_r = cy_hashimoto_light(
+            full_size, new_cache,
+            np.frombuffer(header, dtype=np.uint8),
+            np.frombuffer(nonce, dtype=np.uint8),
+        )
+        self.assertEqual(py_r[b"mix digest"], cy_r[b"mix digest"])
+        self.assertEqual(py_r[b"result"], cy_r[b"result"])
+
     def test_pyethash(self):
         header_hash = b"\xca/\xf0l\xaa\xe7\xc9M\xc9h\xbe}v\xd0\xfb\xf6\r\xd2\xe1\x98\x9e\xe9\xbf\rY1\xe4\x85d\xd5\x14;"
         for diff, expected_nonce in ((100, 34), (500, 78)):
diff --git a/setup.py b/setup.py
index 314502a2f..ae0742f67 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,31 @@
 import os
 
-from setuptools import setup
+from setuptools import setup, Extension
 from setuptools.command.develop import develop
 
+# Optional Cython extension: native inner loop for ethash calc_dataset_item.
+# Built only if both Cython and numpy are importable at setup time. The
+# Python implementation in ethereum/pow/ethash.py falls back transparently
+# when the compiled module is not present.
+ext_modules = []
+try:
+    from Cython.Build import cythonize
+    import numpy as _np
+
+    ext_modules = cythonize(
+        [
+            Extension(
+                "ethereum.pow.ethash_cy",
+                sources=["ethereum/pow/ethash_cy.pyx", "ethereum/pow/keccak_tiny.c"],
+                include_dirs=[_np.get_include(), "ethereum/pow"],
+                define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+            )
+        ],
+        language_level=3,
+    )
+except ImportError:
+    pass
+
 install_requires = set(x.strip() for x in open("requirements.txt"))
 install_requires_replacements = {}
 install_requires = [install_requires_replacements.get(r, r) for r in install_requires]
@@ -46,4 +69,5 @@ def read(fname):
     install_requires=install_requires,
     python_requires=">=3.5",
     cmdclass={"develop": custom_develop},
+    ext_modules=ext_modules,
 )