@@ -4,7 +4,7 @@ jupytext:
4
4
extension : .md
5
5
format_name : myst
6
6
format_version : 0.13
7
- jupytext_version : 1.17.2
7
+ jupytext_version : 1.17.1
8
8
kernelspec :
9
9
display_name : Python 3 (ipykernel)
10
10
language : python
@@ -57,6 +57,7 @@ from scipy.optimize import brentq, minimize_scalar
57
57
from scipy.stats import beta as beta_dist
58
58
import pandas as pd
59
59
from IPython.display import display, Math
60
+ import quantecon as qe
60
61
```
61
62
62
63
## Likelihood Ratio Process
@@ -1764,6 +1765,260 @@ From the figure above, we can see:
1764
1765
1765
1766
**Remark:** Think about how laws of large numbers are applied to compute error probabilities for the model selection problem and the classification problem.
1766
1767
1768
+ ## Special case: Markov chain models
1769
+
1770
+ Consider two $n$-state irreducible and aperiodic Markov chain models on the same state space $\{1, 2, \ldots, n\}$ with positive transition matrices $P^{(f)}$, $P^{(g)}$ and initial distributions $\pi_0^{(f)}$, $\pi_0^{(g)}$.
1771
+
1772
+ In this section, we assume nature chooses $f$.
1773
+
1774
+ For a sample path $(x_0, x_1, \ldots, x_T)$, let $N_{ij}$ count transitions from state $i$ to $j$.
1775
+
1776
+ The likelihood under model $m \in \{f, g\}$ is
1777
+
1778
+ $$
1779
+ L_T^{(m)} = \pi_ {0,x_0}^{(m)} \prod_ {i=1}^n \prod_ {j=1}^n \left(P_ {ij}^{(m)}\right)^{N_ {ij}}
1780
+ $$
1781
+
1782
+ Hence,
1783
+
1784
+ $$
1785
+ \log L_T^{(m)} =\log\pi_ {0,x_0}^{(m)} +\sum_ {i,j}N_ {ij}\log P_ {ij}^{(m)}
1786
+ $$
1787
+
1788
+ The log-likelihood ratio is
1789
+
1790
+ $$
1791
+ \log \frac{L_T^{(f)}}{L_T^{(g)}} = \log \frac{\pi_ {0,x_0}^{(f)}}{\pi_ {0,x_0}^{(g)}} + \sum_ {i,j}N_ {ij}\log \frac{P_ {ij}^{(f)}}{P_ {ij}^{(g)}}
1792
+ $$ (eq:llr_markov)
1793
+
1794
+ ### KL divergence rate
1795
+
1796
+ By the ergodic theorem for irreducible, aperiodic Markov chains, we have
1797
+
1798
+ $$
1799
+ \frac{N_ {ij}}{T} \xrightarrow{a.s.} \pi_i^{(f)}P_ {ij}^{(f)} \quad \text{as } T \to \infty
1800
+ $$
1801
+
1802
+ where $\boldsymbol{\pi}^{(f)}$ is the stationary distribution satisfying $\boldsymbol{\pi}^{(f)} = \boldsymbol{\pi}^{(f)} P^{(f)}$.
1803
+
1804
+ Therefore,
1805
+
1806
+ $$
1807
+ \frac{1}{T}\log \frac{L_T^{(f)}}{L_T^{(g)}} = \frac{1}{T}\log \frac{\pi_ {0,x_0}^{(f)}}{\pi_ {0,x_0}^{(g)}} + \frac{1}{T}\sum_ {i,j}N_ {ij}\log \frac{P_ {ij}^{(f)}}{P_ {ij}^{(g)}}
1808
+ $$
1809
+
1810
+ Taking the limit as $T \to \infty$, we have
1811
+ - The first term: $\frac{1}{T}\log \frac{\pi_{0,x_0}^{(f)}}{\pi_{0,x_0}^{(g)}} \to 0$
1812
+ - The second term: $\frac{1}{T}\sum_{i,j}N_{ij}\log \frac{P_{ij}^{(f)}}{P_{ij}^{(g)}} \xrightarrow{a.s.} \sum_{i,j}\pi_i^{(f)}P_{ij}^{(f)}\log \frac{P_{ij}^{(f)}}{P_{ij}^{(g)}}$
1813
+
1814
+ Define the **KL divergence rate** as
1815
+
1816
+ $$
1817
+ h_ {KL}(f, g) = \sum_ {i=1}^n \pi_i^{(f)} \underbrace{\sum_ {j=1}^n P_ {ij}^{(f)} \log \frac{P_ {ij}^{(f)}}{P_ {ij}^{(g)}}}_ {=: KL(P_ {i\cdot}^{(f)}, P_ {i\cdot}^{(g)})}
1818
+ $$
1819
+
1820
+ where $KL(P_{i\cdot}^{(f)}, P_{i\cdot}^{(g)})$ is the row-wise KL divergence.
1821
+
1822
+
1823
+ By the ergodic theorem, we have
1824
+
1825
+ $$
1826
+ \frac{1}{T}\log \frac{L_T^{(f)}}{L_T^{(g)}} \xrightarrow{a.s.} h_ {KL}(f, g) \quad \text{as } T \to \infty
1827
+ $$
1828
+
1829
+ Taking expectations and using the dominated convergence theorem, we can get
1830
+
1831
+ $$
1832
+ \frac{1}{T}E_f\left[ \log \frac{L_T^{(f)}}{L_T^{(g)}}\right] \to h_ {KL}(f, g) \quad \text{as } T \to \infty
1833
+ $$
1834
+
1835
+ Here we invite readers to pause and compare this result with {eq}`eq:kl_likelihood_link`.
1836
+
1837
+ Let's confirm this in the simulation below.
1838
+
1839
+ ### Simulations
1840
+
1841
+ Let's implement simulations to illustrate these concepts with a three-state Markov chain.
1842
+
1843
+ We start with writing out functions to compute the stationary distribution and the KL divergence rate for Markov chain models
1844
+
1845
+ ```{code-cell} ipython3
1846
+ def compute_stationary_dist(P):
1847
+ """
1848
+ Compute stationary distribution of transition matrix P
1849
+ """
1850
+
1851
+ eigenvalues, eigenvectors = np.linalg.eig(P.T)
1852
+ idx = np.argmax(np.abs(eigenvalues))
1853
+ stationary = np.real(eigenvectors[:, idx])
1854
+ return stationary / stationary.sum()
1855
+
1856
+ def markov_kl_divergence(P_f, P_g, pi_f):
1857
+ """
1858
+ Compute KL divergence rate between two Markov chains
1859
+ """
1860
+ if np.any((P_f > 0) & (P_g == 0)):
1861
+ return np.inf
1862
+
1863
+ valid_mask = (P_f > 0) & (P_g > 0)
1864
+
1865
+ log_ratios = np.zeros_like(P_f)
1866
+ log_ratios[valid_mask] = np.log(P_f[valid_mask] / P_g[valid_mask])
1867
+
1868
+ # Weight by stationary probabilities and sum
1869
+ kl_rate = np.sum(pi_f[:, np.newaxis] * P_f * log_ratios)
1870
+
1871
+ return kl_rate
1872
+
1873
+ def simulate_markov_chain(P, pi_0, T, N_paths=1000):
1874
+ """
1875
+ Simulate N_paths sample paths from a Markov chain
1876
+ """
1877
+ mc = qe.MarkovChain(P, state_values=None)
1878
+
1879
+ initial_states = np.random.choice(len(P), size=N_paths, p=pi_0)
1880
+
1881
+ paths = np.zeros((N_paths, T+1), dtype=int)
1882
+
1883
+ for i in range(N_paths):
1884
+ path = mc.simulate(T+1, init=initial_states[i])
1885
+ paths[i, :] = path
1886
+
1887
+ return paths
1888
+
1889
+
1890
+ def compute_likelihood_ratio_markov(paths, P_f, P_g, π_0_f, π_0_g):
1891
+ """
1892
+ Compute likelihood ratio process for Markov chain paths
1893
+ """
1894
+ N_paths, T_plus_1 = paths.shape
1895
+ T = T_plus_1 - 1
1896
+ L_ratios = np.ones((N_paths, T+1))
1897
+
1898
+ L_ratios[:, 0] = π_0_f[paths[:, 0]] / π_0_g[paths[:, 0]]
1899
+
1900
+ for t in range(1, T+1):
1901
+ prev_states = paths[:, t-1]
1902
+ curr_states = paths[:, t]
1903
+
1904
+ transition_ratios = P_f[prev_states, curr_states] \
1905
+ / P_g[prev_states, curr_states]
1906
+ L_ratios[:, t] = L_ratios[:, t-1] * transition_ratios
1907
+
1908
+ return L_ratios
1909
+ ```
1910
+
1911
+ Now let's create an example with two different 3-state Markov chains
1912
+
1913
+ ```{code-cell} ipython3
1914
+ P_f = np.array([[0.7, 0.2, 0.1],
1915
+ [0.3, 0.5, 0.2],
1916
+ [0.1, 0.3, 0.6]])
1917
+
1918
+ P_g = np.array([[0.5, 0.3, 0.2],
1919
+ [0.2, 0.6, 0.2],
1920
+ [0.2, 0.2, 0.6]])
1921
+
1922
+ # Compute stationary distributions
1923
+ π_f = compute_stationary_dist(P_f)
1924
+ π_g = compute_stationary_dist(P_g)
1925
+
1926
+ print(f"Stationary distribution (f): {π_f}")
1927
+ print(f"Stationary distribution (g): {π_g}")
1928
+
1929
+ # Compute KL divergence rate
1930
+ kl_rate_fg = markov_kl_divergence(P_f, P_g, π_f)
1931
+ kl_rate_gf = markov_kl_divergence(P_g, P_f, π_g)
1932
+
1933
+ print(f"\nKL divergence rate h(f, g): {kl_rate_fg:.4f}")
1934
+ print(f"KL divergence rate h(g, f): {kl_rate_gf:.4f}")
1935
+ ```
1936
+
1937
+ We are now ready to simulate paths and visualize how likelihood ratios evolve.
1938
+
1939
+ We'll verify $\frac{1}{T}E_f\left[\log \frac{L_T^{(f)}}{L_T^{(g)}}\right] = h_{KL}(f, g)$ starting from the stationary distribution by plotting both the empirical average and the line predicted by the theory
1940
+
1941
+ ```{code-cell} ipython3
1942
+ T = 500
1943
+ N_paths = 1000
1944
+ paths_from_f = simulate_markov_chain(P_f, π_f, T, N_paths)
1945
+
1946
+ L_ratios_f = compute_likelihood_ratio_markov(paths_from_f,
1947
+ P_f, P_g,
1948
+ π_f, π_g)
1949
+
1950
+ plt.figure(figsize=(10, 6))
1951
+
1952
+ # Plot individual paths
1953
+ n_show = 50
1954
+ for i in range(n_show):
1955
+ plt.plot(np.log(L_ratios_f[i, :]),
1956
+ alpha=0.3, color='blue', lw=0.8)
1957
+
1958
+ # Compute theoretical expectation
1959
+ theory_line = kl_rate_fg * np.arange(T+1)
1960
+ plt.plot(theory_line, 'k--', linewidth=2.5,
1961
+ label=r'$T \times h_{KL}(f,g)$')
1962
+
1963
+ # Compute empirical mean
1964
+ avg_log_L = np.mean(np.log(L_ratios_f), axis=0)
1965
+ plt.plot(avg_log_L, 'r-', linewidth=2.5,
1966
+ label='empirical average', alpha=0.5)
1967
+
1968
+ plt.axhline(y=0, color='gray',
1969
+ linestyle='--', alpha=0.5)
1970
+ plt.xlabel(r'$T$')
1971
+ plt.ylabel(r'$\log L_T$')
1972
+ plt.title('nature = $f$')
1973
+ plt.legend()
1974
+ plt.show()
1975
+ ```
1976
+
1977
+ Let's examine how the model selection error probability depends on sample size using the same simulation strategy in the previous section
1978
+
1979
+ ```{code-cell} ipython3
1980
+ def compute_selection_error(T_values, P_f, P_g, π_0_f, π_0_g, N_sim=1000):
1981
+ """
1982
+ Compute model selection error probability for different sample sizes
1983
+ """
1984
+ errors = []
1985
+
1986
+ for T in T_values:
1987
+ # Simulate from both models
1988
+ paths_f = simulate_markov_chain(P_f, π_0_f, T, N_sim//2)
1989
+ paths_g = simulate_markov_chain(P_g, π_0_g, T, N_sim//2)
1990
+
1991
+ # Compute likelihood ratios
1992
+ L_f = compute_likelihood_ratio_markov(paths_f,
1993
+ P_f, P_g,
1994
+ π_0_f, π_0_g)
1995
+ L_g = compute_likelihood_ratio_markov(paths_g,
1996
+ P_f, P_g,
1997
+ π_0_f, π_0_g)
1998
+
1999
+ # Decision rule: choose f if L_T >= 1
2000
+ error_f = np.mean(L_f[:, -1] < 1) # Type I error
2001
+ error_g = np.mean(L_g[:, -1] >= 1) # Type II error
2002
+
2003
+ total_error = 0.5 * (error_f + error_g)
2004
+ errors.append(total_error)
2005
+
2006
+ return np.array(errors)
2007
+
2008
+ # Compute error probabilities
2009
+ T_values = np.arange(10, 201, 10)
2010
+ errors = compute_selection_error(T_values,
2011
+ P_f, P_g,
2012
+ π_f, π_g)
2013
+
2014
+ # Plot results
2015
+ plt.figure(figsize=(10, 6))
2016
+ plt.plot(T_values, errors, linewidth=2)
2017
+ plt.xlabel('$T$')
2018
+ plt.ylabel('error probability')
2019
+ plt.show()
2020
+ ```
2021
+
1767
2022
## Measuring discrepancies between distributions
1768
2023
1769
2024
A plausible guess is that the ability of a likelihood ratio to distinguish distributions $f$ and $g$ depends on how "different" they are.
2405
2660
2406
2661
```{solution-end}
2407
2662
```
2408
-
2409
-
2410
-
0 commit comments