# libraries
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns
from pymc.distributions.dist_math import binomln, logpow
'figure.dpi'] = 600
plt.rcParams['fivethirtyeight')
plt.style.use('axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams[= sns.color_palette("Set2")
pal
sns.set_palette(pal)
# hyperparameters
= 808
SEED = np.random.default_rng(SEED)
RNG
def augment_history(history):
'''Augment a capture history with all-zero histories.'''
= history.shape
animals_captured, T
# create M - n all zero histories
= M - animals_captured
zero_history_count = np.zeros((zero_history_count, T))
zero_history
# tack those on to the capture history
= np.row_stack((history, zero_history))
augmented
return augmented
def get_behavior_covariate(history):
# note the occasion when each individual was first seen
= (history != 0).argmax(axis=1)
first_seen
# create the covariate for the behavior effect
= np.zeros_like(history)
behavior_covariate for i, f in enumerate(first_seen):
+ 1):] = 1
behavior_covariate[i, (f
return behavior_covariate
def get_occasion_covariate(history):
= history.shape
_, T = []
l for t in range(T):
= np.zeros_like(history)
oc = 1
oc[:, t]
l.append(oc)
return np.stack(l, axis=2)
def sim_N(idata):
= az.extract(idata).psi.values
psi_samps = az.extract(idata).p.values
p_samps = (1 - p_samps)
not_p
if p_samps.ndim == 1:
= psi_samps * (not_p) ** T
p_included = RNG.binomial(M - n, p_included)
number_undetected
elif p_samps.ndim == 3:
= psi_samps * not_p.prod(axis=1)
p_included = RNG.binomial(1, p_included).sum(axis=0)
number_undetected
= n + number_undetected
N return N
# convert the dolphin capture history from '1001001' to array
= np.loadtxt('firth.txt', dtype=str)
dolphin = np.array([list(map(int, d)) for d in dolphin])
dolphin
# augment the capture history with all zero histories
= dolphin.shape
n, T = 500
M = augment_history(dolphin)
dolphin_augmented
# covariates for t and b
= get_occasion_covariate(dolphin_augmented)
occasion_covariate = get_behavior_covariate(dolphin_augmented) behavior_covariate
Model comparison
In this notebook, I demonstrate an approach to model selection in PyMC. To do so, follow the lead of King and Brooks (2008), although not nearly as elegantly. They demonstrate an approach to model selection for a typical suite of closed capture-recapture models. These include the effects of behavior \(b\), time \(t,\) and individual heterogeneity \(h\) on capture probabilities \(p\). The eight models considered here are combinations of the three: \(M_{0},\) \(M_{t},\) \(M_{b},\) \(M_{tb},\) \(M_{h},\) \(M_{th},\) \(M_{bh}\). The full model, \(M_{tbh}\), is
\[
\begin{equation}
\text{logit} \; p_{it} = \mu + \alpha_t + \beta x_{it} + \gamma_i,
\end{equation}
\] where \(\mu\) is the average catchability, \(\alpha_t\) is the effect of each occasion on catchability, \(\beta\) is the behavioral effect, \(x_{it}\) indicates whether the individual has been previously caught, and \(\gamma_i\) is the individual random effect such that \(\gamma_i \sim \text{Normal}(0,\sigma)\). Formulating the model this way makes the other models nested subsets of the full model.
Like King and Brooks (2008), I use the the Moray Firth bottlenose dolphin data as a motivating example. Wilson, Hammond, and Thompson (1999) detected \(n=56\) dolphins over the course of \(T=17\) boat surveys between May and September 1992. They generated the capture-recapture histories by way of photo-identification, which is near and dear to my heart (and my dissertation).
The discovery curve, the number of unique dolphins encountered as a function of the total number of dolphins encountered, may be flattening. This suggests that, at this point in the study, Wilson, Hammond, and Thompson (1999) may have encountered many of the unique individuals in the population.
# how many dolphins have been seen?
= dolphin.sum(axis=0).cumsum()
total_seen
# how many new dolphins have been seen?
= (dolphin != 0).argmax(axis=1)
first_seen = [sum(first_seen == t) for t in range(T)]
newbies = np.cumsum(newbies)
total_newbies
= plt.subplots(figsize=(5, 3.5))
fig, ax
ax.plot(total_seen, total_newbies)=0.2)
ax.fill_between(total_seen, total_newbies, alpha'Discovery curve')
ax.set_title('Total dolphins')
ax.set_xlabel('Unique dolphins')
ax.set_ylabel( plt.show()
Training each model
This notebook looks messier than the others, in that I train several models with little commentary along the way. In practice, it would probably be better to wrap these up into a function or a class. To complete the model, I used the following priors, \[
\begin{align}
\psi &\sim \text{Uniform}(0, 1)\\
\mu &\sim \text{Logistic}(0, 1) \\
\alpha_t &\sim \text{Normal}(0, \sigma_{\alpha}) \\
\beta &\sim \text{Normal}(0, \sigma_{\beta}) \\
\gamma_i &\sim \text{Normal}(0, \sigma_{\gamma}) \\
\sigma_{\alpha} &\sim \text{InverseGamma}(4, 3) \\
\sigma_{\beta} &\sim \text{InverseGamma}(4, 3) \\
\sigma_{\gamma} &\sim \text{InverseGamma}(4, 3),
\end{align}
\] which were also used by King and Brooks (2008). Although note that I used an informative \(\text{Beta}(1, 5)\) prior for \(\psi\) in the full model (see below). I use the same logp
seen in the occupancy and closed capture-recapture notebooks, which accounts for row-level zero-inflation. Unlike other notebooks, I did not look at the summaries or the trace plots unless the sampler indicated that it had issues during training.
Throughout the notebook, I use the nutpie sampler within PyMC. Nutpie is a NUTS sampler written in Rust, and is often faster than PyMC. Also, I have tweaked the sampling keyword arguments for each model, since they are a little finicky.
def logp(value, n, p, psi):
= binomln(n, value) + logpow(p, value) + logpow(1 - p, n - value)
binom = pm.math.sum(binom, axis=1)
bin_sum = pm.math.exp(bin_sum)
bin_exp
= pm.math.switch(
res sum(axis=1) > 0,
value.* psi,
bin_exp * psi + (1 - psi)
bin_exp
)
return pm.math.log(res)
with pm.Model() as m0:
# Priors
# inclusion
= pm.Uniform('psi', 0, 1)
psi
# mean catchability
= pm.Logistic('mu', 0, 1)
mu
# Linear model
= (np.ones((T, M)) * mu).T
mu_matrix = pm.Deterministic('p', pm.math.invlogit(mu_matrix))
p
# Likelihood
pm.CustomDist('y',
1,
p,
psi,=logp,
logp=dolphin_augmented
observed
)
pm.model_to_graphviz(m0)
with m0:
= pm.sample(nuts_sampler='nutpie') m0_idata
Sampler Progress
Total Chains: 4
Active Chains: 0
Finished Chains: 4
Sampling for now
Estimated Time to Completion: now
Progress | Draws | Divergences | Step Size | Gradients/Draw |
---|---|---|---|---|
2000 | 0 | 1.11 | 3 | |
2000 | 0 | 1.13 | 3 | |
2000 | 0 | 1.14 | 3 | |
2000 | 0 | 1.13 | 7 |
with pm.Model() as mt:
# Priors
# inclusion
= pm.Uniform('psi', 0, 1)
psi
# mean catchability
= pm.Logistic('mu', 0, 1)
mu
# time effect
= pm.InverseGamma('sigma_alpha', 4, 3)
sigma_alpha = pm.Normal('alpha', 0, tau=1/sigma_alpha, shape=T)
alpha
# Linear model
= mu + pm.math.dot(occasion_covariate, alpha)
nu = pm.Deterministic('p', pm.math.invlogit(nu))
p
# Likelihood
pm.CustomDist('y',
1,
p,
psi,=logp,
logp=dolphin_augmented
observed
)
pm.model_to_graphviz(mt)
with mt:
= pm.sample(nuts_sampler='nutpie') mt_idata
Sampler Progress
Total Chains: 4
Active Chains: 0
Finished Chains: 4
Sampling for 30 seconds
Estimated Time to Completion: now
Progress | Draws | Divergences | Step Size | Gradients/Draw |
---|---|---|---|---|
2000 | 0 | 0.63 | 7 | |
2000 | 0 | 0.63 | 7 | |
2000 | 0 | 0.65 | 7 | |
2000 | 0 | 0.61 | 7 |
with pm.Model() as mb:
# Priors
# inclusion
= pm.Uniform('psi', 0, 1)
psi
# mean catchability
= pm.Logistic('mu', 0, 1)
mu
# behavior effect
= pm.InverseGamma('sigma_beta', 4, 3)
sigma_beta = pm.Normal('beta', 0, tau=1/sigma_beta)
beta
# Linear model
= mu + behavior_covariate * beta
nu = pm.Deterministic('p', pm.math.invlogit(nu))
p
# Likelihood
pm.CustomDist('y',
1,
p,
psi,=logp,
logp=dolphin_augmented
observed
)
pm.model_to_graphviz(mb)
with mb:
= pm.sample(nuts_sampler='nutpie') mb_idata
Sampler Progress
Total Chains: 4
Active Chains: 0
Finished Chains: 4
Sampling for now
Estimated Time to Completion: now
Progress | Draws | Divergences | Step Size | Gradients/Draw |
---|---|---|---|---|
2000 | 0 | 0.80 | 3 | |
2000 | 0 | 0.84 | 3 | |
2000 | 0 | 0.79 | 7 | |
2000 | 0 | 0.83 | 3 |
with pm.Model() as mtb:
# Priors
# inclusion
= pm.Uniform('psi', 0, 1)
psi
# mean catchability
= pm.Logistic('mu', 0, 1)
mu
# time effect
= pm.InverseGamma('sigma_alpha', 4, 3)
sigma_alpha = pm.Normal('alpha', 0, tau=1/sigma_alpha, shape=T)
alpha
# behavior effect
= pm.InverseGamma('sigma_beta', 4, 3)
sigma_beta = pm.Normal('beta', 0, tau=1/sigma_beta)
beta
# Linear model
= mu + pm.math.dot(occasion_covariate, alpha) + behavior_covariate * beta
nu = pm.Deterministic('p', pm.math.invlogit(nu))
p
# Likelihood
pm.CustomDist('y',
1,
p,
psi,=logp,
logp=dolphin_augmented
observed
)
pm.model_to_graphviz(mtb)
with mtb:
= pm.sample(nuts_sampler='nutpie') mtb_idata
Sampler Progress
Total Chains: 4
Active Chains: 0
Finished Chains: 4
Sampling for 37 seconds
Estimated Time to Completion: now
Progress | Draws | Divergences | Step Size | Gradients/Draw |
---|---|---|---|---|
2000 | 0 | 0.56 | 7 | |
2000 | 0 | 0.55 | 7 | |
2000 | 0 | 0.57 | 7 | |
2000 | 0 | 0.54 | 7 |
with pm.Model() as mh:
# Priors
# inclusion
= pm.Uniform('psi', 0, 1)
psi
# mean catchability
= pm.Logistic('mu', 0, 1)
mu
# individual effect
= pm.InverseGamma('sigma_gamma', 4, 3)
sigma_gamma = pm.Normal('gamma', 0, tau=1/sigma_gamma, shape=M)
gamma
# Linear model
= (np.ones((T, M)) * gamma).T
individual_effect = mu + individual_effect
nu = pm.Deterministic('p', pm.math.invlogit(nu))
p
# Likelihood
pm.CustomDist('y',
1,
p,
psi,=logp,
logp=dolphin_augmented
observed
)
pm.model_to_graphviz(mh)
with mh:
= pm.sample(3000, target_accept=0.99, nuts_sampler='nutpie') mh_idata
Sampler Progress
Total Chains: 4
Active Chains: 0
Finished Chains: 4
Sampling for 34 seconds
Estimated Time to Completion: now
Progress | Draws | Divergences | Step Size | Gradients/Draw |
---|---|---|---|---|
4000 | 0 | 0.08 | 63 | |
4000 | 0 | 0.07 | 31 | |
4000 | 0 | 0.09 | 31 | |
4000 | 0 | 0.08 | 63 |
=['psi', 'mu', 'sigma_gamma']) az.summary(mh_idata, var_names
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
psi | 0.185 | 0.039 | 0.122 | 0.257 | 0.002 | 0.001 | 589.0 | 912.0 | 1.01 |
mu | -2.788 | 0.329 | -3.432 | -2.241 | 0.018 | 0.013 | 376.0 | 572.0 | 1.01 |
sigma_gamma | 0.775 | 0.351 | 0.291 | 1.453 | 0.024 | 0.017 | 213.0 | 419.0 | 1.03 |
=(8, 6), var_names=['psi', 'mu', 'sigma_gamma']); az.plot_trace(mh_idata, figsize
with pm.Model() as mth:
# Priors
# inclusion
= pm.Beta('psi', 1, 1)
psi
# mean catchability
= pm.Logistic('mu', 0, 1)
mu
# time effect
= pm.InverseGamma('sigma_alpha', 4, 3)
sigma_alpha = pm.Normal('alpha', 0, tau=1/sigma_alpha, shape=T)
alpha
# individual effect
= pm.InverseGamma('sigma_gamma', 4, 3)
sigma_gamma = pm.Normal('gamma', 0, tau=1/sigma_gamma, shape=M)
gamma
# Linear model
= (np.ones((T, M)) * gamma).T
individual_effect = mu + pm.math.dot(occasion_covariate, alpha) + individual_effect
nu = pm.Deterministic('p', pm.math.invlogit(nu))
p
# Likelihood
pm.CustomDist('y',
1,
p,
psi,=logp,
logp=dolphin_augmented
observed
)
pm.model_to_graphviz(mth)
with mth:
= pm.sample(draws=3000, target_accept=0.95, nuts_sampler='nutpie') mth_idata
Sampler Progress
Total Chains: 4
Active Chains: 0
Finished Chains: 4
Sampling for 3 minutes
Estimated Time to Completion: now
Progress | Draws | Divergences | Step Size | Gradients/Draw |
---|---|---|---|---|
4000 | 0 | 0.16 | 31 | |
4000 | 0 | 0.17 | 31 | |
4000 | 0 | 0.17 | 31 | |
4000 | 0 | 0.14 | 15 |
=['psi', 'mu', 'sigma_alpha', 'sigma_gamma', 'alpha']) az.summary(mth_idata, var_names
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
psi | 0.182 | 0.041 | 0.120 | 0.251 | 0.002 | 0.001 | 543.0 | 1020.0 | 1.01 |
mu | -3.058 | 0.437 | -3.873 | -2.308 | 0.021 | 0.015 | 450.0 | 951.0 | 1.00 |
sigma_alpha | 0.923 | 0.356 | 0.381 | 1.577 | 0.004 | 0.003 | 7989.0 | 7805.0 | 1.00 |
sigma_gamma | 0.899 | 0.425 | 0.293 | 1.644 | 0.028 | 0.020 | 212.0 | 512.0 | 1.01 |
alpha[0] | -1.095 | 0.601 | -2.230 | 0.033 | 0.006 | 0.005 | 10515.0 | 8318.0 | 1.00 |
alpha[1] | 0.570 | 0.403 | -0.178 | 1.336 | 0.005 | 0.004 | 5931.0 | 7290.0 | 1.00 |
alpha[2] | -0.802 | 0.562 | -1.829 | 0.296 | 0.006 | 0.005 | 10454.0 | 7723.0 | 1.00 |
alpha[3] | 0.570 | 0.409 | -0.199 | 1.342 | 0.005 | 0.004 | 5787.0 | 7480.0 | 1.00 |
alpha[4] | 0.448 | 0.411 | -0.325 | 1.207 | 0.006 | 0.004 | 5561.0 | 7430.0 | 1.00 |
alpha[5] | 0.782 | 0.403 | 0.026 | 1.536 | 0.006 | 0.004 | 5355.0 | 6868.0 | 1.00 |
alpha[6] | 0.177 | 0.441 | -0.664 | 0.980 | 0.005 | 0.004 | 6445.0 | 8379.0 | 1.00 |
alpha[7] | 0.024 | 0.456 | -0.845 | 0.870 | 0.006 | 0.004 | 5890.0 | 6913.0 | 1.00 |
alpha[8] | -0.804 | 0.557 | -1.832 | 0.235 | 0.006 | 0.005 | 8067.0 | 7704.0 | 1.00 |
alpha[9] | 0.025 | 0.457 | -0.841 | 0.860 | 0.005 | 0.004 | 7612.0 | 7826.0 | 1.00 |
alpha[10] | 1.153 | 0.377 | 0.429 | 1.842 | 0.006 | 0.004 | 4699.0 | 6330.0 | 1.00 |
alpha[11] | -0.332 | 0.494 | -1.230 | 0.608 | 0.006 | 0.004 | 7986.0 | 7561.0 | 1.00 |
alpha[12] | -1.109 | 0.609 | -2.270 | 0.000 | 0.007 | 0.006 | 9351.0 | 6756.0 | 1.00 |
alpha[13] | -0.147 | 0.470 | -1.047 | 0.713 | 0.006 | 0.004 | 7172.0 | 7546.0 | 1.00 |
alpha[14] | -1.106 | 0.608 | -2.253 | -0.003 | 0.006 | 0.005 | 9200.0 | 8052.0 | 1.00 |
alpha[15] | 1.608 | 0.360 | 0.960 | 2.326 | 0.005 | 0.004 | 4561.0 | 7334.0 | 1.00 |
alpha[16] | -0.817 | 0.561 | -1.884 | 0.225 | 0.006 | 0.005 | 9633.0 | 6540.0 | 1.00 |
=(8, 10),
az.plot_trace(mth_idata, figsize=['psi', 'mu', 'sigma_alpha', 'sigma_gamma', 'alpha']); var_names
with pm.Model() as mbh:
# Priors
# inclusion
= pm.Beta('psi', 1, 1)
psi
# mean catchability
= pm.Logistic('mu', 0, 1)
mu
# behavior effect
= pm.InverseGamma('sigma_beta', 4, 3)
sigma_beta = pm.Normal('beta', 0, tau=1/sigma_beta)
beta
# individual effect
= pm.InverseGamma('sigma_gamma', 4, 3)
sigma_gamma = pm.Normal('gamma', 0, tau=1/sigma_gamma, shape=M)
gamma
# Linear model
= (np.ones((T, M)) * gamma).T
individual_effect = mu + behavior_covariate * beta + individual_effect
nu = pm.Deterministic('p', pm.math.invlogit(nu))
p
# Likelihood
pm.CustomDist('y',
1,
p,
psi,=logp,
logp=dolphin_augmented
observed
)
pm.model_to_graphviz(mbh)
with mbh:
= pm.sample(draws=3000, target_accept=0.95, nuts_sampler='nutpie') mbh_idata
Sampler Progress
Total Chains: 4
Active Chains: 0
Finished Chains: 4
Sampling for 2 minutes
Estimated Time to Completion: now
Progress | Draws | Divergences | Step Size | Gradients/Draw |
---|---|---|---|---|
4000 | 0 | 0.15 | 31 | |
4000 | 0 | 0.15 | 31 | |
4000 | 0 | 0.15 | 31 | |
4000 | 0 | 0.15 | 31 |
=['psi', 'mu', 'beta', 'sigma_gamma']) az.summary(mbh_idata, var_names
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
psi | 0.591 | 0.188 | 0.293 | 0.958 | 0.009 | 0.006 | 454.0 | 1087.0 | 1.01 |
mu | -3.628 | 0.560 | -4.679 | -2.624 | 0.031 | 0.022 | 333.0 | 766.0 | 1.01 |
beta | -1.560 | 0.252 | -2.015 | -1.061 | 0.005 | 0.003 | 2754.0 | 4288.0 | 1.00 |
sigma_gamma | 2.300 | 0.740 | 1.052 | 3.723 | 0.043 | 0.030 | 291.0 | 612.0 | 1.01 |
=(8, 10),
az.plot_trace(mbh_idata, figsize=['psi', 'mu', 'beta', 'sigma_beta', 'sigma_gamma']); var_names
with pm.Model() as mtbh:
# Priors
# inclusion
= pm.Beta('psi', 1, 5)
psi
# mean catchability
= pm.Logistic('mu', 0, 1)
mu
# time effect
= pm.InverseGamma('sigma_alpha', 4, 3)
sigma_alpha = pm.Normal('alpha', 0, tau=1/sigma_alpha, shape=T)
alpha
# behavior effect
= pm.InverseGamma('sigma_beta', 4, 3)
sigma_beta = pm.Normal('beta', 0, tau=1/sigma_beta)
beta
# individual effect
= pm.InverseGamma('sigma_gamma', 4, 3)
sigma_gamma = pm.Normal('gamma', 0, tau=1/sigma_gamma, shape=M)
gamma
# Linear model
= (np.ones((T, M)) * gamma).T
individual_effect = mu + pm.math.dot(occasion_covariate, alpha) + behavior_covariate * beta + individual_effect
nu = pm.Deterministic('p', pm.math.invlogit(nu))
p
# Likelihood
pm.CustomDist('y',
1,
p,
psi,=logp,
logp=dolphin_augmented
observed
)
pm.model_to_graphviz(mtbh)
with mtbh:
= pm.sample(draws=2000, nuts_sampler='nutpie') mtbh_idata
Sampler Progress
Total Chains: 4
Active Chains: 0
Finished Chains: 4
Sampling for a minute
Estimated Time to Completion: now
Progress | Draws | Divergences | Step Size | Gradients/Draw |
---|---|---|---|---|
3000 | 0 | 0.30 | 15 | |
3000 | 0 | 0.29 | 15 | |
3000 | 0 | 0.29 | 15 | |
3000 | 0 | 0.29 | 15 |
az.summary(mtbh_idata, =['psi', 'mu', 'alpha', 'beta', 'sigma_alpha', 'sigma_beta', 'sigma_gamma']) var_names
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
psi | 0.712 | 0.100 | 0.528 | 0.897 | 0.002 | 0.001 | 2553.0 | 4215.0 | 1.00 |
mu | -3.060 | 0.479 | -3.931 | -2.133 | 0.019 | 0.013 | 645.0 | 1313.0 | 1.00 |
alpha[0] | -3.133 | 0.729 | -4.568 | -1.855 | 0.015 | 0.010 | 2591.0 | 3674.0 | 1.00 |
alpha[1] | -0.675 | 0.525 | -1.640 | 0.313 | 0.012 | 0.008 | 1992.0 | 3155.0 | 1.00 |
alpha[2] | -1.701 | 0.658 | -2.897 | -0.440 | 0.012 | 0.008 | 3312.0 | 4018.0 | 1.00 |
alpha[3] | -0.055 | 0.491 | -0.991 | 0.865 | 0.010 | 0.007 | 2333.0 | 4005.0 | 1.00 |
alpha[4] | 0.211 | 0.483 | -0.710 | 1.095 | 0.009 | 0.006 | 2818.0 | 4354.0 | 1.00 |
alpha[5] | 0.834 | 0.459 | -0.045 | 1.663 | 0.009 | 0.006 | 2529.0 | 3670.0 | 1.00 |
alpha[6] | 0.376 | 0.505 | -0.551 | 1.353 | 0.009 | 0.006 | 3085.0 | 3796.0 | 1.00 |
alpha[7] | 0.372 | 0.530 | -0.659 | 1.338 | 0.009 | 0.006 | 3420.0 | 4718.0 | 1.00 |
alpha[8] | -0.563 | 0.664 | -1.796 | 0.674 | 0.009 | 0.007 | 5469.0 | 4714.0 | 1.00 |
alpha[9] | 0.467 | 0.520 | -0.552 | 1.402 | 0.009 | 0.006 | 3226.0 | 4124.0 | 1.00 |
alpha[10] | 1.657 | 0.435 | 0.887 | 2.508 | 0.010 | 0.007 | 2000.0 | 3439.0 | 1.00 |
alpha[11] | 0.137 | 0.580 | -0.996 | 1.179 | 0.009 | 0.007 | 4033.0 | 4856.0 | 1.00 |
alpha[12] | -0.892 | 0.758 | -2.323 | 0.551 | 0.010 | 0.009 | 6313.0 | 4271.0 | 1.00 |
alpha[13] | 0.384 | 0.539 | -0.639 | 1.384 | 0.009 | 0.006 | 3577.0 | 4661.0 | 1.00 |
alpha[14] | -0.874 | 0.760 | -2.373 | 0.476 | 0.010 | 0.009 | 6301.0 | 4217.0 | 1.00 |
alpha[15] | 2.237 | 0.419 | 1.483 | 3.042 | 0.010 | 0.007 | 1762.0 | 3599.0 | 1.00 |
alpha[16] | -0.264 | 0.675 | -1.542 | 0.994 | 0.009 | 0.008 | 5795.0 | 5445.0 | 1.00 |
beta | -3.293 | 0.393 | -4.026 | -2.550 | 0.009 | 0.007 | 1739.0 | 3061.0 | 1.00 |
sigma_alpha | 1.567 | 0.601 | 0.668 | 2.669 | 0.009 | 0.007 | 4221.0 | 4797.0 | 1.00 |
sigma_beta | 2.418 | 1.549 | 0.654 | 5.047 | 0.021 | 0.017 | 6836.0 | 5326.0 | 1.00 |
sigma_gamma | 2.707 | 0.679 | 1.540 | 3.949 | 0.041 | 0.029 | 270.0 | 674.0 | 1.02 |
=(8,14),
az.plot_trace(mtbh_idata, figsize=['psi', 'mu', 'alpha', 'beta', 'sigma_alpha', 'sigma_beta', 'sigma_gamma']); var_names
The trace plots and summary statistics show convergence issues for many of the individual heterogeneity models. The variance parameter, \(\sigma_{\gamma},\) seems to sample poorly. Further, models with both behavioral and individual effects lead to extremely large estimates of \(\psi\). This appears to happen regardless of the size of the data augmentation \(M.\)
Note that I upped the target_accept
value for some models. This slows the sampler, but lowers the risk of divergence.
Model comparison
Next, I select a model for inference using an approximation of leave-one-out (loo) cross-validation (Vehtari, Gelman, and Gabry 2017). This approximation can be calculated using PyMC. To do so, I calculate the log-likelihood for each model, which is added to the InferenceData
object. This makes it possible to compare the models using loo and az.compare
.
with m0:
pm.compute_log_likelihood(m0_idata)
with mt:
pm.compute_log_likelihood(mt_idata)
with mb:
pm.compute_log_likelihood(mb_idata)
with mtb:
pm.compute_log_likelihood(mtb_idata)
with mh:
pm.compute_log_likelihood(mh_idata)
with mth:
pm.compute_log_likelihood(mth_idata)
with mbh:
pm.compute_log_likelihood(mbh_idata)
with mtbh:
pm.compute_log_likelihood(mtbh_idata)
= {"m0": m0_idata, "mt": mt_idata, "mb": mb_idata,
model_dict "mtb": mtb_idata, "mh": mh_idata, "mth": mth_idata,
"mbh": mbh_idata, "mtbh": mtbh_idata}
= az.compare(model_dict) comparison
/Users/philtpatton/miniforge3/envs/pymc/lib/python3.12/site-packages/arviz/stats/stats.py:795: UserWarning: Estimated shape parameter of Pareto distribution is greater than 0.70 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.
warnings.warn(
/Users/philtpatton/miniforge3/envs/pymc/lib/python3.12/site-packages/arviz/stats/stats.py:795: UserWarning: Estimated shape parameter of Pareto distribution is greater than 0.70 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.
warnings.warn(
/Users/philtpatton/miniforge3/envs/pymc/lib/python3.12/site-packages/arviz/stats/stats.py:795: UserWarning: Estimated shape parameter of Pareto distribution is greater than 0.70 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.
warnings.warn(
/Users/philtpatton/miniforge3/envs/pymc/lib/python3.12/site-packages/arviz/stats/stats.py:795: UserWarning: Estimated shape parameter of Pareto distribution is greater than 0.70 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.
warnings.warn(
/Users/philtpatton/miniforge3/envs/pymc/lib/python3.12/site-packages/arviz/stats/stats.py:795: UserWarning: Estimated shape parameter of Pareto distribution is greater than 0.70 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.
warnings.warn(
The comparison tools notes issues with several of the models, suggesting a lack of robustness. Inspection of the comparison table shows that the struggling models all include the individual effect \(h.\) A more thorough analysis would consider reparameterizing the model, e.g., through the non-centered parameterization. In lieu of that, I simply discard the models that fail this test and re-do the comparison with the passing models.
round(2) comparison.
rank | elpd_loo | p_loo | elpd_diff | weight | se | dse | warning | scale | |
---|---|---|---|---|---|---|---|---|---|
mtbh | 0 | -431.31 | 87.25 | 0.00 | 0.98 | 57.35 | 0.00 | True | log |
mth | 1 | -489.44 | 39.53 | 58.13 | 0.00 | 58.98 | 10.98 | True | log |
mtb | 2 | -493.57 | 22.01 | 62.27 | 0.00 | 62.09 | 10.93 | True | log |
mt | 3 | -493.85 | 14.33 | 62.54 | 0.00 | 59.55 | 12.28 | False | log |
mbh | 4 | -494.30 | 71.50 | 63.00 | 0.00 | 60.74 | 11.12 | True | log |
mh | 5 | -518.38 | 25.46 | 87.07 | 0.02 | 61.46 | 15.14 | True | log |
mb | 6 | -521.40 | 5.22 | 90.09 | 0.00 | 63.10 | 15.03 | False | log |
m0 | 7 | -522.51 | 2.45 | 91.20 | 0.00 | 62.09 | 15.92 | False | log |
= {"m0": m0_idata, "mt": mt_idata, "mb": mb_idata, "mtb": mtb_idata}
good_dict = az.compare(good_dict)
good_comparison round(2) good_comparison.
/Users/philtpatton/miniforge3/envs/pymc/lib/python3.12/site-packages/arviz/stats/stats.py:795: UserWarning: Estimated shape parameter of Pareto distribution is greater than 0.70 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.
warnings.warn(
rank | elpd_loo | p_loo | elpd_diff | weight | se | dse | warning | scale | |
---|---|---|---|---|---|---|---|---|---|
mtb | 0 | -493.57 | 22.01 | 0.00 | 0.59 | 62.09 | 0.00 | True | log |
mt | 1 | -493.85 | 14.33 | 0.28 | 0.41 | 59.55 | 6.67 | False | log |
mb | 2 | -521.40 | 5.22 | 27.83 | 0.00 | 63.10 | 8.54 | False | log |
m0 | 3 | -522.51 | 2.45 | 28.93 | 0.00 | 62.09 | 10.26 | False | log |
=(5, 4)); az.plot_compare(good_comparison, figsize
The comparison shows that all of the model weight belongs to two models: \(M_t\) and \(M_{tb}.\)
Model averaged predictions
Finally, we can use the model weights to simulate a weighted posterior of \(N.\) To do so, I take a weighted sample of each of the posteriors of \(N,\) with the weight dictated by the comparison tool.
= [sim_N(good_dict[model]) for model in good_dict]
posteriors = [good_comparison.loc[model].weight for model in good_dict]
weights = len(posteriors[0])
sample_count
= []
l for w, p in zip(weights, posteriors):
= RNG.choice(p, size=int(w * sample_count))
weighted_sample
l.append(weighted_sample)
= np.concatenate(l)
weighted_posterior
= plt.subplots(2, 1, figsize=(7, 6), sharex=True, sharey=True, tight_layout=True)
fig, (ax0, ax1)
= sns.color_palette("Set2")
pal
# labs = [k for k in good_dict.keys()]
= [r'$M_{0}$', r'$M_{t}$', r'$M_{b}$', r'$M_{tb}$']
labs for i, p in enumerate(posteriors):
=pal[i], edgecolor='white', bins=60, alpha=0.6, label=labs[i])
ax0.hist(p, color
r'Posteriors of $N$')
ax0.set_title(# ax1.set_title(r'Weighted posterior')
53, 150))
ax0.set_xlim((
ax0.legend()
'Number of samples')
ax0.set_ylabel('Number of samples')
ax1.set_ylabel(
='white', bins=60, alpha=0.9, color=pal[6], label='Weighted')
ax1.hist(weighted_posterior, edgecolor
ax1.legend()
plt.show()
We can also look at the posterior densities of \(p\) from Model \(M_t,\) the second most weighted model.
= az.extract(mt_idata).p.mean(axis=0)
p_samps
= plt.subplots(figsize=(6, 4))
fig, ax
= 0.4
a # ax[0].set_title("Poisson")
= sns.color_palette('viridis', T)
pal for t in range(T):
= t % 2
label_idx if label_idx == 0:
=ax, color=pal[t], label=f'$t_{{{t}}}$',
az.plot_dist(p_samps[t], ax={'linewidth':3, 'alpha': a})
plot_kwargselse:
=ax, color=pal[t],
az.plot_dist(p_samps[t], ax={'linewidth':3, 'alpha': a})
plot_kwargs
r'Posterior densities of $p$ from $M_t$')
ax.set_title(r'$p$')
ax.set_xlabel(
plt.show()
This notebook demonstrates a simple way to compare models using leave one out cross-validation (loo) and a classic example from capture-recapture. This is just one way, however, to perform model comparison using PyMC. Perhaps a more effective solution for this problem would be placing a shrinkage prior on the \(\sigma\) parameters.
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Mon Nov 25 2024
Python implementation: CPython
Python version : 3.12.7
IPython version : 8.29.0
numpy : 1.26.4
pandas : 2.2.3
seaborn : 0.13.2
pymc : 5.18.2
matplotlib: 3.9.2
arviz : 0.20.0
Watermark: 2.5.0