from pymc.distributions.dist_math import factln
from scipy.linalg import circulant
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import arviz as az
import pymc as pm
import pytensor.tensor as pt
'figure.dpi'] = 600
plt.rcParams['fivethirtyeight')
plt.style.use('axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.bottom'] = False
plt.rcParams["tab10")
sns.set_palette(
def create_recapture_array(history):
"""Create the recapture array from a capture history."""
= history.shape
_, occasion_count = occasion_count - 1
interval_count
= np.zeros((interval_count, interval_count), int)
recapture_array for occasion in range(occasion_count - 1):
# which individuals, captured at t, were later recaptured?
= history[:, occasion] == 1
captured_this_time = (history[:, (occasion + 1):] > 0).any(axis=1)
captured_later = captured_this_time & captured_later
now_and_later
# when were they next recaptured?
= history[now_and_later, (occasion + 1):]
remaining_history = (remaining_history.argmax(axis=1)) + occasion
next_capture_occasion
# how many of them were there?
= np.unique(next_capture_occasion, return_counts=True)
ind, count = count
recapture_array[occasion, ind]
return recapture_array.astype(int)
def create_m_array(history):
'''Create the m-array from a capture history.'''
# leftmost column of the m-array
= history.sum(axis=0)
number_released
# core of the m-array
= create_recapture_array(history)
recapture_array = recapture_array.sum(axis=1)
number_recaptured
# no animals that were released on the last occasion are recaptured
= np.append(number_recaptured, 0)
number_recaptured = number_released - number_recaptured
never_recaptured
# add a dummy row at the end to make everything stack
= np.zeros(recapture_array.shape[1])
zeros = np.row_stack((recapture_array, zeros))
recapture_array
# stack the relevant values into the m-array
= np.column_stack((number_released, recapture_array, never_recaptured))
m_array
return m_array.astype(int)
def fill_lower_diag_ones(x):
'''Fill the lower diagonal of a matrix with ones.'''
return pt.triu(x) + pt.tril(pt.ones_like(x), k=-1)
Cormack-Jolly-Seber
In this notebook, I explore the Cormack-Jolly-Seber (CJS) model for estimating survival using capture-recapture data in PyMC. I have drawn considerable inspiration from Austin Rochford’s notebook on capture-recapture in PyMC, the second chapter of my dissertation (a work in progress), and McCrea and Morgan (2014).
Cormack-Jolly-Seber
The goal of the CJS model is to estimate survival, accounting for capture probabilities being less than one. There are many methods for estimating parameters in the model, including state-space formulations that explicitly model the latent alive/dead state \(z.\) Following the theme of the previous notebooks, I instead marginalize this variable out of the model by using the so-called \(M\)-array. This is an array that contains the sufficient statistics for the CJS models. For example, \(m_{1,2}\) is the number of individuals that were released on at \(t=1\) and were first recaptured on \(t=2.\)
Number Released | Number recaptured | Never recaptured | |||
---|---|---|---|---|---|
1982 | 1983 | 1984 | |||
1981 | \(R_1\) | \(m_{1,2}\) | \(m_{1,3}\) | \(m_{1,4}\) | \(R_1-m_{1\cdot}\) |
1982 | \(R_2\) | \(m_{2,3}\) | \(m_{2,3}\) | \(R_2-m_{2\cdot}\) | |
1983 | \(R_3\) | \(m_{3,4}\) | \(R_3-m_{3\cdot}\) |
As an example, I use the cormorant data from McCrea and Morgan (2014), Table 4.6. These data come from an eleven year capture-recapture study between 1982 and 1993. These were breeding cormorants of unknown age. The data is summarized in the \(M\)-array below. The last column is the number that were never recapured. The number released can be calculated from the array.
= np.array([
cormorant 10, 4, 2, 2, 0, 0, 0, 0, 0, 0, 12],
[ 0, 42, 12, 16, 1, 0, 1, 1, 1, 0, 83],
[ 0, 0, 85, 22, 5, 5, 2, 1, 0, 1, 53],
[ 0, 0, 0, 139, 39, 10, 10, 4, 2, 0, 94],
[ 0, 0, 0, 0, 175, 60, 22, 8, 4, 2, 199],
[ 0, 0, 0, 0, 0, 159, 46, 16, 5, 2, 193],
[ 0, 0, 0, 0, 0, 0, 191, 39, 4, 8, 171],
[ 0, 0, 0, 0, 0, 0, 0, 188, 19, 23, 284],
[ 0, 0, 0, 0, 0, 0, 0, 0, 101, 55, 274],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 97]]) [
= cormorant.shape
interval_count, T
= cormorant[:,:-1]
number_recaptured = cormorant[:,-1]
never_recaputured = number_recaptured.sum(axis=1) + never_recaputured number_released
This PyMC model will look different than the ones in previous notebooks, simply because it requires many tricks to get the probabilities in the correct format for the \(m\)-array, then modeling the \(m\)-array as a multinomial with the associated cell probabilities. These probabilities correspond to the situations in the \(m\)-array, such as the probability that an animal survived and was not recaptured until a later date. In this example, I model survival as time-varying, i.e., \(\phi(t).\)
# utility vectors for creating arrays and array indices
= np.arange(interval_count)
intervals = np.reshape(intervals, (interval_count, 1))
row_indices = np.reshape(intervals, (1, interval_count))
col_indices
# matrix indicating the number of intervals between sampling occassions
= np.clip(col_indices - row_indices, 0, np.inf)
intervals_between
with pm.Model() as phit:
# priors for catchability and survival
= pm.Uniform('p', 0, 1)
p = pm.Uniform('phi', 0, 1, shape=interval_count)
phi
# broadcast phi into a matrix
= pt.ones_like(number_recaptured) * phi
phi_mat = fill_lower_diag_ones(phi_mat) # fill irrelevant values
phi_mat
# probability of surviving between i and j in the m-array
= pt.cumprod(phi_mat, axis=1)
p_alive = pt.triu(p_alive) # select relevant (upper triangle) values
p_alive
# probability of not being captured between i and j
= pt.triu((1 - p) ** intervals_between)
p_not_cap
# probabilities associated with each cell in the m-array
= p_alive * p_not_cap * p
nu
# probability for the animals that were never recaptured
= 1 - nu.sum(axis=1)
chi
# combine the probabilities into a matrix
= pt.reshape(chi, (interval_count, 1))
chi = pt.horizontal_stack(nu, chi)
marr_probs
# distribution of the m-array
= pm.Multinomial(
marr 'M-array',
=number_released,
n=marr_probs,
p=cormorant
observed
)
pm.model_to_graphviz(phit)
with phit:
= pm.sample() cjs_idata
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [p, phi]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 3 seconds.
= [0.51]
mccrea_p = [0.79, 0.56, 0.83, 0.86, 0.73, 0.70, 0.81, 0.64, 0.46, 0.95]
mccrea_phi
az.plot_trace(
cjs_idata, =(10, 4),
figsize=[("phi", {}, [mccrea_phi]), ("p", {}, [mccrea_p])]
lines; )
The model samples fairly quickly in this parameterization. The traceplots above include comparisons to the estimates from McCrea and Morgan (2014). While a bit messy, the plots show a high level of agreement between their estimates and the ones here. To clean things up a bit, I plot the estimates for \(\phi\) over time, along with the 94% credible intervals
= plt.subplots(figsize=(6,4))
fig, ax
= np.arange(1983, 1993)
t
= az.extract(cjs_idata, var_names='phi').values.T
phi_samps = np.median(phi_samps, axis=0)
phi_median
='dotted', color='lightgray', linewidth=2)
ax.plot(t, phi_median, linestyle=True, showextrema=False)
ax.violinplot(phi_samps, t, showmedians
0,1))
ax.set_ylim((
r'Apparent survival $\phi$')
ax.set_ylabel(r'Cormorant CJS')
ax.set_title(
plt.show()
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Mon Nov 25 2024
Python implementation: CPython
Python version : 3.12.7
IPython version : 8.29.0
pytensor : 2.26.3
scipy : 1.14.1
pymc : 5.18.2
arviz : 0.20.0
seaborn : 0.13.2
numpy : 1.26.4
matplotlib: 3.9.2
Watermark: 2.5.0