import seaborn as sns
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
= 808
SEED = np.random.default_rng(SEED)
RNG
'fivethirtyeight')
plt.style.use('axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.bottom'] = False
plt.rcParams["tab10")
sns.set_palette(
def invlogit(x):
return 1 / (1 + np.exp(-x))
# read in the detection data
= pd.read_csv('detectionFreq.NH17.csv')
nh17 = nh17.to_numpy()
Y = Y.shape
n, J = Y.max()
K
# convert the species names to ints
= nh17.index.factorize() # lookup[int] returns the actual name
species_idx, lookup
# plot the detection frequencies
= plt.subplots(figsize=(4, 6))
fig, ax = ax.imshow(Y[np.argsort(Y.sum(axis=1))], aspect='auto')
im 'Species')
ax.set_ylabel('Site')
ax.set_xlabel(
# add a legend
= np.unique(Y.ravel())[1::2]
values = [ im.cmap(im.norm(value)) for value in values]
colors = [ Patch(color=colors[i], label=f'{v}') for i, v in enumerate(values) ]
patches ='Detections', handles=patches, bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.)
plt.legend(titleFalse)
ax.grid( plt.show()
Community occupancy
In this notebook, I explore fitting community occupancy models in PyMC. Community occupancy models are a multi-species extension of standard occupancy models. The benefit of these models is that, by treating each species as a random effect, they can estimate occupancy and detection more precisely than single species models. Further, through data augmentation, they can estimate the richness of the supercommunity, that is, the total number of species that use the study area during the surveys.
US Breeding Bird Survey
As a motivating example, I use the breeding bird survey (BBS) data used by Dorazio and Royle (2005) and Royle and Dorazio (2008), Chapter 12. This is a \((n, J)\) matrix with the number of times each species was detected over \(K\) surveys, where \(n\) is the number of detected species and \(J\) is the number surveyed sites. In this example, \(n=99\) species were detected at the \(J=50\) sites over the \(K=11\) surveys in New Hampshire. The BBS occurs on routes across the US. This dataset represents one route.
Dorazio and Royle (2005) draw each species-level effect from a multivariate normal distribution, \[ {\alpha_i \choose \beta_i} \sim \text{Normal} \left( {\mu_{\,\text{detection}} \choose \mu_{\, \text{occupancy}}}, \; \mathbf{\Sigma} \right), \] where \(\alpha_i\) is the logit-scale probability of detection for species \(i=1,\dots,n\), \(\beta_i\) is the logit-scale probability of occurrence, \(\mu\) is the community-level average, and \(\mathbf{\Sigma}\) is the covariance matrix. We assume that there will be a positive correlation between occupancy and the probability of detection, since abundance is positively correlated with both.
Known \(N\)
First, I fit the the known \(N\) version of the model. The goal of this version is to estimate occurrence and detection for each species, without estimating species richness.
This notebook makes extensive use of the coords
feature in PyMC. Coords
makes it easier to incorporate the species-level effects via the multivariate normal. I use a \(\text{Normal}(0, 2)\) prior for both \(\mu\) parameters, and a LKJ Cholesky covariance prior for \(\mathbf{\Sigma}.\)
= {'process': ['detection', 'occurrence'],
coords 'process_bis': ['detection', 'occurrence'],
'species': lookup}
with pm.Model(coords=coords) as known:
# priors for community-level means for detection and occurrence
= pm.Normal('mu', 0, 2, dims='process')
mu
# prior for covariance matrix for occurrence and detection
= pm.LKJCholeskyCov(
chol, corr, stds "chol", n=2, eta=2.0, sd_dist=pm.Exponential.dist(1.0, shape=2)
)= pm.Deterministic("cov", chol.dot(chol.T), dims=("process", "process_bis"))
cov
# species-level occurrence and detection probabilities on logit-scale
= pm.MvNormal("ab", mu, chol=chol, dims=("species", "process"))
ab
# probability of detection. newaxis allows for broadcasting
= ab[:, 0][:, np.newaxis]
a = pm.Deterministic("p", pm.math.invlogit(a))
p
# probability of detection. newaxis allows for broadcasting
= ab[:, 1][:, np.newaxis]
b = pm.Deterministic("psi", pm.math.invlogit(b))
psi
# likelihood
'Y', p=p, psi=psi, n=K, observed=Y)
pm.ZeroInflatedBinomial(
pm.model_to_graphviz(known)
with known:
= pm.sample() known_idata
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu, chol, ab]
/Users/philtpatton/miniforge3/envs/pymc/lib/python3.12/site-packages/pytensor/compile/function/types.py:959: RuntimeWarning: invalid value encountered in accumulate
self.vm()
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 47 seconds.
= [-1.11, -1.7]
mu_hat_royle =['mu'], figsize=(8,2),
az.plot_trace(known_idata, var_names=[("mu", {}, [mu_hat_royle])]); lines
= az.extract(known_idata, var_names='ab')
samps = samps.mean(axis=2)
ab_mean
= plt.subplots(figsize=(5,4))
fig, ax 1]), invlogit(ab_mean[:, 0]), alpha=0.5)
ax.scatter(invlogit(ab_mean[:, 0, 1))
ax.set_xlim(('Occupancy probability')
ax.set_xlabel(0, 0.8))
ax.set_ylim(('Detection probability')
ax.set_ylabel( plt.show()
The estimates of the community-level means is quite close to the estimates from Royle and Dorazio (2008). We can visualize the species-level probabilities of detection and occupancy. Compare with Figure 12.3 in Royle and Dorazio (2008).
Unknown \(N\)
Next, I train the unknown \(N\) version of the model. Like many other notebooks in this series, it relies on augmenting the detection histories with all-zero histories. These represent the detection histories for species that may use the study site, but were not detected over the \(K=11\) surveys. I also augment the species names in the coords
dict, such that we can still use the dims
argument in the multivariate normal. Mirroring Royle and Dorazio (2008), I augment the history \(M - n\) all-zero histories, where \(M=250\) and \(n\) is the number of species detected during the survey.
Similar to the occupancy notebook, I use a CustomDist
to model the augmented history. This accounts for the “row-level” zero-inflation, whereby we know that the species is included in the super community if it was detected along the BBS route. The only difference with this logp
is that it uses a ZeroInflatedBinomial
distribution under the hood, rather than a Bernoulli, and uses the parameter \(\Omega\) to account for the row-level inflation.
= 250
M = np.zeros((M - n, J))
all_zero_history = np.row_stack((Y, all_zero_history))
Y_augmented
= [f'aug{i}' for i in np.arange(M - n)]
aug_names = np.concatenate((lookup, aug_names))
spp_aug
= {'process': ['detection', 'occurrence'],
coords 'process_bis': ['detection', 'occurrence'],
'species_aug': spp_aug}
def logp(x, psi, n, p, omega):
= pm.ZeroInflatedBinomial.dist(psi=psi, n=n, p=p)
rv = pm.logp(rv, x)
lp = lp.sum(axis=1)
lp_sum = pm.math.exp(lp_sum)
lp_exp
= pm.math.switch(
res sum(axis=1) > 0,
x.* omega,
lp_exp * omega + (1 - omega)
lp_exp
)
return pm.math.log(res)
with pm.Model(coords=coords) as unknown:
# priors for inclusion
= pm.Beta('omega', 0.001, 1)
omega
# priors for community-level means for detection and occurrence
= pm.Normal('mu', 0, 2, dims='process')
mu
# prior for covariance matrix for occurrence and detection
= pm.LKJCholeskyCov(
chol, corr, stds "chol", n=2, eta=2.0, sd_dist=pm.Exponential.dist(1.0, shape=2)
)= pm.Deterministic("cov", chol.dot(chol.T), dims=("process", "process_bis"))
cov
# species-level occurrence and detection probabilities on logit-scale
= pm.MvNormal("ab", mu, chol=chol, dims=("species_aug", "process"))
ab
# probability of detection
= ab[:, 0]
alpha = pm.Deterministic("p", pm.math.invlogit(alpha))
p
# probability of occurrence
= ab[:, 1]
beta = pm.Deterministic("psi", pm.math.invlogit(beta))
psi
# likelihood
pm.CustomDist('Y',
psi[:, np.newaxis],
K,
p[:, np.newaxis],
omega,=logp,
logp=Y_augmented
observed
)
pm.model_to_graphviz(unknown)
with unknown:
= pm.sample() unknown_idata
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [omega, mu, chol, ab]
/Users/philtpatton/miniforge3/envs/pymc/lib/python3.12/site-packages/pytensor/compile/function/types.py:959: RuntimeWarning: invalid value encountered in accumulate
self.vm()
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 176 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters. A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details
We see some warnings about the effective sample size and the \(\hat{R}\) statistic. Some of these warnings may just relate to the individual random effects.
=['omega', 'cov', 'mu']) az.summary(unknown_idata, var_names
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
omega | 0.526 | 0.069 | 0.410 | 0.655 | 0.004 | 0.003 | 400.0 | 563.0 | 1.01 |
cov[detection, detection] | 1.260 | 0.335 | 0.734 | 1.905 | 0.016 | 0.011 | 458.0 | 706.0 | 1.01 |
cov[detection, occurrence] | 1.372 | 0.485 | 0.531 | 2.298 | 0.024 | 0.017 | 408.0 | 660.0 | 1.00 |
cov[occurrence, detection] | 1.372 | 0.485 | 0.531 | 2.298 | 0.024 | 0.017 | 408.0 | 660.0 | 1.00 |
cov[occurrence, occurrence] | 5.057 | 1.363 | 2.807 | 7.688 | 0.072 | 0.051 | 361.0 | 598.0 | 1.01 |
mu[detection] | -2.002 | 0.203 | -2.378 | -1.623 | 0.010 | 0.007 | 438.0 | 603.0 | 1.01 |
mu[occurrence] | -2.039 | 0.458 | -2.847 | -1.179 | 0.025 | 0.018 | 349.0 | 505.0 | 1.01 |
= [0.55]
omega_hat_royle =['omega'], figsize=(8,2),
az.plot_trace(unknown_idata, var_names=[("omega", {}, [omega_hat_royle])]); lines
I can plot the posterior distribution of species richness \(N.\) This is slightly more complicated than before sinc there is an additional level of zero-inflation (included and never detected or not-included) in this model compared to the occupancy model (present and never detection or not present).
# relevant posterior samples
= az.extract(unknown_idata)
post = post.omega.to_numpy()
o_samps = post.psi.to_numpy()[n:, :]
psi_samps = post.p.to_numpy()[n:, :]
p_samps
# probability that the animal was never detected during the survey if present
= (1 - p_samps) ** K
p_not_detected
# probability of a zero detection history
= psi_samps * p_not_detected + (1 - psi_samps)
p_zero_hist
# probability that the species was included in the given the all-zero history
= (o_samps * p_zero_hist ** J) / (o_samps * p_zero_hist ** J + (1 - o_samps))
p_included
# posterior samples of N
= RNG.binomial(1, p_included).sum(axis=0)
number_undetected = n + number_undetected
N_samps
# posterior distribution
= 138
N_hat_royle = plt.subplots(figsize=(6, 4))
fig, ax ='white', bins=25)
ax.hist(N_samps, edgecolor'Species richness $N$')
ax.set_xlabel('Posterior samples')
ax.set_ylabel(='--', color='C1')
ax.axvline(N_hat_royle, linestyle='--', color='C2')
ax.axvline(N_samps.mean(), linestyle plt.show()