import seaborn as sns
import pymc as pm
import pytensor.tensor as pt
import matplotlib.pyplot as plt
import arviz as az
import numpy as np
# plotting defaults
'figure.dpi'] = 600
plt.rcParams['fivethirtyeight')
plt.style.use('axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.bottom'] = False
plt.rcParams["tab10")
sns.set_palette(
# hyper parameters
= 500
M = 400
U_X = 400
U_SIGMA
# burnham impala dataset with distances in m
= np.array(
x_observed 71.933980, 26.047227, 58.474341, 92.349221, 163.830409, 84.523652
[163.830409, 157.330098, 22.267696, 72.105330, 86.986979, 50.795047
,0.000000, 73.135370, 0.000000, 128.557522, 163.830409, 71.845104
,30.467336, 71.073909, 150.960702, 68.829172, 90.000000, 64.983827
,165.690874, 38.008322, 378.207430, 78.146226, 42.127052, 0.000000
,400.000000, 175.386612, 30.467336, 35.069692, 86.036465, 31.686029
,200.000000, 271.892336, 26.047227, 76.604444, 41.042417, 200.000000
,86.036465, 0.000000, 93.969262, 55.127471, 10.458689, 84.523652
,0.000000, 77.645714, 0.000000, 96.418141, 0.000000, 64.278761
,187.938524, 0.000000, 160.696902, 150.453756, 63.603607, 193.185165
,106.066017, 114.906666, 143.394109, 128.557522, 245.745613, 123.127252
,123.127252, 153.208889, 143.394109, 34.202014, 96.418141, 259.807621
,8.715574]
,
)
# plot the distances
= plt.subplots(figsize=(4,4))
fig, ax
='white')
ax.hist(x_observed, edgecolor
'Hemingway Impala Data')
ax.set_title('Number of detections')
ax.set_ylabel('Distance (m)')
ax.set_xlabel(
plt.show()
Distance sampling
In this notebook, I explore how to fit distance sampling models for estimating the size of a closed population. Similar to the occupancy and closed capture-recapture notebooks, I use parameter-expanded data-augmentation (PX-DA) and the zero-inflated binomial model in this notebook.
The idea with distance sampling, also known as line-transect sampling, is that a surveyer traverses a transect, typically in a boat or a plane. As they survey, they note when they detect an individual, or a group, from the species of interest, and further note the distance from the transect to the animal. Further, they note the angle to the animal(s), such that they can calculate the perpendicular distance from the animal to the transect. We assume that probability of detecting an animal \(p\) decreases monotonically as the distance from the transect grows, e.g., \(p=\exp(-x^2/\sigma^2),\) where \(x\) is the distance and \(\sigma\) is a scale parameter to be estimated. These simple assumptions permit the estimation of the population size \(N\) as well as density \(D.\)
Following Hooten and Hefley (2019), Chapter 24 and Royle and Dorazio (2008), Chapter 7, I use the impala data from Burnham, Anderson, and Laake (1980), who credits P. Hemingway with the dataset. In this dataset, 73 impalas were observed along a 60km transect. The distance values below are the perpendicular distances, in meters, from the transect.
Again, we treat this as a zero-inflated binomial model using PX-DA. The trick for doing so is to create a binary vector of length \(M\), \(y,\) that represents whether the individual was detected during the study. Then, combine the indicator with the distance vector \(x\) to create a the full dataset \((x,y).\)
= len(x_observed)
n = M - n
unobserved_count = np.zeros(unobserved_count)
zeros
= np.ones(n)
y = np.concatenate((y, zeros)) y_augmented
The issue is that \(x\) is unobserved for the undetected individuals. To work around this, we put a uniform prior on the unobserved \(x,\) i.e., \(x \sim \text{Uniform}(0, U_x).\) With this “complete” \(x,\) we can construct the detection function \(p\) for the unobserved individuals.
with pm.Model() as distance:
= pm.Beta('psi', 0.001, 1)
psi = pm.Uniform('sigma', 0, U_SIGMA)
sigma
= pm.Uniform('x_unobserved', 0, U_X, shape=unobserved_count)
x_unobserved = pt.concatenate((x_observed, x_unobserved))
x_complete
= pm.Deterministic('p', pm.math.exp(- x_complete ** 2 / sigma ** 2))
p
'y', p=p, psi=psi, n=1, observed=y_augmented)
pm.ZeroInflatedBinomial(
pm.model_to_graphviz(distance)
with distance:
= pm.sample() distance_idata
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [psi, sigma, x_unobserved]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 14 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
az.plot_trace(
distance_idata, =(10,4),
figsize=['psi', 'sigma']
var_names; )
This model samples slower than the models in the other notebooks, presumably because of the unobserved \(x.\) As in the closed capture-recapture notebook, we will have to simulate the posterior for \(N\) using the posterior distribution of \(\psi\) and \(M.\)
= np.random.default_rng()
RNG
= az.extract(distance_idata)
posterior = posterior.psi.to_numpy()
psi_samples = posterior.p.to_numpy()
p_samples
= (1 - p_samples[n:])
not_p = (not_p * psi_samples) / (not_p * psi_samples + (1 - psi_samples))
p_included = RNG.binomial(1, p_included).sum(axis=0)
n_undetected = n + n_undetected N_samples
= posterior.sigma.to_numpy()
sigma_samples
# plot the results
= plt.subplots(1, 2, sharey=True, figsize=(8,4))
fig, (ax0, ax1)
# histograms of the posteriors
='white', bins=30)
ax0.hist(N_samples, edgecolor='white', bins=30)
ax1.hist(sigma_samples, edgecolor
# show the abundance dist in terms of M
# ax0.set_xlim((100, M))
# axes labels
r'Abundance $N$')
ax0.set_xlabel('Number of samples')
ax0.set_ylabel(r'Detection range $\sigma$')
ax1.set_xlabel(
# add the point estimates
= N_samples.mean()
N_hat = sigma_samples.mean()
sigma_hat 200, 350, rf'$\hat{{N}}$={N_hat:.1f}', ha='left', va='center')
ax0.text(205, 350, rf'$\hat{{\sigma}}$={sigma_hat:.1f}', ha='left', va='center')
ax1.text(
# the results from royle and dorazio (2008) for comparison
= 179.9
N_hat_royle = 187
sigma_hat_royle
='--', linewidth=3, color='C1')
ax0.axvline(N_hat_royle, linestyle='--', linewidth=3, color='C1')
ax1.axvline(sigma_hat_royle, linestyle
plt.show()
The model shows a high level of agreement with Royle and Dorazio (2008), Chapter 7, although note that they reported \(\sigma\) in terms of 100m units. It is also possible to plot the posterior distribution of the detection function.
= np.arange(400)
xx
def det_func(x, s):
return np.exp(- (x ** 2) / (s ** 2))
= np.array([det_func(xx, s) for s in sigma_samples])
p_samps
= p_samps.mean(axis=0)
p_mean = np.quantile(p_samps, 0.02, axis=0)
p_low = np.quantile(p_samps, 0.98, axis=0)
p_high
= plt.subplots(figsize=(5,4))
fig, ax
'-')
ax.plot(xx, p_mean, =0.2)
ax.fill_between(xx, p_low, p_high, alpha
'Detection function')
ax.set_title(r'$p$')
ax.set_ylabel(r'Distance (m)')
ax.set_xlabel(
plt.show()
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Mon Nov 25 2024
Python implementation: CPython
Python version : 3.12.7
IPython version : 8.29.0
matplotlib: 3.9.2
pymc : 5.18.2
pytensor : 2.26.3
numpy : 1.26.4
seaborn : 0.13.2
arviz : 0.20.0
Watermark: 2.5.0