In this example, we are going to perform inference for the logistic growth model commonly used in population biology to describe resource-limited growth. The population density of a given bacteria is supposed to follow:
\[ \frac{d y}{d t} = r y (1 - \frac{y}{k}) \]
where we assume $y(0)>0$; $r > 0$ is the initial (exponential) growth rate when resources are abundant; $\kappa > 0$ is the carrying capacity denoting the steady-state population density.
Using Scipy's integrator, write a function which numerically solves the logistic equation. Plot the solution for $y(0)=5, r=0.2, \kappa=20$ from $t=0$ to $t=40$.
import numpy as np
from scipy.integrate import odeint
from plotnine import *
import pandas as pd
def logistic_rhs(y, t, r, k):
return r * y * (1 - y / k)
def logistic_solve(times, r, k, y0):
sol = odeint(logistic_rhs, [y0], times, args=(r, k))
df = pd.DataFrame({'t': times, 'y': sol[:, 0]})
return df
df = logistic_solve(np.linspace(0, 40, 100), 0.2, 20, 5)
(ggplot(df, aes(x='t', y='y')) +
geom_line())
Assume that the observed data differs from the logistic model solution according to the measurement equation:
\[ \tilde y(t) \sim \mathcal{N}(y(t), \sigma), \]
where $\sigma >0$ controls the measurement variability. Generate 'observed' data at $t= 0, 5, 10, 15, 20, 25, 30, 35, 40$ assuming $y(0)=5, r=0.2, \kappa=20, \sigma=2$. Plot these data overlaid on top of the true solution.
times = np.arange(0, 45, 5)
df_short = logistic_solve(times, 0.2, 20, 5)
def logistic_observations(y, sigma):
return np.random.normal(y, sigma, len(y))
df_short['y'] = logistic_observations(df_short['y'], 2)
df_short['type'] = 'actual'
df['type'] = 'simulated'
df_both = pd.concat([df, df_short])
(ggplot(df_both[df_both['type']=='simulated'], aes(x='t', y='y', colour='type')) +
geom_line() +
geom_point(df_both[df_both['type']=='actual']))
The likelihood for this model is given by:
\[ L = \prod_{i=1}^{N} \frac{1}{\sqrt{2\pi\sigma^2}} \exp(-\frac{(\tilde y(t_i) - y(t_i))^2}{2\sigma^2}) \]
where $N$ is the number of observations and $y(t) = y(t|r,\kappa, y(0))$ is the ODE solution at time $t$. Write a function which calculates the log-likelihood for given $(r, \kappa, y(0), \sigma)$.
import scipy.stats
def logistic_log_likelihood(times, y_tilde, r, k, y0, sigma):
df = logistic_solve(times, r, k, y0)
log_p = 0.0
for i in range(len(times)):
log_p += scipy.stats.norm.logpdf(y_tilde[i], df['y'][i], sigma)
return log_p
Using the simulated data you previously generated, plot the log-likelihood as a function of $r$ holding all other parameters at their true values.
r = np.linspace(0, 1, 100)
log_like = np.zeros(len(r))
for i in range(len(r)):
log_like[i] = logistic_log_likelihood(times, df_short['y'], r[i], 20, 5, 2)
df_log_like = pd.DataFrame({'r': r, 'log_like': log_like})
(ggplot(df_log_like, aes(x='r', y='log_like')) +
geom_line())
Plot the likelihood (not the log-likelihood as this is harder to distinguish) contour surface for $(r,k)$ holding all other parameters at their true values.
import matplotlib.pyplot as plt
r = np.linspace(0.1, 0.5, 50)
k = np.linspace(0, 50, 50)
X, Y = np.meshgrid(r, k)
Z = np.zeros((len(r), len(k)))
for i in range(len(r)):
for j in range(len(k)):
Z[i, j] = logistic_log_likelihood(times, df_short['y'], r[i], k[j], 5, 2)
fig,ax=plt.subplots(1, 1)
cp = ax.contourf(X, Y, np.exp(Z))
fig.colorbar(cp)
ax.set_xlabel('r')
ax.set_ylabel('k')
plt.show()
## 6.
Assume we have priors on parameters: $r\sim \mathcal{N}(0.2, 0.02), \kappa\sim \mathcal{N}(20, 4)$, and we fix $y(0)=5$. Generate 100 prior predictive simulations of the ODE solution $y$ and plot these. Remember, a single prior predictive simulation is obtained by sampling each parameter from its prior and (in this case) solving the ODE for this parameter set.
n = 100
for i in range(n):
r = np.random.normal(0.2, 0.02, 1)
k = np.random.normal(20, 4, 1)
temp = logistic_solve(np.linspace(0, 40, 100), r, k, 5)
temp['simulation'] = str(i)
if i == 0:
df_big = temp
else:
df_big = pd.concat([df_big, temp])
(ggplot(df_big, aes(x='t', y='y', group='simulation')) +
geom_line(alpha=0.5))
Now also allow $y(0)\sim \mathcal{N}(5, 1)$. How does the prior predictive distribution change?
for i in range(n):
r = np.random.normal(0.2, 0.02, 1)[0]
k = np.random.normal(20, 4, 1)[0]
y0 = np.random.normal(5, 1, 1)[0]
temp = logistic_solve(np.linspace(0, 40, 100), r, k, y0)
temp['simulation'] = str(i)
if i == 0:
df_big = temp
else:
df_big = pd.concat([df_big, temp])
(ggplot(df_big, aes(x='t', y='y', group='simulation')) +
geom_line(alpha=0.5))
We are now going to write a random walk Metropolis sampler. The first step is to write a method which takes as input $(r,\kappa, \sigma)$ and proposes new values using univariate normal distributions centered at the current values. So, for example,
\[ r'\sim\mathcal{N}(r,\tau_r) \]
where $\tau_r$ is a hyperparameter of the method. Write such a function.
def propose(r, k, sigma, tau_r, tau_k, tau_sigma):
r = np.random.normal(r, tau_r)
k = np.random.normal(k, tau_k)
sigma = np.random.normal(sigma, tau_sigma)
return r, k, sigma
Assume priors: $r\sim \mathcal{N}(0.2, 0.02), \kappa\sim \mathcal{N}(20, 4), \sigma \sim \mathcal{N}(2, 0.2)$ and fix $y(0)=5$. Write a function that accepts as input $(r,\kappa, \sigma)$ and outputs the log-prior density $\log{\text -}p(r,\kappa, \sigma)$.
def prior(r, k, sigma):
lp = (scipy.stats.norm.logpdf(r, 0.2, 0.02) +
scipy.stats.norm.logpdf(k, 20, 4) +
scipy.stats.norm.logpdf(sigma, 2, 0.2))
return lp
Write a function which calculates the unnormalised log-posterior (i.e. the sum of the log-prior and log-likelihood), $\text{log-}p(r,\kappa,\sigma|\text{data})$, for a given parameter set.
def posterior(r, k, sigma, y0, times, y_tilde):
ll = logistic_log_likelihood(times, y_tilde, r, k, y0, sigma)
lp = prior(r, k, sigma)
return ll + lp
posterior(0.4, 20, 2, 5, times, df_short['y'])
The next step is to write a function called 'step_accept' which takes as input $(r,\kappa, \sigma)$, proposes new values $(r',\kappa', \sigma')$. Then calculates the ratio:
\[ t = \exp(\text{log-}p(r',\kappa', \sigma'|\text{data}) - \text{log-}p(r,\kappa, \sigma|\text{data})) \]
Then generates $u\sim U(0,1)$ and does the following: if $t \geq u$, return $(r',\kappa',\sigma')$; else return $(r,\kappa,\sigma)$.
def step_accept(r, k, sigma, y0, times, y_tilde, tau_r, tau_k, tau_sigma):
r_prime, k_prime, sigma_prime = propose(r, k, sigma, tau_r, tau_k, tau_sigma)
logp_current = posterior(r, k, sigma, y0, times, y_tilde)
logp_proposed = posterior(r_prime, k_prime, sigma_prime, y0, times, y_tilde)
t = np.exp(logp_proposed - logp_current)
u = np.random.uniform()
if t >= u:
return r_prime, k_prime, sigma_prime
else:
return r, k, sigma
step_accept(0.2, 18, 1.5, 5, times, df_short['y'], 0.01, 1, 0.1)
Write a function which iterates 'step_accept' generating a chain of MCMC samples of $(r,\kappa,\sigma)$. Initialise $(r,\kappa,\sigma)$ using samples from the prior.
def MCMC(numsamples, r, k, sigma, y0, times, y_tilde, tau_r, tau_k, tau_sigma):
r = np.random.normal(0.2, 0.02)
k = np.random.normal(20, 4)
sigma = np.random.normal(2, 0.2)
m_samples = np.zeros((numsamples, 3))
m_samples[0, :] = [r, k, sigma]
for i in range(1, numsamples):
r, k, sigma = step_accept(r, k, sigma, y0, times, y_tilde, tau_r, tau_k, tau_sigma)
m_samples[i, :] = [r, k, sigma]
df = pd.DataFrame({'r': m_samples[:, 0], 'k': m_samples[:, 1],
'sigma': m_samples[:, 2], 'iter': np.arange(0, numsamples)})
return df
Assuming step sizes of $(\tau_r=0.01, \tau_k=1, \tau_{\sigma} = 0.1) $, generate an MCMC sample of 1000 draws. Visualise the sampled values of $r$ over time.
df = MCMC(1000, 0.2, 18, 1.5, 5, times, df_short['y'], 0.01, 1, 0.1)
plt.plot(df['r'])
plt.show()
Plot the pairwise distribution of $(r,\kappa)$. How do the sampled values compare to the true parameters?
(ggplot(df, aes(x='r', y='k')) +
geom_point(colour='blue') +
geom_vline(xintercept=0.2) +
geom_hline(yintercept=20))
Modify your MCMC routine to use the following half-normal distributions to sample initial points for your chains. Run four independent chains for 1000 iterations each and plot the $\kappa$ samples over time. How long does it appear your chains be run until they reach a stationary distribution?
$r \sim \text{half-N}(0.2, 0.1)$ $\kappa \sim \text{half-N}(0, 20, 10)$ $\sigma \sim \text{half-N}(2, 1)$
Note, to do so, you can use the following function:
def truncated_normal(mu, sd):
myclip_a = 0
myclip_b = 1000000
my_mean = mu
my_std = sd
a, b = (myclip_a - my_mean) / my_std, (myclip_b - my_mean) / my_std
r = scipy.stats.truncnorm.rvs(a, b, loc=my_mean, scale=my_std, size=1)
return r[0]
def truncated_normal(mu, sd):
myclip_a = 0
myclip_b = 1000000
my_mean = mu
my_std = sd
a, b = (myclip_a - my_mean) / my_std, (myclip_b - my_mean) / my_std
r = scipy.stats.truncnorm.rvs(a, b, loc=my_mean, scale=my_std, size=1)
return r[0]
def MCMC(numsamples, r, k, sigma, y0, times, y_tilde, tau_r, tau_k, tau_sigma):
r = truncated_normal(0, 0.2, 0.1)
k = truncated_normal(0, 20, 10)
sigma = truncated_normal(0, 2, 1)
m_samples = np.zeros((numsamples, 3))
m_samples[0, :] = [r, k, sigma]
for i in range(1, numsamples):
r, k, sigma = step_accept(r, k, sigma, y0, times, y_tilde, tau_r, tau_k, tau_sigma)
m_samples[i, :] = [r, k, sigma]
df = pd.DataFrame({'r': m_samples[:, 0], 'k': m_samples[:, 1],
'sigma': m_samples[:, 2], 'iter': np.arange(0, numsamples)})
return df
df = MCMC(1000, 0.2, 18, 1.5, 5, times, df_short['y'], 0.01, 1, 0.1)
df['chain'] = '1'
df1 = MCMC(1000, 0.2, 18, 1.5, 5, times, df_short['y'], 0.01, 1, 0.1)
df1['chain'] = '2'
df2 = MCMC(1000, 0.2, 18, 1.5, 5, times, df_short['y'], 0.01, 1, 0.1)
df2['chain'] = '3'
df3 = MCMC(1000, 0.2, 18, 1.5, 5, times, df_short['y'], 0.01, 1, 0.1)
df3['chain'] = '4'
df_all = pd.concat([df, df1, df2, df3])
(ggplot(df_all, aes(x='iter', y='k', colour='chain')) +
geom_line())
Using a random subset of 100 samples from all four chains taken from after they appear to have reached the stationary distribution, draw from the posterior predictive distribution of the logistic equation solution and overlay the observations.
df_all_after = df_all.query('iter > 250')
nsamples = 100
idx = np.random.randint(len(df_all_after), size=nsamples)
df_all_after = df_all_after.sample(n=nsamples)
for i in range(nsamples):
r = df_all_after['r'].iloc[i]
k = df_all_after['k'].iloc[i]
temp = logistic_solve(np.linspace(0, 40, 100), r, k, 5)
temp['simulation'] = str(i)
if i == 0:
df_big = temp
else:
df_big = pd.concat([df_big, temp])
df_big['type'] = 'simulation'
df_short
df_todo = pd.concat([df_big, df_short])
(ggplot(df_todo.query('type == "simulation"'), aes(x='t', y='y', group='simulation')) +
geom_line(alpha=0.5) +
geom_point(df_todo.query('type != "simulation"'), colour="orange"))