\documentclass[a4paper]{article}
%\VignetteIndexEntry{Coalescent Models}
%\VignettePackage{coalescentMCMC}
\usepackage[utf8]{inputenc}
\usepackage{fancyvrb}
\usepackage{color,natbib}

\newcommand\tmrca{T_{\mathsf{MRCA}}}
\newcommand{\code}{\texttt}
\newcommand{\pkg}{\textsf}
\newcommand{\coalmcmc}{\pkg{coalescentMCMC}}
\newcommand{\pegas}{\pkg{pegas}}

\author{Emmanuel Paradis}
\title{Likelihood Functions of Time-Dependent Coalescent Models}

\begin{document}
\maketitle

<<echo=false,quiet=true>>=
options(width=60)
@

\noindent Coalescent models describe the distribution of ancestry in a
population under some assumptions on the variation in the parameter
$\Theta = 2N\mu$, with $N$ being the number of alleles in the population and
$\mu$ the neutral mutation rate. The present document gives the
likelihood functions and some computational details for several
models with $\Theta$ varying through time. These models are available
in \coalmcmc\ as R functions (see below).

The general mathematical framework is given by Griffiths \&
Tavaré \cite{Griffiths1994}.  If $\Theta$ is constant, the probability
of observing the coalescent times $t_1, \dots, t_n$ is:

\begin{displaymath}
\prod_{i=1}^{n-1} {n-i+1 \choose 2}\frac{1}{\Theta}
\exp\left[-{n-i+1 \choose 2}\frac{t_{i+1} - t_i}{\Theta}\right]
\end{displaymath}
where $t_1=0$ is the present time
($t_1<t_2<\dots<t_n$). Note that $t_{i+1}-t_i$ is the $i$th
coalescent interval ($i=1, \dots, n-1$). The general formula for
$\Theta(t)$ varying through time is:

\begin{equation}\label{eq:timecoal}
\prod_{i=1}^{n-1} {n-i+1 \choose 2}\frac{1}{\Theta(t_{i+1})}
\exp\left[-{n-i+1 \choose 2}\int_{t_i}^{t_{i+1}}\frac{1}{\Theta(u)}\mathrm{d}u\right]
\end{equation}

Four specific temporal models are considered below. We denote the time to the
most recent ancestor as $\tmrca$ ($=t_n$).

\section{Models}

The \emph{exponential growth model} is $\Theta(t) = \Theta_0
e^{\rho t}$, where $\Theta_0$ is the value of $\Theta$ at present and
$\rho$ is the population growth rate \cite{Kuhner1998}. The
\emph{linear model} is formulated as $\Theta(t) =
\Theta_0 + t(\Theta_{\tmrca} - \Theta_0)/{\tmrca}$. This model, like
the previous one, has two parameters: $\Theta_0$ and
$\Theta_{\tmrca}$.

The third model (\emph{step model}) assumes two constant values of
$\Theta$ before and after a point in time denoted as $\tau$:

\begin{displaymath}
\Theta(t) = \left\{
\begin{array}{lll}
& \Theta_0 &\qquad t \le \tau\\
& \Theta_1 &\qquad t > \tau
\end{array} \right.
\end{displaymath}

The last model (\emph{double exponential growth model}) assumes that
the population experienced two different phases of exponential growth:

\begin{displaymath}
\Theta(t) = \left\{
\begin{array}{lll}
& \Theta_0 e^{\rho_1 t} &\qquad t \le \tau\\
& \Theta(\tau) e^{\rho_2(t - \tau)} = \Theta_0 e^{\rho_2 t + (\rho_1 - \rho_2)\tau}&\qquad t > \tau
\end{array} \right.
\end{displaymath}
which reduces to the first model if $\rho_1 = \rho_2$. These two last
models have three parameters.

\subsection{Constant-$\Theta$ Model}

The log-likelihood is:

\begin{displaymath}
\ln L=\sum_{i=1}^{n-1} \ln{n-i+1 \choose 2}-\ln\Theta-{n-i+1 \choose 2}\frac{t_{i+1}-t_i}{\Theta}.
\end{displaymath}
Its partial derivative with respect to $\Theta$ is:

\begin{displaymath}
\frac{\partial\ln L}{\partial\Theta}=\sum_{i=1}^{n-1} -\frac{1}{\Theta}
+{n-i+1 \choose 2}\frac{t_{i+1}-t_i}{\Theta^2},
\end{displaymath}
which, after setting $\partial\ln L/\partial\Theta = 0$ can be solved
to find the maximum likelihood estimator (MLE):

\begin{displaymath}
\widehat{\Theta}=\frac{1}{n-1}\sum_{i=1}^{n-1}{n-i+1 \choose 2}(t_{i+1}-t_i).
\end{displaymath}

Under the normal approximation of the likelihood function, the
variance of $\hat{\Theta}$ is calculated through the second derivative
of $\ln L$:

\begin{displaymath}
\frac{\partial^2\ln L}{\partial\Theta^2}=\sum_{i=1}^{n-1} \frac{1}{\Theta^2}
- 2\times{n-i+1 \choose 2}\frac{t_{i+1}-t_i}{\Theta^3},
\end{displaymath}
and:

\begin{displaymath}
\widehat{\mathrm{var}}(\widehat{\Theta})=
-\left[\frac{n-1}{\widehat{\Theta}^2}-\frac{2}{\widehat{\Theta}^3}
\sum_{i=1}^{n-1}{n-i+1 \choose 2}(t_{i+1}-t_i)\right]^{-1}.
\end{displaymath}
This estimator is implemented in \pegas\ with the function \code{theta.tree}.

\subsection{Exponential Growth Model}

The integral in equation~(\ref{eq:timecoal}) is:

\begin{displaymath}
\int_{t_{i}}^{t_{i+1}}\frac{1}{\Theta(u)}\mathrm{d}u = -\frac{1}{\rho\Theta_0}
(e^{-\rho t_{i+1}}-e^{-\rho t_{i}}),
\end{displaymath}
leading to the log-likelihood:

\begin{displaymath}
\ln L=\sum_{i=1}^{n-1} \ln{n-i+1 \choose 2}- \ln\Theta_0 - \rho t_{i+1}+{n-i+1 \choose 2}\frac{1}{\rho\Theta_0}(e^{-\rho t_{i+1}} - e^{-\rho t_{i}}),
\end{displaymath}
with its first partial derivatives being:

\begin{eqnarray*}
\frac{\partial\ln L}{\partial\Theta_0} &=& \sum_{i=1}^{n-1} -\frac{1}{\Theta_0}-{n-i+1 \choose 2}\frac{1}{\rho\Theta_0^2}(e^{-\rho t_{i+1}} - e^{-\rho t_{i}}),\\
\frac{\partial\ln L}{\partial\rho} &=& \sum_{i=2}^{n-1} -t_{i+1} + {n-i+1 \choose
  2}\frac{1}{\Theta_0}\left[-\frac{1}{\rho^2}(e^{-\rho t_{i+1}} - e^{-\rho
    t_{i}}) + \frac{1}{\rho}(-t_{i+1} e^{-\rho t_{i+1}} + t_{i}e^{-\rho t_{i}})\right].
\end{eqnarray*}

\subsection{Linear Growth Model}

We define $\kappa = (\Theta_{\tmrca} - \Theta_0)/\tmrca$, so $\Theta(t) =
\Theta_0 + \kappa t$. The integral in equation~(\ref{eq:timecoal}) is:

\begin{eqnarray*}
\displaystyle\int_{t_i}^{t_{i+1}}\frac{1}{\Theta(u)}\mathrm{d}u &=&
\displaystyle\frac{\ln(\Theta_0+\kappa t_{i+1})}{\kappa}-\frac{\ln(\Theta_0+\kappa t_i)}{\kappa}\\
&=& \displaystyle\frac{1}{\kappa}\ln\frac{\Theta_0+\kappa t_{i+1}}{\Theta_0+\kappa t_i}.
\end{eqnarray*}
The log-likelihood is thus:

\begin{displaymath}
\ln L = \sum_{i=1}^{n-1} \ln{n-i+1 \choose 2}-
\ln(\Theta_0+\kappa t_{i+1}) - {n-i+1 \choose 2}\frac{1}{\kappa}
\ln\frac{\Theta_0+\kappa t_{i+1}}{\Theta_0+\kappa t_i}.
\end{displaymath}

\subsection{Step Model}

It is easier to calculate the integral in equation~\ref{eq:timecoal}
with the difference:

\begin{equation}
\int_{t_i}^{t_{i+1}}\frac{1}{\Theta(u)}\mathrm{d}u =
\int_0^{t_{i+1}}\frac{1}{\Theta(u)}\mathrm{d}u -
\int_0^{t_i}\frac{1}{\Theta(u)}\mathrm{d}u.\label{eq:intpart}
\end{equation}
The integral from the origin is:

\begin{displaymath}
\int_0^t\frac{1}{\Theta(u)}\mathrm{d}u = \left\{
\begin{array}{lll}
&\displaystyle\frac{t}{\Theta_0} &\qquad t \le \tau\\
&\displaystyle\frac{\tau}{\Theta_0} + \frac{t - \tau}{\Theta_1}
&\qquad t > \tau.
\end{array} \right.
\end{displaymath}
This is then plugged into equation~\ref{eq:timecoal} with a simple
Dirac delta function.
%So the integral in equation~\ref{eq:timecoal} is:
%
%\begin{displaymath}
%\int_{t_{i-1}}^{t_i}\frac{1}{\Theta(u)}\mathrm{d}u = \left\{
%\begin{array}{lll}
%&\displaystyle\frac{t_i - t_{i-1}}{\Theta_0} &\qquad t_{i-1} < t_i \le \tau\\
%&\displaystyle\frac{\tau - t_{i-1}}{\Theta_0} + \frac{t_i - \tau}{\Theta_1}
%&\qquad t_{i-1} \le \tau < t_i\\
%&\displaystyle\frac{t_i - t_{i-1}}{\Theta_1} &\qquad \tau < t_{i-1} < t_i.
%\end{array} \right.
%\end{displaymath}

\subsection{Double Exponential Growth Model}

In this model the inverse of $\Theta(t)$ is:

\begin{displaymath}
\frac{1}{\Theta(t)} = \left\{
\begin{array}{lll}
&\displaystyle\frac{e^{-\rho_1 t}}{\Theta_0}&\qquad t \le \tau\\
&\displaystyle\frac{e^{-\rho_2 t - (\rho_1 - \rho_2)\tau}}{\Theta_0}&\qquad t > \tau
\end{array} \right.
\end{displaymath}
Again, it is easier to calculate the integral in equation~(\ref{eq:timecoal})
with equation~(\ref{eq:intpart}). The integral from the origin is:

\begin{displaymath}
\int_0^t\frac{1}{\Theta(u)}\mathrm{d}u = \left\{
\begin{array}{lll}
&\displaystyle-\frac{1}{\rho_1\Theta_0}(e^{-\rho_1 t} - 1) &\quad t \le \tau\\
&\displaystyle-\frac{1}{\rho_1\Theta_0}(e^{-\rho_1\tau} - 1) -
\frac{1}{\rho_2\Theta_0}[e^{-\rho_2 t - (\rho_1 - \rho_2)\tau} - e^{-\rho_1\tau}]
&\quad t\ge\tau
\end{array} \right.
\end{displaymath}
This is then plugged into equation~(\ref{eq:timecoal}) with a simple
Dirac delta function.

\section{Simulation of Coalescent Times}

It is possible to simulate coalescent times from a
time-dependent model by rescaling a set of coalescent times simulated
with constant $\Theta$, denoted as $t$, with:

\begin{displaymath}
t' = \frac{\displaystyle\int_0^t\Theta(u)\mathrm{d}u}{\Theta(0)}.
\end{displaymath}
This gives for the exponential growth model \cite{Kuhner1998}:

\begin{displaymath}
t' = \frac{e^{\rho t} - 1}{\rho},
\end{displaymath}
for the linear growth model:

\begin{displaymath}
t' = t + t^2(\Theta_{\tmrca}/\Theta_0 - 1)/{\tmrca},
\end{displaymath}
for the step model:

\begin{displaymath}
t' = \tau + (t - \tau)\Theta_1/\Theta_0\qquad \mathrm{if}\ t > \tau,
\end{displaymath}
and for the exponential double growth model:

\begin{displaymath}
t' = \left\{
\begin{array}{lll}
&\displaystyle\frac{e^{\rho_1 t} - 1}{\rho_1} &\qquad t \le \tau\\
&\displaystyle\frac{e^{\rho_1 \tau} - 1}{\rho_1}+\frac{e^{\rho_2 t + (\rho_1 - \rho_2)\tau} - e^{\rho_1\tau}}{\rho_2}
&\qquad t \ge \tau
\end{array} \right.
\end{displaymath}

\section{Implementation in \coalmcmc}

Five functions are available in \coalmcmc\ which compute the
likelihood of the constant-$\Theta$ model as well as the four above
ones:

\begin{Verbatim}[formatcom=\color{blue}]
dcoal(bt, theta, log = FALSE)
dcoal.time(bt, theta0, rho, log = FALSE)
dcoal.linear(bt, theta0, thetaT, TMRCA, log = FALSE)
dcoal.step(bt, theta0, theta1, tau, log = FALSE)
dcoal.time2(bt, theta0, rho1, rho2, tau, log = FALSE)
\end{Verbatim}
The two arguments common to all functions are:

\begin{description}
\item[\code{bt}:] a vector of branching times;
\item[\code{log}:] a logical value, if \code{TRUE} the values are
  returned log-transformed which is recommended for computing
  log-likelihoods.
\end{description}
The other arguments are the parameters of the models.

\bibliographystyle{plain}
\bibliography{coalescentMCMC}

\end{document}