\documentclass[a4paper]{article} %\VignetteIndexEntry{Coalescent Models} %\VignettePackage{coalescentMCMC} \usepackage[utf8]{inputenc} \usepackage{fancyvrb} \usepackage{color,natbib} \newcommand\tmrca{T_{\mathsf{MRCA}}} \newcommand{\code}{\texttt} \newcommand{\pkg}{\textsf} \newcommand{\coalmcmc}{\pkg{coalescentMCMC}} \newcommand{\pegas}{\pkg{pegas}} \author{Emmanuel Paradis} \title{Likelihood Functions of Time-Dependent Coalescent Models} \begin{document} \maketitle <>= options(width=60) @ \noindent Coalescent models describe the distribution of ancestry in a population under some assumptions on the variation in the parameter $\Theta = 2N\mu$, with $N$ being the number of alleles in the population and $\mu$ the neutral mutation rate. The present document gives the likelihood functions and some computational details for several models with $\Theta$ varying through time. These models are available in \coalmcmc\ as R functions (see below). The general mathematical framework is given by Griffiths \& Tavaré \cite{Griffiths1994}. If $\Theta$ is constant, the probability of observing the coalescent times $t_1, \dots, t_n$ is: \begin{displaymath} \prod_{i=1}^{n-1} {n-i+1 \choose 2}\frac{1}{\Theta} \exp\left[-{n-i+1 \choose 2}\frac{t_{i+1} - t_i}{\Theta}\right] \end{displaymath} where $t_1=0$ is the present time ($t_1 \tau \end{array} \right. \end{displaymath} The last model (\emph{double exponential growth model}) assumes that the population experienced two different phases of exponential growth: \begin{displaymath} \Theta(t) = \left\{ \begin{array}{lll} & \Theta_0 e^{\rho_1 t} &\qquad t \le \tau\\ & \Theta(\tau) e^{\rho_2(t - \tau)} = \Theta_0 e^{\rho_2 t + (\rho_1 - \rho_2)\tau}&\qquad t > \tau \end{array} \right. \end{displaymath} which reduces to the first model if $\rho_1 = \rho_2$. These two last models have three parameters. \subsection{Constant-$\Theta$ Model} The log-likelihood is: \begin{displaymath} \ln L=\sum_{i=1}^{n-1} \ln{n-i+1 \choose 2}-\ln\Theta-{n-i+1 \choose 2}\frac{t_{i+1}-t_i}{\Theta}. \end{displaymath} Its partial derivative with respect to $\Theta$ is: \begin{displaymath} \frac{\partial\ln L}{\partial\Theta}=\sum_{i=1}^{n-1} -\frac{1}{\Theta} +{n-i+1 \choose 2}\frac{t_{i+1}-t_i}{\Theta^2}, \end{displaymath} which, after setting $\partial\ln L/\partial\Theta = 0$ can be solved to find the maximum likelihood estimator (MLE): \begin{displaymath} \widehat{\Theta}=\frac{1}{n-1}\sum_{i=1}^{n-1}{n-i+1 \choose 2}(t_{i+1}-t_i). \end{displaymath} Under the normal approximation of the likelihood function, the variance of $\hat{\Theta}$ is calculated through the second derivative of $\ln L$: \begin{displaymath} \frac{\partial^2\ln L}{\partial\Theta^2}=\sum_{i=1}^{n-1} \frac{1}{\Theta^2} - 2\times{n-i+1 \choose 2}\frac{t_{i+1}-t_i}{\Theta^3}, \end{displaymath} and: \begin{displaymath} \widehat{\mathrm{var}}(\widehat{\Theta})= -\left[\frac{n-1}{\widehat{\Theta}^2}-\frac{2}{\widehat{\Theta}^3} \sum_{i=1}^{n-1}{n-i+1 \choose 2}(t_{i+1}-t_i)\right]^{-1}. \end{displaymath} This estimator is implemented in \pegas\ with the function \code{theta.tree}. \subsection{Exponential Growth Model} The integral in equation~(\ref{eq:timecoal}) is: \begin{displaymath} \int_{t_{i}}^{t_{i+1}}\frac{1}{\Theta(u)}\mathrm{d}u = -\frac{1}{\rho\Theta_0} (e^{-\rho t_{i+1}}-e^{-\rho t_{i}}), \end{displaymath} leading to the log-likelihood: \begin{displaymath} \ln L=\sum_{i=1}^{n-1} \ln{n-i+1 \choose 2}- \ln\Theta_0 - \rho t_{i+1}+{n-i+1 \choose 2}\frac{1}{\rho\Theta_0}(e^{-\rho t_{i+1}} - e^{-\rho t_{i}}), \end{displaymath} with its first partial derivatives being: \begin{eqnarray*} \frac{\partial\ln L}{\partial\Theta_0} &=& \sum_{i=1}^{n-1} -\frac{1}{\Theta_0}-{n-i+1 \choose 2}\frac{1}{\rho\Theta_0^2}(e^{-\rho t_{i+1}} - e^{-\rho t_{i}}),\\ \frac{\partial\ln L}{\partial\rho} &=& \sum_{i=2}^{n-1} -t_{i+1} + {n-i+1 \choose 2}\frac{1}{\Theta_0}\left[-\frac{1}{\rho^2}(e^{-\rho t_{i+1}} - e^{-\rho t_{i}}) + \frac{1}{\rho}(-t_{i+1} e^{-\rho t_{i+1}} + t_{i}e^{-\rho t_{i}})\right]. \end{eqnarray*} \subsection{Linear Growth Model} We define $\kappa = (\Theta_{\tmrca} - \Theta_0)/\tmrca$, so $\Theta(t) = \Theta_0 + \kappa t$. The integral in equation~(\ref{eq:timecoal}) is: \begin{eqnarray*} \displaystyle\int_{t_i}^{t_{i+1}}\frac{1}{\Theta(u)}\mathrm{d}u &=& \displaystyle\frac{\ln(\Theta_0+\kappa t_{i+1})}{\kappa}-\frac{\ln(\Theta_0+\kappa t_i)}{\kappa}\\ &=& \displaystyle\frac{1}{\kappa}\ln\frac{\Theta_0+\kappa t_{i+1}}{\Theta_0+\kappa t_i}. \end{eqnarray*} The log-likelihood is thus: \begin{displaymath} \ln L = \sum_{i=1}^{n-1} \ln{n-i+1 \choose 2}- \ln(\Theta_0+\kappa t_{i+1}) - {n-i+1 \choose 2}\frac{1}{\kappa} \ln\frac{\Theta_0+\kappa t_{i+1}}{\Theta_0+\kappa t_i}. \end{displaymath} \subsection{Step Model} It is easier to calculate the integral in equation~\ref{eq:timecoal} with the difference: \begin{equation} \int_{t_i}^{t_{i+1}}\frac{1}{\Theta(u)}\mathrm{d}u = \int_0^{t_{i+1}}\frac{1}{\Theta(u)}\mathrm{d}u - \int_0^{t_i}\frac{1}{\Theta(u)}\mathrm{d}u.\label{eq:intpart} \end{equation} The integral from the origin is: \begin{displaymath} \int_0^t\frac{1}{\Theta(u)}\mathrm{d}u = \left\{ \begin{array}{lll} &\displaystyle\frac{t}{\Theta_0} &\qquad t \le \tau\\ &\displaystyle\frac{\tau}{\Theta_0} + \frac{t - \tau}{\Theta_1} &\qquad t > \tau. \end{array} \right. \end{displaymath} This is then plugged into equation~\ref{eq:timecoal} with a simple Dirac delta function. %So the integral in equation~\ref{eq:timecoal} is: % %\begin{displaymath} %\int_{t_{i-1}}^{t_i}\frac{1}{\Theta(u)}\mathrm{d}u = \left\{ %\begin{array}{lll} %&\displaystyle\frac{t_i - t_{i-1}}{\Theta_0} &\qquad t_{i-1} < t_i \le \tau\\ %&\displaystyle\frac{\tau - t_{i-1}}{\Theta_0} + \frac{t_i - \tau}{\Theta_1} %&\qquad t_{i-1} \le \tau < t_i\\ %&\displaystyle\frac{t_i - t_{i-1}}{\Theta_1} &\qquad \tau < t_{i-1} < t_i. %\end{array} \right. %\end{displaymath} \subsection{Double Exponential Growth Model} In this model the inverse of $\Theta(t)$ is: \begin{displaymath} \frac{1}{\Theta(t)} = \left\{ \begin{array}{lll} &\displaystyle\frac{e^{-\rho_1 t}}{\Theta_0}&\qquad t \le \tau\\ &\displaystyle\frac{e^{-\rho_2 t - (\rho_1 - \rho_2)\tau}}{\Theta_0}&\qquad t > \tau \end{array} \right. \end{displaymath} Again, it is easier to calculate the integral in equation~(\ref{eq:timecoal}) with equation~(\ref{eq:intpart}). The integral from the origin is: \begin{displaymath} \int_0^t\frac{1}{\Theta(u)}\mathrm{d}u = \left\{ \begin{array}{lll} &\displaystyle-\frac{1}{\rho_1\Theta_0}(e^{-\rho_1 t} - 1) &\quad t \le \tau\\ &\displaystyle-\frac{1}{\rho_1\Theta_0}(e^{-\rho_1\tau} - 1) - \frac{1}{\rho_2\Theta_0}[e^{-\rho_2 t - (\rho_1 - \rho_2)\tau} - e^{-\rho_1\tau}] &\quad t\ge\tau \end{array} \right. \end{displaymath} This is then plugged into equation~(\ref{eq:timecoal}) with a simple Dirac delta function. \section{Simulation of Coalescent Times} It is possible to simulate coalescent times from a time-dependent model by rescaling a set of coalescent times simulated with constant $\Theta$, denoted as $t$, with: \begin{displaymath} t' = \frac{\displaystyle\int_0^t\Theta(u)\mathrm{d}u}{\Theta(0)}. \end{displaymath} This gives for the exponential growth model \cite{Kuhner1998}: \begin{displaymath} t' = \frac{e^{\rho t} - 1}{\rho}, \end{displaymath} for the linear growth model: \begin{displaymath} t' = t + t^2(\Theta_{\tmrca}/\Theta_0 - 1)/{\tmrca}, \end{displaymath} for the step model: \begin{displaymath} t' = \tau + (t - \tau)\Theta_1/\Theta_0\qquad \mathrm{if}\ t > \tau, \end{displaymath} and for the exponential double growth model: \begin{displaymath} t' = \left\{ \begin{array}{lll} &\displaystyle\frac{e^{\rho_1 t} - 1}{\rho_1} &\qquad t \le \tau\\ &\displaystyle\frac{e^{\rho_1 \tau} - 1}{\rho_1}+\frac{e^{\rho_2 t + (\rho_1 - \rho_2)\tau} - e^{\rho_1\tau}}{\rho_2} &\qquad t \ge \tau \end{array} \right. \end{displaymath} \section{Implementation in \coalmcmc} Five functions are available in \coalmcmc\ which compute the likelihood of the constant-$\Theta$ model as well as the four above ones: \begin{Verbatim}[formatcom=\color{blue}] dcoal(bt, theta, log = FALSE) dcoal.time(bt, theta0, rho, log = FALSE) dcoal.linear(bt, theta0, thetaT, TMRCA, log = FALSE) dcoal.step(bt, theta0, theta1, tau, log = FALSE) dcoal.time2(bt, theta0, rho1, rho2, tau, log = FALSE) \end{Verbatim} The two arguments common to all functions are: \begin{description} \item[\code{bt}:] a vector of branching times; \item[\code{log}:] a logical value, if \code{TRUE} the values are returned log-transformed which is recommended for computing log-likelihoods. \end{description} The other arguments are the parameters of the models. \bibliographystyle{plain} \bibliography{coalescentMCMC} \end{document}