\documentclass[a4paper]{article} \usepackage[round]{natbib} \usepackage{hyperref, amsmath, amssymb} \usepackage{Sweave} \usepackage{hyperref} %\VignetteIndexEntry{KKT Conditions for Zero-Inflated Regression} \hypersetup{% pdftitle = {KKT Conditions for Zero-Inflated Regression}, pdfsubject = {package vignette}, pdfauthor = {Zhu Wang}, %% change colorlinks to false for pretty printing colorlinks = {true}, linkcolor = {blue}, citecolor = {blue}, urlcolor = {red}, hyperindex = {true}, linktocpage = {true}, } \author{Zhu Wang \\ University of Tennessee Health Science Center\\ zwang145@uthsc.edu} \title{kKT Conditions for Zero-Inflated Regression} \begin{document} \maketitle This technical report details the calculation of solution path of the \textbf{\textsf{R}} package \textbf{mpath} on penalized zero-inflated regression, supplementary to the publications \cite{wang2014zip, wang2014zinb}. \section{Zero-inflated Poisson regression} Assume response variable $Y$ has a zero-inflated Poisson distribution, denote $d_1+1$ and $d_2+1$-dimensional vectors $B_i$ and $G_i$. The parameters $\mu=(\mu_{1}, \mu_{2}, ... \mu_{n})^\intercal$ and $\pi_i$ are modeled with $\log(\mu_i)=B^\intercal\beta$ and logit$(\pi_i)=\log(\pi_i/(1-\pi_i))=G_i^\intercal\zeta$ for covariate matrix $B$ and $G$, which can be different. To include an intercept, let $B_{i0}=G_{i0}=1$. Define $\Phi=(\zeta^\intercal,\beta^\intercal)^\intercal$ with length $d=d_1+d_2+2$. The log-likelihood function $\ell_{ZIP}(\Phi;y)$ is given by \begin{equation}\label{eqn:llik} \begin{aligned} \ell_{ZIP}(\Phi;y)&=\sum_{y_i=0} \log(\exp(G_i^\intercal\zeta) + \exp(-e^{B_i^\intercal\beta})) + \sum_{y_i > 0}(y_iB_i^\intercal\beta - \exp(B_i^\intercal\beta)) \\ &- \sum_{i=1}^{n}\log(1+\exp(G_i^\intercal\zeta)) - \sum_{y_i > 0}\log(y_i !). \end{aligned} \end{equation} We minimize a penalized negative log-likelihood function for ZIP model: \begin{equation}\label{eqn:plik} p\ell_{ZIP}(\Phi;y)=-\ell_{ZIP}(\Phi;y) + p(\zeta, \beta), \end{equation} where \begin{equation}\label{eqn:plik1} p(\zeta, \beta)=n\sum_{j=1}^{d_1}(\alpha_1\lambda_1|\zeta_j|+\frac{\lambda_1(1-\alpha_1)}{2}\zeta_j^2) + n\sum_{k=1}^{d_2}(\alpha_2\lambda_2|\beta_k|+\frac{\lambda_2(1-\alpha_2)}{2}\beta_k^2). \end{equation} In the \textbf{\textsf{R}} package \textbf{mpath}, $\alpha_1, \alpha_2$ are labeled as \texttt{alpha.zero, alpha.count}, respectively; $\lambda_1, \lambda_2$ are labeled as \texttt{lambda.zero, lambda.count}, respectively. A point $\hat\Phi$ is a minimizer of $p\ell_{ZIP}(\Phi; y)$ if and only if $p\ell_{ZIP}(\Phi; y)$ is subdifferentiable at $\hat\Phi$ and $\Phi=0$ is a subgradient of $p\ell_{ZIP}(\Phi; y)$ at $\hat\Phi$. Take derivatives: \begin{equation}\label{eqn:deri1} \begin{aligned} \frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\zeta_j}=&\sum_{y_i=0}\frac{\exp(G_i^\intercal\zeta)G_{ij}}{\exp(G_i^\intercal\zeta)+\exp(-\exp(B_i^\intercal\beta))} - \sum_{i=1}^n\frac{\exp(G_i^\intercal\zeta)G_{ij}}{1+\exp(G_i^\intercal\zeta)},\\ \frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\beta_k}=&\sum_{y_i=0}\frac{\exp(-\exp(B_i^\intercal\beta))(-\exp(B_i^\intercal\beta))B_{ik}}{\exp(G_i^\intercal\zeta)+\exp(-\exp(B_i^\intercal\beta))}\\ &+\sum_{y_i>0}(y_iB_{ik}-\exp(B_i^\intercal\beta)B_{ik}). \end{aligned} \end{equation} In this document, we take $j=1, ..., d_1, k=1, ..., d_2$ unless otherwise specified. It is clear that $n\alpha_1\lambda_1\text{sign}(\zeta_j)+\lambda_1(1-\alpha_1)\zeta_j$ is a subgradient of $p(\zeta, \beta)$ at $\hat\zeta_j\neq0$, $n\alpha_2\lambda_2\text{sign}(\beta_k)+\lambda_2(1-\alpha_2)\beta_k$ is a subgradient of $p(\zeta, \beta)$ at $\hat\beta_k\neq0$, $n\alpha_1\lambda_1e_1$ for $e_1\in [-1,1]$ is a subgradient of $p(\zeta, \beta)$ at $\hat\zeta_j=0$, and $n\alpha_2\lambda_2e_2$ for $e_2\in [-1,1]$ is a subgradient of $p(\zeta, \beta)$ at $\hat\beta_k=0$. If $\hat\Phi$ is a minimizer of $p\ell_{ZIP}(\Phi; y)$, then $0 \in \partial p\ell_{ZIP}(\hat\Phi; y)$, the subdifferential of $p\ell_{ZIP}(\Phi; y)$ at $\hat\Phi$, which leads to the KKT conditions: \begin{equation}\label{eqn:kktp} \begin{aligned} \text{if } \hat\zeta_j&\neq0: -\frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\zeta_j}+n\alpha_1\lambda_1\text{sign}(\zeta_j)+\lambda_1(1-\alpha_1)\zeta_j=0,\\ \text{if } \hat\zeta_j&=0: -\frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\zeta_j}+n\alpha_1\lambda_1e_1=0, \\ \text{if } \hat\beta_k&\neq0: -\frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\beta_k}+n\alpha_2\lambda_2\text{sign}(\beta_k)+\lambda_2(1-\alpha_2)\beta_k=0,\\ \text{if } \hat\beta_k&=0: -\frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\beta_k}+n\alpha_2\lambda_2e_2=0. \end{aligned} \end{equation} Therefore, for $\hat\zeta_j=0, \hat\beta_k=0$, it must be: \begin{equation}\label{eqn:lmax} \left|\frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\zeta_j}\right| \leq n\alpha_1\lambda_1,\\ \left|\frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\beta_k}\right| \leq n\alpha_2\lambda_2. \end{equation} Denote $\lambda_{1, \max}$ and $\lambda_{2, \max}$ the smallest values of $\lambda_1$ and $\lambda_2$, respectively, such that $\hat\zeta_j=0, \hat\beta_k=0$, and $(\lambda_{1, \max}, \lambda_{2, \max})$ can be determined by (\ref{eqn:lmax}) and the following quantities: \begin{equation} \begin{aligned} \frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\zeta_j}=&\sum_{y_i=0}\frac{\exp(\zeta_0)G_{ij}}{\exp(\zeta_0)+\exp(-\exp(\beta_0))} - \sum_{i=1}^n\frac{\exp(\zeta_0)G_{ij}}{1+\exp(\zeta_0)},\\ \frac{\partial\ell_{ZIP}(\Phi; y)}{\partial\beta_k}=&\sum_{y_i=0}\frac{\exp(-\exp(\beta_0))(-\exp(\beta_0))B_{ik}}{\exp(\zeta_0)+\exp(-\exp(\beta_0))}\\ &+\sum_{y_i>0}(y_iB_{ik}-\exp(\beta_0)B_{ik}). \end{aligned} \end{equation} There is an alternative approach to construct $(\lambda_{1, \max}, \lambda_{2, \max})$ as in \citet{wang2014zip}. In mixture models, the EM algorithm is set up by imposing missing data into the problem. Suppose we could observe which zeros came from the zero state and which came from Poisson state; i.e., suppose we knew $z_i=1$ when $y_i$ is from zero state, and $z_i=0$ when $y_i$ is from the Poisson state. Denote $z=(z_1, z_2, ..., z_n)^\intercal.$ The complete data $(y, z)$ log-likelihood function can be written as \begin{equation}\label{eqn:cllik} \begin{aligned} \ell_{ZIP}^c(\Phi;y,z)=&\sum_{i=1}^n \left\{z_iG_i^\intercal\zeta - \log(1+\exp(G_i^\intercal\zeta))\right\}\\ &+ \sum_{i=1}^n(1-z_i)\left\{y_iB_i^\intercal\beta-\exp(B_i^\intercal\beta)-\log(y_i!)\right\}. \end{aligned} \end{equation} The complete data penalized negative log-likelihood function is then given by \begin{equation}\label{eqn:plzip} \begin{aligned} p\ell_{ZIP}^c(\Phi;y,z)=-\ell_{ZIP}^c+p(\zeta, \beta). \end{aligned} \end{equation} Taking derivatives of (\ref{eqn:cllik}), we obtain \begin{equation}\label{eqn:deri2} \begin{aligned} \frac{\partial\ell_{ZIP}^c(\Phi;y,z)}{\partial \zeta_j}&=\sum_{i=1}^n\left\{z_iG_{ij}-\frac{\exp(G_i^\intercal\zeta)G_{ij}}{1+\exp(G_i^\intercal\zeta)}\right\},\\ \frac{\partial\ell_{ZIP}^c(\Phi;y,z)}{\partial \beta_k}&=\sum_{i=1}^n(1-z_i)\left\{y_iB_{ik}-\exp(B_i^\intercal\beta)B_{ik}\right\}. \end{aligned} \end{equation} The KKT conditions of a minimizer $\hat\Phi$ of $p\ell_{ZIP}^c(\Phi;y,z)$ are given by: \begin{equation}\label{eqn:kktpc} \begin{aligned} \text{if } \hat\zeta_j&\neq0: -\frac{\partial\ell_{ZIP}^c(\Phi; y)}{\partial\zeta_j}+n\alpha_1\lambda_1\text{sign}(\zeta_j)+\lambda_1(1-\alpha_1)\zeta_j=0,\\ \text{if } \hat\zeta_j&=0: -\frac{\partial\ell_{ZIP}^c(\Phi; y)}{\partial\zeta_j}+n\alpha_1\lambda_1e_1=0, \\ \text{if } \hat\beta_k&\neq0: -\frac{\partial\ell_{ZIP}^c(\Phi; y)}{\partial\beta_k}+n\alpha_2\lambda_2\text{sign}(\beta_k)+\lambda_2(1-\alpha_2)\beta_k=0,\\ \text{if } \hat\beta_k&=0: -\frac{\partial\ell_{ZIP}^c(\Phi; y)}{\partial\beta_k}+n\alpha_2\lambda_2e_2=0. \end{aligned} \end{equation} Therefore, for $\hat\zeta_j=0, \hat\beta_k=0$, it must be: \begin{equation}\label{eqn:lmax1} \left|\frac{\partial\ell_{ZIP}^c(\Phi; y)}{\partial\zeta_j}\right| \leq n\alpha_1\lambda_1,\\ \left|\frac{\partial\ell_{ZIP}^c(\Phi; y)}{\partial\beta_k}\right| \leq n\alpha_2\lambda_2. \end{equation} The EM algorithm estimates $z_i$ at iteration $m$ by its conditional mean $z_i^{(m)}$ given below: \begin{equation}\label{eqn:z} \begin{aligned} z_i^{(m)} &= \begin{cases} \left[1+\exp(-G_i^\intercal\zeta^{(m)} - \exp(B_i^\intercal \beta^{(m)})\right]^{-1},&\textrm{ if } y_i=0,\\ 0,&\textrm{ if } y_i=1, 2, .... \end{cases} \end{aligned} \end{equation} Let $\zeta^{(m)}=\zeta, \beta^{(m)}=\beta$, then (\ref{eqn:z}) becomes \begin{equation}\label{eqn:z1} \begin{aligned} z_i &= \begin{cases} \left[1+\exp(-G_i^\intercal\zeta - \exp(B_i^\intercal \beta)\right]^{-1},&\textrm{ if } y_i=0,\\ 0,&\textrm{ if } y_i=1, 2, .... \end{cases} \end{aligned} \end{equation} It is a simple exercise to show that the right hand side of (\ref{eqn:deri2}) is the same as that of (\ref{eqn:deri1}) once (\ref{eqn:z1}) is plugged into (\ref{eqn:deri2}). Hence, the KKT conditions (\ref{eqn:kktpc}) are the same as (\ref{eqn:kktp}) once (\ref{eqn:z1}) is plugged into (\ref{eqn:deri2}). These connections offer a different method to derive $(\hat\lambda_{1, \max}, \hat\lambda_{2, \max})$ such that $\hat\zeta_j=0, \hat\beta_k=0$ hold \citep{wang2014zip}. We first estimate $\zeta_0, \beta_0$ for an intercept-only ZIP model, then (\ref{eqn:z1}) reduces to \begin{equation}\label{eqn:z2} \begin{aligned} z_i &= \begin{cases} \left[1+\exp(-\zeta_0 - \exp(\beta_0)\right]^{-1},&\textrm{ if } y_i=0,\\ 0,&\textrm{ if } y_i=1, 2, .... \end{cases} \end{aligned} \end{equation} Plugging in (\ref{eqn:z2}), $(\hat\lambda_{1, \max}, \hat\lambda_{2, \max})$ are computed based on (\ref{eqn:lmax1}). Furthermore, we have shown that $(\lambda_{1, \max}, \lambda_{2, \max})=(\hat\lambda_{1, \max}, \hat\lambda_{2, \max})$ holds, a special case of a more general result \citep{wang2016robust3}. \section{Zero-inflated negative binomial regression} Assume response variable $Y$ has a zero-inflated negative binomial distribution, denote $d_1+1$ and $d_2+1$-dimensional vectors $B_i$ and $G_i$, respectively. As before, the first entry of these vectors is 1. In ZINB regression, assume $\log(\mu_i)=B_i^\intercal\beta$ and $\log(\frac{p_i}{1-p_i})=G_i^\intercal\zeta$ where $\zeta=(\zeta_0, \zeta_1, ..., \zeta_{d_1})$ and $\beta=(\beta_0, \beta_1, ..., \beta_{d_2})$ are unknown parameters. Here $\zeta_0$ and $\beta_0$ are intercepts. For $n$ independent random samples, denote $\Phi=(\zeta^\intercal, \beta^\intercal, \theta)^\intercal$, the log-likelihood function is then given by \begin{equation*}\label{eqn:ll} \begin{aligned} \ell_{ZINB}(\Phi; y)&=\sum_{y_i=0} \log\left[p_i+(1-p_i)(\frac{\theta}{\mu_i+\theta})^\theta\right]\\ &+\sum_{y_i>0}\log \left[(1-p_i)\frac{\Gamma(\theta+y_i)}{\Gamma(y_i+1)\Gamma(\theta)}(\frac{\mu_i}{\mu_i+\theta})^{y_i}(\frac{\theta}{\mu_i+\theta})^\theta\right], \end{aligned} \end{equation*} where $\mu_i=\exp(B_i^\intercal\beta)$ and $p_i=\frac{\exp(G_i^\intercal\zeta)}{1+\exp(G_i^\intercal\zeta)}$. The derivatives are given by: \begin{equation}\label{eqn:deri3} \begin{aligned} \frac{\partial\ell_{ZINB}(\Phi; y)}{\partial\zeta_j}&=\sum_{y_i=0}\frac{\frac{\partial p_i}{\partial \zeta_j}-\frac{\partial p_i}{\partial \zeta_j} (\frac{\theta}{\mu_i+\theta})^\theta} {p_i+(1-p_i)(\frac{\theta}{\mu_i+\theta})^\theta} -\sum_{y_i>0}\frac{\partial p_i}{\partial \zeta_j}\frac{1}{1-p_i},\\ \frac{\partial\ell_{ZINB}(\Phi; y)}{\partial\beta_k}&=\sum_{y_i=0}\frac{-(1-p_i) \frac{\partial u_i}{\partial \beta_k} \dfrac{{\theta}\left(\frac{{\theta}}{{\mu_i}+{\theta}}\right)^{\theta}}{{\mu_i}+{\theta}}} {p_i+(1-p_i)(\frac{\theta}{\mu_i+\theta})^\theta} +\sum_{y_i>0} \frac{\partial u_i}{\partial \beta_k} (\frac{y_i}{\mu_i}-\frac{y_i+\theta}{\mu_i+\theta}), \end{aligned} \end{equation} where \begin{equation*} \begin{aligned} \frac{\partial p_i}{\partial \zeta_j}&=\frac{G_{ij}\exp(G_i^\intercal\zeta)}{(1+\exp(G_i^\intercal \zeta))^2},\\ \frac{\partial u_i}{\partial \beta_k}&=B_{ik}\exp(B_i^\intercal\beta). \end{aligned} \end{equation*} For variable selection, consider minimizing a penalized negative loss function: \begin{equation}\label{eqn:pll} p\ell_{ZINB}(\Phi; y)=-\ell(\Phi)+p(\zeta, \beta), \end{equation} where $p(\zeta, \beta)$ is given by (\ref{eqn:plik1}). The KKT conditions for a minimizer $\hat\Phi$ of $p\ell_{ZINB}(\Phi)$ can be derived. Therefore, for $\hat\zeta_j=0, \hat\beta_k=0$, it must be: \begin{equation}\label{eqn:lmaxnb} \left|\frac{\partial\ell_{ZINB}(\Phi; y)}{\partial\zeta_j}\right| \leq n\alpha_1\lambda_1,\\ \left|\frac{\partial\ell_{ZINB}(\Phi; y)}{\partial\beta_k}\right| \leq n\alpha_2\lambda_2. \end{equation} Denote $\lambda_{1, \max}$ and $\lambda_{2, \max}$ the smallest values of $\lambda_1$ and $\lambda_2$, respectively, such that $\hat\zeta_j=0, \hat\beta_k=0$, and $(\lambda_{1, \max}, \lambda_{2, \max})$ can be determined by (\ref{eqn:deri3}), (\ref{eqn:lmaxnb}) and the following quantities: \begin{equation} p_i=\frac{\exp(\zeta_0)}{1+\exp(\zeta_0)}, \frac{\partial p_i}{\partial \zeta_j}=\frac{G_{ij}\exp(\zeta_0)}{(1+\exp(\zeta_0))^2}, \mu_i=\exp(\beta_0), \frac{\partial u_i}{\partial \beta_k}=B_{ik}\exp(\beta_0). \end{equation} Consider an EM algorithm to optimize (\ref{eqn:pll}). Let $z_i=1$ if $y_i$ is from the zero state and $z_i=0$ if $y_i$ is from the NB state. Since $z=(z_1, ..., z_n)^T$ is not observable, it is often treated as missing data. The EM algorithm is particularly attractive to missing data problems. If complete data $(y, z)$ are available, the complete data log-likelihood function is given by \begin{equation}\label{eqn:zinblik} \ell^c_{ZINB}(\Phi; y)=\sum_{i=1}^n \left\{(z_iG_i^\intercal\zeta-\log(1+\exp(G_i^\intercal\zeta))+(1-z_i)\log(f(y_i;\beta, \theta))\right\}, \end{equation} and the complete data penalized negative log-likelihood function is given by \begin{equation*} \begin{aligned} p\ell^c_{ZINB}(\Phi; y, z)=-\ell^c_{ZINB}(\Phi; y, z)+p(\zeta, \beta), \end{aligned} \end{equation*} where $f(y_i;\beta, \theta)= \frac{\Gamma(\theta+y_i)}{\Gamma(y_i+1)\Gamma(\theta)}(\frac{\mu_i}{\mu_i+\theta})^{y_i}(\frac{\theta}{\mu_i+\theta})^\theta $ and $\mu_i=\exp(B_i^\intercal\beta)$. Taking derivatives of (\ref{eqn:zinblik}), we obtain \begin{equation}\label{eqn:deri4} \begin{aligned} \frac{\partial\ell_{ZINB}^c(\Phi;y,z)}{\partial \zeta_j}&=\sum_{i=1}^n\left\{z_iG_{ij}-\frac{\exp(G_i^\intercal\zeta)G_{ij}}{1+\exp(G_i^\intercal\zeta)}\right\},\\ \frac{\partial\ell_{ZINB}^c(\Phi;y,z)}{\partial \beta_k}&=\sum_{i=1}^n \left\{(1-z_i) \frac{\partial u_i}{\partial \beta_k} (\frac{y_i}{\mu_i}-\frac{y_i+\theta}{\mu_i+\theta}) \right\}. \end{aligned} \end{equation} The KKT conditions of a minimizer $\hat\Phi$ of $\ell_{ZINB}^c(\Phi;y,z)$ can be derived. Therefore, for $\hat\zeta_j=0, \hat\beta_k=0$, it must be: \begin{equation}\label{eqn:lmaxnb1} \left|\frac{\partial\ell_{ZINB}^c(\Phi; y)}{\partial\zeta_j}\right| \leq n\alpha_1\lambda_1,\\ \left|\frac{\partial\ell_{ZINB}^c(\Phi; y)}{\partial\beta_k}\right| \leq n\alpha_2\lambda_2. \end{equation} The conditional expectation of $z_i$ at iteration $m$ is provided by \begin{equation}\label{eqn:znb} \begin{aligned} z_i^{(m)}&=\begin{cases} \left(1+\exp(-G_i^\intercal\zeta^{(m)})\left[\frac{\theta}{\exp(B_i^\intercal\beta^{(m)})+\theta}\right]^\theta\right)^{-1}, &\textrm{ if } y_i=0\\ 0, &\textrm{ if } y_i > 0. \end{cases} \end{aligned} \end{equation} Let $\zeta^{(m)}=\zeta, \beta^{(m)}=\beta$, then (\ref{eqn:znb}) becomes \begin{equation}\label{eqn:znb1} \begin{aligned} z_i&=\begin{cases} \left(1+\exp(-G_i^\intercal\zeta)\left[\frac{\theta}{\exp(B_i^\intercal\beta)+\theta}\right]^\theta\right)^{-1}, &\textrm{ if } y_i=0\\ 0, &\textrm{ if } y_i > 0. \end{cases} \end{aligned} \end{equation} It is simple to show that the right hand side of (\ref{eqn:deri4}) is the same as that of (\ref{eqn:deri3}) once (\ref{eqn:znb1}) is plugged into (\ref{eqn:deri4}). Hence, the KKT conditions (\ref{eqn:lmaxnb1}) are the same as (\ref{eqn:lmaxnb}) once (\ref{eqn:znb1}) is plugged into (\ref{eqn:deri4}). These connections offer a different method to derive $(\hat\lambda_{1, \max}, \hat\lambda_{2, \max})$ such that $\hat\zeta_j=0, \hat\beta_k=0$ hold \citep{wang2014zinb}. We first estimate $\zeta_0, \beta_0$ for an intercept-only ZINB model, then (\ref{eqn:znb1}) becomes \begin{equation}\label{eqn:znb2} \begin{aligned} z_i&=\begin{cases} \left(1+\exp(-\zeta_0)\left[\frac{\theta}{\exp(\beta_0)+\theta}\right]^\theta\right)^{-1}, &\textrm{ if } y_i=0\\ 0, &\textrm{ if } y_i > 0. \end{cases} \end{aligned} \end{equation} Plugging in (\ref{eqn:znb2}), $(\lambda_{1,\max}, \lambda_{2, \max})$ are computed based on (\ref{eqn:lmaxnb1}). Furthermore, we have shown that $(\lambda_{1,\max}, \lambda_{2, \max})=(\hat\lambda_{1,\max}, \hat\lambda_{2, \max})$ holds. \bibliographystyle{plainnat} \bibliography{mpath} \end{document}