\documentclass[article,shortnames,nojss]{jss}
\usepackage{thumbpdf}
%% need no \usepackage{Sweave.sty}


%% Packages.

\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{bm}
\usepackage{xspace}




%\VignetteIndexEntry{The VGAM Package for Capture--Recapture Data Using the Conditional Likelihood}
%\VignetteDepends{VGAM}
%\VignetteKeywords{closed population size estimation, conditional likelihood,mark--capture--recapture, vector generalized additive model, VGAM}
%\VignettePackage{VGAM}

%% new commands
%% Shortcut commands.
\newcommand{\logit}{\mbox{\rm logit}}
\newcommand{\bone}{{\bf 1}}
\newcommand{\bzero}{{\bf 0}}
\newcommand{\bid}{\mbox{$\bm{\mathcal{D}}$}}
\newcommand{\bib}{\mbox{$\bm{b}$}}
\newcommand{\bif}{\mbox{$\bm{f}$}}
\newcommand{\bix}{\mbox{$\bm{x}$}}
\newcommand{\biy}{\mbox{$\bm{y}$}}
\newcommand{\biz}{\mbox{$\bm{z}$}}
\newcommand{\bB}{\mbox{\rm \bf B}}
\newcommand{\bX}{\mbox{\rm \bf X}}
\newcommand{\bH}{\mbox{\rm \bf H}}
\newcommand{\bI}{\mbox{\rm \bf I}}
\newcommand{\bOO}{\mbox{\rm \bf O}}
\newcommand{\bW}{\mbox{\rm \bf W}}
\newcommand{\bY}{\mbox{\rm \bf Y}}
\newcommand{\bbeta}{\mbox{$\bm{\beta}$}}
\newcommand{\boldeta}{\mbox{$\bm{\eta}$}}
\newcommand{\btheta}{\mbox{$\bm{\theta}$}}
\newcommand{\calM}{\mbox{$\mathcal{M}$}}
\newcommand{\mytilde}{\mbox{\lower.80ex\hbox{\char`\~}\xspace}}


\author{Thomas W. Yee\\The University of Auckland \And
Jakub Stoklosa\\The University of New South Wales \AND
Richard M. Huggins\\The University of Melbourne}
\title{The \pkg{VGAM} Package for Capture--Recapture Data Using the Conditional Likelihood}

%% for pretty printing and a nice hypersummary also set:

\Plainauthor{Thomas W. Yee, Jakub Stoklosa, Richard M. Huggins} %% comma-separated
\Plaintitle{The VGAM Package for Capture--Recapture Data Using the Conditional Likelihood} %% without formatting
\Shorttitle{The VGAM Package for Capture--Recapture Data} %% a short title (if necessary)

%% an abstract and keywords
\Abstract{
It is well known that using individual covariate information 
(such as body weight or gender) to model heterogeneity
in capture--recapture (CR) experiments can greatly enhance
inferences on the size of a closed population. Since individual 
covariates are only observable for captured individuals, complex 
conditional likelihood methods are usually required and these do 
not constitute a standard generalized linear model (GLM) family. 
Modern statistical techniques such as generalized additive models
(GAMs), which allow a relaxing of the linearity assumptions on the 
covariates, are readily available for many standard GLM families.
Fortunately, a natural statistical framework for maximizing
conditional likelihoods is available in the Vector GLM and Vector 
GAM classes of models. We present several new \proglang{R}-functions 
(implemented within the \pkg{VGAM} package) specifically developed to allow 
the incorporation of individual covariates in the analysis of 
closed population CR data using a GLM/GAM-like approach 
and the conditional likelihood. As a result, a wide variety of 
practical tools are now readily available in the \pkg{VGAM} object 
oriented framework. We discuss and demonstrate their advantages, 
features and flexibility using the new \pkg{VGAM} CR functions on several 
examples.
}



\Keywords{closed population size estimation, conditional likelihood, 
mark--capture--recapture, vector generalized additive model, \pkg{VGAM}}
\Plainkeywords{closed population, conditional likelihood, 
mark--capture--recapture, vector generalized additive model, VGAM R package}



\Address{
  Thomas W. Yee \\
  Department of Statistics \\
  University of Auckland, Private Bag 92019 \\
  Auckland Mail Centre \\
  Auckland 1142, New Zealand \\
  E-mail: \email{t.yee@auckland.ac.nz}\\
  URL: \url{http://www.stat.auckland.ac.nz/~yee/}
}


\begin{document}


<<echo=FALSE, results=hide>>=
library("VGAM")
library("VGAMdata")
ps.options(pointsize = 12)
options(width = 72, digits = 4)
options(SweaveHooks = list(fig = function() par(las = 1)))
options(prompt = "R> ", continue = "+")
@










%*********************************************************************
\section[Introduction]{Introduction}
%% Note: If there is markup in \(sub)section, then it has to be escape as above.
\label{sec:intro}



Note: this vignette is essentially \cite{yee:stok:hugg:2015}.



\bigskip



Capture--recapture (CR) surveys are widely used in ecology and 
epidemiology to estimate population sizes. In essence they are 
sampling schemes that allow the estimation of both $n$ and $p$ 
in a Binomial($n$, $p$) experiment \citep{hugg:hwan:2011}.
The simplest CR sampling design consists of units or individuals 
in some population that are captured or tagged across several 
sampling occasions, e.g., trapping a nocturnal mammal species
on seven consecutive nights. In these experiments, when an individual 
is captured for the first time then it is marked or tagged so that 
it can be identified upon subsequent recapture. On each occasion recaptures 
of individuals which have been previously marked are also noted. Thus 
each observed individual has a capture history: a vector of 1s and 0s
denoting capture/recapture and noncapture respectively. The unknown
population size is then estimated using the observed capture histories 
and any other additional information collected on captured individuals, 
such as weight or sex, along with environmental information such as 
rainfall or temperature.


We consider closed populations, where there are no births, deaths,
emigration or immigration throughout the sampling period
\citep{amst:mcdo:manl:2005}. Such an assumption is often reasonable
when the overall time period is relatively short. 
\citet{otis:etal:1978} provided eight specific closed population CR
models (see also \citet{pollock:1991}), which permit the individual
capture probabilities to depend on time and behavioural response,
and be heterogeneous between individuals.
The use of covariate information (or explanatory variables)
to explain heterogeneous capture probabilities 
in CR experiments has received considerable attention over the
last 30 years \citep{pollock:2002}. Population size estimates that
ignore this heterogeneity typically result in biased population
estimates \citep{amst:mcdo:manl:2005}.
A recent book on CR experiements as a whole is \cite{mccr:morg:2014}.


Since individual covariate information (such as gender or body weight)
can only be collected on observed individuals, conditional likelihood 
models are employed \citep{pollock:1984,hugg:1989,alho:1990,lebreton:1992}. 
That is, one conditions on the individuals seen at least once through-out 
the experiment, hence they allow for individual covariates to be 
considered in the analysis. The capture probabilities are typically 
modelled as logistic functions of the covariates, and parameters are 
estimated using maximum likelihood. Importantly, these CR models are
generalized linear models \citep[GLMs;][]{mccull:1989,hugg:hwan:2011}.


Here, we maximize the conditional likelihood (or more
formally the positive-Bernoulli distribution) models
of \citet{hugg:1989}. This approach has become standard practice to carry 
out inferences when considering individual covariates, with several different 
software packages currently using this methodology, including:
\proglang{MARK} \citep{cooch:white:2012},
\proglang{CARE-2} \citep{hwang:chao:2003},
and the \proglang{R} packages \citep{R:2014}:
\pkg{mra} \citep{mcdonald:2010}, \pkg{RMark} \citep{laake:2013}
and \pkg{Rcapture} \citep{rcapturepackage:2012,Baillargeon:Rivest:2007},
the latter package uses a log-linear approach, which can be shown to be 
equivalent to the conditional likelihood \citep{cormak:1989,hugg:hwan:2011}. 
These programs are quite user friendly, and specifically, allow modelling 
capture probabilities as linear functions of the covariates. So an obvious 
question is to ask why develop yet another implementation for closed population
CR modelling?


Firstly, nonlinearity arises quite naturally in many ecological applications,
\citep{schluter1988,yee:mitc:1991,craw:1993,gimenez:2006,bolk:2008}. 
In the CR context, capture probabilities may depend nonlinearly on 
individual covariates, e.g., mountain pygmy possums with lighter or 
heavier body weights may have lower capture probabilities compared 
with those having mid-ranged body weights 
\citep[e.g.,][]{hugg:hwan:2007,stok:hugg:2012}.
However, in our experience, the vast majority of CR software does not handle 
nonlinearity well in regard to both estimation and in the plotting 
of the smooth functions. Since GAMs \citep[]{hastie:1990,wood:2006} 
were developed in the mid-1980s they have become a standard tool for 
data analysis in regression. The nonlinear relationship between the 
response and covariate is flexibly modelled, and few assumptions are 
made on the functional relationship. The drawback in applying these
models to CR data has been the difficult programming required to 
implement the approach.


Secondly, we have found several implementations of conditional 
likelihood slow, and in some instances unreliable and difficult to use. 
We believe our implementation has superior capabilities, and has 
good speed and reliability. The results of 
Section \ref{sec:poz:posbernoulli.eg.timingtests} contrast our software 
with some others. Moreover, the incorporation of these methods in a general, 
maintained statistical package will result in them being updated as 
the package is updated.


Standard GLM and GAM methodologies are unable to cope with the CR 
models considered in this article because they are largely restricted 
to one linear/additive predictor $\eta$. Fortunately however, a 
natural extension in the form of the vector generalized linear 
and additive model (VGLM/VGAM) classes do allow for multiple $\eta$s. 
VGAMs and VGLMs are described in \citet{yee:wild:1996} and \citet{yee:hast:2003}. 
Their implementation in the \pkg{VGAM} package \citep{yee:2008,yee:2010,yee:VGAM:2013-093} 
has become increasing popular and practical over the last few years, due to 
large number of exponential families available for discrete/multinomial 
response data. In addition to flexible modelling of both VGLMs and VGAMs, 
a wide range of useful features are also available:
\begin{itemize}
\item smoothing capabilities;

\item model selection using, e.g., AIC or BIC \citep{burnham:anderson:1999}; 

\item regression diagnostics and goodness--of--fit tools;

\item reduced-rank regression \citep{yee:hast:2003} for dimension 
reduction;

\item computational speed and robustness;

\item choice of link functions;

\item offsets and prior weights; and

\item (specifically) when using \proglang{R}: generic functions 
based on object oriented programming, e.g., \code{fitted()}, 
\code{coef()}, \code{vcov()}, \code{summary()}, \code{predict()}, 
\code{AIC()}, etc.
\end{itemize}


Our goal is to provide users with an easy-to-use object-oriented \pkg{VGAM} 
structure, where four \code{family}-type functions based on the conditional
likelihood are available to fit the eight models of \citet{otis:etal:1978}. 
We aim to give the user additional tools and features,
such as those listed above, to carry out a more informative and
broader analysis of CR data; particularly when considering more than
one covariate. Finally, this article primarily focuses on the technical 
aspects of the proposed package, and less so on the biological interpretation 
for CR experiments. The latter will be presented elsewhere.


An outline of this article is as follows. In Section \ref{sec:cr} we
present the conditional likelihood for CR models and a description of
the eight \citet{otis:etal:1978} models. Section \ref{sec:vgam}
summarizes pertinent details of VGLMs and VGAMs. Their connection to
the CR models is made in Section \ref{sec:meth}. Software details 
are given in Section \ref{sec:software}, and examples on real and 
simulated data using the new software are demonstrated in 
Section \ref{sec:body:exam}. Some final remarks are given in 
Section \ref{sec:discussion}. The two appendices give some
technical details relating to the first and second derivatives
of the conditional log-likelihood, and the means.


\begin{table}[tt]
\centering
\begin{tabular}{cl}
\hline 
\ \ \ Symbol \ \ \ & Explanation \\
\hline
% --------------------------------------
$N$ & (Closed) population size to be estimated \\
% --------------------------------------
$n$ & Total number of distinct individuals caught in the trapping experiment \\
% --------------------------------------
$\tau$ & Number of sampling occasions, where $\tau \geq 2$ \\
% --------------------------------------
$\biy_i$ & Vector of capture histories for individual $i$ $(i=1,\ldots,n)$ with observed values\\
& 1 (captured) and 0 (noncaptured). Each $\biy_i$ has at least one observed 1 \\
% --------------------------------------
``$h$'' & Model $\calM$ subscript, for heterogeneity \\
% --------------------------------------
``$b$'' & Model $\calM$ subscript, for behavioural effects \\
% --------------------------------------
``$t$'' & Model $\calM$ subscript, for temporal effects \\
% --------------------------------------
$p_{ij}$ & Probability that individual $i$ is captured at sampling occasion $j$ $(j=1,\ldots,\tau)$ \\
% --------------------------------------
$z_{ij}$ & $= 1$ if individual $i$ has been captured before occasion $j$,
else $= 0$ \\
% --------------------------------------
$\btheta^{}$ & Vector of regression coefficients to be estimated related to $p_{ij}$
\\
% --------------------------------------
$\boldeta$ & Vector of linear predictors (see Table \ref{tab2} 
for further details)
\\
% --------------------------------------
$g$ & Link function applied to, e.g., $p_{ij}$. Logit by default
\\
% --------------------------------------
\hline
\end{tabular}
\caption{
Short summary of the notation used for the positive-Bernoulli distribution
for capture--recapture (CR) experiments. Additional details are in the text.
\label{tab0}
}
\end{table}


%*********************************************************************
\section[Capture--recapture models]{Capture--recapture models}
\label{sec:cr}


In this section we give an outline for closed population CR models
under the conditional likelihood/GLM approach. For further details 
we recommend \citet{hugg:1991} and \citet{hugg:hwan:2011}. 
The notation of Table \ref{tab0} is used throughout this article.


% ---------------------------------------------------------------
\subsection{Conditional likelihood}
\label{sec:condlik}


Suppose we have a closed population of $N$ individuals,
labelled $i=1,\ldots,N$ and $\tau$ capture occasions 
labelled $j=1,\ldots,\tau$. We make the usual assumptions that 
individuals in the population behave independently of each other, 
individuals do not lose their tags, and tags are recorded correctly. 
Let $y_{ij}=1$ if the $i$th individual was caught on the $j$th 
occasion and be zero otherwise, and let $n$ be the number of 
distinct individuals captured.


Let $p_{ij}$ denote the probability of capturing individual $i$ 
on occasion $j$. As noted in Section \ref{sec:intro},
\citet{otis:etal:1978} 
describe eight models for the capture probabilities,
see Section \ref{sec:8models} 
for further details. Label the individuals captured in the experiment 
by $i=1,\ldots,n$ and those never captured by $i=n+1,\ldots,N$. The full 
likelihood is given by
\begin{eqnarray}
L_{f} & = & K \prod_{i=1}^{N}\prod_{j=1}^{\tau} p_{ij}^{y_{ij}}
(1-p_{ij})^{1- y_{ij}} 
\nonumber
\\
& = & K 
\left\{\prod_{i=1}^{n}\prod_{j=1}^{\tau}p_{ij}^{y_{ij}} 
(1-p_{ij})^{1 - y_{ij}}\right\}\cdot
\left\{\prod_{i=n+1}^{N} \prod_{j=1}^{\tau} (1-p_{ij})\right\}
\label{eq:posbern.uncondlikelihood}
\end{eqnarray}
where $K$ is independent of the $p_{ij}$ but may depend on $N$. The
RHS of (\ref{eq:posbern.uncondlikelihood}) requires knowledge of the
uncaptured individuals and in general cannot be computed. Consequently 
no MLE of $N$ will be available unless some homogeneity assumption is 
made about the noncaptured individuals. Instead, a conditional likelihood 
function based only on the individuals observed at least once is
\begin{eqnarray}
\label{eq:posbern.condlikelihood}
L_{c} & \propto & \prod_{i=1}^{n} 
\frac{\prod_{j=1}^{\tau} p_{ij}^{y_{ij}} (1-p_{ij})^{1 - y_{ij}}}
{1-\prod_{s=1}^{\tau}(1-p_{is}^{\dagger})}.
\end{eqnarray}
is used. Here $p_{is}^{\dagger}$ are the $p_{ij}$ computed as if the
individual had not been captured prior to $j$ so that the denominator 
is the probability individual $i$ is captured at least once. This 
conditional likelihood (\ref{eq:posbern.condlikelihood}) is a modified
version of the likelihood corresponding to a positive-Bernoulli
distribution \citep{patil:1962}.


\renewcommand{\arraystretch}{1.2}
\begin{table}[tt]
\begin{center}
\begin{tabular}{|c||c|c|c|c|}
\hline
Capture & \multicolumn{4}{c|}{Joint probability}\\
\cline{2-5}
history & \multicolumn{1}{c|}{$\calM_0$/$\calM_h$} 
& \multicolumn{1}{c|}{$\calM_b$/$\calM_{bh}$}
& \multicolumn{1}{c|}{$\calM_t$/$\calM_{th}$} 
& \multicolumn{1}{c|}{$\calM_{tb}$/$\calM_{tbh}$} \\
\hline
01 & $(1-p) p$ & $(1-p_{c}) \, p_{c}$ & $(1-p_1) p_2$ 
& $(1-p_{c1}) \, p_{c2}$ \\
10 & $p(1-p)$ & $p_{c} (1-p_{r})$ & $p_1 (1-p_2)$ 
& $p_{c1} (1-p_{r2})$ \\
11 & $p^2$ & $p_{c} \, p_{r}$ & $p_1 \, p_2$ & $p_{c1} \, p_{r2}$ \\
\hline
00 & $(1-p)^2$ & $(1-p_{c})^2$ & $(1-p_1)(1-p_2)$ 
& $(1-p_{c1})(1-p_{c2})$ \\
\hline \hline
$ M \equiv \dim(\boldeta)$ & 1 & 2 & 2 $(=\tau)$ 
& 3 $(=2 \tau - 1)$ \\
\hline
\end{tabular}
\end{center}
\caption{Capture history sample space and corresponding probabilities 
for the eight models of \citet{otis:etal:1978}, with $\tau=2$ capture occasions 
in closed population CR experiment. Here, $p_{cj}=$ capture probability for 
unmarked individuals at sampling period $j$, $p_{rj}=$ recapture 
probability for marked individuals at sampling period $j$, and $p=$ 
constant capture probability across $\tau=2$. Note that the ``00'' row 
is never realized in sample data.}
\label{tab1}
\end{table}
\renewcommand{\arraystretch}{1.0}


% ---------------------------------------------------------------
\subsection{The eight models}
\label{sec:8models}


Models which allow capture probabilities to depend on one or a 
combination of time, heterogeneity or behavioural effects are defined 
using appropriate subscripts, e.g., $\calM_{th}$ depends on time and 
heterogeneity. These eight models have a nested structure 
of which $\calM_{tbh}$ is the most general. The homogeneous 
model $\calM_0$ is the simplest (but most unrealistic) and has equal 
capture probabilities for each individual $H_0: p_{ij}=p$, regardless 
of the sampling occasion. All eight models are GLMs, since the 
conditional likelihood (\ref{eq:posbern.condlikelihood}) 
belongs to the exponential family \citep{hugg:hwan:2011}.


To illustrate the approach, we use the following toy example throughout,
consider a CR experiment with two occasions---morning and evening
(i.e., $\tau=2$), with capture 
probabilities varying between the two occasions. Furthermore, suppose we
have collected some individual covariates---weight and gender.
The joint probabilities of all the eight models are listed in
Table \ref{tab1}.  It can be seen that all but the positive-Binomial
model ($\calM_{0}/\calM_{h}$) 
require more than one probability and hence more than
one linear predictor, so that the 
original \cite{neld:wedd:1972} GLM framework is inadequate. Further, there 
are two noteworthy points from Table \ref{tab1} which apply for 
any $\tau\ge 2$:
\begin{itemize}

\item first, for $\calM_{t}$-type models, as $\tau$ increases
so will the number of linear predictors and hence the potential
number of parameters;
 
\item secondly, it is evident that there are four main categories 
consisting of non-heterogeneity models ($\calM_{0}$, $\calM_{b}$, $\calM_{t}$ 
and $\calM_{tb}$), which are paired with a heterogeneity sub-model 
(respectively $\calM_{h}$, $\calM_{bh}$, $\calM_{th}$ and $\calM_{tbh}$).

\end{itemize}


The four heterogeneity models allow for each individual to have 
their own probability of capture/recapture. In our toy example, 
the capture probabilities are dependent on an individual's weight 
and gender. We discuss these models further in Section \ref{sec:vgam.basics}. 
It is natural to consider individual covariates such as weight 
and gender as linear/additive predictors. Let $x_{i}$ denote a 
covariate (either continuous or discrete) for the $i$th individual, 
which is constant across the capture occasions $j=1,\ldots,\tau$, 
e.g., for continuous covariates one could use the first 
observed value or the mean across all $j$. If there are $d-1$ covariates, 
we write $\bix_i=(x_{i1},\ldots,x_{id})^{\top}$ with $x_{i1}=1$ if 
there is an intercept. Also, let $g^{-1}(\eta)={\exp(\eta)}/\{{1+\exp(\eta)}\}$ 
be the inverse \logit{} function. Consider model $\mathcal{M}_{tbh}$, then the
capture/recapture probabilities are given as [notation follows
Section \ref{sec:VGAMs.basics}]
\begin{eqnarray*}
p_{ij}^{\dagger} & = & g^{-1} \!
\left(\qquad \quad \, \beta^*_{(j+1)1} + \bix_{i[-1]}^{\top} \, 
\bbeta_{1[-1]}^{} \right), \qquad j=1,\ldots,\tau, \\
p_{ij} & = & g^{-1} \!\left(\beta^*_{(1)1} + \beta^*_{(j+1)1} + 
\bix_{i[-1]}^{\top} \,\bbeta_{1[-1]}^{} \right),\qquad j=2,\ldots,\tau,
\end{eqnarray*}
where $\beta^*_{(1)1}$ is the behavioural effect of prior capture, 
$\beta^*_{(j+1)1}$ for $j=1,\ldots,\tau$ are time effects, 
and $\bbeta_{1[-1]}$ are the remaining regression parameters 
associated with the covariates. Computationally, the conditional 
likelihood (\ref{eq:posbern.condlikelihood}) is maximized with 
respect to all the parameters (denote by $\btheta{}$) by the 
Fisher scoring algorithm using the derivatives given in 
Appendix A.


% ---------------------------------------------------------------
\subsection[Estimation of N]{Estimation of $N$}
\label{sec:Nhat}


In the above linear models, to estimate $N$
let $\pi_{i}(\btheta)=1-\prod_{s=1}^{\tau}(1-p_{is}^{\dagger})$ 
be the probability that individual $i$ is captured at least once 
in the course of the study. Then, if $\btheta$ is known, 
the Horvitz--Thompson \citep[HT;][]{horv:thom:1952} estimator
\begin{eqnarray}
\label{eq:HT}
\widehat{N}(\btheta) &=& \sum_{i=1}^{n} \; {\pi}_{i}(\btheta)^{-1}
\end{eqnarray}
is unbiased for the population size $N$ and an associated estimate of 
the variance of $\widehat{N}(\btheta)$ is $s^2(\btheta) = \sum_{i=1}^{n} 
\; {\pi}_{i}(\btheta)^{-2} \, \left[1-{\pi}_{i}(\btheta)\right]$. 
If $\btheta$ is estimated by $\widehat{\btheta}$ then one can use
\begin{eqnarray}
\label{eq:est.varNhat2}
\VAR\left(\widehat{N}(\widehat{\btheta}) \right) &  \approx  & 
s^2(\widehat{\btheta}) + \widehat{\bid}^{\top} 
\widehat{\VAR}(\widehat{\btheta}) \,\widehat{\bid}
\end{eqnarray}
where, following from a Taylor series expansion 
of $\widehat{N}(\widehat{\btheta})$ 
about $\widehat{N}(\btheta)$,
\begin{eqnarray*}
\bid\,  =  \, \frac{d N(\btheta)}{d \btheta}
&  =  &\sum_{i=1}^n \; {\pi}_{i}(\btheta)^{-2} \; \,
\frac{d {\pi}_{i}(\btheta)}{d \btheta} \\
&  =  &\sum_{i=1}^n \; \frac{-1}{{\pi}_{i}(\btheta)^{2}} \; 
\sum_{s=1}^{\tau} \; \left[\prod_{t=1,\ t \neq s}^{\tau}
\left( 1 - p_{it}^{\dagger}\right)\right]
\frac{\partial p_{is}^{\dagger}}{\partial \btheta}.
\end{eqnarray*}


%*********************************************************************
\section[Vector generalized linear and additive models]{Vector generalized 
linear and additive models}
\label{sec:vgam}


To extend the above linear models, we use VGLMs and VGAMs which we briefly 
describe in this section. These models fit within a large statistical 
regression framework which will be described in \citet{yee:2015}. 
The details here are purposely terse; readers are directed
to \citet{yee:2008,yee:2010} 
for accessible overviews and examples, and \citet{yee:wild:1996} 
and \citet{yee:hast:2003} for technical details.


% ---------------------------------------------------------------
\subsection[Basics]{Basics}
\label{sec:vgam.basics}


Consider observations on independent pairs $(\bix_i,\biy_i)$,
$i=1,\ldots,n$. We use ``$[-1]$'' to delete the first element, 
e.g., $\bix_{i[-1]} =(x_{i2},\ldots,x_{id})^{\top}$. For 
simplicity, we will occasionally drop the subscript $i$ and simply 
write $\bix =(x_{1},\ldots,x_{d})^{\top}$. Consider a single observation 
where \biy{} is a $Q$-dimensional vector. For the CR models of this 
paper, $Q=\tau$ when the response is entered as a matrix of 0s and 1s.
The only exception is for the $\calM_0/\calM_h$ where the aggregated
counts may be inputted, see Section \ref{sec:M0Mh}. VGLMs are defined 
through the model for the conditional density
\[
f(\biy | \bix ; \bB)  =  f(\biy,\eta_1,\ldots,\eta_M)
\]
for some known function $f(\cdot)$, 
where $\bB =(\bbeta_1 \,\bbeta_2 \,\cdots \,\bbeta_M)$ is 
a $d\times M$ matrix of regression coefficients to be estimated.
We may also write $\bB^{\top} = (\bbeta_{(1)} \,\bbeta_{(2)}\,\cdots\,
\bbeta_{(d)})$ so that $\bbeta_j$ is the $j$th column of $\bB$ 
and $\bbeta_{(k)}$ is the $k$th row.


The $j$th linear predictor is then
\begin{equation}
\eta_j  =  \bbeta_j^{\top} \bix  =  \sum_{k=1}^d \beta_{(j)k} \, 
x_{k},     j=1, \ldots, M,
\label{gammod2}
\end{equation}
where $\beta_{(j)k}$ is the $k$th component of $\bbeta_j$.
In the CR context, we remind the reader that,
as in Table \ref{tab1}, we have $M=2$ for $\calM_{bh}$, 
$M=\tau$ for $\calM_{th}$ and $M=2\tau-1$ for $\calM_{tbh}$.


In GLMs the linear predictors are used to model the means.
The $\eta_j$ of VGLMs model the parameters of a model. 
In general, for a parameter $\theta_j$ we take
\[
\eta_j  =  g_j(\theta_j),     j=1,\ldots,M
\]
and we say $g_j$ is a parameter link function. Write
\begin{equation}
\boldeta_i  =  \left(\begin{array}{c}\eta_1(\bix_{i})\\
\vdots \\
\eta_M(\bix_{i})\end{array}\right)  = \bB^{\top} \bix_{i}  = 
\left(\begin{array}{c}\bbeta_1^{\top} \bix_{i} \\\vdots \\
\bbeta_M^{\top} \bix_{i}\end{array} \right).
\label{eq:lin.pred}
\end{equation}


In practice we may wish to constrain the effect of a covariate to
be the same for some of the $\eta_j$ and to have no effect for others.
In our toy example, model $\calM_{th}$ with $\tau=M=2$, $d=3$, we have
\begin{eqnarray*}
\eta_1(\bix_i) & = & \beta_{(1)1} + \beta_{(1)2} \, x_{i2} + 
\beta_{(1)3} \, x_{i3}, \\
\eta_2(\bix_i) & = & \beta_{(2)1} + \beta_{(2)2} \, x_{i2} + 
\beta_{(2)3} \, x_{i3},
\end{eqnarray*}
which correspond to $x_{i2}$ being the individual's weight 
and $x_{i3}$ an indicator of gender say, then we have the 
constraints $\beta_{(1)2}\equiv\beta_{(2)2}$
and $\beta_{(1)3}\equiv\beta_{(2)3}$. Then, with ``${}^*$'' 
denoting the parameters that are estimated,
\begin{eqnarray*}
\eta_1(\bix_i) & = & \beta^*_{(1)1} + \beta^*_{(1)2} \, x_{i2} + 
\beta^*_{(1)3} \, x_{i3}, \\
\eta_2(\bix_i) & = & \beta^*_{(2)1} + \beta^*_{(1)2} \, x_{i2} + 
\beta^*_{(1)3} \, x_{i3}, \\
\end{eqnarray*}
and we may write
\begin{eqnarray*}
\boldeta(\bix_i) =
\begin{pmatrix}\eta_1(\bix_i)\\
\eta_2(\bix_i)\end{pmatrix}
& = & \sum_{k=1}^3 \, \bbeta_{(k)} \, x_{ik}\\
& = & \begin{pmatrix}\beta_{(1)1} & \beta_{(1)2} & \beta_{(1)3}\\
\beta_{(2)1} & \beta_{(2)2} & \beta_{(2)3} \end{pmatrix}
\begin{pmatrix}x_{i1}\\ x_{i2}\\ x_{i3} \end{pmatrix}\\
& = &
\begin{pmatrix}
\beta^*_{(1)1} & \beta^*_{(1)2} & \beta^*_{(1)3}\\
\beta^*_{(2)1} & \beta^*_{(1)2} & \beta^*_{(1)3}
\end{pmatrix}
\begin{pmatrix}x_{i1}\\ x_{i2}\\ x_{i3}\end{pmatrix}\\
& = &
\begin{pmatrix}1 & 0\\ 0 & 1\end{pmatrix} 
\begin{pmatrix}
\beta^*_{(1)1}\\ \beta^*_{(2)1}
\end{pmatrix}
x_{i1}+
\begin{pmatrix}1\\1\end{pmatrix}
\beta^*_{(1)2} \, x_{i2}+
\begin{pmatrix}
1\\
1\end{pmatrix}
\beta^*_{(1)3} \, x_{i3}\\
& = & \sum_{k=1}^3 \, \bH_k \, \bbeta^*_{(k)} \, x_{ik}.
\end{eqnarray*}
We can also write this as (noting that $x_{i1}=1$)
\begin{eqnarray*}
\boldeta(\bix_i) & = & \begin{pmatrix}x_{i1} & 0 \\ 0 & x_{i1}
\end{pmatrix} \begin{pmatrix} 1 & 0 \\ 0 & 1 \end{pmatrix} 
\begin{pmatrix} \beta^*_{(1)1}\\ \beta^*_{(2)1} \end{pmatrix} + 
\begin{pmatrix} x_{i2} & 0 \\ 0 & x_{i2} \end{pmatrix}
\begin{pmatrix} 1 \\ 1\end{pmatrix}
\beta^*_{(1)2} + \begin{pmatrix} x_{i3} & 0 \\ 0 & x_{i3} 
\end{pmatrix}
\begin{pmatrix}
1 \\
1
\end{pmatrix}
\beta^*_{(1)3}\\ 
& = & \sum_{k=1}^3 \, \mathrm{diag}(x_{ik},x_{ik}) \, \bH_k \,\bbeta_{(k)}^{*}. 
\end{eqnarray*}
In general, for VGLMs, we represent the models as
\begin{eqnarray}
\boldeta(\bix_i) 
& = & \sum_{k=1}^d \, \bbeta_{(k)} \, x_{ik} \nonumber \\
& = & \sum_{k=1}^d \, \bH_k \, \bbeta_{(k)}^{*} \, x_{ik}
\label{eq:constraints.VGLM}\\
& = & \sum_{k=1}^d \, \mathrm{diag}(x_{ik},\ldots,x_{ik}) \,
\bH_k \, \bbeta_{(k)}^{*}
\nonumber
\end{eqnarray}
where $\bH_1,\bH_2,\ldots,\bH_d$ are known constraint matrices
of full column-rank (i.e., rank \code{ncol}($\bH_k$)), $\bbeta_{(k)}^*$ 
is a vector containing a possibly reduced set of regression coefficients.
Then we may write
\begin{equation}
\label{eq:lin.coefs4}
{\bB}^{\top}  =  
\left( 
\bH_1 \bbeta_{(1)}^* \; \; \; 
\bH_2 \bbeta_{(2)}^* \;\;\; \cdots \;\;\; 
\bH_d \bbeta_{(d)}^* 
\right)
\end{equation}
as an expression of (\ref{eq:lin.pred}) concentrating on columns rather
than rows. Note that with no constraints at all, all $\bH_k = \bI_M$
and $\bbeta_{(k)}^*=\bbeta_{(k)}$. We need both (\ref{eq:lin.pred}) 
and (\ref{eq:lin.coefs4}) since we focus on the $\eta_j$
and at other times on the variables $x_{k}$. The constraint matrices 
for common models are pre-programmed in \pkg{VGAM}
and can be set up by using arguments such as \code{parallel} and \code{zero}
found in \pkg{VGAM} family functions. Alternatively, there
is the argument \code{constraints} where they may be explicitly
inputted. Using \code{constraints} is less convenient but provides
the full generality of its power.

 
% ---------------------------------------------------------------
\subsection[Handling time-varying covariates]{Handling time-varying covariates}
\label{sec:xij}


Often, the covariates may be time-varying, e.g., when using 
temperature as a covariate, then a different value is observed and 
measured for each occasion $j$ for $j=1,\dots,\tau$.
Again, using our toy example with $M=2$, $d=3$,
and $\tau=2$, suppose we have time-dependent covariates $\bix_{ij}$, $j=1,2$. 
We may have the model
\begin{eqnarray*}
\eta_1(\bix_{i1}) & = & \beta^*_{(1)1} + \beta^*_{(1)2} \, x_{i21} +
\beta^*_{(1)3}\, x_{i31},\\
\eta_2(\bix_{i2}) & = & \beta^*_{(2)1} + \beta^*_{(1)2} \, x_{i22} +
\beta^*_{(1)3}\, x_{i32},\\
\end{eqnarray*}
for the linear predictor on the two occasions. Here, $x_{ikt}$ is for 
the $i$th animal, $k$th explanatory variable and $t$th time. We write this model as
\begin{eqnarray*}
\boldeta(\bix_{ij}) & = & \begin{pmatrix} x_{i11} & 0\\ 0 & x_{i12} \end{pmatrix}
\begin{pmatrix} 1 & 0\\ 0 & 1\end{pmatrix} \begin{pmatrix} \beta^*_{(1)1}\\
\beta^*_{(2)1} \end{pmatrix} + \begin{pmatrix} x_{i21} & 0\\ 0 & x_{i22}\end{pmatrix}
\begin{pmatrix} 1\\ 1\end{pmatrix} \beta^*_{(1)2} + 
\begin{pmatrix} x_{i31} & 0\\ 0 & x_{i32}\end{pmatrix}
\begin{pmatrix} 1 \\ 1\end{pmatrix}
\beta^*_{(1)3}\\ & = & \sum_{k=1}^3 \, \mathrm{diag}(x_{ik1},x_{ik2}) \,
\bH_k\,\bbeta_{(k)}^{*}. 
\end{eqnarray*}
Thus to handle time-varying covariates one needs the \code{xij} facility of \pkg{VGAM} 
(e.g., see Section \ref{sec:poz:posbernoulli.eg.hugg:1991}), which allows a covariate 
to have different values for different $\eta_{j}$ through the general formula
\begin{eqnarray}
\boldeta(\bix_{ij})
&  =  & \sum_{k=1}^{d}\, \mathrm{diag}(x_{ik1},\ldots,x_{ikM})\,
\bH_k \,\bbeta_{(k)}^{*}=\sum_{k=1}^d \, 
\bX^{\#}_{(ik)}\bH_k \,\bbeta_{(k)}^{*}
\label{eq:vglimo:xij.vector.diag}
\end{eqnarray}
where $x_{ikj}$ is the value of variable $x_{k}$ for unit $i$ for $\eta_{j}$.
The derivation of (\ref{eq:vglimo:xij.vector.diag}),
followed by some examples are given in \cite{yee:2010}.
Implementing this model requires specification 
of the diagonal elements of the matrices $\bX^*_{ik}$ and we see
its use in Section \ref{sec:poz:posbernoulli.eg.hugg:1991}.
Clearly, a model may include a mix of time-dependent and
time-independent covariates. 
The model is then specified through the constraint matrices $\bH_k$ 
and the covariate matrices $\bX^{\#}_{(ik)}$. Typically in CR experiments, 
the time-varying covariates will be environmental effects. Fitting 
time-varying individual covariates requires some interpolation when 
an individual is not captured and is beyond the scope of the present 
work.


% ---------------------------------------------------------------
\subsection[VGAMs]{VGAMs}
\label{sec:VGAMs.basics}


VGAMs replace the linear functions in (\ref{eq:constraints.VGLM})
by smoothers such as splines. Hence, the central formula is
\begin{equation}
\boldeta_i  =  \sum_{k=1}^d \; \bH_k \, \bif_k^*(x_{ik})
\label{eq:vgam}
\end{equation}
where $\bif_k^*(x_k) = (f_{k(1)}^*(x_k),\ldots,f_{k(M_k)}^*(x_k))^{\top}$
is a vector of $M_k$ smooth functions of $x_k$, where $M_k=\mathtt{ncol}(\bH_k)$ 
is the rank of the constraint matrix for $x_k$. Note that standard error bands 
are available upon plotting the estimated component functions (details at \cite{yee:wild:1996}),
e.g., see Figure \ref{fig:poz:deermice}.



%*********************************************************************
\section[VGLMs and VGAMs applied to CR data]{VGLMs and VGAMs applied to CR data}
\label{sec:meth}


In this section we merge the results of Sections \ref{sec:cr}
and \ref{sec:vgam} to show how the eight models of \citet{otis:etal:1978} 
can be fitted naturally within the VGLM/VGAM framework.


% ---------------------------------------------------------------
\subsection[Linear predictors and constraint matrices]{Linear predictors and constraint matrices}
\label{sec:constraints}


As in Section \ref{sec:vgam.basics}, we now write $\biy_i$ as the 
capture history vector for individual $i$. Written technically, 
$\biy_i \in (\{0,1\})^{\tau} \backslash\{\bzero_\tau\}$ so that 
there is at least one 1 (capture). For simplicity let $p_c$ and $p_r$ 
be the capture and recapture probabilities. Recall that the value 
for $M$ will depend on the CR model type and the number of 
capture occasions considered in the experiment,
for example, consider model $\calM_b$ as in 
Table \ref{tab1}, then $(\eta_1,\eta_2)=(g(p_c),g(p_r))$
for some link function $g$, thus $M=2$. The upper half of Table \ref{tab2} 
gives these for the eight \citet{otis:etal:1978} 
models. The lower half of Table \ref{tab2} gives
the names of the \pkg{VGAM} family function that fits those 
models. They work very similarly to the \code{family} argument 
of \code{glm()}, e.g.,



<<label = example-posber, eval = FALSE, prompt = FALSE>>=
vglm(cbind(y1, y2, y3, y4, y5, y6) ~ weight + sex + age,
           family = posbernoulli.t, data = pdata)
@



is a simple call to fit a $\calM_{th}$ model. The response is a 
matrix containing 0 and 1 values only, and three individual covariates
are used here. The argument name \code{family} was chosen for not
necessitating \code{glm()} users learning a new argument
name; and the concept of error distributions as for the GLM
class does not carry over for VGLMs. Indeed, \code{family} denotes some 
full-likelihood specified statistical model worth fitting in its own right 
regardless of an `error distribution' which may not make sense.
Each family function has \code{logit()} as their default link, 
however, alternatives such as \code{probit()} and \code{cloglog()} 
are also permissible. Section \ref{sec:software} discusses the software side 
of \pkg{VGAM} in detail, and Section \ref{sec:body:exam} gives more examples.



As noted above, constraint matrices are used to simplify complex
models, e.g., model $\calM_{tbh}$ into model $\calM_{th}$. The default 
constraint matrices for the $\calM_{tbh}(\tau)$ model are given 
in Table \ref{tab3}. These are easily constructed using the
\code{drop.b}, \code{parallel.b} and \code{parallel.t}
arguments in the family function. More generally, the $\bH_k$
may be inputted using the \code{constraints} 
argument---see \cite{yee:2008} and \cite{yee:2010} 
for examples. It can be seen that the building blocks of 
the $\bH_k$ are \bone, \bzero, \bI{} and \bOO. 
This is because one wishes to constrain the effect of $x_k$ 
to be the same for capture and recapture probabilities. In general, 
we believe the $\bH_k$ in conjunction with (\ref{eq:vglimo:xij.vector.diag}) 
can accommodate all linear constraints between the estimated regression 
coefficients $\widehat{\beta}_{(j)k}$.



For time-varying covariates models, the $M$ diagonal elements $x_{ikj}$
in (\ref{eq:vglimo:xij.vector.diag}) correspond to the value of
covariate $x_k$ at time $j$ for individual $i$. These are inputted
successively in order using the \code{xij} argument, e.g., as in
Section \ref{sec:poz:posbernoulli.eg.hugg:1991}.



\clearpage
% ---------------------------------------------------------------
\subsection[Penalized likelihood and smoothing parameters]{Penalized 
likelihood and smoothing parameters}
\label{sec:gam}



For each covariate $x_{k}$, the smoothness of each component 
function $f^{*}_{(j)k}$ in (\ref{eq:vgam}) can be controlled 
by the non-negative smoothing parameters $\lambda_{(j)k}$. 
\cite{yee:wild:1994a} show that, when vector splines are used 
as the smoother, the penalized conditional log-likelihood 
\begin{eqnarray}
\label{eq:posbern.pen.condlikelihood}
\ell_p  \equiv \log\, L_p  =  \ell_c - \frac12 \sum_{k=1}^d 
\sum_{j=1}^{\mathtt{ncol}(\mathbf{H}_k)}\,\lambda_{(j)k}\int_{a_k}^{b_k} 
\left\{f^{*''}_{(j)k}(t) \right\}^2 {\rm d}t 
\end{eqnarray}
is maximized. Here, $\ell_c$ is the logarithm of the conditional likelihood 
function (\ref{eq:posbern.condlikelihood}). 
The penalized conditional likelihood (\ref{eq:posbern.pen.condlikelihood}) 
is a natural extension of the penalty approach described 
in \citet{gree:silv:1994} to models with multiple $\eta_j$.



An important practical issue is to control for overfitting and 
smoothness in the model. The \code{s()} function used within \code{vgam()} 
signifies the smooth functions $f^{*}_{(j)k}$ estimated by vector splines, 
and there is an argument \code{spar} for the smoothing parameters, 
and a relatively small (positive) value will mean much flexibility and wiggliness. 
As \code{spar} increases the solution converges to the least squares 
estimate. More commonly, the argument \code{df} is used, and this is 
known as the equivalent degrees of freedom (EDF). A value of unity 
means a linear fit, and the default is the value 4 which affords
a reasonable amount of flexibility.



\renewcommand{\arraystretch}{1.3}
\begin{table}[tt]
\begin{center}
\begin{tabular}{ll}
\hline 
\hline 
% --------------------------------------
Model       & $\bm{\eta}^{\top}$ \\
\hline
$\calM_{0}$/$\calM_{h}$ & $g(p)$ \\
% --------------------------------------
$\calM_{b}$/$\calM_{bh}$ & $(g(p_c), g(p_r))$ \\
% --------------------------------------
$\calM_{t}$/$\calM_{th}$ & $(g(p_{1}),\ldots,g(p_{\tau}))$ \\ 
% --------------------------------------
$\calM_{tb}$/$\calM_{tbh}$ \ \ \ &
$(g(p_{c1}),\ldots,g(p_{c\tau}),g(p_{r2}),\ldots,g(p_{r\tau}))$ \\
\hline
% --------------------------------------
% --------------------------------------
\hline
Model \ \ \ \ \ & \code{family =} \\
\hline
%--------------------------------------
$\calM_{0}$/$\calM_{h}$ & \code{posbinomial(omit.constant = TRUE)} \\
& \code{posbernoulli.b(drop.b = FALSE \mytilde{} 0)} \\
& \code{posbernoulli.t(parallel.t = FALSE \mytilde{} 0)} \\
& 
\code{posbernoulli.tb(drop.b = FALSE \mytilde{} 0, parallel.t = FALSE \mytilde{} 0)} \\
% --------------------------------------
$\calM_{b}$/$\calM_{bh}$ &
\code{posbernoulli.b()} \\
&
\code{posbernoulli.tb(drop.b = FALSE \mytilde{} 1, parallel.t = FALSE \mytilde{} 0)} \\
% --------------------------------------
$\calM_{t}$/$\calM_{th}$ &
\code{posbernoulli.t()} \\
 &
\code{posbernoulli.tb(drop.b = FALSE \mytilde{} 0, parallel.t = FALSE \mytilde{} 1)} \\
% --------------------------------------
$\calM_{tb}$/$\calM_{tbh}$ \ \ \ &
\code{posbernoulli.tb()} \\
\hline
% --------------------------------------
\end{tabular}
\end{center}
\caption{Upper table gives the $\boldeta$ for the eight \citet{otis:etal:1978} 
models. Lower table gives the relationships between the eight models 
and function calls. See Table \ref{tab1} for definitions. 
The $g=\logit$ link is default for all.\label{tab2}}
\end{table}
\renewcommand{\arraystretch}{1.0}


%*********************************************************************
\section[Software details for CR models in VGAM]{Software details for 
CR models in \pkg{VGAM}}
\label{sec:software}


Having presented the conditional likelihood (\ref{eq:posbern.condlikelihood})
and VGLMs/VGAMs for CR models, we further discuss the fitting in \pkg{VGAM}. 
It is assumed that users are somewhat familiar with modelling in \proglang{R} 
and using \code{glm()} class objects. \pkg{VGAM}, authored by TWY, uses S4 classes.
In order to present the new \code{family} functions developed for \code{vglm()} 
and \code{vgam()}, some additional preliminaries for \pkg{VGAM} are given below. 
Version 0.9-4 or later is assumed, and the latest prerelease version is 
available at \url{http://www.stat.auckland.ac.nz/ yee/VGAM/prerelease}.


In \code{vglm()}/\code{vgam()}, both $\calM_0$ and $\calM_h$ 
are serviced by \code{family = posbinomial()}, i.e., the 
positive-binomial family. For models $\calM_{b}$, $\calM_{t}$ 
and $\calM_{tb}$, each of these are serviced by their 
corresponding \code{family = posbernoulli.}-type functions 
as in Table \ref{tab2}. Formulas of the form \code{\mytilde{} 1} 
correspond to $\calM_{0}$, $\calM_{b}$, $\calM_{t}$ and $\calM_{tb}$;
otherwise they are $\calM_{h}$, $\calM_{bh}$,$\calM_{th}$ and $\calM_{tbh}$.


Below we describe each of the eight models with their \pkg{VGAM}
representation and their default values, we also give additional remarks.
All eight models can be fit using \code{posbernoulli.tb()}, it is generally 
not recommended as it is less efficient in terms of memory requirements and speed.


\begin{table}[tt]
\begin{center}
\begin{tabular}{ccc}
& \multicolumn{1}{c}{\code{ parallel.t}} 
& \code{!parallel.t} \\[0.9em]
\cline{2-3}
% --------------------------------------
\multicolumn{1}{r|}{\code{ parallel.b}} &
\multicolumn{1}{r|}{
%
$\left(
\begin{array}{ll}
\bzero_\tau & \bone_\tau\\
\bone _{\tau-1} \  & \bone_{\tau-1}
\end{array} 
\right)$,
$\left(
\begin{array}{l}
\bone_\tau\\
\bone_{\tau-1}
\end{array} 
\right)$ 
}
& % ---------------------------------------------
\multicolumn{1}{r|}{ 
$\left(
\begin{array}{ll}
\bzero_\tau & \bI_\tau\\
\bone _{\tau-1} \  & \bI_{{\tau}[-1,]}
\end{array} 
\right)$,
$\left(
\begin{array}{l}
\bI_\tau\\
\bI_{{\tau}[-1,]}
\end{array} 
\right)$ 
}
\\[1.5em] % This gives problems
% --------------------------------------
\cline{2-3}
% --------------------------------------
\multicolumn{1}{r|}{\code{!parallel.b}} &
\multicolumn{1}{r|}{
$\left(
\begin{array}{ll}
\bOO_{\tau \times (\tau-1)} \  & \bone_{\tau}\\
\bI_{\tau-1} & \bone_{\tau-1} 
\end{array} 
\right)$,
%
$\left(
\begin{array}{l}
\bone_\tau\\
\bone_{\tau-1}
\end{array} 
\right)$ 
}
& % --------------------------------------
\multicolumn{1}{r|}{ 
$\left(
\begin{array}{ll}
\bOO_{\tau \times(\tau-1)} \  & \bI_{\tau}\\
\bI_{\tau-1}& \bI_{{\tau}[-1,]}
\end{array} 
\right)$,
%
$\left(
\begin{array}{l}
\bI_\tau\\
\bI_{{\tau}[-1,]}
\end{array} 
\right)$ 
} 
\\[1.5em]
% --------------------------------------
\cline{2-3}
\end{tabular}
\end{center}
\caption{ For the general $\calM_{tbh}(\tau)$
family \code{posbernoulli.tb()}, 
the constraint matrices corresponding to the arguments \code{parallel.t},
\code{parallel.b} and \code{drop.b}. In each cell the 
LHS matrix is $\bH_k$ when \code{drop.b} is \code{FALSE} 
for $x_k$. The RHS matrix is when \code{drop.b} is \code{TRUE} 
for $x_k$; it simply deletes the left submatrix of $\bH_k$.
These $\bH_k$ should be seen in light of Table \ref{tab2}.
Notes:
(i) the default for \code{posbernoulli.tb()}
is $\bH_1 = $ the LHS matrix of the top-right cell
and $\bH_k = $ the RHS matrix of the top-left cell; and
(ii) $\bI_{{\tau}[-1,]} = (\bzero_{\tau-1} | \bI_{\tau-1})$.
\label{tab3}}
\end{table}


% ---------------------------------------------------------------
\subsection[Basic software details]{Basic software details}
\label{sec:furthersoftware}


All family functions except \code{posbinomial()} should have
a $n\times\tau$ capture history matrix as the response, preferably 
with column names. Indicators of the past capture of individual $i$, 
defined as $z_{ij}$, are stored on \pkg{VGAM} objects as the \code{cap.hist1}
component in the \code{extra} slot. Also, there is a component
called \code{cap1} which indicates on which sampling occasion the
first capture occurred.


As will be illustrated in Section \ref{sec:poz:posbernoulli.eg.hugg:1991}, 
a fitted CR object stores the point estimate for the population
size estimator (\ref{eq:HT}), in the \code{extra} slot with
component name \code{N.hat}. Likewise, its standard error (\ref{eq:est.varNhat2}) 
has component name \code{SE.N.hat}. By default all the family functions
return fitted values corresponding to the probabilities in the conditional 
likelihood function (\ref{eq:posbern.condlikelihood}), however,
Appendix B describes an alternative type of
fitted value; the choice is made by the argument \code{type.fitted}, 
and the fitted values are returned by the \code{fitted()} methods function.


Notice that in Table \ref{tab2}, the \pkg{VGAM} family functions have 
arguments such as \verb+parallel.b+ which may be assigned a logical or 
else a formula with a logical as the response. If it is a single logical 
then the function may or may not apply that constraint to the intercept. 
The formula is the most general and some care must be taken with the 
intercept term. Here are some examples of the syntax:
\begin{itemize}
\item \code{parallel.b = TRUE \mytilde{} x2} means a parallelism assumption is
applied to variables $x_2$ and the intercept, since
formulas include the intercept by default. 
 
\item \code{parallel.b = TRUE \mytilde{} x2-1} means a parallelism assumption is
applied to variable $x_2$ only.
 
\item \code{parallel.b = FALSE \mytilde{} 0} means a parallelism assumption 
is applied to every variable including the intercept.
\end{itemize}


% ---------------------------------------------------------------
\subsection[Models M0/Mh]{Models $\calM_0$/$\calM_h$}
\label{sec:M0Mh}


For $\calM_0$/$\calM_h$, the defaults are given as


<<label = poz-args-posbinomial>>=
args(posbinomial)
@ 



Both models can alternatively be fitted using \code{posbernoulli.t()},
\code{posbernoulli.b()} and\\ \code{posbernoulli.tb()} by setting the
appropriate constrains/arguments (Table \ref{tab2}). For example, 
setting \code{posbernoulli.t(parallel.t = FALSE \mytilde{} 0)}
constrains all the $p_{j}$ to be equal.


If comparing all eight models using \code{AIC()} or \code{BIC()} then 
setting \code{omit.constant = TRUE} will allow for comparisons to be 
made with the positive-Bernoulli functions given below. The reason is that 
this omits the log-normalizing constant $\log{\tau \choose \tau{y}_i^{*}}$
from its conditional log-likelihood so that it is comparable with the logarithm
of (\ref{eq:posbern.condlikelihood}).


An extreme case for $\calM_h$ is where $p_{ij} = p_i$ with $p_i$ being
parameters in their own right \citep{otis:etal:1978}. While this could
possibly be fitted by creating a covariate of the form \code{factor(1:n)} 
there would be far too many parameters for comfort. Such an extreme 
case is not recommended to avoid over-parameterization.


% ---------------------------------------------------------------
\subsection[Models Mt/Mth]{Models $\calM_t$/$\calM_{th}$}
\label{sec:MtMth}


<<label = poz-args-posbernoulli-t>>=
args(posbernoulli.t)
@ 



Note that for $\calM_t$, capture probabilities are the same for each 
individual but may vary with time, i.e., $H_0: p_{ij} = p_{j}$.
One might wish to constrain the probabilities of a subset
of sampling occasions to be equal by forming the appropriate
constraint matrices.


Argument \code{iprob} is for an optional initial value for the
probability, however all \pkg{VGAM} family functions are self-starting
and usually do not need such input.


% ---------------------------------------------------------------
\subsection[Models Mb/Mbh]{Models $\calM_b$/$\calM_{bh}$}
\label{sec:MbMbh}


<<label = poz-args-posbernoulli-b>>=
args(posbernoulli.b)
@



Setting \code{drop.b = FALSE \mytilde{} 0} assumes 
there is no behavioural effect and this reduces to $\calM_0$/$\calM_h$.
The default constraint matrices are
\[\bH_1 =\left(\begin{array}{cc}
0 & 1 \\ 1 & 1 \\
\end{array} \right), \ \ \ \ \bH_2 = \cdots = \bH_d =
\left(\begin{array}{c}
1 \\ 1 \\
\end{array}\right)\]
so that the first coefficient $\beta_{(1)1}^{*}$ corresponds to the 
behavioural effect. Section \ref{sec:poz:posbernoulli.eg.ephemeral}
illustrates how the VGLM/VGAM framework can handle short-term and
long-term behavioural effects.


% ---------------------------------------------------------------
\subsection[Models Mtb/Mtbh]{Models $\calM_{tb}$/$\calM_{tbh}$}
\label{sec:MtbMtbh}


There are three arguments which determine whether there 
are behavioural effects and/or time effects: \code{parallel.b},
\code{parallel.t} and \code{drop.b}. The last two are as above.
The defaults are



<<label = poz-args-posbernoulli-tb>>=
args(posbernoulli.tb)
@



One would usually want to keep the behavioural effect to be equal over
different sampling occasions, therefore \code{parallel.b} should be
normally left to its default. Allowing it to be \code{FALSE} for a
covariate $x_k$ means an additional $\tau-1$ parameters, something
that is not warranted unless the data set is very large and/or the
behavioural effect varies greatly over time.



Arguments \code{ridge.constant} and \code{ridge.power}
concern the working weight matrices and are explained in 
Appendix A. 



Finally, we note that using



<<label = poz-posbernoulli-tb-gen, prompt=FALSE, eval=FALSE>>=
vglm(..., family = posbernoulli.tb(parallel.b = TRUE ~ 0, parallel.t = TRUE ~ 0,
                                   drop.b = TRUE ~ 0))
@



fits the most general model. Its formula is effectively (\ref{gammod2})
for $M=2\tau-1$, hence there are $(2\tau-1)d$ regression coefficients
in total---far too many for most data sets.



%*********************************************************************
\newpage
\section[Examples]{Examples}
\label{sec:body:exam}


We present several examples using \pkg{VGAM} on both real-life and
simulated CR data.


% ---------------------------------------------------------------
\subsection[Deer mice]{Deer mice}
\label{sec:deer.mice}


Our first example uses a well-known data set analyzed in both \citet{hugg:1991} 
and \citet{amst:mcdo:manl:2005}. The CR data was collected on the deer mouse 
(\textit{Peromyscus maniculatus}), a small rodent native to North America, 
and about 8 to 10 cm long, not counting the length of the tail. There 
were $n=38$ individual mice caught over $\tau=6$ trapping occasions. 
Individual body weight, sex and age (young or adult) were also recorded, 
which we used as covariates to model heterogeneity. The data are given 
in the following data frame \code{deermice}:


<<label = eg-deermice-look>>=
head(deermice, 4)
@


Each row represents the capture history followed by the corresponding 
covariate values for each observed individual. We compared our 
results with those given in \citet{hugg:1991}, who
reported an analysis which involved fitting all eight model variations. 
Prior to this we relabelled the age and sex covariates to match those
given in \citet{hugg:1991}.


<<label = example1-model>>=
deermice <- within(deermice, {
  age <- 2 - as.numeric(age) 
  sex <- 1 - as.numeric(sex)
})
@


Below we demonstrate model fitting for each model in \pkg{VGAM}:


<<label = example2-model>>=
M.0 <- vglm(cbind(y1, y2, y3, y4, y5, y6) ~ 1,
            posbernoulli.t(parallel = TRUE ~ 1), data = deermice)
M.b <- vglm(cbind(y1, y2, y3, y4, y5, y6) ~ 1, 
            posbernoulli.b, data = deermice)
M.t <- vglm(cbind(y1, y2, y3, y4, y5, y6) ~ 1, 
            posbernoulli.t, data = deermice)
M.h <- vglm(cbind(y1, y2, y3, y4, y5, y6) ~ weight + sex + age,
            posbernoulli.t(parallel = TRUE ~ weight + sex + age), data = deermice)
M.th <- vglm(cbind(y1, y2, y3, y4, y5, y6) ~ weight + sex + age,
             posbernoulli.t, data = deermice)
M.tb <- vglm(cbind(y1, y2, y3, y4, y5, y6) ~ 1, 
             posbernoulli.tb, data = deermice)
M.bh <- vglm(cbind(y1, y2, y3, y4, y5, y6) ~ weight + sex + age,
             posbernoulli.b, data = deermice)
M.tbh <- vglm(cbind(y1, y2, y3, y4, y5, y6) ~ weight + sex + age,
              posbernoulli.tb, data = deermice)
@


Notice that \code{parallel = TRUE} was used for models $\calM_{0}/\calM_{h}$. 
Population size estimates with standard errors (SE), log-likelihood and 
AIC values, can all be easily obtained using the following, 
for example, consider model $\calM_{bh}$:


<<label = eg-deermice-Nhat>>=
c(M.bh@extra$N.hat, M.bh@extra$SE.N.hat)
c(logLik(M.bh), AIC(M.bh))
@


We did this for each model, and obtained the following:


<<maketable, echo=FALSE, results=hide, message=FALSE, warning=FALSE>>=

Table <- rbind(c(round(M.tbh@extra$N.hat,2), 
                 round(M.bh@extra$N.hat,2), 
                 round(M.tb@extra$N.hat,2), 
                 round(M.th@extra$N.hat,2), 
                 round(M.h@extra$N.hat,2), 
                 round(M.b@extra$N.hat,2),
                 round(M.t@extra$N.hat,2), 
                 round(M.0@extra$N.hat,2)), 
               
              c(round(M.tbh@extra$SE.N.hat,2), 
                round(M.bh@extra$SE.N.hat,2), 
                round(M.tb@extra$SE.N.hat,2),
                round(M.th@extra$SE.N.hat,2), 
                round(M.h@extra$SE.N.hat,2), 
                round(M.b@extra$SE.N.hat,2),
                round(M.t@extra$SE.N.hat,2), 
                round(M.0@extra$SE.N.hat,2)), 
               
              -2*c(round(logLik(M.tbh),2), 
                   round(logLik(M.bh),2), 
                   round(logLik(M.tb),2), 
                   round(logLik(M.th),2), 
                   round(logLik(M.h),2), 
                   round(logLik(M.b),2),
                   round(logLik(M.t),2), 
                   round(logLik(M.0),2)), 
               
              c(round(AIC(M.tbh),2), 
                round(AIC(M.bh),2), 
                round(AIC(M.tb),2), 
                round(AIC(M.th),2),
                round(AIC(M.h),2), 
                round(AIC(M.b),2), 
                round(AIC(M.t),2), 
                round(AIC(M.0),2)));

colnames(Table) <- c("M.tbh", "M.bh", "M.tb", 
                     "M.th", "M.h", "M.b", "M.t", "M.0");
rownames(Table) <- c("N.hat", "SE","-2ln(L)", "AIC");
@



<<label = example2-table>>=
Table
@


Based on the AIC, it was concluded that $\calM_{bh}$ was 
superior (although other criteria can also be considered), 
yielding the following coefficients (as well as their SEs):


<<label = poz-posbernoulli-eg-deermice-coefs>>=
round(coef(M.bh), 2)
round(sqrt(diag(vcov(M.bh))), 2)
@


which, along with the estimates for the population size, 
agree with the results of \citet{hugg:1991}.
The first coefficient, \Sexpr{round(coef(M.bh)[1],2)}, is positive 
and hence implies a trap-happy effect.


Now to illustrate the utility of fitting VGAMs, we performed some model
checking on $\calM_{bh}$ by confirming that the component function
of \code{weight} is indeed linear. To do this, we smoothed this covariate 
but did not allow it to be too flexible due to the size of the data set.


<<label = poz-posbernoulli-eg-deermice-smooth, fig.keep = 'none', message=FALSE, warning=FALSE>>=
fit.bh <- vgam(cbind(y1, y2, y3, y4, y5, y6) ~ s(weight, df = 3) + sex + age,
               posbernoulli.b, data = deermice)
plot(fit.bh, se = TRUE, las = 1, lcol = "blue", scol = "orange",
     rcol = "purple", scale = 5)
@


Notice that the \code{s()} function was used to smooth over 
the weight covariate with the equivalent degrees of freedom set 
to 3. Plots of the estimated component functions against each 
covariate are given in Figure \ref{fig:poz:deermice}.
In general, \code{weight} does seem to have a (positive) 
linear effect on the logit scale. Young deer mice
appear more easily caught compared to adults,
and gender seems to have a smaller effect than weight.
A more formal test of linearity is


<<label = poz-posbernoulli-eg-deermice-summary>>=
summary(fit.bh)
@ 


and not surprisingly, this suggests there is no significant nonlinearity.
This is in agreement with Section 6.1 of \citet{hwan:hugg:2011} 
who used kernel smoothing.


Section \ref{sec:poz:posbernoulli.eg.ephemeral}
reports a further analysis of the \code{deermice} data using a
behavioural effect comprising of long-term and
short-term memory.


<<label = poz-posbernoulli-eg-deermice-smooth-shadow, eval=FALSE, echo = FALSE, message=FALSE, warning=FALSE>>=
plot(fit.bh, se = TRUE, las = 1, lcol = "blue", scol = "orange",
     rcol = "purple", scale = 5, mgp = c(2.0, 1, 0))
@





% ---------------------------------------------------------------------

\setkeys{Gin}{width=0.9\textwidth} % 0.8 is the current default

\begin{figure}[tt]
\begin{center}
<<fig=TRUE,label=plot-deermice, width=6.0, height=5, echo=FALSE, message=FALSE, warning=FALSE>>=
par(mfrow = c(2, 2))
par(las = 1, cex = 1.1, mar = c(3.8, 4, 0.5, 0.2) + 0.1)
par(mgp = c(2.3, 1, 0))  # Default is c(3, 1, 0)



plot(fit.bh, se = TRUE, las = 1, lcol = "blue", scol = "orange",
     rcol = "purple", scale = 5, mgp = c(2.0, 1, 0))

# < < poz-posbernoulli-eg-deermice-smooth-shadow> >


@
\caption{Estimated component functions with approximate $\pm 2$ 
pointwise SE bands fitting a $\calM_{bh}$-VGAM, using 
the \code{deermice} data. The rugplot gives jittered values of 
each covariate value $x_{ik}$.\label{fig:poz:deermice}
}
\end{center}
\end{figure}


\setkeys{Gin}{width=0.8\textwidth} % 0.8 is the current default


% ---------------------------------------------------------------------


<<birds91read, echo = FALSE>>=
data("prinia", package = "VGAM")
@


\subsection[Yellow-bellied Prinia]{Yellow-bellied Prinia}
\label{sec:bird}


Our second example also uses a well-known and well-studied data 
set collected on the Yellow-bellied Prinia (\textit{Prinia flaviventris}),
a common bird species located in Southeast Asia. A CR experiment was 
conducted at the Mai Po Nature Reserve in Hong Kong during 1991, 
where captured individuals had their wing lengths measured and 
fat index recorded. A total of $\tau=19$ weekly capture occasions 
were considered, where $n=151$ distinct birds were captured. In previous
studies, models $\calM_h$ and $\calM_{th}$ have both been fitted 
to these data, where both wing length and fat index were used as
covariates. We focus our attention on the former model, and considered
the \code{posbinomial()} function, with some further emphasis on
demonstrating smoothing on covariates. The \code{prinia} data
consists of four columns and rows corresponding to each observed
individual:


<<label = example2a, size = "small">>=
head(prinia, 4)[, 1:4]
@


The first two columns give the observed covariate values for each
individual, followed by the number of times each individual was captured/not
captured respectively (columns 3--4). Notice that the wing
length (\code{length}) was standardized here.
We considered smoothing over the wing length, 
and now plotted the fitted capture probabilities with and 
without fat content against wing length present, see Figure \ref{fig:bird}.


<<label = example2b>>=
M.h.GAM <- 
  vgam(cbind(cap, noncap) ~ s(length, df = 3) + fat, 
       posbinomial(omit.constant = TRUE, parallel = TRUE ~ s(length, df = 3) + fat),
       data = prinia)
M.h.GAM@extra$N.hat     
M.h.GAM@extra$SE.N.hat  
@


<<label = eg-bird-smooth-shadow1, echo=FALSE, fig.keep = 'none', message = FALSE, warning = FALSE>>=
plot.info <- plot(M.h.GAM,
                  se = TRUE, las = 1, plot.arg = FALSE,
                  lcol = "blue",
                  scol = "orange",
                  rcol = "purple",
                  scale = 5)
@



<<label = eg-bird-smooth-shadow2, echo=FALSE, eval=FALSE>>=
info.fit2 <- plot.info@preplot[[1]]
fat.effect <- coef(M.h.GAM)["fat"] 
intercept <- coef(M.h.GAM)["(Intercept)"]  

ooo <- order(info.fit2$x)
centering.const <- mean(prinia$length) - coef(M.h.GAM)["s(length, df = 3)"]

plotframe <- data.frame(lin.pred.b = intercept + fat.effect * 1 +
                                     centering.const + info.fit2$y[ooo],
                        lin.pred.0 = intercept + fat.effect * 0 +
                                     centering.const + info.fit2$y[ooo],
                        x2 = info.fit2$x[ooo])

plotframe <- transform(plotframe,
                       up.lin.pred.b = lin.pred.b + 2*info.fit2$se.y[ooo],
                       lo.lin.pred.b = lin.pred.b - 2*info.fit2$se.y[ooo],
                       up.lin.pred.0 = lin.pred.0 + 2*info.fit2$se.y[ooo],
                       lo.lin.pred.0 = lin.pred.0 - 2*info.fit2$se.y[ooo])

plotframe <- transform(plotframe,
                       fv.b    = logit(lin.pred.b,    inverse = TRUE),
                       up.fv.b = logit(up.lin.pred.b, inverse = TRUE),
                       lo.fv.b = logit(lo.lin.pred.b, inverse = TRUE),
                       fv.0    = logit(lin.pred.0,    inverse = TRUE),
                       up.fv.0 = logit(up.lin.pred.0, inverse = TRUE),
                       lo.fv.0 = logit(lo.lin.pred.0, inverse = TRUE))

with(plotframe,
     matplot(x2, cbind(up.fv.b, fv.b, lo.fv.b), type = "l", col = "blue",
             lty = c(2, 1, 2), las = 1, cex.lab = 1.5, lwd = 2,
             main = "", ylab = "", xlab = "Wing length (standardized)"))
mtext( ~ hat(p), side = 2, cex = 1.4, line = 4, adj = 0.5, las = 1)
with(plotframe, matlines(x2, cbind(up.fv.0, fv.0, lo.fv.0),
                         col = "darkorange", lty = c(2, 1, 2)), lwd = 2)
legend("topleft", legend = c("Fat present", "Fat not present"), bty = "n",
       lwd = 2, col = c("blue", "darkorange"), merge = TRUE, cex = 1.5)
@






% ---------------------------------------------------------------------

\setkeys{Gin}{width=0.9\textwidth} % 0.8 is the current default

\begin{figure}[tt]
\begin{center}
<<fig=TRUE, label=plot-bird, width=6.0, height=5.5, echo=FALSE, message=FALSE, warning=FALSE>>=
par(mfrow = c(1, 1))



info.fit2 <- plot.info@preplot[[1]]
fat.effect <- coef(M.h.GAM)["fat"] 
intercept <- coef(M.h.GAM)["(Intercept)"]  

ooo <- order(info.fit2$x)
centering.const <- mean(prinia$length) - coef(M.h.GAM)["s(length, df = 3)"]

plotframe <- data.frame(lin.pred.b = intercept + fat.effect * 1 +
                                     centering.const + info.fit2$y[ooo],
                        lin.pred.0 = intercept + fat.effect * 0 +
                                     centering.const + info.fit2$y[ooo],
                        x2 = info.fit2$x[ooo])

plotframe <- transform(plotframe,
                       up.lin.pred.b = lin.pred.b + 2*info.fit2$se.y[ooo],
                       lo.lin.pred.b = lin.pred.b - 2*info.fit2$se.y[ooo],
                       up.lin.pred.0 = lin.pred.0 + 2*info.fit2$se.y[ooo],
                       lo.lin.pred.0 = lin.pred.0 - 2*info.fit2$se.y[ooo])

plotframe <- transform(plotframe,
                       fv.b    = logit(lin.pred.b,    inverse = TRUE),
                       up.fv.b = logit(up.lin.pred.b, inverse = TRUE),
                       lo.fv.b = logit(lo.lin.pred.b, inverse = TRUE),
                       fv.0    = logit(lin.pred.0,    inverse = TRUE),
                       up.fv.0 = logit(up.lin.pred.0, inverse = TRUE),
                       lo.fv.0 = logit(lo.lin.pred.0, inverse = TRUE))

with(plotframe,
     matplot(x2, cbind(up.fv.b, fv.b, lo.fv.b), type = "l", col = "blue",
             lty = c(2, 1, 2), las = 1, cex.lab = 1.5, lwd = 2,
             main = "", ylab = "", xlab = "Wing length (standardized)"))
mtext( ~ hat(p), side = 2, cex = 1.4, line = 4, adj = 0.5, las = 1)
with(plotframe, matlines(x2, cbind(up.fv.0, fv.0, lo.fv.0),
                         col = "darkorange", lty = c(2, 1, 2)), lwd = 2)
legend("topleft", legend = c("Fat present", "Fat not present"), bty = "n",
       lwd = 2, col = c("blue", "darkorange"), merge = TRUE, cex = 1.5)



# < < eg-bird-smooth-shadow2 > >
    
    
    
@
\caption{
Capture probability estimates with approximate $\pm 2$ pointwise SEs, 
versus wing length with (blue) and without (orange) fat content present 
fitting a $\calM_{h}$-VGAM, using the \code{prinia} data. 
Notice that the standard errors are wider at the boundaries.
\label{fig:bird}
}
\end{center}
\end{figure}


\setkeys{Gin}{width=0.8\textwidth} % 0.8 is the current default


% ---------------------------------------------------------------------


Both the estimates for the population size and shape of the fitted 
capture probabilities with smoothing (Figure \ref{fig:bird}) matched 
those in previous studies, e.g., see Figure 1 of \citet{hwan:hugg:2007}. 
Notice that capture probabilities are larger for individuals with 
fat content present, also the approximate $\pm 2$ pointwise SEs 
become wider at the boundaries---this feature is commonly seen
in smooths.


% ------------------------------------------------------------
\subsection[A time-varying covariate example]{A time-varying covariate example}
\label{sec:poz:posbernoulli.eg.hugg:1991}


To illustrate time-varying covariates in the $\calM_{th}$ 
and $\calM_{tbh}$ model via the \code{xij} argument, we 
mimicked the results of \citet{hugg:1989} who fitted the $\calM_{tbh}$ 
model to a small simulated data set of $n=18$ observed individuals 
and $\tau=10$ trapping occasions. To help illustrate the procedure 
we also fitted model $\calM_{th}$. The true population was $N=20$. 
For the $i$th individual, model $\calM_{th}$ will be written 
as ($i=1,\ldots,18$, $j=1,\ldots,10$)
\begin{eqnarray}
\label{eq:huggins89t0}
\logit \, p_{ij} & = & \beta_{(1)1}^{*} + \beta_{(1)2}^{*} 
\cdot \mathtt{x2}_{i} + \beta_{(1)3}^{*} \cdot \mathtt{x3}_{j}, \ \ %%
\end{eqnarray}
and model $\calM_{tbh}$ will be written as ($i=1,\ldots,18$, $j=1,\ldots,10$)
\begin{eqnarray}
\label{eq:huggins89t1}
\logit \, p_{ij} &=&
\beta_{(1)1}^{*} \,z_{ij} + \beta_{(2)1}^{*} + \beta_{(1)2}^{*} 
\cdot \mathtt{x2}_{i} + \beta_{(1)3}^{*} \cdot \mathtt{x3}_{j}, \ \ \ %%
\end{eqnarray}
where $\beta_{(1)1}^{*}$ in (\ref{eq:huggins89t1}) is the behavioural 
effect, and $z_{ij}$ is defined in Table \ref{tab0}. Variable \code{x2} 
is an ordinary individual covariate such as weight, as in the previous 
examples. The variable \code{x3} is a time-varying or occasion-specific 
covariate such as temperature or daily rainfall that is handled using 
the \code{xij} argument described in Section \ref{sec:xij}. Note that 
the environmental covariates are involved in the $\eta_j$ for individuals 
that have not been and have been previously captured so that if behavioural
response is included in the model (e.g., $\calM_{tbh}$) these must be
repeated to construct the overall model matrix. Also,
note that there can be no recaptures on the first occasion so that
the environmental variable for this occasion 
need not be repeated. We first examined the data


<<label = poz-posbernoulli-tb-huggins89t1-data>>=
head(Huggins89table1, 4)
@


The time-varying/occasion-specific covariate variable \code{x3} 
is represented by variables \code{t01}--\code{t10}. As noted above, we need 
to construct the \code{T02}--\code{T10} to model the recapture probabilities 
through $\eta_{j}$ for $j=11,\ldots,19$


<<label = poz-posbernoulli-tb-huggins89t1-look>>=
Hdata <- transform(Huggins89table1, x3.tij = t01,
                   T02 = t02, T03 = t03, T04 = t04, T05 = t05, T06 = t06,
                   T07 = t07, T08 = t08, T09 = t09, T10 = t10)
Hdata <- subset(Hdata,
                y01 + y02 + y03 + y04 + y05 + y06 + y07 + y08 + y09 + y10 > 0)
@
  
  
The last step deletes the two observations which were never caught, such that $n=18$. 
Thus model (\ref{eq:huggins89t0}) can be fitted by


<<label = poz-posbernoulli-th-huggins89t0-fit>>=
fit.th <-
   vglm(cbind(y01, y02, y03, y04, y05, y06, y07, y08, y09, y10) ~  x2 + x3.tij,
        xij = list(x3.tij ~ t01 + t02 + t03 + t04 + t05 + t06 + t07 + t08 +
                            t09 + t10 - 1),
        posbernoulli.t(parallel.t = TRUE ~ x2 + x3.tij), 
        data = Hdata, trace = FALSE, 
        form2 = ~ x2 + x3.tij + t01 + t02 + t03 + t04 + t05 + t06 + t07 + t08 +
                                t09 + t10)
@


The \code{form2} argument is required if \code{xij} is used 
and it needs to include all the variables in the model. It is from
this formula that a very large model matrix is constructed, from which
the relevant columns are extracted to construct the diagonal matrix
in (\ref{eq:vglimo:xij.vector.diag}) in the specified order of diagonal 
elements given by \code{xij}. Their names need to be uniquely specified.
To check the constraint matrices we can use


<<label = poz-posbernoulli-th-huggins89t0-constraints>>=
constraints(fit.th, matrix = TRUE)
@


Model (\ref{eq:huggins89t1}) can be fitted by


<<label = poz-posbernoulli-tbh-huggins89t1-fit>>=
fit.tbh <-
  vglm(cbind(y01, y02, y03, y04, y05, y06, y07, y08, y09, y10) ~  x2 + x3.tij,
       xij = list(x3.tij ~ t01 + t02 + t03 + t04 + t05 + t06 +
                                 t07 + t08 + t09 + t10 +
                                 T02 + T03 + T04 + T05 + T06 +
                                 T07 + T08 + T09 + T10 - 1),
       posbernoulli.tb(parallel.t = TRUE ~ x2 + x3.tij),
       data = Hdata, trace = FALSE,
       form2 = ~  x2 + x3.tij +
                  t01 + t02 + t03 + t04 + t05 + t06 + t07 + t08 + t09 + t10 +
                        T02 + T03 + T04 + T05 + T06 + T07 + T08 + T09 + T10)
@


To compare with model (\ref{eq:huggins89t0}) we have


<<label = poz-posbernoulli-tbh-huggins89t1-aic>>=
c(logLik(fit.th), AIC(fit.th))
c(logLik(fit.tbh), AIC(fit.tbh))
@


so that the behavioural response model does indeed give a better fit. 
To check, the constraint matrices are (cf., Table \ref{tab3})


<<label = poz-posbernoulli-tb-huggins89t1-constraints>>=
head(constraints(fit.tbh, matrix = TRUE), 4)
tail(constraints(fit.tbh, matrix = TRUE), 4)
@


The coefficients $\widehat{\beta}_{(j)k}^{*}$ and their standard errors are


<<label = poz-posbernoulli-tb-huggins89t1-coefs>>=
coef(fit.tbh)
sqrt(diag(vcov(fit.tbh))) 
@


The first coefficient, \Sexpr{round(coef(fit.tbh)[1], 2)}, is positive and
hence implies a trap-happy effect. The Wald statistic for the behavioural effect,
being \Sexpr{round(c(coef(fit.tbh) / sqrt(diag(vcov(fit.tbh))))[1], 2)},
suggests the effect is real.


Estimates of the population size can be obtained from


<<label = poz-posbernoulli-tb-huggins89t1-Nhat>>=
fit.tbh@extra$N.hat   
fit.tbh@extra$SE.N.hat
@


This compares with $\widehat{N}=20.86$ with a standard error 
of $4.51$ \citep{hugg:1989}.


In closing, we refit model \code{fit.tbh} using
\code{Select()} to illustrate the avoidance of
manual specification of cumbersome formulas and response matrices
with many columns. For example, suppose \code{pdata} is a data frame
with columns \code{y01}, \code{y02}, \ldots, \code{y30}.
Then \code{Select(pdata, "y")} will return the matrix 
\code{cbind(y01, y02, \ldots, y30)} if there are no other 
variables beginning with \code{"y"}.


Starting with \code{Huggins89table1}, the following
code works quite generally provided the original variables
are labelled as \code{y01}, \code{y02}, \ldots,
and \code{t01}, \code{t02}, \ldots.
The code makes a copy of \code{cbind(t01,\ldots,t10)}
for the capture probabilities
and calls the variables \code{cbind(T01,\ldots,T10)}
for the recapture probabilities.
Also, \code{Form2} contains more variables than what is needed.


<<label = poz-posbernoulli-tbh-huggins89t1-fit-Select, eval=T>>=
Hdata <- subset(Huggins89table1, rowSums(Select(Huggins89table1, "y")) > 0)
Hdata.T <- Select(Hdata, "t")
colnames(Hdata.T) <- gsub("t", "T", colnames(Hdata.T))
Hdata <- data.frame(Hdata, Hdata.T)
Hdata <- transform(Hdata, x3.tij = y01)
Form2 <- Select(Hdata, prefix = TRUE, as.formula = TRUE)
Xij   <- Select(Hdata, c("t", "T"), as.formula = TRUE,
                sort = FALSE, rhs = "0", lhs = "x3.tij", exclude = "T01")
fit.tbh <- vglm(Select(Hdata, "y") ~ x2 + x3.tij,
                form2 = Form2,  xij = list(Xij),
                posbernoulli.tb(parallel.t = TRUE ~ x2 + x3.tij),
                data = Hdata, trace = FALSE)
coef(fit.tbh)
@


Note that this illustrates the ability to enter a matrix response without 
an explicit \code{cbind()}, e.g., \code{Y <- Select(Hdata, "y")} and the 
invocation \code{vglm(Y \mytilde{}} $\cdots$\code{)} would work as well.
However, the utility of \code{cbind()} encourages the use of column names, 
which is good style and avoids potential coding errors.


% ------------------------------------------------------------
\subsection[Ephemeral and enduring memory]{Ephemeral and enduring memory}
\label{sec:poz:posbernoulli.eg.ephemeral}


\cite{yang:chao:2005} consider modelling the behavioural effect
with both enduring (long-term) and ephemeral (short-term) memory components.
For example, the short-term component depends on whether or not the animal
was caught on the most recent sampling occasion. We call this a lag-1 effect.
In the example of this section, which combines aspects of
Sections \ref{sec:deer.mice} 
and \ref{sec:poz:posbernoulli.eg.hugg:1991}, we illustrate how this
may be easily achieved
within the VGLM framework; it is another case of using the \code{xij}
argument. We retain the enduring component as with the $\calM_{tbh}$:
$\bH_1$ contains a column that applies to all the recapture probabilities. 
For simplicity, we first consider a lag-1 effect only
(as in \cite{yang:chao:2005}) 
for the short-term component.


In the following, we fit a $\calM_{tbh}$ model to \code{deermice}
with both long-term and short-term effects:
\begin{eqnarray*}
\mathrm{logit}\, p_{cs} &=&
\beta_{(2)1}^{*} + 
\beta_{(1)2}^{*} \, \mathtt{sex} +
\beta_{(1)3}^{*} \, \mathtt{weight},
\\
\mathrm{logit}\, p_{rt} &=&
\beta_{(1)1}^{*} + \beta_{(2)1}^{*} + 
\beta_{(1)2}^{*} \, \mathtt{sex} +
\beta_{(1)3}^{*} \, \mathtt{weight} +
\beta_{(1)4}^{*} \, y_{t-1},
\end{eqnarray*}
where $s=2,\ldots,\tau$, $t=1,\ldots,\tau$ and $\tau=6$.


<<label = poz-posbernoulli-bh-ephemeral-method1>>=
deermice <- transform(deermice, Lag1 = y1)
M.tbh.lag1 <-
  vglm(cbind(y1, y2, y3, y4, y5, y6) ~ sex + weight + Lag1,
       posbernoulli.tb(parallel.t = FALSE ~ 0,
                       parallel.b = FALSE ~ 0,
                       drop.b = FALSE ~ 1),
       xij = list(Lag1 ~ fill(y1) + fill(y2) + fill(y3) + fill(y4) +
                         fill(y5) + fill(y6) +
                         y1 + y2 + y3 + y4 + y5),
       form2 = ~ sex + weight + Lag1 +
                 fill(y1) + fill(y2) + fill(y3) + fill(y4) +
                 fill(y5) + fill(y6) +
                 y1 + y2 + y3 + y4 + y5 + y6,
       data = deermice)
coef(M.tbh.lag1)
@ 


The coefficient of \code{Lag1}, \Sexpr{round(coef(M.tbh.lag1)["Lag1"], dig = 4)},
is the estimated ephemeral effect $\widehat{\beta}_{(1)4}^{*}$.
The estimated enduring effect $\widehat{\beta}_{(1)1}^{*}$
has value \Sexpr{round(coef(M.tbh.lag1)["(Intercept):1"], dig = 4)}.
Note that the \code{fill()} function is used to create 6 variables
having 0 values, i.e., $\bzero_n$.


There is an alternative method to fit the above model; here we 
set $\bH_{\mathtt{Lag1}} = (\bzero_{\tau}^{\top}, \bone_{\tau-1}^{\top})^{\top}$
and the variables \code{fill(y1)},\ldots,\code{fill(y6)}
can be replaced by variables that do not need to be 0.
Importantly, the two methods have $\bX^{\#}_{(ik)}\bH_k$ in (\ref{eq:vglimo:xij.vector.diag})
being the same regardless. The second alternative method requires 
constraint matrices to be inputted using the \code{constraints} argument.
For example,


<<label = poz-posbernoulli-bh-ephemeral-method2>>=
deermice <- transform(deermice, Lag1 = y1)
deermice <- transform(deermice, f1 = y1, f2 = y1, f3 = y1, f4 = y1,
                                f5 = y1, f6 = y1)
tau <- 6
H2 <- H3 <- cbind(rep(1, 2*tau-1))
H4 <- cbind(c(rep(0, tau), rep(1, tau-1)))
M.tbh.lag1.method2 <-
  vglm(cbind(y1, y2, y3, y4, y5, y6) ~ sex + weight + Lag1,
       posbernoulli.tb(parallel.b = TRUE ~ 0, parallel.t = TRUE ~ 0),
       constraints = list("(Intercept)" = cbind(H4, 1), sex = H2, weight= H3, 
                          Lag1 = H4),
       xij = list(Lag1 ~ f1 + f2 + f3 + f4 + f5 + f6 +
                         y1 + y2 + y3 + y4 + y5),
       form2 = Select(deermice, prefix = TRUE, as.formula = TRUE),
       data = deermice)
coef(M.tbh.lag1.method2)
@ 


is identical. In closing, it can be noted that
more complicated models can be handled.
For example, the use of \code{pmax()} to handle lag-2 effects
as follows.


<<label = poz-posbernoulli-bh-ephemeral-lag2>>=
deermice <- transform(deermice, Lag2 = y1)
M.bh.lag2 <-
  vglm(cbind(y1, y2, y3, y4, y5, y6) ~ sex + weight + Lag2,
       posbernoulli.tb(parallel.t = FALSE ~ 0,
                       parallel.b = FALSE ~ 0,
                       drop.b = FALSE ~ 1),
       xij = list(Lag2 ~ fill(y1) + fill(y2) + fill(y3) + fill(y4) +
                         fill(y5) + fill(y6) +
                         y1 + pmax(y1, y2) + pmax(y2, y3) + pmax(y3, y4) + 
                         pmax(y4, y5)),
       form2 = ~ sex + weight + Lag2 +
                 fill(y1) + fill(y2) + fill(y3) + fill(y4) +
                 fill(y5) + fill(y6) +
                 y1 + pmax(y1, y2) + pmax(y2, y3) + pmax(y3, y4) + 
                 pmax(y4, y5) + y6,
       data = deermice)
coef(M.bh.lag2)
@ 


Models with separate lag-1 and lag-2 effects may also be similarly estimated as above.







%*********************************************************************
\section[Discussion]{Discussion}
\label{sec:discussion}


We have presented how the VGLM/VGAM framework naturally handles the 
conditional-likelihood and closed population CR models in a GLM-like
manner. Recently, \citet{stok:2011} proposed a partial likelihood approach for
heterogeneous models with covariates. There, the recaptures of the 
observed individuals were modelled, which yielded a binomial distribution, 
and hence a GLM/GAM framework in \proglang{R} is also possible. However, some 
efficiency is lost, as any individuals captured only once on the last 
occasion are excluded. The advantage of partial likelihood is that the 
full range of GLM based techniques, which includes more than GAMs, 
are readily applicable. \citet{hugg:hwan:2007,hwan:hugg:2011} 
and \citet{stok:hugg:2012} implemented smoothing on covariates for more 
general models, however these methods required implementing sophisticated 
coding for estimating the model parameters. \citet{zwane:2004} also used 
the \pkg{VGAM} package for smoothing and CR data, but considered multinomial
logit models as an alternative to the conditional likelihood. We 
believe the methods here, based on spline smoothing and classical GAM, 
are a significant improvement in terms of ease of use, capability and 
efficiency.


When using any statistical software, the user must take a 
careful approach when analyzing and interpreting their output data. 
In our case, one must be careful when estimating the population via 
the HT estimator. Notice that (\ref{eq:HT}) is a sum of the reciprocal of 
the estimated capture probabilities seen at least 
once, $\widehat{\pi}_{i}(\btheta)$. Hence, for very 
small $\widehat{\pi}_{i}(\btheta)$, the population size estimate may 
give a large and unrealistic value (this is also apparent when 
using the \pkg{mra} package and \pkg{Rcapture} which gives the 
warning message: \code{The abundance estimation for this model 
can be unstable}). To avoid this, \citet{stok:hugg:2012} 
proposed a robust HT estimator which places a lower bound 
on $\widehat{\pi}_{i}(\btheta)$ to prevent it from giving 
unrealistically large values. In \pkg{VGAM}, a warning similar 
to \pkg{Rcapture} is also implemented, and there are 
arguments to control how close to 0 ``very small'' is and to 
suppress the warning entirely.


There are limitations for $\calM_{h}$-type models, in that they rely 
on the very strong assumption that all the heterogeneity is explained 
by the unit-level covariates. This assumption is often not true,
see, e.g., \cite{rive:bail:2014}. To this end, a proposal is to add
random-effects to the VGLM class. This would result in the VGLMM class 
(``M'' for mixed) which would be potentially very useful if developed
successfully. Of course, VGLMMs would contain
GLMMs \citep{mccu:sear:neuh:2008} as a special case.
Further future implementations also include: 
automatic smoothing parameter selection (via, say, generalized cross 
validation or AIC); including a bootstrap procedure as an alternative 
for standard errors.


GAMs are now a standard statistical tool in the modern data analyst's
toolbox. With the exception of the above references, CR analysis has 
since been largely confined to a few regression coefficients (at most), 
and devoid of any data-driven exploratory analyses involving graphics. 
This work has sought to rectify this need by introducing GAM-like 
analyses using a unified statistical framework. Furthermore, the 
functions are easy to use and often can be invoked by a single 
line of code. Finally, we believe this work is a substantial 
improvement over other existing software for closed population 
estimation, and we have shown \pkg{VGAM}'s favourable speed and 
reliability over other closed population CR \proglang{R}-packages.


%*********************************************************************
\section*{Acknowledgements}


We thank the reviewers for their helpful feedback that led to substantial 
improvements in the manuscript. TWY thanks Anne Chao for a helpful 
conversation, and the Department of Mathematics and Statistics at 
the University of Melbourne for hospitality, during a sabbatical visit 
to Taiwan and a workshop, respectively. Part of his work was also 
conducted while as a visitor to the Institute of Statistical Science, 
Academia Sinica, Taipei, during October 2012. JS visited TWY on the 
Tweedle-funded Melbourne Abroad Travelling Scholarship, the 
University of Melbourne, during September 2011. All authors would 
also like to thank Paul Yip for providing and giving permission for 
use of the \code{prinia} data set, and Zachary Kurtz for some helpful 
comments.




\bibliography{./crVGAM}




%*********************************************************************
\section*{Appendix A: Derivatives}
\label{sec:posbernoulli.technical}


We give the first and (expected) second derivatives of the models. 
Let $z_{ij}= 1$ if individual $i$ has been captured before occasion $j$,
else $=0$. Also, let $p_{cj}$ and $p_{rj}$ be the probability that an
individual is captured/recaptured at sampling occasion $j$, 
and $Q_{s:t} = \prod_{j=s}^{t} (1-p_{cj})$.
Occasionally, subscripts $i$ are omitted for clarity.
\cite{hugg:1989} gives a general form for the derivatives of the
conditional likelihood (\ref{eq:posbern.condlikelihood}).


For the $\calM_{tbh}$, the score vector is 
\begin{eqnarray*}
\frac{\partial \ell_{i}}{\partial p_{cj}^{}} 
&  =  & (1 - z_{ij})
\left[\frac{y_{ij}}{p_{cj}^{}} - \frac{1-y_{ij}}{1-p_{cj}^{}} \right] -
\frac{Q_{1:\tau} / (1-p_{cj}^{})}{1-Q_{1:\tau}},\ \ \ \ j=1,\ldots,\tau,\\
\frac{\partial \ell_{i}}{\partial p_{rj}^{}}
&  =  & z_{ij} \left[\frac{ y_{ij}}{ p_{rj}^{}} - \frac{1-y_{ij}}{1-p_{rj}^{}} \right],
\ \ \ \ j=2,\ldots,\tau,
\end{eqnarray*}
and the non-zero elements of the expected information matrix (EIM)
can be written
\begin{eqnarray*}
-\E \left(\frac{\partial^2 \ell}{\partial p_{cj}^{2}}\right)
& = &
\frac{Q_{1:(j-1)}}{1 - Q_{1:\tau}} \left\{\frac{1}{p_{cj}} + 
\frac{1 - Q_{(j+1):\tau}}{1 - p_{cj}}\right\} -
\left(\frac{\partial Q_{1:\tau} / \partial p_{cj}}{1-Q_{1:\tau}}\right)^2\\
& = & \frac{1}{(1 - p_{cj})^2 (1 - Q_{1:\tau})} 
\left\{\frac{Q_{1:j}}{p_{cj}} -\frac{Q_{1:\tau}}{1 - Q_{1:\tau}} \right\}, \\
-\E \left(\frac{\partial^2 \ell}{\partial p_{rj}^{2}}\right)
& = &\frac{1-Q_{1:j}/(1-p_{cj})}{p_{rj}(1-p_{rj})(1 - Q_{1:\tau})},\\
-\E \left(\frac{\partial^2 \ell}{\partial p_{cj} \,\partial p_{ck} }\right)
& = & 
\displaystyle{\frac{\displaystyle{ -\frac{\partial Q_{1:\tau}}{\partial p_{cj}^{}} 
\frac{\partial Q_{1:\tau}}{\partial p_{ck}^{}} }}{(1-Q_{1:\tau})^2}} - 
\displaystyle{\frac{\displaystyle{\frac{\partial^2 Q_{1:\tau}}{\partial p_{cj}^{} \, 
\partial p_{ck}^{}}}}{(1-Q_{1:\tau})}},\ \ \ j\neq k,
\end{eqnarray*}
where $\partial Q_{1:\tau} / \partial p_{cj}^{} = -Q_{1:\tau} / (1-p_{cj})$ 
and $\partial^2 Q_{1:\tau} / (\partial p_{cj} \, \partial p_{ck}) = 
Q_{1:\tau} / \{(1-p_{cj})(1-p_{ck})\}$.


Arguments \code{ridge.constant} and \code{ridge.power}
in \code{posbernoulli.tb()} add a ridge parameter to the first $\tau$
EIM diagonal elements, i.e., those for $p_{cj}$. This ensures that
the working weight matrices are positive-definite, and is needed
particularly in the first few iteratively reweighted
least squares iterations. Specifically, at iteration ${a}$ a 
positive value ${\omega K \times a^p}$ is added, where $K$ and $p$ 
correspond to the two arguments, and $\omega$ is the
mean of elements of such working weight matrices. The ridge factor
decays to zero as iterations proceed and plays a negligible role upon
convergence.


For individual $i$, let $y_{0i}$ be the number of noncaptures before
the first capture, $y_{r0i}$ be the number of noncaptures after the
first capture, and $y_{r1i}$ be the number of recaptures after the
first capture. For the $\calM_{bh}$, the score vector is
\begin{eqnarray*}
\frac{\partial\ell_{i}}{\partial p_{c}^{}} & = & \frac{1}{p_{c}^{}} -
\frac{y_{0i}}{1 - p_{c}^{}} - \frac{\tau (1 - p_{ij}^{})^{\tau-1}}{1-Q_{1:\tau}},\\
\frac{\partial\ell_{i}}{\partial p_{r}^{}} & = & 
\frac{y_{r1i}}{p_{r}^{}} - \frac{y_{r0i}}{1 - p_{c}^{}}.
\end{eqnarray*}
The non-zero elements of the EIM can be written
\begin{eqnarray*}
-\E \left(\frac{\partial^2 \ell}{\partial p_{c}^{2}}\right)
& = &\frac{p_c}{1-Q_{1:\tau}} \;\sum_{j=1}^{\tau} \;(1-p_c)^{j-1}
\left(\frac{j-1}{(1-p_c)^2} + \frac{1}{p_c^2}\right) - \frac{\partial}{\partial p_c} \!
\left(\frac{\partial Q_{1:\tau} / \partial p_c}{1-Q_{1:\tau}} \right)\\
& = &\frac{1 - Q_{1:\tau} - p_c [1 + (\tau-1)Q_{1:\tau}]}{p_c \, (1-p_c)^2 \, (1-Q_{1:\tau})} 
+ \frac{1}{p_c^2} -\mbox{}\\
& & \tau (\tau-1) \, \frac{(1-p_c)^{\tau-2}}{1-Q_{1:\tau}} +\tau^2 \, 
\frac{(1-p_c)^{\tau-2}}{(1-Q_{1:\tau})^2}, \\
-\E \left(\frac{\partial^2 \ell}{\partial p_{r}^{2}}\right)
& = & \frac{1}{p_r \, (1-p_r) \, (1-Q_{1:\tau})} \; \sum_{j=1}^{\tau}
\left\{1 - (1-p_c)^{j-1}\right\}\\
& = & \frac{\tau - (1-Q_{1:\tau}) / p_c}{p_r (1-p_r) (1-Q_{1:\tau})}.
\end{eqnarray*}


For the $\calM_{th}$, the score vector is
\begin{eqnarray*}
\frac{\partial \ell_{i}}{\partial p_{j}^{}}
& = & \frac{y_{ij}}{p_{ij}^{}} -\frac{1-y_{ij}}{1-p_{ij}^{}} - 
\frac{Q_{1:\tau} /(1-p_{ij}^{})}{1-Q_{1:\tau}},\ \ \ \ j=1,\ldots,\tau,
\end{eqnarray*}
and the EIM elements are
\begin{eqnarray*}
-\E \left(\frac{\partial^2 \ell}{\partial p_{j}^{2}}\right)
& = & \frac{1 - p_{j} - Q_{1:\tau}}{p_{j} \,(1-p_{j})^2 \, (1 - Q_{1:\tau})^2},\\
-\E \left(\frac{\partial^2 \ell}{\partial p_{j} \,\partial p_{k}}\right)
& = & \frac{p_{j} \, p_{k} \, Q_{1:\tau}(1-Q_{1:\tau}) + Q_{1:\tau}^2}{(1-Q_{1:\tau})^2 \,(1-p_{j})\, (1-p_{k})},
\ \ \ \ \ j\neq k.
\end{eqnarray*}


%*********************************************************************
\section*{Appendix B: Fitted values}
\label{sec:fitted.values}


By default all the family functions have fitted values corresponding
to the probabilities in the conditional
likelihood function (\ref{eq:posbern.condlikelihood}), viz.
\[
{\widehat{p}}_{ij}^{ y_{ij}} \left(1-\widehat{p}_{ij}\right)^{1-y_{ij}}
\cdot
\left[
 1 - \prod_{s=1}^{\tau}
\left( 1 - \widehat{p}_{i,cs}^{}\right)
\right]^{-1}.
\]



Alternatively, the unconditional means of the $Y_j$ can be
returned as the fitted values upon selecting
\code{type.fitted = "mean"} argument.
They are $\mu_1 = E(Y_1) =p_{c1} / (1 - Q_{1:\tau})$,
 $\mu_2 = [(1 - p_{c1}) \,p_{c2} + p_{c1}\, p_{r2}]/(1 - Q_{1:\tau})$, and
for $j=3,4,\ldots,\tau$,
\[
\mu_j = 
\left( 1 - Q_{1:\tau} \right)^{-1}
\left\{p_{cj}\, Q_{1:(j-1)} + p_{rj} \!
\left[ p_{c1} + \sum_{s=2}^{j-1} \, p_{cs} \,Q_{1:(s-1)}
\right]
\right\}.
\]





\end{document}