/* ----- PROGRAM c:\racd\racd3p3.gau  July 1998 ------

  This GAUSS program does Poisson PMLE for data in Chapter 3 of
   A.C. Cameron and Pravin K. Trivedi (1998)
   REGRESSION ANALYSIS OF COUNT DATA,
   Econometric Society Monograph No.30, Cambridge University Press.

  A title such as Table 3.3, col.3, p.69 means Table 3.2 column 3 page 69

  For more details on the data see file racd3.1st

  The data set racd3.asc is the same data as originally used in
  (1) A.C. Cameron and P.K. Trivedi (1986), "Econometric Models Based on
  Count Data: Comparisons and Applications of  Some Estimators and Tests",
  Journal of Applied Econometrics, Vol. 1, pp. 29-54.
  and in Table 4
  (2) A.C. Cameron, P.K. Trivedi, F. Milne and J. Piggott (1988),
  "A Microeconometric Model of the Demand for Health Care and Health
  Insurance in Australia", Review of Economic Studies, Vol.55, pp. 85-106.
  though in places this paper used more variables than those here
  and in
  (3) A.C. Cameron, P.K. Trivedi (1993),
  "Tests of Independence in Parametric Models with Applications and
   Illustrations", Journal of Business & Economics Statistics,
   Vol.11, pp.29-43.
  and in a number of other subsequent papers.

  This data is not a representative sample of Australians as it oversamples
  young and old. In particular, use of health services may be overstated.
  This is because while the original sample of 40,650 individuals
  from the 1977-78 Australian Health Survey is representative,
  the sample used here is restricted to single people over 18 years of age.

  See the R.E.Stud. (1988, pp.85-106) section 3 for more detailed
  discussion of the data than that given in the RACD book.

  The regressors are defined in RACD on page 68
  Socioeconomic:
   SEX       1 if female, 0 if male
   AGE       Age in years divided by 100
             (measured as mid-point of 10 age groups from 15-19 years to
              65-69 with 70 or more coded treated as 72)
   AGESQ     AGE squared
   INCOME    Annual income in Australian dollars divided by 1000
             (measured as mid-point of coded ranges Nil, <200, 200-1000,
              1001-, 2001-, 3001-, 4001-, 5001-, 6001-, 7001-, 8001-10000,
              10001-12000, 12001-14000, with 14001- treated as 15000
  Health insurance:
   LEVYPLUS  1 if covered by private health insurance fund for private
             patient in public hospital (with doctor of choice), 0 otherwis
   FREEPOOR  1 if covered by government because low income, recent immigran
             unemployed, 0 otherwise
   FREEREPA  1 if covered free by government because of old-age or disabili
             pension, or because invalid veteran or family of deceased
             veteran, 0 otherwise
   (Omitted category LEVY is 1 if covered by Medibank health insurance)
  Health status:
   ILLNESS   Number of illnesses in past 2 weeks with 5 or more coded as 5
   ACTDAYS   Number of days of reduced activity in past two weeks due to
             illness or injury
   HSCORE    General health questionnaire score using Goldberg's method.
             High score indicates bad health.
   CHCOND1   1 if chronic condition(s) but not limited in activity, 0 other
   CHCOND2   1 if chronic condition(s) and limited in activity, 0 otherwise

  Note that the R.E.Stud. (1988) article uses different names:
        FREEREPA was called FREEOTHER
        HSCORE   was called GHQ
        CHCOND1  was called LIMCHRON
        CHCOND2  was called NONLIMCHRON

  The count variables are defined in R.E.Stud. (1988, pp.91)
   DVISITS  Number of consultations with a doctor or specialist
             (same as DOCTORCON in R.E.Stud. (1988)
                 and NOCNSLT in J.A.E.(1986))
   NONDOCCO Number of consultations with non-doctor health professionals
            (chemist, optician, physiotherapist, social worker,
             district community nurse, chiropodist or chiropractor)
             in the past 2 weeks
    HOSPADMI Number of admissions to a hospital, psychiatric hospital,
            nursing or convalescent home in the past 12 months
            (up to 5 or more admissions which is coded as 5)
   HOSPDAYS Number of nights in a hospital, etc. during most recent
            admission: taken, where appropriate, as the mid-point of the
            intervals 1, 2, 3, 4, 5, 6, 7, 8-14, 15-30, 31-60, 61-79
            with 80 or more admissions coded as 80. If no admission
            in past 12 months then equals zero.
   MEDICINE Total number of prescribed and nonprescribed medications used
            in past 2 days
   PRESCRIB Total number of prescribed medications used in past 2 days
   NONPRESC Total number of nonprescribed medications used in past 2 days

  This program performs Poisson PMLE regression of
     DVISITS
  on ten regressors including the intercept:
     intercept,SEX,AGE,AGESQ,INCOME,LEVYPLUS,FREEPOOR,FREEREPA,
     ILLNESS,ACTDAYS,HSCORE,CHCOND1,CHCOND2
*/

/******* Initial setup: libraries used etc. and output file ********/

New;                              @ clear workspace @
outfile = "c:\\racd\\racd3p3.out";
output file = ^outfile reset;
print "File racd3p3.out is output from Gauss program racd3p3.gau";
print "Date is:";
print date;

/******* Read in the data  ********/

data ="c:\\racd\\racd3";
print;
call dstat(data,0);           @ check that data okay @
print;
let YVAR = DVISITS;
let XVAR = CONSTANT,SEX,AGE,AGESQ,INCOME,LEVYPLUS,FREEPOOR,FREEREPA,
              ILLNESS,ACTDAYS,HSCORE,CHCOND1,CHCOND2;

/* Set data size dimensions and some other variables
   Will need to change nobs in different examples   */

m = 1;           @ Number of dependent variables @
k = rows(xvar);  @ Number of regressors @
nobs = 5190;     @ Maximum number of rows to read in dataset @

/* Call opendat, rad, readdat in seldata.src
   This gives
        x       - nobs*k matrix of explanatory variables
        y       - nobs*m matrix of dependent variables
        indy    - Index for the dependent variables
        indx    - Index for the exogenous variables
        f1      - File handle for the open data set
        n       - Number of observations in the data set
        nror    - number of times data set needs to be read to be complete
    rest    - number of obervations in the time data is read
   The data are read into memory one time.
   Nobs must be equal to number of observations in the data set
*/

{f1,n,indy,indx} = opendat(data,yvar,xvar,&indices,m);
{nror, rest }    = rad(nobs,n);
{ y,x }          = Readdat(f1,1,nobs,indy,indx);


/******** Do the statistical analysis    *********/

/* Obtain starting values from ols regression of lny on x */
   b = invpd(x'x)*x'(ln(y+(y.==0)*0.1));

/* Estimate parameters using Newton-Raphson algorithm */

/* For the Poisson log-density
     ln L = Sum_i {-mu_i + y_i*ln(mu_i) - y_i!}
   where
     mu_i = exp(x_i'b)

   The first-order conditions are
     Sum_i (y_i - exp(x_i'b))x_i = 0

   and the second-derivative matrix is
     Sum_i -exp(x_i'b))x_i*x_i' = 0          */


/* In the code below sco is the first derivative vector
                                  divided by n (for numerical stability)
   So  sco = (x'(y-mu))/n
           = [ Sum_i (y_i - exp(x_i'b))x_i ] / n
   and hes is the second-derivative matrix
                                  divided by n (for numerical stability)
   So  hes = -x'(x.*mu)/n
           = [ Sum_i (-exp(x_i'b))x_i*x_i' ] / n                       */

/* Begin newton-raphson iterations  */
    cha = 1;
    n = nobs;
    do until cha<1e-16;
        mu = exp(x*b);
        sco = (x'(y-mu))/n;
        hes = -x'(x.*mu)/n;
        bo = b;
        b = bo + invpd(-hes)*sco;
        cha = (bo-b)'(bo-b)/b'b;
    endo;

/* Obtain mle standard errors and t-statistics
   Several ways are given:
   (1) Usual mle using minus inverse of hessian
   (2) Robust sandwich or Eicker-White or Huber
   (3) Corrected mle assuming variance multiple of mean
   This program then uses (3)
*/

covusual = invpd(-n*hes);  @ (1) usual ml estimate of covariance @
opg = (x.*(y-mu))'(x.*(y-mu)); @ outer product of the gradient @
covrobus = covusual*opg*covusual; @ (2) robust sandwich/  Huber/ Eicker-White @
alphanb1 = (1/(n-k))*(y-mu)'((y-mu)./mu);
covnb1 = alphanb1*covusual; @ (3) nb1 estimate using nb1 variance function @

bmle = b;
covmle = covnb1;    @ Change this line for alternative variance estimators @
varmle = diag(covmle);
semle = sqrt(varmle);
tmle = bmle./semle;

/* print results */
print;
print "Cameron & Trivedi REGRESSION ANALYSIS OF COUNT DATA Table 3.3 p.69";
print;
print "Poisson PML regression";
print "Standard errors and t-statistics assume NB1 variance function";
print;
print $yvar "is the dependent variable";
print;

print "Variable      Poisson PMLE   Standard errors    t-statistics";
results=ones(k,4);
results[.,1]=xvar;
results[.,2:4]=bmle~semle~tmle;
let resmask[1,4]= 0 1 1 1;  @ character numeric numeric numeric @
call printfmt(results,resmask);

/* End of program with procedures following below */


/* Procedures opendat, rad, readdat   **********************************

  Following procedures written 4/16/97 based on Per's fsa2.src.
  In turn I think this is based on Frank Windmeijer's code.
  Basically given a Gauss data set with names
  Calling opendat, rad, readdat in seldata.src gives
        x       - nobs*k matrix of explanatory variables
        y       - nobs*m matrix of dependent variables
        indy    - Index for the dependent variables
        indx    - Index for the exogenous variables
        f1      - File handle for the open data set
        n       - Number of observations in the data set
        nror    - number of times data set needs to be read to be complete
        rest    - number of obervations in the time data is read

-----------------------------------------------------------------------
 Open the dataset and get the indices of the explanatory
 and dependent variables

{ f1,n,indy,indx } = opendat(data,yvar,xvar,&indices,m);

INPUT:  data      - String data set to be used
        yvar      - Character vecor of names of dependent variables
        xvar      - Charcter vector of names of explantory variables
        &indices  - Proc Indices.src, should be included in the main
                program
        m         - The Number of dependent variables

OUTPUT: f1      - File handle for the open data set
        n       - Number of observations in the data set
        indy    - Index for the dependent variables
        indx    - Index for the exogenous variables

-------------------------------------------------------------------
{ nror, rest } = rad(nobs,n);

INPUT:  n       - number of observations in the data set
    nobs    - number of observations to read

OUTPUT: nror    - number of times data set needs to be read
                  to be complete
    y       - number of obervations in the time data is read

-----------------------------------------------------------------------

Reads the nobs observations of the data set

{ x,y } = Readdat(f1,r,nobs,indy,indx);

INPUT:  f1      - File handle for the open data set
        r       - the row where to start the reading
        nobs    - number of observations to read
        indy    - Index for the dependent variables
        indx    - Index for the exogenous variables
              program

OUTPUT: x       - nobs*k matrix of explanatory variables
        y       - nobs*m matrix of dependent variables

-----------------------------------------------------------------------

*/

proc (4) = opendat(data,yvar,xvar,&index,m);
local f1,n,p,var;
local depvar,indy,ind,indx;
local index:proc;
p = rows(yvar|xvar);
  { var,ind } = index(data,yvar|xvar);
indy = ind[1:m];
indx = ind[m+1:p];
open f1 = ^data;
n = rowsf(f1);
retp(f1,n,indy,indx);
endp;

proc (2) = rad(nobs,n);
local a,nror,rest;
if n > nobs;
a = n/nobs;
nror = ceil(a);
rest = nobs*(a- (nror-1));
else;
nror = 1;
rest = n;
endif;
retp(nror,rest);
endp;

proc (2) = Readdat(f1,r,nobs,indy,indx);
local y,z,x;
y = seekr(f1,r);
    z = readr(f1,nobs);
    x = z[.,indx];
    y = z[.,indy];
retp(y,x);
endp;