/* This program is written to show how one can construct comparable
   coefficients of determination (R2) to decide which function form of a 
   regression to choose.  */ 

/* Linear-Linear Monte Carlo Data */

data in;

iseed = 12345;

do t = 1 to 100;
  a = rannor(iseed);

  x = t;

  y = 10 + 1*x + 3*a;

if t > 0 then output;

end;

keep y x;

data in;
  set in;
  lny = log(y);
  lnx = log(x);

title 'Monte Carlo Linear-Linear Regression data';

proc gplot data=in;
  symbol v=dot c=black i=join h=.8;
  axis1 order=(0 to 100 by 10)
  label=(f=duplex 'Obs');
  axis2 order=(0 to 120 by 20)
  label=(f=duplex 'Y Series');
  plot y*x / haxis=axis1 vaxis=axis2;

  run;

/*  We are going to compare the fits of the following models:

    (1) y = beta1 + beta2*x + e  (linear-linear)
    (2) y = beta1 + beta2*lnx + e (linear-log)
    (3) lny = beta1 + beta2*x + e (log-linear)
    (4) lny = beta1 + beta2*lnx + e (log-log).

   The first two models are easy to compare.  They have the same dependent
   variable y and the same number of explanatory variables so you
   would choose the model that had the highest R2(coefficient of determination).
   However, comparing the last two models to the first two models is problematic
   because they don't have the same dependent variables.  So what we are going
   to do is get the fitted values of the dependent variables for the
   third and fourth models, say ly(hat), and transform them to fitted
   values of y, yhat, by the following formula:

  		yhat = exp(lny(hat) + 0.5*SE(lny - lny(hat))^2)

  where SE(lny - lny(hat)) is the standard error of the forecast error in the lny 
  equation. Then we will calculate an R2 for y for models (3) and (4) using

  		SSE = sum[(y - yhat)^2]

  and then calculating R2 = 1 - (SSE/TSS)

  where, as usual, TSS = sum[(y - ymean)^2].

  Now we have R2's that are comparable across models.  We should choose the model
  that has the largest R2 among the comparable R2's.  */

/* Here we get the comparable R2's for the first two models. */

proc reg data = in;
   model y = x;
   title 'Linear-Linear Model';

run;

proc reg data = in;
   model y = lnx;
   title 'Linear-Log Model';

run;

proc reg data = in;
   model lny = x; 
   output out = result1 p=lnyhat stdi=stdi;
   title 'Log-Linear model';
      
run;

data result1;
  set result1;
  yhat = exp(lnyhat + 0.5*stdi*stdi);
  ey2 = (y-yhat)**2; 
  y2 = y**2;
  keep  y2 y yhat ey2 lnyhat stdi; 

proc means data = result1 noprint;
  var y2 y ey2;
  output out = calc1 mean = y2mean ymean ey2mean N = num;
 
run; 

data calc1;
  set calc1;
  sst = num*y2mean - num*(ymean**2);
  sse = num*ey2mean;
  R2 = 1 - (sse/sst);

  title 'Comparable R2 for the Log-Linear Model';

proc print data = calc1;
  var R2 sse sst;

  run;

proc reg data = in;
   model lny = lnx; 
   output out = result2 p=lnyhat stdi=stdi;
   title 'Log-Log Model';
      
run;

data result2;
  set result2;
  yhat = exp(lnyhat + 0.5*stdi*stdi);
  ey2 = (y-yhat)**2; 
  y2 = y**2;
  keep  y2 y yhat ey2 lnyhat stdi; 

proc means data = result2 noprint;
  var y2 y ey2;
  output out = calc2 mean = y2mean ymean ey2mean N=num;
 
run; 

data calc2;
  set calc2;
  sst = num*y2mean - num*(ymean**2);
  sse = num*ey2mean;
  R2 = 1 - (sse/sst);

title 'Comparable R2 for the Log-Log Model';

proc print data = calc2;
  var R2 sse sst;

  run;