/*  This program generates several situations where the wrong and right
tranformations are used in modeling a Monte Carlo data set.  In the first instance
we generate a Random Walk with Drift data set where simple differencing would be appropriate
and log differencing would not.  Secondly we generate an exponential growth data set 
where a log difference transformation would be appropriate and a simple differencing would not.
In both cases we use in-sample MSEs and MAEs to verify which transformation is the correct one. */ 

/*  This segment of the program generates a Random Walk with Drift  */

data mc1;

x1 = 5;
iseed = 445;

do t = 0 to 100;
  a = rannor(iseed);

x2 = 1.0 + 1.0*x1 + 2*a;

if t > 0 then output;

x1 = x2;

end;

keep x2;
run;

data mc1;
  set mc1;
  time = _N_;
  y = x2;
  logy = log(y);
  ydif = dif(y);
  diflogy = log(y) - lag(log(y));
  drop x2;
  run;

  /* The next plots are (a) a plot of a Random Walk with Drift, (b) a differenced
  random walk with drift (the right transformation!), (c) a log difference plot
  of the Random Walk with drift data (the wrong transformation!), and (c) a plot of the
  residuals of a deterministic trend fit to the Random Walk with Drift Data (again
  the wrong transformations because the residuals of the trend regression are slow
  turning and not stationary).  */

title 'Random Walk with Drift Data';
proc sgplot data=mc1 noautolegend;
   reg x=time y=y/lineattrs=(color=black) markerattrs=(size=5.0);
   run;

  title;

proc gplot data=mc1;
  symbol v=dot c=black i=join h=.8;
  title1 'Differenced RW With Drift';
  title2 'X=Time Y=Diff. RW Series';
  axis1 order=(0 to 100 by 10)
  label=(f=duplex 'Obs');
  axis2 order=(-10 to 10 by 1.0)
  label=(f=duplex 'Dif Series');
  plot ydif*time / haxis=axis1 vaxis=axis2 overlay vref = 1;
  run;

  title;

proc gplot data=mc1;
  symbol v=dot c=black i=join h=.8;
  title1 'Difference of Logs when RW With Drift';
  title2 'X=Time Y=Diff. Log Series';
  axis1 order=(0 to 100 by 10)
  label=(f=duplex 'Obs');
  axis2 order=(-2 to 2 by 1.0)
  label=(f=duplex 'Dif Log Series');
  plot diflogy*time / haxis=axis1 vaxis=axis2 overlay vref = 0;
  run;

  title;

proc reg data=mc1;
  model y = time;
  output out=resid r=resid;
  run;

proc gplot data=resid;
  symbol v=dot c=black i=join h=.8;
  title1 'Deterministic Trend Residuals for RW with Drift Data';
  title2 'X = Time Y = Residuals from Deterministic Trend';
  axis1 order=(0 to 100 by 10)
  label=(f=duplex 'OBS');
  axis2 order=(-40 to 40 by 10)
  label=(f=duplex 'Residuals');
  plot resid*time / haxis=axis1 vaxis=axis2 overlay vref = 0;

  title;

  /* The two data sets below MSE1 and MSE2 contain the within-sample errors of the 
  correct transformation and the incorrect transformation, respectively.
  After comparing the MSEs and MAEs of the two competing transformations
  we see that the simple difference applied to the Random Walk with Drift
  data has the smallest MSE and MAE thus indicating that it is the best transformation
  of the data (as compared to the log difference transformation.  */

proc arima data=mc1 out = result1;
   identify var=y(1);
   estimate p = 0;
   forecast lead = 0;
   run;

   data MSE1;
     set result1;
	 e1 = y - forecast;
     e12 = e1**2;
	 abse1 = abs(e1);
     keep e12 abse1;
   run;

   /* The performance of the simple difference transformation */
   title 'The performance of simple difference transformation applied to RW with Drift data';
   proc means data = MSE1;
      var e12 abse1;
	  run;

	  title;

   proc arima data=mc1 out = result2;
      identify var = logy(1);
      estimate p = 0;
      forecast lead = 0;
   run;

    data MSE2;
      merge mc1 result2;
	  yhat = exp(forecast + 0.5*std*std);
	  e2 = y - yhat;
      e22 = e2**2;
      abse2 = abs(e2);
	  run;
/*  The performance of the difference of logs transformation */

	  title 'Performance of difference of logs transformation applied to RW with Drift data';
	  proc means data = MSE2;
	  var e22 abse2;
	  run;

	  title;

	  /* The exponential growth data is generated here. */

data mc2;

x1 = 0;
iseed = 333;

do t = 0 to 100;
  a = rannor(iseed);

x2 = .03 + 1.0*x1 + a*(.03);

if t > 0 then output;

x1 = x2;

end;

keep t x2;
run;

data mc2;
  set mc2;
  y = exp(x2);
  time=_N_;
  run;

/* The next plots are (a) a plot of an exponential growth series, (b) a differenced
  growth series (the wrong transformation!), random walk with drift (the right transformation!),
  (c) a plot of the logged series which appears linear in it trend, and (d) the differenced log
  series.  (The correct transformation!)  */

proc gplot data=mc2;
  symbol v=dot c=black i=join h=.8;
  title1 'Monte Carlo Growth Data';
  title2 'X=Time Y=Growth Series';
  axis1 order=(0 to 100 by 10)
  label=(f=duplex 'Obs');
  axis2 order=(0 to 25 by 5)
  label=(f=duplex 'Growth Series');
  plot y*time / haxis=axis1 vaxis=axis2;
  run;

data mc2;
  set mc2;
  ydif = dif(y);
  logy = x2;
  logydif = dif(logy);
  run;

proc gplot data=mc2;
  symbol v=dot c=black i=join h=.8;
  title1 'Difference of Monte Carlo Growth Data';
  title2 'X=Time Y=Diff. of Growth Series';
  axis1 order=(0 to 100 by 10)
  label=(f=duplex 'Obs');
  axis2 order=(-2.0 to 2.0 by .2)
  label=(f=duplex 'Delta Y');
  plot ydif*time / haxis=axis1 vaxis=axis2 vref = 0;
  run;

  title;

proc gplot data=mc2;
  symbol v=dot c=black i=join h=.8;
  title1 'Log of Series';
  title2 'X=Time Y=Log of Series';
  axis1 order=(0 to 100 by 10)
  label=(f=duplex 'Obs');
  axis2 order=(-1 to 4 by 1)
  label=(f=duplex 'Log Y');
  plot logy*time / haxis=axis1 vaxis=axis2;
  run;

  title;

proc gplot data=mc2;
  symbol v=dot c=black i=join h=.8;
  title1 'Differenced Log of Growth Series = Percentage Change';
  title2 'X=Time Y = Diff. Log Growth Series = Percentage Change';
  axis1 order=(0 to 100 by 10)
  label=(f=duplex 'Obs');
  axis2 order=(-.15 to .20 by .05)
  label=(f=duplex 'Perc. Change');
  plot logydif*time / haxis=axis1 vaxis=axis2 overlay vref = 0.03;
  run;

  title;

/* The two data sets below MSE3 and MSE4 contain the within-sample errors of the 
  incorrect and correct transformations, respectively.
  After comparing the MSEs and MAEs of the two competing transformations
  we see that the difference of logs transformation of the exponential growth data
  has the smallest MSE and MAE thus indicating that it is the best transformation of
  the data (as compared to the simeple difference transformation).  */

proc arima data=mc2 out = result3;
   identify var=y(1);
   estimate p = 0;
   forecast lead = 0;
   run;

data MSE3;
   set result3;
	e3 = y - forecast;
    e32 = e3**2;
	abse3 = abs(e3);
    keep e32 abse3;
   run;
/*  Performance of simple difference transformation applied to exponential growth data */
   title 'Performance of simple difference transformation applied to exponential growth data';
   proc means data = MSE3;
      var e32 abse3;
	  run;

	  title;

 proc arima data=mc2 out = result4;
   identify var=logy(1);
   estimate p = 0;
   forecast lead = 0;
   run;

   proc print data = result4;
   run;

   data MSE4;
   merge mc2 result4;
     yhat = exp(forecast + 0.5*std*std);
	 e4 = y - yhat; 
     e42 = e4**2;
	 abse4 = abs(e4);
     run;
/* Performance of difference of logs transformation applied to exponential growth data */
	 title 'Performance of difference of logs transformation applied to exponential growth data';
   proc means data = MSE4;
      var e42 abse4;
	  run;

	  title;

/* Performance of naive log transformation with out the standard deviation correction */

     data MSE5;
     set MSE4;
	 yhat_alt = exp(forecast);
	 e4n = y - yhat_alt;
     e4n2 = e4n**2;
     abse4n = abs(e4n);
	 run;

	 title 'Performance of naive log transformation without the standard deviation correction'; 
 proc means data = MSE5;
      var e4n2 abse4n;
	  run;