/* mlr.c Multiple Linear Regression */ // Version 4. Added "r^2" and "Sy.x" to printout #include #include #include #define MAXVARS 25 // #define MAXOBS 300 #define MAXOBS 1000 #define MAXNAMLEN 8 /* Length of variable names */ #define MAXTITLE 70 /* Max title length */ #define FALSE 0 #define TRUE !FALSE #define OK TRUE double b[MAXVARS+1][MAXVARS+1]; double b1[MAXVARS+1][MAXVARS+1]; int byvariable = FALSE; /* Option: read data by variables? */ double det; /* Determinant */ double dw; /* Durbin-Watson statistic */ int echo = FALSE; /* Option: echo input data? */ double epsd = 1e-50; /* Zero toler in determinant calc */ double epsp = 1e-50; /* Zero toler in invert */ double fcal; char *filename = "flurp.dat "; FILE *fn, *fopen(); int k; /* Number of variables */ double l[MAXVARS+1]; double msreg; double msres; double mx[MAXVARS+1]; int n; /* Number of observations */ double r[MAXVARS+1][2*(MAXVARS+1)]; /* Correlation matrix */ double r1[MAXVARS+1][MAXVARS+1]; /* Is this used? */ int residuals = FALSE; /* Option: print residuals report? */ double rhi; /* Largest | residual | */ double rsq; /* R-sq statistic */ double s[MAXVARS+1]; double see; /* Std err of estimate */ double ssreg; double ssres; double sst; char title[80]; /* Title for problem */ char vname[MAXVARS+1][MAXNAMLEN+1]; /* Names for variables */ double x[MAXOBS+1][MAXVARS+1]; double x1[MAXOBS+1][MAXVARS+2]; main(argc,argv) int argc; char *argv[]; { if (!init_read(argc,argv)) return; if (echo) showdat(); descriptive(); corr(); prt_corr(); if(!deter()) { printf("Singular correlation matrix. Cannot perform regression\n"); return; } summary(); anova(); if(residuals) resid(); forecast(); } init_read(argc,argv) int argc; char *argv[]; { register int i,j; double q; char buf[20], *ch, *tempp; if (argc == 1) { help(); return !OK; } for(i=1; i MAXVARS || n > MAXOBS) { printf("Maximum of %d variables and %d observations allowed\n",MAXVARS,MAXOBS); return !OK; } for (i=1; i<=k; ++i) { mx[i] = s[i] = 0.0; fscanf(fn," %s ",buf); tempp = strncpy(&vname[i][0],buf,MAXNAMLEN+1); vname[i][MAXNAMLEN] = '\0'; } printf("and dependent variable %s\n\n",vname[k]); if (!byvariable) { /* Read data by observations */ for (i=1; i<=n; ++i) for (j=1; j<=k; ++j) { fscanf(fn,"%lf",&q); x[i][j] = q; x1[i][j] = q; mx[j] += q; s[j] += q*q; } } else { /* Read data variable-at-a-time */ for (j=1; j<=k; ++j) for (i=1; i<=n; ++i) { fscanf(fn,"%lf",&q); x[i][j] = q; x1[i][j] = q; mx[j] += q; s[j] += q*q; } } return OK; } showdat() { register int i,j; printf(" *** INPUT DATA ***\n\n "); for(j=1; j<=k; ++j) printf(" %9s",vname[j]); cr(); for(i=1; i<=n; ++i) { printf("%3d. ",i); for (j=1; j<=k; ++j) printf(" %9.2lf",x[i][j]); cr(); } cr(); } cr() /* Carriage return */ { printf("\n"); } descriptive() { register int i,j; char *fh="%-12s%15s%15s%15s\n"; char *fd="%-12s%15.4lf%15.4lf%15.4lf\n"; double mean,var; printf(" *** DESCRIPTIVE STATISTICS ***\n\n"); printf(fh,"Variable","Mean","Variance","StdDev"); printf(fh,"--------","----","--------","------"); for (j=1; j<=k; ++j) { mean = mx[j]/n; for (var=0.0, i=1; i<=n; ++i) var += (x[i][j]-mean)*(x[i][j]-mean); var /= (n-1); printf(fd,vname[j],mean,var,sqrt(var)); } printf(fh,"--------","----","--------","------"); } corr() /* Compute correlation matrix */ { register int j,i,ir,ic; for (j=1; j<=k; ++j) { s[j] -= mx[j]*mx[j]/n; mx[j] /= n; } for (i=1; i<=n; ++i) { for (j=1; j<=k; ++j) x[i][j] = (x[i][j]-mx[j])/sqrt(s[j]); } --k; /* Note: K is redefined here */ for (ir=1; ir<=k+1; ++ir) { for (ic=ir+1; ic<=k+1; ++ic) { r[ir][ic] = 0.0; for (i=1; i<=n; ++i) r[ir][ic] += x[i][ir]*x[i][ic]; r[ic][ir] = r[ir][ic]; } r[ir][ir] = 1.0; } sst = s[k+1]; } prt_corr() { register int i,j; printf("\n *** CORRELATION MATRIX ***\n\n "); for (i=1; i<=k+1; ++i) printf(" %8s",vname[i]); cr(); for (i=1; i<=k+1; ++i) { printf(" %-8s",vname[i]); for (j=1; j<=k+1; ++j) printf(" %6.3lf",r[i][j]); cr(); } cr(); } deter() /* Calculate determinant */ { int i,j,k1; double adj, e, sav, sx, rs[MAXOBS+1], r2[MAXVARS+1][MAXVARS+1]; double res, yc, ycalc(); int kk; for(i=1; i<=k; ++i) for (j=1; j<=k+1; ++j) r1[i][j] = r[i][j]; kk = 0; adj = 1.0; for (k1=k; k1>=2; --k1) { ++kk; i = j = 1; while (fabs(r1[i][j]) < epsd) { if (i == k1) return !OK; else ++i; } if (i != 1) { for (j=1; j<=k1; ++j) { sav = r1[1][j]; r1[1][j]= r1[i][j]; r1[i][j]= sav; } adj = -adj; } rs[kk] = r1[1][1]; for (j=1; j<=k1; ++j) r1[1][j] /= rs[kk]; for (i=1; i<=k1-1; ++i) for (j=1; j<=k1-1; ++j) r2[i][j] = r1[i+1][j+1]-r1[1][j+1]*r1[i+1][1]; for (i=1; i<=k1-1; ++i) for (j=1; j<=k1-1; ++j) r1[i][j] = r2[i][j]; } det = r1[1][1]*adj; for (i=1; i<=k-1; ++i) det *= rs[i]; printf("Determinant of dependent variables' submatrix = %f\n",(float)det); if (fabs(det) < epsd) return !OK; for(i=1; i<=k; ++i) for(j=1; j<=k; ++j) r1[i][j] = r[i][j]; invert(); /* Compute coefficients */ for (i=1; i<=k; ++i) for (l[i]=0.0, j=1; j<=k; ++j) l[i] += r[i][j]*r1[j][k+1]; for (rsq = 0.0, i=1; i<=k; ++i) rsq += l[i]*r1[i][k+1]; ssreg = rsq*sst; ssres = sst-ssreg; msres = ssres/(n-k-1); msreg = ssreg/k; fcal = ssreg/(msres*k); for (sx=mx[k+1], i=1; i<=k; ++i) { b1[0][i] = l[i]; b[0][i] = sqrt(s[k+1]/s[i])*l[i]; sx -= b[0][i]*mx[i]; } b[0][0] = sx; for (i=1; i<=n; ++i) { for (e = 0.0, j=1; j<=k; ++j) e += b[0][j]*x1[i][j]; rs[i] = x1[i][k+1]-sx-e; } for (e=0.0, i=2; i<=n; ++i) { sx = rs[i]-rs[i-1]; e += sx*sx; } dw = e/ssres; for (see=0.0,i=1; i<=n; ++i) /* Compute Y-estimated */ { yc = ycalc(&x1[i][0]); x1[i][0] = yc; res = fabs(x1[i][k+1]-yc); if(res>rhi) rhi = res; see += res*res; } see = sqrt(see/(n-k-1)); return OK; } invert() { register int i,ir,ic,p; double imax, max, xmlt; int l[MAXVARS+1], pvt, itemp; for(ir=1; ir <=k; ++ir) { for (ic=k+1; ic <= k+k; ++ic) r[ir][ic] = 0.0; r[ir][ir+k] = 1.0; l[ir] = ir; } for (p=1; p<=k; ++p) { max = 0.0; pvt = p; for (i=p; i<=k; ++i) { imax = fabs(r[l[i]][p]); if (max < imax) { max = imax; pvt = i; } } if (pvt != p) {itemp = l[p]; l[p]=l[pvt]; l[pvt]=itemp;} for (ic=p+1; ic <= k+k; ++ic) r[l[p]][ic] /= r[l[p]][p]; r[l[p]][p] = 1.0; for (ir=1; ir<=k; ++ir) { if (ir == p) continue; xmlt = r[l[ir]][p]; if (fabs(xmlt) < epsp) continue; for (ic=p; ic<= k+k; ++ic) r[l[ir]][ic] -= xmlt*r[l[p]][ic]; } } for (ir=1; ir<=k; ++ir) for (ic=1; ic<=k; ++ic) r[ir][ic] = r[l[ir]][ic+k]; } anova() /* Print analysis of variance */ { char *fh = "%-15s%7s%15s%15s%15s%10s\n"; char *fd5 = "%-15s%7d%15.4lf%15.4lf%15.4lf%10.4lf\n"; char *fd4 = "%-15s%7d%15.4lf%15.4lf\n"; char *fdx = "%-37s%15.4lf\n"; double fprob(); printf("\n *** ANALYSIS OF VARIANCE ***\n\n"); printf(fh,"Source","DegF","SumSq","MeanSumSq","F-Ratio","P(>F)"); printf(fh,"------","----","-----","---------","-------","-----"); printf(fd5,"Regular",k,ssreg,msreg,fcal,fprob(k,n-k-1,fcal)); printf(fd4,"Residuals",n-k-1,ssres,msres); printf(fh,"------","----","-----","---------","------","-----"); printf(fd4,"Total",n-1,sst,sst/(n-1)); cr(); printf(fdx,"Coefficient of determination (r^2)",rsq); printf(fdx,"Standard error of the estimate (Sy.x)",see); printf(fdx,"Durbin-Watson statistic",dw); } summary() /* Print summary report */ { double sb, t, tprob(); register int i,j; static char *fh="%-8s%15s%15s%15s%12s%10s\n"; static char *fd="%-8s%15.4lf%15.4lf%15.4lf%12.4lf%10.4lf\n"; static char *fd2="%-8s%15.4lf\n"; printf("\n *** REGRESSION EQUATION ***\n\n"); printf(fh,"Variable","Coeff","Beta","StdErr","t-Stat","P(>|t|)"); printf(fh,"--------","-----","----","------","------","-------"); printf(fd2,"Constant",b[0][0]); for (i=1; i<=k; ++i) { sb = sqrt(msres*r[i][i]/s[i]); t = b[0][i]/sb; printf(fd,vname[i],b[0][i],l[i],sb,t,tprob(t,n-k-1)); } printf(fh,"--------","-----","----","------","------","-------"); printf("\n%s = %f",vname[k+1],b[0][0]); for (j=1; j<=k; ++j) printf(" %+f*%s",b[0][j],vname[j]); cr(); } resid() { register int i,j; double yc,res, ycalc(); static char *fhd = "%4s %12s%12s%12s %s\n"; static char buf[] = "| | |"; static char *heading = "+---------------+---------------+"; static char isave; printf("\n\t\t\t*** RESIDUAL SUMMARY ***\n\n"); printf(fhd,"Obs","Y","Y-Calc","Residual",heading); for (i=1; i<=n; ++i) { yc = x1[i][0]; res = x1[i][k+1]-yc; printf("%3d. %12.2lf%12.2lf%12.2f", i,x1[i][k+1], yc, res); j = 1+15*(1.0+res/rhi); isave = buf[j]; buf[j] = '*'; printf(" %s\n",buf); buf[j] = isave; } printf(fhd,"","","","",heading); } double ycalc(xx) double xx[]; { register int j; double yc; for (yc=b[0][0], j=1; j<=k; ++j) yc += b[0][j]*xx[j]; return yc; } forecast() { register int i,j; int prthead = FALSE; /* Has heading been printed? */ double xx[MAXVARS+1], ycalc(); while (TRUE) { for (i=1; i<=k; ++i) if (EOF == fscanf(fn," %lf",&xx[i])) return; if (!prthead) { prthead = TRUE; printf("\n\t\t*** FORECASTING WITH MLR MODEL ***\n\n"); for (j=1; j<=k; ++j) printf("%10s ",vname[j]); printf("%12s\n","Forecast"); } for (j=1; j<=k; ++j) printf("%10.2lf ",xx[j]); printf("%12.3lf\n", ycalc(xx)); } } help() { static char *msg =" Multiple Linear Regression, (c) 1989-2007 Richard S. Barr.\n\ Program usage: mlr filename\n\ where 'filename' is a file containing the following data for analysis:\n\ 1. k, the number of independent (X) and dependent (Y) variables (max %d)\n\ 2. n, the number of observations of the k variables (max %d).\n\ 3. A list of k names for the variables, each up to 8 characters long,\n\ separated by blanks. >> The dependent variable is listed LAST.\n\ 4. n sets of observations for the k variables separated by blanks.\n\ The form is:\n\ X(1,1) X(1,2) ...X(1,k-1) Y(1)\n\ X(2,1) X(2,2) ...X(2,k-1) Y(2)\n\ ...\n\ X(n,1) X(n,2) ... X(n,k-1) Y(N)\n\ 5. Any additional observations to be forecast with the regression\n\ equation computed from the preceeding data.\n\ \n\ User options can be invoked using the form: mlr -options filename\n\ where 'options' is a string of one or more of the following characters\n\ e = Echo input data\n\ v = read data (in step 5 above) by Variables, instead of by \n\ observation, with X variable data first, then Y\n\ r = display Residuals summary report\n\ MLR is for Dr. Barr's students and friends. Please do not distribute to others.\n\ "; printf(msg,MAXVARS,MAXOBS); } double tprob(t,df) double t; int df; { double betai(); return betai(df/2.0, 0.5, df/(df+t*t) ); } double fprob(df1,df2,f) double f; int df1, df2; { double betai(); return betai(df2/2.0, df1/2.0, df2/(df2+df1*f) ); } double betai(a,b,x) double a,b,x; { double bt; double gammln(),betacf(); if (x < 0.0 || x > 1.0) return -1.0; if (x == 0.0 || x == 1.0) bt=0.0; else bt=exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(1.0-x)); if (x < (a+1.0)/(a+b+2.0)) return bt*betacf(a,b,x)/a; else return 1.0-bt*betacf(b,a,1.0-x)/b; } double gammln(xx) double xx; { double x,tmp,ser; static double cof[6]={76.18009173,-86.50532033,24.01409822, -1.231739516,0.120858003e-2,-0.536382e-5}; int j; x=xx-1.0; tmp=x+5.5; tmp -= (x+0.5)*log(tmp); ser=1.0; for (j=0;j<=5;j++) { x += 1.0; ser += cof[j]/x; } return -tmp+log(2.50662827465*ser); } #define ITMAX 100 #define EPS 3.0e-7 double betacf(a,b,x) double a,b,x; { double qap,qam,qab,em,tem,d; double bz,bm=1.0,bp,bpp; double az=1.0,am=1.0,ap,app,aold; int m; qab=a+b; qap=a+1.0; qam=a-1.0; bz=1.0-qab*x/qap; for (m=1;m<=ITMAX;m++) { em=(double) m; tem=em+em; d=em*(b-em)*x/((qam+tem)*(a+tem)); ap=az+d*am; bp=bz+d*bm; d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem)); app=ap+d*az; bpp=bp+d*bz; aold=az; am=ap/bpp; bm=bp/bpp; az=app/bpp; bz=1.0; if (fabs(az-aold) < (EPS*fabs(az))) return az; } return az; } #undef ITMAX #undef EPS