STATISTICA







STATISTICA BASIC Program BoxCox.stb

{ This program will compute the maximum-likelihood estimate of lambda, for the Box-Cox transformation of the dependent variable in a multiple regression.
[Results Screenshot] The maximum number of variables is 10; to increase program limits change the respective constant MAXV at the beginning of the program.

For additional information, see: Box, G. E. P. & Cox, D. R. (1964). An analysis of transformations. Journal of the Royal Statistical Society, B, 26, 211-234; see also Maddala, G. S. (1977). Econometrics. New York: McGraw-Hill. (page 315-317); or Mason, R., L., Gunst, R., F., & Hess. J., L. (1989). Statistical design and analysis of experiments with applications to engineering and science. New York: Wiley.

The program will minimize SSE(lambda), which is computed from the transformed dependent variable values as:
   y' = (y^l - 1)/(l*h^(l-1)    for l<>0
        ln(y)*h                for l=0
  where h is the geometric mean of the dependent variable.

Finally, residuals will be computed and plotted for the model with lambda=1 (no transformation), and for the final lambda, using 26.8 in Mason, Gunst, Hess:
   y' = y^l                   for l<>0
        ln(y)     for l=0
  
Program written, modified, or edited at StatSoft, Inc.
}

randomaccess;
NoDataFileVariableNames;

	delta:=1.e-10; { tolerance check of lambda against 0}
	toler:=.001;   { tolerance check for convergence }
	maxtable:=30;  { maximum number of rows in Scrollsheet}
	redim table(maxtable,6);
	maxc:=ncases;
	maxv:=10;	   { max is 10 variables; change if necessary}
	maxiter:=40;   { max no. of iterations is 40; change if necessary}

	Sub BCTransform(ygeometricmean,yhat,nc,lam)
{Box-Cox transformation}
	begin
	 dim yhat(nc);
	 for i:=1 to nc do begin
 	  if abs(lam)<delta then
	   yhat(i):=log(yhat(i))*ygeometricmean
	  else
	   yhat(i):=(yhat(i)^lam-1)/(lam*ygeometricmean^(lam-1));
	  end;
	end;

	Function GetBracketsForSearch(l)
	begin
	 dim l(2);
	 l(1):=-2;
	 l(2):=2;
	 iret:=0;
	 if DisplayNumericInputBox ('Range of lambda',
	  'Lower limit:|Upper limit:', l)=0 then begin
	   iret:=1;
 	   goto thatsit;
	 end;
	 if l(1)>-.05 then begin
 	  DisplayMessageBox (MB_ICONSTOP, 'Invalid lower lambda',
	   'Lower lambda value must < -.05.');
	  iret:=1;
	 end;
	 if l(2)<.05 then begin
 	  DisplayMessageBox (MB_ICONSTOP, 'Invalid upper lambda',
	   'Upper lambda value must > .05.');
	  iret:=1;
	 end;
	 if (abs(l(1))>10) or (abs(l(2))>10) then begin
 	  DisplayMessageBox (MB_ICONSTOP, 'Invalid lambda values',
	   'User-specified lambda values too large.');
	  iret:=1;
 	 end;
thatsit:;
	 GetBracketsForSearch:=iret;
	end;

	Function MissingDataorYminltzero(nc,var,n,dv)
	begin
	 dim var(n);
	 MissingDataorYminltzero:=0;
	 iflag:=0;
	 for i:=1 to n do begin
	  j:=ValCount (data(1,var(i)), 1, ncases);
	  if j<ncases then iflag:=1;
	 end;
	 j:=ValCount (data(1,dv), 1, ncases);
	 if j<ncases then iflag:=1;
	 if iflag=1 then begin
	  DisplayMessageBox(MB_ICONSTOP, 'Missing Data',
	   'Some cases have missing data; this program requires that all cases are'+
	   ' complete; use the missing data replacement facilities to remove missing'+
	   ' data values.');
	  MissingDataorYminltzero:=1;
	  goto thatsit;
	 end;
	 ValMin (data(1,dv), 1, ncases, ymin);
	 if ymin<=0 then begin
	    DisplayMessageBox(MB_ICONSTOP, 'Invalid Y Values',
	     'Some y<=0; all dependent variable values must be greater than 0.');
	   MissingDataorYminltzero:=1;
	  end;
thatsit:;
	end;

	Sub Regression(x,nc,maxv,var,n,yhat,xx,dv,lam,delta,ygeometricmean)
	begin
	 dim yhat(nc);
	 dim x(nc,maxv);
	 dim xx(maxv,maxv);
	 dim var(maxv);
{ set first column in X to 1 for intercept}
	 MatrixFill (1,x,1,1, nc, 1);
{ copy data to x}
	 for i:=1 to n do MatrixCopy (data, 1, var(i), nc, 1, x, 1, i+1);
{ copy y}
	 MatrixCopy (data, 1, dv, ncases, 1, yhat, 1, 1);
{Box-Cox transformation}
	 BCTransform(ygeometricmean,yhat,ncases,lam);
	 MatrixCopy (yhat, 1, 1, nc, 1, x, 1, n+1+1);
{compute X'X}
	 MatrixCrossProductOfDev (x, 0, xx);
{ sweep on xx}
	 MatrixSweep (xx, 1, n+1, 1);
	end;

	Sub GridSearch(maxiter,x,maxv,var,n,yhat,xx,l,byref lam,
            byref ygeometricmean, dv,delta,iprint,
	      xplot1,xplot2,yplot1,yplot2)
	begin
	 dim xplot1(20);
	 dim yplot1(20);
	 dim xplot2(2);
	 dim yplot2(2);
	 dim yhat(ncases);
	 dim x(ncases,maxv);
	 dim xx(maxv,maxv);
	 dim var(maxv);
	 dim l(2);
{manual/visual optimization of lambda}
{compute geometric mean}
        MatrixCopy (data, 1, dv, ncases, 1, yhat, 1, 1);
	  ygeometricmean:=0;
	  for i:=1 to ncases do ygeometricmean:=ygeometricmean+log(yhat(i));
	  ygeometricmean:=exp(ygeometricmean/ncases);
{create Scrollsheet to report progress of search }
	  line01$:='Searching for|smallest lambda';
	  colnam$:='Lambda| SSE ';

{start iterations}
	  bestss:=1e+30;
	  bestlam:=0;
	  for iter:=1 to 20 do begin
	   lam:=l(1)+(iter-1)*(l(2)-l(1))/19;
	   Regression(x,ncases,maxv,var,n,yhat,xx,dv,lam,delta,ygeometricmean);
	   sigma:=xx(n+2,n+2);
	   xplot1(iter):=lam;
	   yplot1(iter):=sigma;
	   table(iter,1):=lam;
	   table(iter,2):=sigma;
	   if sigma<bestss then begin
	    bestss:=sigma;
	    bestlam:=lam;
	   end;
	   if iter=1 then begin
	    shandle:=NewScrollsheet (iter, 2, table, line01$, ?rownam$, colnam$);
	   end else begin
	    DeleteScrollsheet (shandle);
	    shandle:=NewScrollsheet (iter, 2, table, line01$, ?rownam$, colnam$);
	   end;
	  end;
{print Scrollsheet if printing was requested}
	  if iprint=1 then PrintScrollsheet (shandle);
{graph of lambda versus SSE(lambda)}
	  Graph:=Newgraph (LINEPLOT, 'Lambda versus SSE(lambda)', 'SSE(lambda)',
	   'Lambda', 20, xplot1, yplot1);
	  GraphSetPlotPointStyle (Graph, 1, ON, ?Pattern, ?Size, ?Color);
	  GraphSetTitle (Graph, TITLE_TOP2, 'Dependent variable: '+VarName(dv));
	  line01$:='Indep.: ';
	  for i:=1 to min(n,6) do line01$:=line01$+VarName(var(i))+' ';
	  if n>6 then line01$:=line01$+'...';
	  GraphSetTitle (Graph, TITLE_TOP3, line01$);
	  GraphSetTitle (Graph, TITLE_TOP4,
	   'The intersection of the 95% C.I line with the SSE line');
	  GraphSetTitle (Graph, TITLE_TOP5,
	   'marks the 95% confidence limits for the best lambda');

{determine confidence line for smallest SSE;
 see Mason, Gunst, Hess, page 557}
	  df:=ncases-n-1;
	  t:=VStudent (.95, df);
	  xplot2(1):=l(1);
	  yplot2(1):=bestss*(1+t*t/df);
	  xplot2(2):=l(2);
	  yplot2(2):=yplot2(1);
	  GraphAddPlot (Graph, LINEPLOT, '', 2, xplot2, yplot2);
	  GraphSetScaleTextLabels (Graph, AX_RY, 1, yplot2(1), '95% C.I.');
	  {GraphSetScaleValuesStyle (Graph, AX_RY, SCALEVALUES_NUMERIC,
	   ?NumWidth, ?NumDec, ?DateType, ?SkippedLabels);}
{display the graph}
	  DisplayGraph (Graph);
	  if iprint=1 then PrintGraph (Graph);
	  lam:=bestlam;
	end;

	Sub Goldensearch(maxiter,x,maxv,var,n,yhat,xx,l,byref lam,
            byref ygeometricmean, dv,delta,iprint)
	begin
	 dim yhat(ncases);
	 dim x(ncases,maxv);
	 dim xx(maxv,maxv);
	 dim var(maxv);
	 dim l(2);

{ begin iterative search (golden search of bracketed function) }
	 ii:=0;
	 r:=.61803399;c:=1.-r;
	 x0:=l(1);x3:=l(2);x2:=0;
	 if abs(l(2))>abs(l(1)) then begin
	  x1:=0;x2:=c*l(2);
	 end else begin
	  x2:=0;x1:=c*l(1);
	 end;
	 lamold:=0;
{compute geometric mean of y}
	 MatrixCopy (data, 1, dv, ncases, 1, yhat, 1, 1);
	 ygeometricmean:=0;
	 for i:=1 to ncases do ygeometricmean:=ygeometricmean+log(yhat(i));
	 ygeometricmean:=exp(ygeometricmean/ncases);
	 line01$:='Searching for|smallest lambda';
	 colnam$:='Lambda| SSE ';
{start iterations}
	 for iter:=1 to maxiter do begin
	  if iter=1 then lam:=x1;
	  if iter=2 then lam:=x2;
	  Regression(x,ncases,maxv,var,n,yhat,xx,dv,lam,delta,ygeometricmean);
{ compute f(l)}
	  sigma:=xx(n+2,n+2);
	  fl:=sigma;
{ display scrollsheet of iterations}
	  irow:=iter;
	  if iter>maxtable then begin
	   MatrixCopy (table, 2, 1, maxtable-1, 2, table, 1, 1);
	   rownam$:=SDelete (rownam$, 1, 4);
	   irow:=maxtable;
	  end;
	  table(irow,1):=lam;table(irow,2):=sigma;
	  if iter=1 then begin
	   shandle:=NewScrollsheet (iter, 2, table, line01$, ?rowname$, colnam$);
	  end else begin
	   DeleteScrollsheet (shandle);
	   shandle:=NewScrollsheet (irow, 2, table, line01$, ?rowname$, colnam$);
	  end;
{ first and second iteration:}
	  if iter=1 then f1:=fl;
	  if iter=2 then f2:=fl;
{DEBUG}
{	 writeln('lam:',lam,'sigma:',sigma,'iter:',iter);}
{ENDDEBUG}
{find new point}
	   if iter>2 then begin
	    if ii=1 then f2:=fl;
	    if ii=2 then f1:=fl;
	    if iprint=1 then begin
	     writeln('Iteration:',iter,
	             'lambda=',lam,
 			 'SSE=',sigma);
	    end;

	    if abs(x3-x0)>toler*(abs(x1)+abs(x2)) then begin
{ compute new lambda}
	     if f2<f1 then begin
	      x0:=x1;x1:=x2;x2:=r*x1+c*x3;
	      f0:=f1;f1:=f2;
	      lam:=x2;
	      ii:=1;
	     end else begin
	      x3:=x2;x2:=x1;x1:=r*x2+c*x0;
	      f3:=f2;
	      f2:=f1;
	      lam:=x1;
	      ii:=2;
	     end;
          end else begin
	     lam:=x2;
	     if f1<f2 then lam:=x1;
{stop iterations}
	     if iprint=1 then begin
	      writeln(' ');
	      writeln('Iterations converged after ',iter,' iterations');
	      writeln('Final value of lambda=',lam);
 		writeln('Final value of SSE=',sigma);
	      writeln(' ');
	     end;
	     goto thatsit;
	    end;
	   end;
	  end;
thatsit:;
	end;


{======================================================================================}
{                                   main program                                       }
{======================================================================================}

{ allocate memory for arrays}
{dimension global array table }

	redim l(2);
	redim xplot1(20);
	redim yplot1(20);
	redim xplot2(2);
	redim yplot2(2);
	redim b(maxv);
	redim yhat(ncases);
	redim ytransf(ncases);
	redim error(ncases);
	redim error0(ncases);
	redim x(ncases,maxv);
	redim xx(maxv,maxv);
	redim var(maxv);

{intro; and find out whether or not to print results}
	iret:=DisplayMessageBox (MB_YESNOCANCEL,
	  'Box-Cox Transformation',
	  'This program will estimate the maximum likelihood lambda for the '+
        'Cox-Box transformation of the dependent variable.  Do you want to '+
        'print all results?');
	 if iret=IDCANCEL then stop;
	 iprint:=0;
	 if iret=IDYES then iprint:=1;

{ select variables}
	if SelectVariables2 ('Select Variables',
	 1, 1, dv, i,'Dependent variable:',
	 1, maxv-1-1, var, n, 'Independent variables:')=0 then stop;

{check for missing data }
	if MissingDataorYminltzero(ncases,var,n,dv) then stop;

{ get values ("brackets") for lambda}
	if GetBracketsForSearch(l) then stop;
{ Give user the choice of either graphical method of optimization,
  or use iterative search (Golden Search)   }

	imethod:=DisplayButtonBox ('Select a Search Method',
	 'Manual/visual search (graph of lambda vs. SSE)|Iterative optimization '+
       '(golden search)');
	if imethod=0 then stop;
	if imethod=2 then begin
{ iterative search via golden search method}
	  Goldensearch(maxiter,x,maxv,var,n,yhat,xx,l,lam,
           ygeometricmean,dv,delta,iprint);
	end else begin
	  GridSearch(maxiter,x,maxv,var,n,yhat,xx,l, lam,
            ygeometricmean, dv,delta,iprint,
	      xplot1,xplot2,yplot1,yplot2);
	end;

{allow user to round lambda (enter user-specified value) }
	  if (DisplayNumericInputBox ('Specify lambda value',
	   'lambda', lam)=0) then stop;

{ recompute SSE(lambda) for lambda=1 (i.e., no transformation;
  simple linear relationship); also store error for later plot}
	   MatrixFill (1,x,1,1, ncases, 1);
	   for i:=1 to n do MatrixCopy (data, 1, var(i), ncases, 1, x, 1, i+1);
	   MatrixCopy(data,1,dv,ncases,1,x,1,n+1+1);
{compute X'X}
	   MatrixCrossProductOfDev (x, 0, xx);
{ sweep on xx}
	   MatrixSweep (xx, 1, n+1, 1);
	   sigma1:=xx(n+2,n+2);
{ compute errors for lambda=1 (observed-predicted)}
	   for i:=1 to ncases do begin
	    yhat(i):=0;
	    for j:=1 to n+1 do yhat(i):=yhat(i)+x(i,j)*xx(n+2,j);
	   end;
	   MatrixSubtract (data(1,dv), yhat, error0);

{ recompute sse(lambda) for final lambda}
	   MatrixFill (1,x,1,1, ncases, 1);
	   for i:=1 to n do MatrixCopy (data, 1, var(i), ncases, 1, x, 1, i+1);
	   MatrixCopy (data, 1, dv, ncases, 1, yhat, 1, 1);
{Box-Cox transformation}
	   BCTransform(ygeometricmean,yhat,ncases,lam);
	   MatrixCopy (yhat, 1, 1, ncases, 1, x, 1, n+1+1);
{compute X'X}
	   MatrixCrossProductOfDev (x, 0, xx);
{ sweep on xx}
	   MatrixSweep (xx, 1, n+1, 1);
	   sigma:=xx(n+2,n+2);

{ make final scrollsheet; note that the Chi-square test statistic pertains to the
  hypothesis that lambda=1  }
	   rownam$:='Final statistics';
	   colnam$:='lambda| SSE(l) |ChiČ(1)|  p  ';
	   table(1,1):=lam;
	   table(1,2):=sigma;
{see Maddala, p. 316 }
	   table(1,3):=-2*log((sigma/sigma1)^(ncases/2));
	   table(1,4):=1-IChi2 (table(1,3), 1);
	   table(1,5):=0;
	   table(1,6):=0;
	   line01$:='Summary Statistics for Box-Cox Lambda|';
	   line02$:='Dependent variable: '+varname(dv);
	   line02$:=line02$+'|Indep.:';
	   for i:=1 to min(n,5) do
	    line02$:=line02$+' '+varname(var(i));
	   if n>5 then line02$:=line02$+'...';
	   shandle:=NewScrollsheet (1, 4, table,
	    line01$+line02$,
	    rownam$, colnam$);
	   if iprint=1 then PrintScrollsheet (shandle);

{produce final summary graphs of observed vs. residuals for lambda=1,
 and best (userdefined) lambda}

{ recompute sse(lambda) for final lambda, using 26.8. in Mason, Gunst, Hess}
	   MatrixFill (1,x,1,1, ncases, 1);
	   for i:=1 to n do MatrixCopy (data, 1, var(i), ncases, 1, x, 1, i+1);
	   MatrixCopy (data, 1, dv, ncases, 1, yhat, 1, 1);
	   for i:=1 to ncases do begin
{Power transformation (see Mason, Gunst, Hess, 26.8}
	    if abs(lam)<delta then
	     yhat(i):=log(yhat(i))
	    else
	     yhat(i):=yhat(i)^lam;
	   end;
	   MatrixCopy (yhat, 1, 1, ncases, 1, x, 1, n+1+1);
	   MatrixDuplicate (yhat, ytransf);
{compute X'X}
	   MatrixCrossProductOfDev (x, 0, xx);
{ sweep on xx}
	   MatrixSweep (xx, 1, n+1, 1);
	   sigma:=xx(n+2,n+2);
	   for i:=1 to ncases do begin
	    yhat(i):=0;
	    for j:=1 to n+1 do yhat(i):=yhat(i)+x(i,j)*xx(n+2,j);
	   end;
	   MatrixSubtract (ytransf, yhat, error);

{first make blank graph
{note; residuals for lambda=1 are in error0; for lambda=lam in error; yhat has
 transformed dependent variable values; make embedded multiple graph}
	graph:=NewGraph (IGNOREDPLOT, 'Observed vs. Residual Values|'+line02$,
	  ?Left_Title$, ?Bottom_Title$,
  	   0, yhat, yhat);
	graph1:=NewGraph (SCATTERPLOT, 'Lambda=1 (No Transformation)',
          'Residuals', 'Observed Values', ncases, data(1,dv), error0);
	graph2:=NewGraph (SCATTERPLOT, 'Lambda='+Str (lam, 6, 4),
	  'Residuals', 'Observed Values, @btransformed@b', ncases,
	   ytransf, error);
	GraphSetPlotFitting (Graph1, 1, fit_linear);
	GraphSetPlotFitting (Graph2, 1, fit_linear);
	GraphSetDefaultFont (graph1, ?FontName$, 14, ?Color);
	GraphSetDefaultFont (graph2, ?FontName$, 14, ?Color);
	GraphEmbedGraph (Graph, graph1, false, false, ?MappingMode,
	  0, 0, 50, 80, false);
	GraphEmbedGraph (Graph, graph2, false, false, ?MappingMode,
	  50, 0, 100, 80, false);
	DisplayGraph (Graph);
	if iprint=1 then PrintGraph (Graph);

{Normal probability plot of residuals}
{first make blank graph}
	graph:=NewGraph (IGNOREDPLOT,
	  'Normal Probability Plots of Residuals|'+line02$,
	  ?Left_Title$, ?Bottom_Title$,
	 0, yhat, yhat);
	VectorSort (error0, SORT_ASCENDING);
	MatrixDuplicate (error0, yhat);
	VectorRank (yhat, SORT_ASCENDING, RANK_MEAN);
	for i:=1 to ncases do begin
	 yhat(i):=(3*yhat(i)-1)/(3*ncases+1);
	 yhat(i):=VNormal (yhat(i), 0, 1);
	end;

      line01$:='Lambda=1 (No Transformation)';
	graph1:=NewGraph (SCATTERPLOT, line01$,
          'z-Value','Residual', ncases, error0, yhat);
	GraphSetPlotFitting (graph1, 1, FIT_LINEAR);
	VectorSort (error, SORT_ASCENDING);
	MatrixDuplicate (error, yhat);
	VectorRank (yhat, SORT_ASCENDING, RANK_MEAN);
	for i:=1 to ncases do begin
	 yhat(i):=(3*yhat(i)-1)/(3*ncases+1);
	 yhat(i):=VNormal (yhat(i), 0, 1);
	end;
	line01$:='Lambda='+Str (lam, 6, 4);
	graph2:=NewGraph (SCATTERPLOT, line01$,
          'z-Value','Residual', ncases, error, yhat);
	GraphSetPlotFitting (graph2, 1, FIT_LINEAR);
	GraphSetDefaultFont (graph1, ?FontName$, 14, ?Color);
	GraphSetDefaultFont (graph2, ?FontName$, 14, ?Color);
	GraphEmbedGraph (Graph, graph1, false, false, ?MappingMode,
	  0, 0, 50, 80, false);
	GraphEmbedGraph (Graph, graph2, false, false, ?MappingMode,
	  50, 0, 100, 80, false);
	DisplayGraph (Graph);
	if iprint=1 then PrintGraph (Graph);
Back to List of Programs



[StatSoft]
2300 East 14th Street, Tulsa, OK 74104
Phone: (918) 749-1119; Fax: (918) 749-2217

[StatSoft]e-mail: info@statsoft.com

©Copyright StatSoft, Inc., 1984-2004.
StatSoft, StatSoft logo, STATISTICA, SEWSS, SEDAS, Data Miner, SEPATH and GTrees are trademarks of StatSoft, Inc.