STATISTICA







STATISTICA BASIC Program Outliers.stb

{This program allows the user to pick a number of variables and define outliers in terms of the number of standard deviations from the mean. The program will then search the variables' values for outliers and report the case numbers containing outliers. The user can choose to direct output to either scrollsheets or the Text/Output Window. If output is directed to scrollsheets, the data file will be displayed as a scrollsheet with outlier values for the variables selected highlighted in red. A column is appended to the scrollsheet called OUTLIER? . This column codes for cases with outliers (1=TRUE).

When the scrollsheet is saved as a data file, cases with outliers can be excluded from an analysis easily by case selection conditions on OUTLIER? (exclude if OUTLIER?=1).

Program written, modified, or edited at StatSoft, Inc.}


randomaccess;

{user selects variables to check for outliers}
redim varlist(NVars);
if (SelectVariables1 ("Outlier Analysis:  Please pick the variables", 1, NVars, VarList, count, ?Text1$))=0 then abort;

{user defines the outliers}
coeff := 2;
if (DisplayNumericInputBox
('Please define outliers', 'Outliers are how many standard deviations from the mean?', coeff))=0 then abort;

{Let's user specify where the output is headed}
output_choice := DisplayButtonBox ('Direct output to where?', 'Scrollsheets|Text/Output Window');
if (output_choice = 0) then abort;

{data file is displayed as scrollsheet and new column is added to code for cases with outliers}
if (output_choice = 1) then
	begin
	datatitle$ := 'Marked cases have a value > ' + str(coeff,4,2) + ' std devs from the mean';
	datahandle := MatrixDisplay (Data, datatitle$);
	for n:=1 to NVars do
		ScrollsheetSetColumnName (datahandle, n, VarName(n), ?Name2$);
	ScrollsheetAppendColumns (datahandle, 1);
	ScrollsheetSetColumnName (datahandle, Nvars+1, "OUTLIER?", ?title2$);
	end;

{Big loop so that outlier analysis is performed for all variables selected}
for numvars := 1 to count do
begin

	{read the values from that variable into an array}
	redim varvalues (NCases);
	for counter :=1 to NCases do
		begin
		varvalues(counter) := Data(counter,varlist(numvars));
		end;

	{compute mean and standard deviation}
	ValMean (varvalues, 1, NCases,mean);
	ValStDeviation (varvalues, 1, NCases, std);

	{compute the valid range for the chosen variable given the user's selection of
	the number of standard deviations from the mean}
	low := (mean - (coeff*std));
	high := (mean + (coeff*std));

	{checking for outliers}
	number_outliers := 0;
	for counter2 := 1 to NCases do
		begin
		if ((Data (counter2, varlist(numvars)) < low) or (Data (counter2, varlist(numvars)) > high)) then
			begin
			number_outliers := number_outliers + 1;
			redim outliers(number_outliers);
			outliers(number_outliers) := counter2;
			end;
		end;

	{creates array and stores descriptive info about the variable}
	redim var_info(4);
	var_info(1):=mean;
	var_info(2):=std;
	var_info(3):=high;
	var_info(4):=low;

	{Let's user know if there are no outliers for the specified variable}
	if ((number_outliers = 0) and (output_choice = 1)) then
		DisplayMessageBox (MB_IconExclamation, 'Outlier Analysis', 'No outliers were found for '+
			Varname(varlist(numvars)));

	{Scrollsheet Output}

	{takes care of initial variable}
	desc_title$ := 'Descriptives & Values within ' + str(coeff,4,2) + ' Standard Dev(s) of the Mean';
	if ((output_choice = 1) and (numvars = 1)) then
	begin
	desc_handle := NewScrollsheet (4, 1, var_info, desc_title$,
	 ?rownames$, Varname(varlist(numvars)));
	outlier_title$ := "Cases with Outliers";
	outlier_handle := NewScrollsheet(number_outliers,1, outliers, outlier_title$,?RowNames$,
	Varname(varlist(numvars)));
	ScrollsheetSetRowNameWidth (desc_handle, 20);
	ScrollsheetSetColumnName (outlier_handle, numvars,Varname(varlist(numvars)),'Cases');
	high_temp$ := 'Mean +*s.d.';
	high$ := sinsert(high_temp$,str(coeff,4,2),7);
	low_temp$ := 'Mean -*s.d.';
	low$ := sinsert(low_temp$, str(coeff,4,2),7);
	ScrollsheetSetRowName (desc_handle, 1, 'Mean');
	ScrollsheetSetRowName (desc_handle, 2, 'Standard Deviation');
	ScrollsheetSetRowName (desc_handle, 3, high$);
	ScrollsheetSetRowName (desc_handle, 4, low$);
	ScrollsheetSetColumnFormat (outlier_handle, numvars, SCF_integer, 4);
	for d:=1 to number_outliers do
		begin
		ScrollsheetSetHilite (datahandle, outliers(d), Varlist(numvars), 1);
		ScrollsheetSetValue (datahandle, outliers(d), nvars+1, 1);
		end;
	end;

	{takes care of subsequent variables}
	If ((output_choice = 1) and (not(numvars = 1))) then
	begin
	ScrollsheetAppendColumns (outlier_handle, 1);
	ScrollsheetSetColumnName (outlier_handle, numvars,Varname(varlist(numvars)),'Cases');
	ScrollsheetSetColumnFormat (outlier_handle, numvars, SCF_integer, 4);
	Current_size := ScrollsheetGetNbRows (outlier_handle);
	If (number_outliers > current_size) then
	ScrollsheetAppendRows (outlier_handle, number_outliers - current_size);
	for c:=1 to number_outliers do
		begin
		ScrollsheetSetValue (outlier_handle, c, numvars, outliers(c));
		end;
	for e:=1 to number_outliers do
		begin
		ScrollsheetSetHilite (datahandle, outliers(e), varlist(numvars), 1);
		ScrollsheetSetValue (datahandle, outliers(e), nvars+1, 1);
		end;
	ScrollsheetAppendColumns (desc_handle, 1);
	ScrollsheetSetColumnName (desc_handle, numvars,?Name$ ,Varname(varlist(numvars)) );
	ScrollsheetSetValue (desc_handle, 1, Numvars, mean);
	ScrollsheetSetValue (desc_handle, 2, Numvars, std);
	ScrollsheetSetValue (desc_handle, 3, Numvars, high);
	ScrollsheetSetValue (desc_handle, 4, Numvars, low);
	end;

	{T/O ouput}
	if (output_choice = 2) then
	begin
	Writeln("");
	Writeln ("OUTLIER ANALYSIS OUTPUT");
	Writeln("");
	Writeln("Variable: ",Varname(varlist(numvars)));
	Writeln ("Mean: ", mean, "Standard deviation: ", std);
	Writeln ("Values within +/- ", coeff, "standard deviations are from ", low, "to ", high);
	Writeln("");
	number_outliers := 0;
	for counter2 := 1 to NCases do
		begin
		if ((Data (counter2, varlist(numvars)) < low) or (Data (counter2, varlist(numvars)) > high)) then
			begin
			writeln ("Case number: ", counter2, "(Case Name: ", CaseName(counter2),")", " is an outlier for ",
			varname(varlist(numvars)));
			number_outliers := number_outliers + 1;
			end;
		end;
	if (number_outliers = 0) then
		writeln ("No outliers were found for ", varname(varlist(numvars)));
		end;
end;

{message for scrollsheet output}
if (output_choice = 1) then
DisplayMessageBox (MB_IconExclamation, 'Please note', 'If you selected more than 1 variable, you will need to resize the
scrollsheets to view the output.');
Back to List of Programs



[StatSoft]
2300 East 14th Street, Tulsa, OK 74104
Phone: (918) 749-1119; Fax: (918) 749-2217

[StatSoft]e-mail: info@statsoft.com

©Copyright StatSoft, Inc., 1984-2004.
StatSoft, StatSoft logo, STATISTICA, SEWSS, SEDAS, Data Miner, SEPATH and GTrees are trademarks of StatSoft, Inc.