# Functions useful in Screening Data developed by: # Thomas D. Fletcher # Assistant Professor # Department of Psychology # University of Missouri-St. Louis # FletcherT@umsl.edu # # # The following functions are developed for use in R # They will likely work in S+ with some minor modifications ################################## # INDEX of functions # plot.normX - plots density of a variable with corresponding normal curve # plot.normX2 - same as plot.normX but useful for multiple variables # Kurt _ assesses kurtosis # Skew - assesses skewness # norm - assesses skewness & kurtosis # eda.uni - creates 4 graphs on 1 page for detection of univariate normality # mult.norm - multivariate skewness & kurtosis & mahalanobis D^2 # Make.Z - converts data to z scores # ### plot.normX and plot.normX2 # To simply call individual variables and plot single graphs you may want to use plot.normX plot.normX <- function (x) { plot ( density(x, na.rm=T), col=4, main = paste("Density of ", deparse(substitute(x))), sub = "With Corresponding Normal" ) lines ( density(qnorm (ppoints(length (x[!(is.na(x))])), mean(x, na.rm=T), sd(x, na.rm=T))), lty=2, col=2) } # To plot more than one variable (as in sending plots to postscript device) plot.normX2 <- function (x) { plot ( density(x[,i], na.rm=T), col=4, main = paste("Density of ", colnames(x)[i]), sub = "With Corresponding Normal" ) lines ( density(qnorm (ppoints(length(x[!(is.na(x))])), mean(x[,i],na.rm=T), sd(x[,i],na.rm=T))), lty=2, col=2) } # norm() relies on two other functions Kurt() and Skew() Kurt <- function(x) { n <- length (x[!(is.na(x))]) sd <- sqrt(var(x,na.rm=T)) m <- mean(x,na.rm=T) ku <-(((n*(n+1))/((n-1)*(n-2)*(n-3)))*(sum(((x-m)/sd)^4, na.rm=T)))-((3*(n-1)^2)/((n-2)*(n-3))) se.ku <- sqrt(24/n) t.ku <- ku/se.ku p.ku <- 1-pnorm(abs(t.ku)) #Note - evaluated against z instead of t return(ku,se.ku,t.ku,p.ku) } Skew <- function(x) { n <- length (x[!(is.na(x))]) sd <- sqrt(var(x,na.rm=T)) m <- mean(x,na.rm=T) sk <- (n/((n-1)*(n-2)))*sum(((x-m)/sd)^3, na.rm=T) se.sk <- sqrt(6/n) t.sk <- sk/se.sk p.sk <- 1-pnorm(abs(t.sk)) #Note - evaluated against z instead of t return(sk,se.sk,t.sk,p.sk) } norm <- function(x) { mat <- matrix(,nrow=2,ncol=4) mat <- data.frame(mat) names(mat)<- c("Statistic","SE","t-val","p") row.names(mat) <- c("Skewness","Kurtosis") sk <- Skew(x) ku <- Kurt(x) mat[1,] <- as.numeric(sk) mat[2,] <- as.numeric(ku) return(mat) } ### eda.uni() eda.uni <- function(x,title="") { par (mfrow = c(2,2)) hist(x,main=title) plot(density(x, na.rm=T),main="Smoothed Histogram") qqnorm(x); qqline(x) boxplot(x,horizontal=TRUE,main="BoxPlot") } ### mult.norm() ### TD Fletcher created R syntax for these formula based on ### USING Applied Multivariate Statistics (Khatree & Naik, 1999) ### From pg. 13 ### Original formula from Mardia (1970; 1974) & Mahalanobis (1936) ### Returns: ### Multivariate Skewness ### Multivariate Kurtosis ### Mahalanobis distance !! If NAs present, does not identify which mult.norm <- function (x,s=var(x)) { x <- as.matrix(x) x <- na.omit(x) n <- nrow(x) p <- ncol(x) dfchi <- p*(p+1)*(p+2)/6 # create 1 matrix one <- matrix(1,n) # create idendity matrix n*n id <- matrix(0,n,n) diag(id) <- 1 # create Q matrix ( I - 1/n * 1[n] * 1[n]' ) Q <- id - 1/n * one %*% t(one) # create g matrix g <- Q %*% x %*% solve(s) %*% t(x) %*% Q # mahalanobis distance = mahalanobis <- diag(g) # Extreme D^2 = ext <- qchisq(.995,p) # b1 & b2 used for skew & kurt measure b1p <- 1/(n^2) * sum(g^3) b2p <- sum(diag(g)^2)/n # Kappa measures k1 <- n * b1p / 6 k2 <- (b2p - p*(p+2))/(8*p*(p+2)/n)^.5 # p-values pskew <- 1 - pchisq(k1,dfchi) pkurt <- 2 * ( 1 - pnorm(abs(k2))) # create matrix to return mat <- matrix(,2,3) colnames(mat) <- c("Beta-hat", "kappa", "p-val") rownames(mat) <- c("Skewness","Kurtosis") mat[1,] <- c(as.numeric(b1p),as.numeric(k1),as.numeric(pskew)) mat[2,] <- c(as.numeric(b2p),as.numeric(k2),as.numeric(pkurt)) return(mat,mahalanobis, c("Extreme D^2 >= ", as.numeric(ext))) } ## Make.Z () Make.Z <- function(x) { x <- as.matrix(x) x <- sweep(x, 2, colMeans(x, na.rm=T)) zx <- sweep(x,2,sd(x, na.rm=T),"/") return(zx) } ############################################### ################ EXAMPLES #################### ############################################ # plot.normX data(USJudgeRatings) # data packaged with R plot.normX(USJudgeRatings$CONT) # creates a postcript file (to be converted to .pdf) # contains plots for all 12 variables in USJudgeRatings # plot.normX2 data(USJudgeRatings) postscript("C:/temp/Judge.ps") for (i in 1:12) { plot.normX2(USJudgeRatings) } dev.off() # Kurt # Skew # norm # create skewed data xc <- -rchisq(100,3) # negatively skewed with 100 observations norm(xc) # eda.uni eda.uni(xc) # mult.norm # assess the multivariate normality of variables 4,5,6 in USJudgeRatings data(USJudgeRatings) mult.norm(USJudgeRatings[,4:6]) # Make.Z zUSJR <- Make.Z(USJudgeRatings) # creates new object containg z scores dim(zUSJR) # shows that there are 43 observed z scores for 12 variables zUSJR[,1] # to look at only the first column of z scores