DATA MINING AND BUSINESS ANALYTICS WITH R COPYRIGHT JOHANNES LEDOLTER UNIVERSITY OF IOWA WILEY 2013 CHAPTER 2: PROCESSING THE INFORMATION AND GETTING TO KNOW YOUR DATA Example 1: 2006 Birth Data ## Install packages from CRAN; use any USA mirror library(lattice) library(nutshell) data(births2006.smpl) births2006.smpl[1:5,] dim(births2006.smpl) births.dow=table(births2006.smpl$DOB_WK) births.dow barchart(births.dow,ylab="Day of Week",col="black") ## for color, use col="red" or omit the color argument dob.dm.tbl=table(WK=births2006.smpl$DOB_WK,MM=births2006.smpl$DMETH_REC) dob.dm.tbl dob.dm.tbl=dob.dm.tbl[,-2] dob.dm.tbl trellis.device() barchart(dob.dm.tbl,ylab="Day of Week") barchart(dob.dm.tbl,horizontal=FALSE,groups=FALSE,xlab="Day of Week",col="black") ## for color, omit the color argument histogram(~DBWT|DPLURAL,data=births2006.smpl,layout=c(1,5),col="black") histogram(~DBWT|DMETH_REC,data=births2006.smpl,layout=c(1,3),col="black") densityplot(~DBWT|DPLURAL,data=births2006.smpl,layout=c(1,5),plot.points=FALSE,col="black") densityplot(~DBWT,groups=DPLURAL,data=births2006.smpl,plot.points=FALSE) dotplot(~DBWT|DPLURAL,data=births2006.smpl,layout=c(1,5),plot.points=FALSE,col="black") xyplot(DBWT~DOB_WK,data=births2006.smpl,col="black") xyplot(DBWT~DOB_WK|DPLURAL,data=births2006.smpl,layout=c(1,5),col="black") xyplot(DBWT~WTGAIN,data=births2006.smpl,col="black") xyplot(DBWT~WTGAIN|DPLURAL,data=births2006.smpl,layout=c(1,5),col="black") smoothScatter(births2006.smpl$WTGAIN,births2006.smpl$DBWT) ## boxplot is the command for a box plot in the standard graphics ## package boxplot(DBWT~APGAR5,data=births2006.smpl,ylab="DBWT",xlab="AGPAR5") boxplot(DBWT~DOB_WK,data=births2006.smpl,ylab="DBWT",xlab="Day of Week") ## bwplot is the command for a box plot in the lattice graphics ## package. There you need to declare the conditioning variables as ## factors bwplot(DBWT~factor(APGAR5)|factor(SEX),data=births2006.smpl,xlab="AGPAR5") bwplot(DBWT~factor(DOB_WK),data=births2006.smpl,xlab="Day of Week") fac=factor(births2006.smpl$DPLURAL) res=births2006.smpl$DBWT t4=tapply(res,fac,mean,na.rm=TRUE) t4 t5=tapply(births2006.smpl$DBWT,INDEX=list(births2006.smpl$DPLURAL,births2006.smpl$SEX),FUN=mean,na.rm=TRUE) t5 barplot(t4,ylab="DBWT") barplot(t5,beside=TRUE,ylab="DBWT") t5=table(births2006.smpl$ESTGEST) t5 new=births2006.smpl[births2006.smpl$ESTGEST != 99,] t51=table(new$ESTGEST) t51 t6=tapply(new$DBWT,INDEX=list(cut(new$WTGAIN,breaks=10),cut(new$ESTGEST,breaks=10)),FUN=mean,na.rm=TRUE) t6 levelplot(t6,scales = list(x = list(rot = 90))) contourplot(t6,scales = list(x = list(rot = 90))) Example 2: Alumni Donations ## Install packages from CRAN; use any USA mirror library(lattice) don <- read.csv("C:/DataMining/Data/contribution.csv") don[1:5,] table(don$Class.Year) barchart(table(don$Class.Year),horizontal=FALSE,xlab="Class Year",col="black") don$TGiving=don$FY00Giving+don$FY01Giving+don$FY02Giving+don$FY03Giving+don$FY04Giving mean(don$TGiving) sd(don$TGiving) quantile(don$TGiving,probs=seq(0,1,0.05)) quantile(don$TGiving,probs=seq(0.95,1,0.01)) hist(don$TGiving) hist(don$TGiving[don$TGiving!=0][don$TGiving[don$TGiving!=0]<=1000]) ## or, if you want to achieve the above histogram slower in two steps ## ff1=don$TGiving[don$TGiving!=0] ## ff1 ## ff2=ff1[ff1<=1000] ## ff2 ## hist(ff2,main=paste("Histogram of TGivingTrunc"),xlab="TGivingTrunc") boxplot(don$TGiving,horizontal=TRUE,xlab="Total Contribution") boxplot(don$TGiving,outline=FALSE,horizontal=TRUE,xlab="Total Contribution") ddd=don[don$TGiving>=30000,] ddd ddd1=ddd[,c(1:5,12)] ddd1 ddd1[order(ddd1$TGiving,decreasing=TRUE),] boxplot(TGiving~Class.Year,data=don,outline=FALSE) boxplot(TGiving~Gender,data=don,outline=FALSE) boxplot(TGiving~Marital.Status,data=don,outline=FALSE) boxplot(TGiving~AttendenceEvent,data=don,outline=FALSE) t4=tapply(don$TGiving,don$Major,mean,na.rm=TRUE) t4 t5=table(don$Major) t5 t6=cbind(t4,t5) t7=t6[t6[,2]>10,] t7[order(t7[,1],decreasing=TRUE),] barchart(t7[,1],col="black") t4=tapply(don$TGiving,don$Next.Degree,mean,na.rm=TRUE) t4 t5=table(don$Next.Degree) t5 t6=cbind(t4,t5) t7=t6[t6[,2]>10,] t7[order(t7[,1],decreasing=TRUE),] barchart(t7[,1],col="black") densityplot(~TGiving|factor(Class.Year), data=don[don$TGiving<=1000,][don[don$TGiving<=1000,]$TGiving>0,], plot.points=FALSE,col="black") t11=tapply(don$TGiving,don$Class.Year,FUN=sum,na.rm=TRUE) t11 barplot(t11,ylab="Average Donation") barchart(tapply(don$FY04Giving,don$Class.Year,FUN=sum,na.rm=TRUE),horizontal=FALSE,ylim=c(0,225000),col="black") barchart(tapply(don$FY03Giving,don$Class.Year,FUN=sum,na.rm=TRUE),horizontal= FALSE,ylim=c(0,225000),col="black") barchart(tapply(don$FY02Giving,don$Class.Year,FUN=sum,na.rm=TRUE),horizontal= FALSE,ylim=c(0,225000),col="black") barchart(tapply(don$FY01Giving,don$Class.Year,FUN=sum,na.rm=TRUE),horizontal= FALSE,ylim=c(0,225000),col="black") barchart(tapply(don$FY00Giving,don$Class.Year,FUN=sum,na.rm=TRUE),horizontal= FALSE,ylim=c(0,225000),col="black") don$TGivingIND=cut(don$TGiving,c(-1,0.5,10000000),labels=FALSE)-1 mean(don$TGivingIND) t5=table(don$TGivingIND,don$Class.Year) t5 barplot(t5,beside=TRUE) mosaicplot(factor(don$Class.Year)~factor(don$TGivingIND)) t50=tapply(don$TGivingIND,don$Class.Year,FUN=mean,na.rm=TRUE) t50 barchart(t50,horizontal=FALSE,col="black") don$FY04GivingIND=cut(don$FY04Giving,c(-1,0.5,10000000),labels=FALSE)-1 t51=tapply(don$FY04GivingIND,don$Class.Year,FUN=mean,na.rm=TRUE) t51 barchart(t51,horizontal=FALSE,col="black") Data=data.frame(don$FY04Giving,don$FY03Giving,don$FY02Giving,don$FY01Giving,don$FY00Giving) correlation=cor(Data) correlation plot(Data) library(ellipse) plotcorr(correlation) mosaicplot(factor(don$Gender)~factor(don$TGivingIND)) mosaicplot(factor(don$Marital.Status)~factor(don$TGivingIND)) t2=table(factor(don$Marital.Status),factor(don$TGivingIND)) mosaicplot(t2) mosaicplot(factor(don$AttendenceEvent)~factor(don$TGivingIND)) t2=table(factor(don$Marital.Status),factor(don$TGivingIND),factor(don$AttendenceEvent)) t2 mosaicplot(t2[,,1]) mosaicplot(t2[,,2]) Example 3: Orange Juice ## Install packages from CRAN; use any USA mirror library(lattice) oj <- read.csv("C:/DataMining/Data/oj.csv") oj$store <- factor(oj$store) oj[1:2,] t1=tapply(oj$logmove,oj$brand,FUN=mean,na.rm=TRUE) t1 t2=tapply(oj$logmove,INDEX=list(oj$brand,oj$week),FUN=mean,na.rm=TRUE) t2 plot(t2[1,],type= "l",xlab="week",ylab="dominicks",ylim=c(7,12)) plot(t2[2,],type= "l",xlab="week",ylab="minute.maid",ylim=c(7,12)) plot(t2[3,],type= "l",xlab="week",ylab="tropicana",ylim=c(7,12)) logmove=c(t2[1,],t2[2,],t2[3,]) week1=c(40:160) week=c(week1,week1,week1) brand1=rep(1,121) brand2=rep(2,121) brand3=rep(3,121) brand=c(brand1,brand2,brand3) xyplot(logmove~week|factor(brand),type= "l",layout=c(1,3),col="black") boxplot(logmove~brand,data=oj) histogram(~logmove|brand,data=oj,layout=c(1,3)) densityplot(~logmove|brand,data=oj,layout=c(1,3),plot.points=FALSE) densityplot(~logmove,groups=brand,data=oj,plot.points=FALSE) xyplot(logmove~week,data=oj,col="black") xyplot(logmove~week|brand,data=oj,layout=c(1,3),col="black") xyplot(logmove~price,data=oj,col="black") xyplot(logmove~price|brand,data=oj,layout=c(1,3),col="black") smoothScatter(oj$price,oj$logmove) densityplot(~logmove,groups=feat, data=oj, plot.points=FALSE) xyplot(logmove~price,groups=feat, data=oj) oj1=oj[oj$store == 5,] xyplot(logmove~week|brand,data=oj1,type="l",layout=c(1,3),col="black") xyplot(logmove~price,data=oj1,col="black") xyplot(logmove~price|brand,data=oj1,layout=c(1,3),col="black") densityplot(~logmove|brand,groups=feat,data=oj1,plot.points=FALSE) xyplot(logmove~price|brand,groups=feat,data=oj1) t21=tapply(oj$INCOME,oj$store,FUN=mean,na.rm=TRUE) t21 t21[t21==max(t21)] t21[t21==min(t21)] oj1=oj[oj$store == 62,] oj2=oj[oj$store == 75,] oj3=rbind(oj1,oj2) xyplot(logmove~price|store,data=oj3) xyplot(logmove~price|store,groups=feat,data=oj3) ## store in the wealthiest neighborhood mhigh=lm(logmove~price,data=oj1) summary(mhigh) plot(logmove~price,data=oj1,xlim=c(0,4),ylim=c(0,13)) abline(mhigh) ## store in the poorest neighborhood mlow=lm(logmove~price,data=oj2) summary(mlow) plot(logmove~price,data=oj2,xlim=c(0,4),ylim=c(0,13)) abline(mlow) CHAPTER 3: STANDARD LINEAR REGRESSION Example 1: Fuel Efficiency of Automobiles ## first we read in the data FuelEff <- read.csv("C:/DataMining/Data/FuelEfficiency.csv") FuelEff plot(GPM~MPG,data=FuelEff) plot(GPM~WT,data=FuelEff) plot(GPM~DIS,data=FuelEff) plot(GPM~NC,data=FuelEff) plot(GPM~HP,data=FuelEff) plot(GPM~ACC,data=FuelEff) plot(GPM~ET,data=FuelEff) FuelEff=FuelEff[-1] FuelEff ## regression on all data m1=lm(GPM~.,data=FuelEff) summary(m1) cor(FuelEff) ## best subset regression in R library(leaps) X=FuelEff[,2:7] y=FuelEff[,1] out=summary(regsubsets(X,y,nbest=2,nvmax=ncol(X))) tab=cbind(out$which,out$rsq,out$adjr2,out$cp) tab m2=lm(GPM~WT,data=FuelEff) summary(m2) ## cross-validation (leave one out) for the model on all six regressors n=length(FuelEff$GPM) diff=dim(n) percdiff=dim(n) for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] ## the R expression "train1[train1!=k]" picks from train1 those ## elements that are different from k and stores those elements in the ## object train. ## For k=1, train consists of elements that are different from 1; that ## is 2, 3, …, n. m1=lm(GPM~.,data=FuelEff[train,]) pred=predict(m1,newdat=FuelEff[-train,]) obs=FuelEff$GPM[-train] diff[k]=obs-pred percdiff[k]=abs(diff[k])/obs } me=mean(diff) rmse=sqrt(mean(diff**2)) mape=100*(mean(percdiff)) me # mean error rmse # root mean square error mape # mean absolute percent error ## cross-validation (leave one out) for the model on weight only n=length(FuelEff$GPM) diff=dim(n) percdiff=dim(n) for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] m2=lm(GPM~WT,data=FuelEff[train,]) pred=predict(m2,newdat=FuelEff[-train,]) obs=FuelEff$GPM[-train] diff[k]=obs-pred percdiff[k]=abs(diff[k])/obs } me=mean(diff) rmse=sqrt(mean(diff**2)) mape=100*(mean(percdiff)) me # mean error rmse # root mean square error mape # mean absolute percent error Example 2: Toyota Used Car Prices toyota <- read.csv("C:/DataMining/Data/ToyotaCorolla.csv") toyota[1:3,] summary(toyota) hist(toyota$Price) ## next we create indicator variables for the categorical variable ## FuelType with its three nominal outcomes: CNG, Diesel, and Petrol v1=rep(1,length(toyota$FuelType)) v2=rep(0,length(toyota$FuelType)) toyota$FuelType1=ifelse(toyota$FuelType=="CNG",v1,v2) toyota$FuelType2=ifelse(toyota$FuelType=="Diesel",v1,v2) auto=toyota[-4] auto[1:3,] plot(Price~Age,data=auto) plot(Price~KM,data=auto) plot(Price~HP,data=auto) plot(Price~MetColor,data=auto) plot(Price~Automatic,data=auto) plot(Price~CC,data=auto) plot(Price~Doors,data=auto) plot(Price~Weight,data=auto) ## regression on all data m1=lm(Price~.,data=auto) summary(m1) set.seed(1) ## fixing the seed value for the random selection guarantees the ## same results in repeated runs n=length(auto$Price) n1=1000 n2=n-n1 train=sample(1:n,n1) ## regression on training set m1=lm(Price~.,data=auto[train,]) summary(m1) pred=predict(m1,newdat=auto[-train,]) obs=auto$Price[-train] diff=obs-pred percdiff=abs(diff)/obs me=mean(diff) rmse=sqrt(sum(diff**2)/n2) mape=100*(mean(percdiff)) me # mean error rmse # root mean square error mape # mean absolute percent error ## cross-validation (leave one out) n=length(auto$Price) diff=dim(n) percdiff=dim(n) for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] m1=lm(Price~.,data=auto[train,]) pred=predict(m1,newdat=auto[-train,]) obs=auto$Price[-train] diff[k]=obs-pred percdiff[k]=abs(diff[k])/obs } me=mean(diff) rmse=sqrt(mean(diff**2)) mape=100*(mean(percdiff)) me # mean error rmse # root mean square error mape # mean absolute percent error ## cross-validation (leave one out): Model with just Age n=length(auto$Price) diff=dim(n) percdiff=dim(n) for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] m1=lm(Price~Age,data=auto[train,]) pred=predict(m1,newdat=auto[-train,]) obs=auto$Price[-train] diff[k]=obs-pred percdiff[k]=abs(diff[k])/obs } me=mean(diff) rmse=sqrt(mean(diff**2)) mape=100*(mean(percdiff)) me # mean error rmse # root mean square error mape # mean absolute percent error ## Adding the squares of Age and KM to the model auto$Age2=auto$Age^2 auto$KM2=auto$KM^2 m11=lm(Price~Age+KM,data=auto) summary(m11) m12=lm(Price~Age+Age2+KM+KM2,data=auto) summary(m12) m13=lm(Price~Age+Age2+KM,data=auto) summary(m13) plot(m11$res~m11$fitted) hist(m11$res) plot(m12$res~m12$fitted) CHAPTER 4: LOCAL POLYNOMIAL REGRESSION: A NONPARAMETRIC REGRESSION APPROACH Example 1: Old Faithful library(locfit) ## first we read in the data OldFaithful <- read.csv("C:/DataMining/Data/OldFaithful.csv") OldFaithful[1:3,] ## density histograms and smoothed density histograms ## time of eruption hist(OldFaithful$TimeEruption,freq=FALSE) fit1 <- locfit(~lp(TimeEruption),data=OldFaithful) plot(fit1) ## waiting time to next eruption hist(OldFaithful$TimeWaiting,freq=FALSE) fit2 <- locfit(~lp(TimeWaiting),data=OldFaithful) plot(fit2) ## experiment with different smoothing constants fit2 <- locfit(~lp(TimeWaiting,nn=0.9,deg=2),data=OldFaithful) plot(fit2) fit2 <- locfit(~lp(TimeWaiting,nn=0.3,deg=2),data=OldFaithful) plot(fit2) ## cross-validation of smoothing constant ## for waiting time to next eruption alpha<-seq(0.20,1,by=0.01) n1=length(alpha) g=matrix(nrow=n1,ncol=4) for (k in 1:length(alpha)) { g[k,]<-gcv(~lp(TimeWaiting,nn=alpha[k]),data=OldFaithful) } g plot(g[,4]~g[,3],ylab="GCV",xlab="degrees of freedom") ## minimum at nn = 0.66 fit2 <- locfit(~lp(TimeWaiting,nn=0.66,deg=2),data=OldFaithful) plot(fit2) ## local polynomial regression of TimeEruption on TimeWaiting plot(TimeWaiting~TimeEruption,data=OldFaithful) # standard regression fit fitreg=lm(TimeWaiting~TimeEruption,data=OldFaithful) plot(TimeWaiting~TimeEruption,data=OldFaithful) abline(fitreg) # fit with nearest neighbor bandwidth fit3 <- locfit(TimeWaiting~lp(TimeEruption),data=OldFaithful) plot(fit3) fit3 <- locfit(TimeWaiting~lp(TimeEruption,deg=1),data=OldFaithful) plot(fit3) fit3 <- locfit(TimeWaiting~lp(TimeEruption,deg=0),data=OldFaithful) plot(fit3) Example 2: NOx Exhaust Emissions library(locfit) ## first we read in the data ethanol <- read.csv("C:/DataMining/Data/ethanol.csv") ethanol[1:3,] ## density histogram hist(ethanol$NOx,freq=FALSE) ## smoothed density histogram fit <- locfit(~lp(NOx),data=ethanol) plot(fit) ## experiment with the parameters of locfit fit <- locfit(~lp(NOx,deg=1),data=ethanol) plot(fit) fit <- locfit(~lp(NOx,nn=0.7,deg=1),data=ethanol) plot(fit) fit <- locfit(~lp(NOx,nn=0.5,deg=1),data=ethanol) plot(fit) fit <- locfit(~lp(NOx,deg=2),data=ethanol) plot(fit) fit <- locfit(~lp(NOx,nn=0.7,deg=2),data=ethanol) plot(fit) fit <- locfit(~lp(NOx,nn=0.5,deg=2),data=ethanol) plot(fit) fit <- locfit(~lp(NOx,deg=3),data=ethanol) plot(fit) fit <- locfit(~lp(NOx,nn=0.7,deg=3),data=ethanol) plot(fit) fit <- locfit(~lp(NOx,nn=0.5,deg=3),data=ethanol) plot(fit) ## standard regression of NOx on the equivalence ratio plot(NOx~EquivRatio,data=ethanol) fitreg=lm(NOx~EquivRatio,data=ethanol) plot(NOx~EquivRatio,data=ethanol) abline(fitreg) ## local polynomial regression of NOx on the equivalence ratio ## fit with a 50% nearest neighbor bandwidth. fit <- locfit(NOx~lp(EquivRatio,nn=0.5),data=ethanol) plot(fit) ## experiment with the parameters of locfit fit <- locfit(NOx~lp(EquivRatio,nn=0.2),data=ethanol) plot(fit) fit <- locfit(NOx~lp(EquivRatio,nn=0.8),data=ethanol) plot(fit) fit <- locfit(NOx~lp(EquivRatio,deg=1),data=ethanol) plot(fit) fit <- locfit(NOx~lp(EquivRatio,deg=2),data=ethanol) plot(fit) ## cross-validation alpha<-seq(0.20,1,by=0.01) n1=length(alpha) g=matrix(nrow=n1,ncol=4) for (k in 1:length(alpha)) { g[k,]<-gcv(NOx~lp(EquivRatio,nn=alpha[k]),data=ethanol) } g plot(g[,4]~g[,3],ylab="GCV",xlab="degrees of freedom") f1=locfit(NOx~lp(EquivRatio,nn=0.30),data=ethanol) f1 plot(f1) ## local polynomial regression on both E and C plot(NOx~EquivRatio,data=ethanol) plot(NOx~CompRatio,data=ethanol) fit <- locfit(NOx~lp(EquivRatio,CompRatio,scale=TRUE),data=ethanol) plot(fit) ## experiment with the parameters of locfit fit <- locfit(NOx~lp(EquivRatio,CompRatio,nn=0.5,scale=TRUE),data=ethanol) plot(fit) fit <- locfit(NOx~lp(EquivRatio,CompRatio,deg=0,scale=TRUE),data=ethanol) plot(fit) CHAPTER 5: IMPORTANCE OF PARSIMONY IN STATISTICAL MODELING Example 1 ## Example 1 ## We specify a seed to make the results reproducible. Omitting the ## set.seed statement would lead to a different set of random numbers ## and the results would vary somewhat set.seed(10) alpha=0.10 m=100 p=dim(m) index=dim(m) for (i in 1:5) { x=rnorm(25,1,1) t=-abs(mean(x)/(sd(x)/sqrt(25))) p[i]=2*pt(t,24) index[i]=i } for (i in 6:m) { x=rnorm(25) t=-abs(mean(x)/(sd(x)/sqrt(25))) p[i]=2*pt(t,24) index[i]=i } count=p<=0.05 table(count) ps=sort(p) logps=log(ps) logindex=log(index) y=log(index*alpha/m) plot(logps~logindex,xlab="log(j)",ylab="log(ProbValue(j))",main="False Discovery Rate") points(y~logindex,type="l") ps ps[6] Example 2 ## Example 2 set.seed(10) alpha=0.20 m=500 p=dim(m) index=dim(m) for (i in 1:5) { x=rnorm(25,1,1) t=-abs(mean(x)/(sd(x)/sqrt(25))) p[i]=2*pt(t,24) index[i]=i } for (i in 6:m) { x=rnorm(25) t=-abs(mean(x)/(sd(x)/sqrt(25))) p[i]=2*pt(t,24) index[i]=i } count=p<=0.05 table(count) ps=sort(p) logps=log(ps) logindex=log(index) y=log(index*alpha/m) plot(logps~logindex,xlab="log(j)",ylab="log(ProbValue(j))",main="False Discovery Rate") points(y~logindex,type="l") ps ps[7] CHAPTER 6: PENALTY-BASED VARIABLE SELECTION IN REGRESSION MODELS WITH MANY PARAMETERS (LASSO) Example 1: Prostate Cancer prostate <- read.csv("C:/DataMining/Data/prostate.csv") prostate[1:3,] m1=lm(lcavol~.,data=prostate) summary(m1) ## the model.matrix statement defines the model to be fitted x <- model.matrix(lcavol~age+lbph+lcp+gleason+lpsa,data=prostate) x=x[,-1] ## stripping off the column of 1s as LASSO includes the intercept ## automatically library(lars) ## lasso on all data lasso <- lars(x=x,y=prostate$lcavol,trace=TRUE) ## trace of lasso (standardized) coefficients for varying penalty plot(lasso) lasso coef(lasso,s=c(.25,.50,0.75,1.0),mode="fraction") ## cross-validation using 10 folds cv.lars(x=x,y=prostate$lcavol,K=10) ## another way to evaluate lasso’s out-of-sample prediction performance MSElasso25=dim(10) MSElasso50=dim(10) MSElasso75=dim(10) MSElasso100=dim(10) set.seed(1) for(i in 1:10){ train <- sample(1:nrow(prostate),80) lasso <- lars(x=x[train,],y=prostate$lcavol[train]) MSElasso25[i]= mean((predict(lasso,x[-train,],s=.25,mode="fraction")$fit-prostate$lcavol[-train])^2) MSElasso50[i]= mean((predict(lasso,x[-train,],s=.50,mode="fraction")$fit-prostate$lcavol[-train])^2) MSElasso75[i]= mean((predict(lasso,x[-train,],s=.75,mode="fraction")$fit-prostate$lcavol[-train])^2) MSElasso100[i]= mean((predict(lasso,x[-train,],s=1.00,mode="fraction")$fit-prostate$lcavol[-train])^2) } mean(MSElasso25) mean(MSElasso50) mean(MSElasso75) mean(MSElasso100) boxplot(MSElasso25,MSElasso50,MSElasso75,MSElasso100,ylab="MSE", sub="LASSO model",xlab="s=0.25 s=0.50 s=0.75 s=1.0(LS)") Example 2: Orange Juice oj <- read.csv("C:/DataMining/Data/oj.csv") oj$store <- factor(oj$store) oj[1:2,] x <- model.matrix(logmove ~ log(price)*(feat + brand + AGE60 + EDUC + ETHNIC + INCOME + HHLARGE + WORKWOM + HVAL150 + SSTRDIST + SSTRVOL + CPDIST5 + CPWVOL5)^2, data=oj) dim(x) ## First column of x consists of ones (the intercept) ## We strip the column of ones as intercept is included automatically x=x[,-1] ## We normalize the covariates as they are of very different magnitudes ## Each normalized covariate has mean 0 and standard deviation 1 for (j in 1:209) { x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j]) } ## One could consider the standard regression model reg <- lm(oj$logmove~x) summary(reg) p0=predict(reg) ## Or, one could consider LASSO library(lars) lasso <- lars(x=x, y=oj$logmove, trace=TRUE) coef(lasso, s=c(.25,.50,0.75,1.00), mode="fraction") ## creates LASSO estimates as function of lambda ## gives you the estimates for four shrinkage coef ## Check that predictions in regression and lars (s=1) are the same p1=predict(lasso,x,s=1,mode="fraction") p1$fit pdiff=p1$fit-p0 pdiff ## zero differences ## out of sample prediction; estimate model on 20,000 rows MSElasso10=dim(10) MSElasso50=dim(10) MSElasso90=dim(10) MSElasso100=dim(10) set.seed(1) ## fixes seed to make random draws reproducible for(i in 1:10){ train <- sample(1:nrow(oj), 20000) lasso <- lars(x=x[train,], y=oj$logmove[train]) MSElasso10[i]= mean((predict(lasso,x[-train,], s=.10, mode="fraction") $fit - oj$logmove[-train])^2) MSElasso50[i]= mean((predict(lasso,x[-train,], s=.50, mode="fraction") $fit - oj$logmove[-train])^2) MSElasso90[i]= mean((predict(lasso,x[-train,], s=.90, mode="fraction") $fit - oj$logmove[-train])^2) MSElasso100[i]= mean((predict(lasso,x[-train,], s=1.0, mode="fraction") $fit - oj$logmove[-train])^2) } mean(MSElasso10) mean(MSElasso50) mean(MSElasso90) mean(MSElasso100) boxplot(MSElasso10,MSElasso50,MSElasso90,MSElasso100,ylab="MSE", sub="LASSO model",xlab="s=0.10 s=0.50 s=0.9 s=1.0(LS)") ## out of sample prediction; estimate model on 1,000 rows set.seed(1) ## fixes seed to make random draws reproducible for(i in 1:10){ train <- sample(1:nrow(oj), 1000) lasso <- lars(x=x[train,], y=oj$logmove[train]) MSElasso10[i]= mean((predict(lasso,x[-train,], s=.10, mode="fraction") $fit - oj$logmove[-train])^2) MSElasso50[i]= mean((predict(lasso,x[-train,], s=.50, mode="fraction") $fit - oj$logmove[-train])^2) MSElasso90[i]= mean((predict(lasso,x[-train,], s=.90, mode="fraction") $fit - oj$logmove[-train])^2) MSElasso100[i]= mean((predict(lasso,x[-train,], s=1.0, mode="fraction") $fit - oj$logmove[-train])^2) } mean(MSElasso10) mean(MSElasso50) mean(MSElasso90) mean(MSElasso100) boxplot(MSElasso10,MSElasso50,MSElasso90,MSElasso100,ylab="MSE", sub="LASSO model",xlab="s=0.10 s=0.50 s=0.9 s=1.0(LS)") CHAPTER 7: LOGISTIC REGRESSION Example 1: Death Penalty Data ## analyzing individual observations dpen <- read.csv("C:/DataMining/Data/DeathPenalty.csv") dpen[1:4,] dpen[359:362,] m1=glm(Death~VRace+Agg,family=binomial,data=dpen) m1 summary(m1) ## calculating logits exp(m1$coef[2]) exp(m1$coef[3]) ## plotting probability of getting death penalty as a function of aggravation ## separately for black (in black) and white (in red) victim fitBlack=dim(501) fitWhite=dim(501) ag=dim(501) for (i in 1:501) { ag[i]=(99+i)/100 fitBlack[i]=exp(m1$coef[1]+ag[i]*m1$coef[3])/(1+exp(m1$coef[1]+ag[i]*m1$coef[3])) fitWhite[i]=exp(m1$coef[1]+m1$coef[2]+ag[i]*m1$coef[3])/(1+exp(m1$coef[1]+m1$coef[2]+ag[i]*m1$coef[3])) } plot(fitBlack~ag,type="l",col="black",ylab="Prob[Death]",xlab="Aggravation",ylim=c(0,1),main="red line for white victim; black line for black victim") points(fitWhite~ag,type="l",col="red") ## analyzing summarized data dpenother <- read.csv("C:/DataMining/Data/DeathPenaltyOther.csv") dpenother m1=glm(Death~VRace+Agg,family=binomial,weights=Freq,data=dpenother) m1 summary(m1) exp(m1$coef[2]) exp(m1$coef[3]) Example 2: Delayed Airplanes library(car) ## needed to recode variables set.seed(1) ## read and print the data del <- read.csv("C:/DataMining/Data/FlightDelays.csv") del[1:3,] ## define hours of departure del$sched=factor(floor(del$schedtime/100)) table(del$sched) table(del$carrier) table(del$dest) table(del$origin) table(del$weather) table(del$dayweek) table(del$daymonth) table(del$delay) del$delay=recode(del$delay,"'delayed'=1;else=0") del$delay=as.numeric(levels(del$delay)[del$delay]) table(del$delay) ## Delay: 1=Monday; 2=Tuesday; 3=Wednesday; 4=Thursday; ## 5=Friday; 6=Saturday; 7=Sunday ## 7=Sunday and 1=Monday coded as 1 del$dayweek=recode(del$dayweek,"c(1,7)=1;else=0") table(del$dayweek) ## omit unused variables del=del[,c(-1,-3,-5,-6,-7,-11,-12)] del[1:3,] n=length(del$delay) n n1=floor(n*(0.6)) n1 n2=n-n1 n2 train=sample(1:n,n1) ## estimation of the logistic regression model ## explanatory variables: carrier, destination, origin, weather, day of week ## (weekday/weekend), scheduled hour of departure ## create design matrix; indicators for categorical variables (factors) Xdel <- model.matrix(delay~.,data=del)[,-1] Xdel[1:3,] xtrain <- Xdel[train,] xnew <- Xdel[-train,] ytrain <- del$delay[train] ynew <- del$delay[-train] m1=glm(delay~.,family=binomial,data=data.frame(delay=ytrain,xtrain)) summary(m1) ## prediction: predicted default probabilities for cases in test set ptest <- predict(m1,newdata=data.frame(xnew),type="response") data.frame(ynew,ptest)[1:10,] ## first column in list represents the case number of the test element plot(ynew~ptest) ## coding as 1 if probability 0.5 or larger gg1=floor(ptest+0.5) ## floor function; see help command ttt=table(ynew,gg1) ttt error=(ttt[1,2]+ttt[2,1])/n2 error ## coding as 1 if probability 0.3 or larger gg2=floor(ptest+0.7) ttt=table(ynew,gg2) ttt error=(ttt[1,2]+ttt[2,1])/n2 error bb=cbind(ptest,ynew) bb bb1=bb[order(ptest,decreasing=TRUE),] bb1 ## order cases in test set according to their success prob ## actual outcome shown next to it ## overall success (delay) prob in the evaluation data set xbar=mean(ynew) xbar ## calculating the lift ## cumulative 1’s sorted by predicted values ## cumulative 1’s using the average success prob from evaluation set axis=dim(n2) ax=dim(n2) ay=dim(n2) axis[1]=1 ax[1]=xbar ay[1]=bb1[1,2] for (i in 2:n2) { axis[i]=i ax[i]=xbar*i ay[i]=ay[i-1]+bb1[i,2] } aaa=cbind(bb1[,1],bb1[,2],ay,ax) aaa[1:100,] plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob") points(axis,ax,type="l") Example 3: Loan Acceptance library(car) ## needed to recode variables set.seed(1) loan <- read.csv("C:/DataMining/Data/UniversalBank.csv") loan[1:3,] ## familiarize yourself with the data hist(loan$Age) hist(loan$Experience) hist(loan$Income) hist(loan$Family) ## below we treat loan$Family as categorical hist(loan$CCAvg) hist(loan$Mortgage) hist(loan$SecuritiesAccount) hist(loan$CDAccount) hist(loan$Online) hist(loan$CreditCard) hist(loan$Education) ## below we treat loan$Education as categorical response=loan$PersonalLoan hist(response) MeanRes=mean(response) MeanRes ## creating indicator variables for loan$Family and loan$Education v1=rep(1,dim(loan)[1]) v2=rep(0,dim(loan)[1]) ## creating indicator variables for family size (4 groups: 1, 2, 3, 4) loan$FamSize2=ifelse(loan$Family==2,v1,v2) loan$FamSize3=ifelse(loan$Family==3,v1,v2) loan$FamSize4=ifelse(loan$Family==4,v1,v2) ## creating indicator variables for education level (3 groups: 1, 2, 3) loan$Educ2=ifelse(loan$Education==2,v1,v2) loan$Educ3=ifelse(loan$Education==3,v1,v2) xx=cbind(response,Age=loan$Age,Exp=loan$Experience,Inc=loan$Income,Fam2=loan$FamSize2,Fam3=loan$FamSize3,Fam4=loan$FamSize4,CCAve=loan$CCAvg,Mort=loan$Mortgage,SecAcc=loan$SecuritiesAccount,CD=loan$CDAccount,Online=loan$Online,CreditCard=loan$CreditCard,Educ2=loan$Educ2,Educ3=loan$Educ3) xx[1:3,] ## split the data set into training and test (evaluation) set n=dim(loan)[1] n n1=floor(n*(0.6)) n1 n2=n-n1 n2 train=sample(1:n,n1) ## model fitted on all data m1=glm(response~.,family=binomial,data=data.frame(xx)) summary(m1) xx=xx[,-1] xtrain <- xx[train,] xnew <- xx[-train,] ytrain <- response[train] ynew <- response[-train] ## model fitted on the training data set m2=glm(response~.,family=binomial,data=data.frame(response=ytrain,xtrain)) summary(m2) ## create predictions for the test (evaluation) data set ptest=predict(m2,newdata=data.frame(xnew),type="response") ## predicted probabilities hist(ptest) plot(ynew~ptest) ## coding as 1 if probability 0.5 or larger gg1=floor(ptest+0.5) ttt=table(ynew,gg1) ttt error=(ttt[1,2]+ttt[2,1])/n2 error ## coding as 1 if probability 0.3 or larger gg2=floor(ptest+0.7) ttt=table(ynew,gg2) ttt error=(ttt[1,2]+ttt[2,1])/n2 error bb=cbind(ptest,ynew) bb bb1=bb[order(ptest,decreasing=TRUE),] bb1 ## order cases in test set according to their success prob ## actual outcome shown next to it ## overall success probability in evaluation (test) data set xbar=mean(ynew) xbar ## calculating the lift ## cumulative 1’s sorted by predicted values ## cumulative 1’s using the average success prob from evaluation set axis=dim(n2) ax=dim(n2) ay=dim(n2) axis[1]=1 ax[1]=xbar ay[1]=bb1[1,2] for (i in 2:n2) { axis[i]=i ax[i]=xbar*i ay[i]=ay[i-1]+bb1[i,2] } aaa=cbind(bb1[,1],bb1[,2],ay,ax) aaa[1:20,] plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob") points(axis,ax,type="l") Example 4: German Credit Data #### ******* German Credit Data ******* #### #### ******* data on 1000 loans ******* #### ## read data and create relevant variables credit <- read.csv("C:/DataMining/Data/germancredit.csv") credit credit$Default <- factor(credit$Default) ## re-level the credit history and a few other variables credit$history = factor(credit$history, levels=c("A30","A31","A32","A33","A34")) levels(credit$history) = c("good","good","poor","poor","terrible") credit$foreign <- factor(credit$foreign, levels=c("A201","A202"), labels=c("foreign","german")) credit$rent <- factor(credit$housing=="A151") credit$purpose <- factor(credit$purpose, levels=c("A40","A41","A42","A43","A44","A45","A46","A47","A48","A49","A410")) levels(credit$purpose) <- c("newcar","usedcar",rep("goods/repair",4),"edu",NA,"edu","biz","biz") ## for demonstration, cut the dataset to these variables credit <- credit[,c("Default","duration","amount","installment","age", "history", "purpose","foreign","rent")] credit[1:3,] summary(credit) # check out the data ## create a design matrix ## factor variables are turned into indicator variables ## the first column of ones is omitted Xcred <- model.matrix(Default~.,data=credit)[,-1] Xcred[1:3,] ## creating training and prediction datasets ## select 900 rows for estimation and 100 for testing set.seed(1) train <- sample(1:1000,900) xtrain <- Xcred[train,] xnew <- Xcred[-train,] ytrain <- credit$Default[train] ynew <- credit$Default[-train] credglm=glm(Default~.,family=binomial,data=data.frame(Default=ytrain,xtrain)) summary(credglm) ## prediction: predicted default probabilities for cases in test set ptest <- predict(credglm,newdata=data.frame(xnew),type="response") data.frame(ynew,ptest) ## What are our misclassification rates on that training set? ## We use probability cutoff 1/6 ## coding as 1 (predicting default) if probability 1/6 or larger gg1=floor(ptest+(5/6)) ttt=table(ynew,gg1) ttt error=(ttt[1,2]+ttt[2,1])/100 error CHAPTER 8: BINARY CLASSIFICATION, PROBABILITIES AND EVALUATING CLASSIFICATION PERFORMANCE Example: German Credit Data #### ******* German Credit Data ******* #### #### ******* data on 1000 loans ******* #### ## read data and create some `interesting' variables credit <- read.csv("C:/DataMining/Data/germancredit.csv") credit credit$Default <- factor(credit$Default) ## re-level the credit history and a few other variables credit$history = factor(credit$history, levels=c("A30","A31","A32","A33","A34")) levels(credit$history) = c("good","good","poor","poor","terrible") credit$foreign <- factor(credit$foreign, levels=c("A201","A202"), labels=c("foreign","german")) credit$rent <- factor(credit$housing=="A151") credit$purpose <- factor(credit$purpose, levels=c("A40","A41","A42","A43","A44","A45","A46","A47","A48","A49","A410")) levels(credit$purpose) <- c("newcar","usedcar",rep("goods/repair",4),"edu",NA,"edu","biz","biz") ## for demonstration, cut the dataset to these variables credit <- credit[,c("Default","duration","amount","installment","age","history", "purpose","foreign","rent")] credit summary(credit) # check out the data ## create a design matrix ## factor variables are turned into indicator variables ## the first column of ones is omitted Xcred <- model.matrix(Default~.,data=credit)[,-1] Xcred[1:3,] ## creating training and prediction datasets ## select 900 rows for estimation and 100 for testing set.seed(1) train <- sample(1:1000,900) xtrain <- Xcred[train,] xnew <- Xcred[-train,] ytrain <- credit$Default[train] ynew <- credit$Default[-train] credglm=glm(Default~.,family=binomial,data=data.frame(Default=ytrain,xtrain)) summary(credglm) ## Now to prediction: what are the underlying default probabilities ## for cases in the test set ptest <- predict(credglm, newdata=data.frame(xnew),type="response") data.frame(ynew,ptest) ## What are our misclassification rates on that training set? ## We use probability cutoff 1/6 ## coding as 1 (predicting default) if probability 1/6 or larger cut=1/6 gg1=floor(ptest+(1-cut)) ttt=table(ynew,gg1) ttt truepos <- ynew==1 & ptest>=cut trueneg <- ynew==0 & ptest=cut trueneg <- ynew==0 & ptest matrix(rep(seq(0,1,length=500),n),ncol=500,byrow=TRUE) fp <- colSums((y==levels(y)[1])*Q)/sum(y==levels(y)[1]) tp <- colSums((y==levels(y)[2])*Q)/sum(y==levels(y)[2]) plot(fp, tp, xlab="1-Specificity", ylab="Sensitivity") abline(a=0,b=1,lty=2,col=8) } ## ROC for hold-out period roc(p=ptest,y=ynew) ## ROC for all cases (in-sample) credglmall <- glm(credit$Default ~ Xcred,family=binomial) roc(p=credglmall$fitted, y=credglmall$y) ## using the ROCR package to graph the ROC curves library(ROCR) ## input is a data frame consisting of two columns ## predictions in first column and actual outcomes in the second ## ROC for hold-out period predictions=ptest labels=ynew data=data.frame(predictions,labels) data ## pred: function to create prediction objects pred <- prediction(data$predictions,data$labels) pred ## perf: creates the input to be plotted ## sensitivity and one minus specificity (the false positive rate) perf <- performance(pred, "sens", "fpr") perf plot(perf) ## ROC for all cases (in-sample) credglmall <- glm(credit$Default ~ Xcred,family=binomial) predictions=credglmall$fitted labels=credglmall$y data=data.frame(predictions,labels) pred <- prediction(data$predictions,data$labels) perf <- performance(pred, "sens", "fpr") plot(perf) CHAPTER 9: CLASSIFICATION USING A NEAREST NEIGHBOR ANALYSIS Example 1: Forensic Glass #### ******* Forensic Glass ****** #### library(textir) ## needed to standardize the data library(MASS) ## a library of example datasets data(fgl) ## loads the data into R; see help(fgl) fgl ## data consists of 214 cases ## here are illustrative box plots of the features stratified by ## glass type par(mfrow=c(3,3), mai=c(.3,.6,.1,.1)) plot(RI ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Al ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Na ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Mg ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Ba ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Si ~ type, data=fgl, col=c(grey(.2),2:6)) plot(K ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Ca ~ type, data=fgl, col=c(grey(.2),2:6)) plot(Fe ~ type, data=fgl, col=c(grey(.2),2:6)) ## for illustration, consider the RIxAl plane ## use nt=200 training cases to find the nearest neighbors for ## the remaining 14 cases. These 14 cases become the evaluation ## (test, hold-out) cases n=length(fgl$type) nt=200 set.seed(1) ## to make the calculations reproducible in repeated runs train <- sample(1:n,nt) ## Standardization of the data is preferable, especially if ## units of the features are quite different ## could do this from scratch by calculating the mean and ## standard deviation of each feature, and use those to ## standardize. ## Even simpler, use the normalize function in the R-package textir; ## it converts data frame columns to mean-zero sd-one ## x <- normalize(fgl[,c(4,1)]) x=fgl[,c(4,1)] x[,1]=(x[,1]-mean(x[,1]))/sd(x[,1]) x[,2]=(x[,2]-mean(x[,2]))/sd(x[,2]) x[1:3,] library(class) nearest1 <- knn(train=x[train,],test=x[-train,],cl=fgl$type[train],k=1) nearest5 <- knn(train=x[train,],test=x[-train,],cl=fgl$type[train],k=5) data.frame(fgl$type[-train],nearest1,nearest5) ## plot them to see how it worked par(mfrow=c(1,2)) ## plot for k=1 (single) nearest neighbor plot(x[train,],col=fgl$type[train],cex=.8,main="1-nearest neighbor") points(x[-train,],bg=nearest1,pch=21,col=grey(.9),cex=1.25) ## plot for k=5 nearest neighbors plot(x[train,],col=fgl$type[train],cex=.8,main="5-nearest neighbors") points(x[-train,],bg=nearest5,pch=21,col=grey(.9),cex=1.25) legend("topright",legend=levels(fgl$type),fill=1:6,bty="n",cex=.75) ## calculate the proportion of correct classifications on this one ## training set pcorrn1=100*sum(fgl$type[-train]==nearest1)/(n-nt) pcorrn5=100*sum(fgl$type[-train]==nearest5)/(n-nt) pcorrn1 pcorrn5 ## cross-validation (leave one out) pcorr=dim(10) for (k in 1:10) { pred=knn.cv(x,fgl$type,k) pcorr[k]=100*sum(fgl$type==pred)/n } pcorr ## Note: Different runs may give you slightly different results as ties ## are broken at random ## using all nine dimensions (RI plus 8 chemical concentrations) ## x <- normalize(fgl[,c(1:9)]) x=fgl[,c(1:9)] for (j in 1:9) { x[,j]=(x[,j]-mean(x[,j]))/sd(x[,j]) } nearest1 <- knn(train=x[train,],test=x[-train,],cl=fgl$type[train],k=1) nearest5 <- knn(train=x[train,],test=x[-train,],cl=fgl$type[train],k=5) data.frame(fgl$type[-train],nearest1,nearest5) ## calculate the proportion of correct classifications pcorrn1=100*sum(fgl$type[-train]==nearest1)/(n-nt) pcorrn5=100*sum(fgl$type[-train]==nearest5)/(n-nt) pcorrn1 pcorrn5 ## cross-validation (leave one out) pcorr=dim(10) for (k in 1:10) { pred=knn.cv(x,fgl$type,k) pcorr[k]=100*sum(fgl$type==pred)/n } pcorr Example 2: German Credit Data #### ******* German Credit Data ******* #### #### ******* data on 1000 loans ******* #### library(textir) ## needed to standardize the data library(class) ## needed for knn ## read data and create some `interesting' variables credit <- read.csv("C:/DataMining/Data/germancredit.csv") credit credit$Default <- factor(credit$Default) ## re-level the credit history and a few other variables credit$history = factor(credit$history, levels=c("A30","A31","A32","A33","A34")) levels(credit$history) = c("good","good","poor","poor","terrible") credit$foreign <- factor(credit$foreign, levels=c("A201","A202"), labels=c("foreign","german")) credit$rent <- factor(credit$housing=="A151") credit$purpose <- factor(credit$purpose, levels=c("A40","A41","A42","A43","A44","A45","A46","A47","A48","A49","A410")) levels(credit$purpose) <- c("newcar","usedcar",rep("goods/repair",4),"edu",NA,"edu","biz","biz") ## for demonstration, cut the dataset to these variables credit <- credit[,c("Default","duration","amount","installment","age", "history", "purpose","foreign","rent")] credit[1:3,] summary(credit) # check out the data ## for illustration we consider just 3 loan characteristics: ## amount,duration,installment ## Standardization of the data is preferable, especially if ## units of the features are quite different ## We use the normalize function in the R-package textir; ## it converts data frame columns to mean-zero sd-one ## x <- normalize(credit[,c(2,3,4)]) x=credit[,c(2,3,4)] x[,1]=(x[,1]-mean(x[,1]))/sd(x[,1]) x[,2]=(x[,2]-mean(x[,2]))/sd(x[,2]) x[,3]=(x[,3]-mean(x[,3]))/sd(x[,3]) x[1:3,] ## training and prediction datasets ## training set of 900 borrowers; want to classify 100 new ones set.seed(1) train <- sample(1:1000,900) ## this is training set of 900 borrowers xtrain <- x[train,] xnew <- x[-train,] ytrain <- credit$Default[train] ynew <- credit$Default[-train] ## k-nearest neighbor method library(class) nearest1 <- knn(train=xtrain, test=xnew, cl=ytrain, k=1) nearest3 <- knn(train=xtrain, test=xnew, cl=ytrain, k=3) data.frame(ynew,nearest1,nearest3)[1:10,] ## calculate the proportion of correct classifications pcorrn1=100*sum(ynew==nearest1)/100 pcorrn3=100*sum(ynew==nearest3)/100 pcorrn1 pcorrn3 ## plot for 3nn plot(xtrain[,c("amount","duration")],col=c(4,3,6,2)[credit[train,"installment"]],pch=c(1,2)[as.numeric(ytrain)],main="Predicted default, by 3 nearest neighbors",cex.main=.95) points(xnew[,c("amount","duration")],bg=c(4,3,6,2)[credit[train,"installment"]],pch=c(21,24)[as.numeric(nearest3)],cex=1.2,col=grey(.7)) legend("bottomright",pch=c(1,16,2,17),bg=c(1,1,1,1),legend=c("data 0","pred 0","data 1","pred 1"),title="default",bty="n",cex=.8) legend("topleft",fill=c(4,3,6,2),legend=c(1,2,3,4),title="installment %",horiz=TRUE,bty="n",col=grey(.7),cex=.8) ## above was for just one training set ## cross-validation (leave one out) pcorr=dim(10) for (k in 1:10) { pred=knn.cv(x,cl=credit$Default,k) pcorr[k]=100*sum(credit$Default==pred)/1000 } pcorr CHAPTER 10: THE NAÏVE BAYESIAN ANALYSIS: A MODEL FOR PREDICTING A CATEGORICAL RESPONSE FROM MOSTLY CATEGORICAL PREDICTOR VARIABLES Example: Delayed Airplanes set.seed(1) library(car) #used to recode a variable ## reading the data delay <- read.csv("C:/DataMining/Data/FlightDelays.csv") delay del=data.frame(delay) del$schedf=factor(floor(del$schedtime/100)) del$delay=recode(del$delay,"'delayed'=1;else=0") response=as.numeric(levels(del$delay)[del$delay]) hist(response) mm=mean(response) mm ## determining test and evaluation data sets n=length(del$dayweek) n n1=floor(n*(0.6)) n1 n2=n-n1 n2 train=sample(1:n,n1) ## determining marginal probabilities tttt=cbind(del$schedf[train],del$carrier[train],del$dest[train],del$origin[train],del$weather[train],del$dayweek[train],response[train]) tttrain0=tttt[tttt[,7]<0.5,] tttrain1=tttt[tttt[,7]>0.5,] ## prior probabilities tdel=table(response[train]) tdel=tdel/sum(tdel) tdel ## scheduled time ts0=table(tttrain0[,1]) ts0=ts0/sum(ts0) ts0 ts1=table(tttrain1[,1]) ts1=ts1/sum(ts1) ts1 ## scheduled carrier tc0=table(tttrain0[,2]) tc0=tc0/sum(tc0) tc0 tc1=table(tttrain1[,2]) tc1=tc1/sum(tc1) tc1 ## scheduled destination td0=table(tttrain0[,3]) td0=td0/sum(td0) td0 td1=table(tttrain1[,3]) td1=td1/sum(td1) td1 ## scheduled origin to0=table(tttrain0[,4]) to0=to0/sum(to0) to0 to1=table(tttrain1[,4]) to1=to1/sum(to1) to1 ## weather tw0=table(tttrain0[,5]) tw0=tw0/sum(tw0) tw0 tw1=table(tttrain1[,5]) tw1=tw1/sum(tw1) tw1 ## bandaid as no observation in a cell tw0=tw1 tw0[1]=1 tw0[2]=0 ## scheduled day of week tdw0=table(tttrain0[,6]) tdw0=tdw0/sum(tdw0) tdw0 tdw1=table(tttrain1[,6]) tdw1=tdw1/sum(tdw1) tdw1 ## creating test data set tt=cbind(del$schedf[-train],del$carrier[-train],del$dest[-train],del$origin[-train],del$weather[-train],del$dayweek[-train],response[-train]) ## creating predictions, stored in gg p0=ts0[tt[,1]]*tc0[tt[,2]]*td0[tt[,3]]*to0[tt[,4]]*tw0[tt[,5]+1]*tdw0[tt[,6]] p1=ts1[tt[,1]]*tc1[tt[,2]]*td1[tt[,3]]*to1[tt[,4]]*tw1[tt[,5]+1]*tdw1[tt[,6]] gg=(p1*tdel[2])/(p1*tdel[2]+p0*tdel[1]) hist(gg) plot(response[-train]~gg) ## coding as 1 if probability 0.5 or larger gg1=floor(gg+0.5) ttt=table(response[-train],gg1) ttt error=(ttt[1,2]+ttt[2,1])/n2 error ## coding as 1 if probability 0.3 or larger gg2=floor(gg+0.7) ttt=table(response[-train],gg2) ttt error=(ttt[1,2]+ttt[2,1])/n2 error ## Here we calculate the lift (see Chapter 4) ## The output is not shown in the text bb=cbind(gg,response[-train]) bb bb1=bb[order(gg,decreasing=TRUE),] bb1 ## order cases in test set naccording to their success prob ## actual outcome shown next to it ## overall success (delay) prob in evaluation set xbar=mean(response[-train]) xbar ## calculating the lift ## cumulative 1’s sorted by predicted values ## cumulative 1’s using the average success prob from training set axis=dim(n2) ax=dim(n2) ay=dim(n2) axis[1]=1 ax[1]=xbar ay[1]=bb1[1,2] for (i in 2:n2) { axis[i]=i ax[i]=xbar*i ay[i]=ay[i-1]+bb1[i,2] } aaa=cbind(bb1[,1],bb1[,2],ay,ax) aaa[1:100,] plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob") points(axis,ax,type="l") CHAPTER 11: MULTINOMIAL LOGISTIC REGRESSION Example 1: Forensic Glass ## Program 1: Estimation on all 214 shards ## Forensic Glass library(VGAM) ## VGAM to estimate multinomial logistic regression library(textir) ## to standardize the features library(MASS) ## a library of example datasets data(fgl) ## loads the data into R; see help(fgl) fgl ## standardization, using the normalize function in the library textir covars <- normalize(fgl[,1:9],s=sdev(fgl[,1:9])) sd(covars) ## convince yourself that features are standardized dd=data.frame(cbind(type=fgl$type,covars)) gg <- vglm(type~ Na+Mg+Al,multinomial,data=dd) summary(gg) predict(gg) ## obtain log-odds relative to last group round(fitted(gg),2) ## probabilities cbind(round(fitted(gg),2),fgl$type) ## boxplots of estimated probabilities against true group dWinF=fgl$type=="WinF" dWinNF=fgl$type=="WinNF" dVeh=fgl$type=="Veh" dCon=fgl$type=="Con" dTable=fgl$type=="Tabl" dHead=fgl$type=="Head" yy1=c(fitted(gg)[dWinF,1],fitted(gg)[dWinNF,2],fitted(gg)[dVeh,3], fitted(gg)[dCon,4],fitted(gg)[dTable,5],fitted(gg)[dHead,6]) xx1=c(fgl$type[dWinF],fgl$type[dWinNF],fgl$type[dVeh],fgl$type[dCon],fgl$type[dTable],fgl$type[dHead]) boxplot(yy1~xx1,ylim=c(0,1),xlab="1=WinF,2=WinNF,3=Veh,4=Con,5=Table,6=Head") ## Program 2: Estimation on all 194 shards and predicting 20 new cases ## performance in predicting a single set of 20 new cases library(VGAM) library(textir) library(MASS) ## a library of example datasets data(fgl)     ## loads the data into R; see help(fgl) fgl covars <- normalize(fgl[,1:9],s=sdev(fgl[,1:9])) dd=data.frame(cbind(type=fgl$type,covars)) n=length(fgl$type) nt=n-20 set.seed(1) train <- sample(1:n,nt) ## predict gg <- vglm(type ~ Na+Mg+Al,multinomial,data=dd[train,]) p1=predict(gg,newdata=dd[-train,]) p1=exp(p1) ## we calculate the probabilities from the predicted logits sum=(1+p1[,1]+p1[,2]+p1[,3]+p1[,4]+p1[,5]) probWinF=round(p1[,1]/sum,2) ## WinF probWinNF=round(p1[,2]/sum,2) ## WinNF probVeh=round(p1[,3]/sum,2) ## Veh probCon=round(p1[,4]/sum,2) ## Con probTable=round(p1[,5]/sum,2) ## Table probHead=round(1/sum,2) ## Head ppp=data.frame(probWinF,probWinNF,probVeh,probCon,probTable,probHead,fgl$type[-train]) ppp ## Program 3: Estimation on all 194 shards and predicting 20 new cases; 100 reps ## performance from 100 replications predicting 20 new cases library(VGAM) library(textir) library(MASS) ## a library of example datasets data(fgl)     ## loads the data into R; see help(fgl) fgl covars <- normalize(fgl[,1:9],s=sdev(fgl[,1:9])) dd=data.frame(cbind(type=fgl$type,covars)) ## out-of-sample prediction set.seed(1) out=dim(20) proportion=dim(100) prob=matrix(nrow=20,ncol=6) n=length(fgl$type) nt=n-20 for (kkk in 1:100) { train <- sample(1:n,nt) ## predict gg <- vglm(type ~ Na+Mg+Al,multinomial,data=dd[train,]) p1=predict(gg,newdata=dd[-train,]) p1=exp(p1) ## we calculate the probabilities from the predicted logits sum=(1+p1[,1]+p1[,2]+p1[,3]+p1[,4]+p1[,5]) prob[,1]=p1[,1]/sum ## WinF prob[,2]=p1[,2]/sum ## WinNF prob[,3]=p1[,3]/sum ## Veh prob[,4]=p1[,4]/sum ## Con prob[,5]=p1[,5]/sum ## Table prob[,6]=1/sum ## Head for (k in 1:20) { pp=prob[k,] out[k]=max(pp)==pp[fgl$type[-train]][k] } proportion[kkk]=sum(out)/20 } ## proportion of correct classification proportion mean(proportion) boxplot(ylim=c(0,1),ylab="percent correct classification",proportion) Example 2: Forensic Glass Revisited ## Program 1: Cross-validation to determine the penalty in mnlm library(textir) set.seed(1) library(MASS) ## a library of example datasets data(fgl)     ## loads the data into R; see help(fgl) covars <- normalize(fgl[,1:9],s=sdev(fgl[,1:9])) n=length(fgl$type) prop=dim(30) pen=dim(30) out=dim(n) for (j in 1:30) { pen[j]=0.1*j for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] glasslm <- mnlm(counts=fgl$type[train],penalty=pen[j],covars=covars[train,]) prob=predict(glasslm,covars[-train,]) prob=round(prob,3) out[k]=max(prob)==prob[fgl$type[-train]] } prop[j]=sum(out)/n } ## proportion of correct classifications using Laplace scale penalty output=cbind(pen,prop) round(output,3) ## Program 2: Detailed mnlm output for penalty = 1 library(textir) library(MASS) ## a library of example datasets data(fgl) ## loads the data into R; see help(fgl) fgl$type covars <- normalize(fgl[,1:9],s=sdev(fgl[,1:9])) glasslm <- mnlm(counts=fgl$type,penalty=1.0,covars=covars) glasslm$intercept glasslm$loadings round(as.matrix(glasslm$loadings)[,],2) fitted(glasslm) as.matrix(fitted(glasslm)[1,]) round(predict(glasslm,covars),2) plot(glasslm) ## Program 3: Estimation on all 194 shards and predicting 20 new cases ## multinomial logistic regression with linear logits library(textir) library(MASS) ## a library of example datasets data(fgl)     ## loads the data into R; see help(fgl) covars <- normalize(fgl[,1:9],s=sdev(fgl[,1:9])) sd(covars) set.seed(1) pp=dim(6) out=dim(20) proportion=dim(100) n=length(fgl$type) nt=n-20 for (kkk in 1:100) { train <- sample(1:n,nt) glasslm=mnlm(counts=fgl$type[train],penalty=1,covars=covars[train,]) prob=predict(glasslm,covars[-train,]) for (k in 1:20) { pp=prob[k,] out[k]=max(pp)==pp[fgl$type[-train]][k] } proportion[kkk]=sum(out)/20 } proportion mean(proportion) boxplot(proportion) ## Program 4: Estimation on all 194 shards and predicting 20 new cases ## multinomial logistic regression with linear and cross-products library(textir) library(MASS) ## a library of example datasets data(fgl)     ## loads the data into R; see help(fgl) X <- model.matrix(~.+.^2, data=fgl[,1:9])[,-1] X[1:3,] ## to see the contents ## -1 removes the intercept dim(X) ## X has 45 columns covars <- normalize(X,s=sdev(X)) sd(covars) set.seed(1) pp=dim(6) out=dim(20) proportion=dim(100) n=length(fgl$type) nt=n-20 for (kkk in 1:100) { train <- sample(1:n,nt) glasslm=mnlm(counts=fgl$type[train],penalty=1,covars=covars[train,]) prob=predict(glasslm,covars[-train,]) for (k in 1:20) { pp=prob[k,] out[k]=max(pp)==pp[fgl$type[-train]][k] } proportion[kkk]=sum(out)/20 } proportion mean(proportion) boxplot(proportion) Appendix: Specification of a Simple Triplet Matrix ## working with simple triplet matrices i=c(1,2,3,4,5,6) j=c(1,1,1,2,2,2) v=c(5,5,5,6,6,6) b=simple_triplet_matrix(i,j,v) b as.matrix(b)[,] v=c(11,12,22,33,44,55) b=simple_triplet_matrix(i,j,v) as.matrix(b)[,] i=c(1,2,3,4,5,6) j=c(1,2,3,4,5,6) v=c(5,5,5,6,6,6) b=simple_triplet_matrix(i,j,v) b as.matrix(b)[,] i=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16) j=c(1,1,2,3,3,4,4,4,4,5,5,6,6,6,6,6) v=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1) b=simple_triplet_matrix(i,j,v) b as.matrix(b)[,] CHAPTER 12: MORE ON CLASSIFICATION AND A DISCUSSION OF DISCRIMINANT ANALYSIS Example 1: German Credit Data #### ******* German Credit Data ******* #### #### ******* data on 1000 loans ******* #### library(MASS) ## includes lda and qda for discriminant analysis set.seed(1) ## read data and create some 'interesting' variables credit <- read.csv("C:/DataMining/Data/germancredit.csv") credit credit$Default <- factor(credit$Default) ## re-level the credit history and a few other variables credit$history = factor(credit$history, levels=c("A30","A31","A32","A33","A34")) levels(credit$history) = c("good","good","poor","poor","terrible") credit$foreign <- factor(credit$foreign, levels=c("A201","A202"), labels=c("foreign","german")) credit$rent <- factor(credit$housing=="A151") credit$purpose <- factor(credit$purpose, levels=c("A40","A41","A42","A43","A44","A45","A46","A47","A48","A49","A410")) levels(credit$purpose) <- c("newcar","usedcar",rep("goods/repair",4),"edu",NA,"edu","biz","biz") ## take the continuous variables duration, amount, installment, age ## with indicators the assumptions of a normal distribution would be ## tenuous at best; hence these variables are not considered here cred1=credit[,c("Default","duration","amount","installment","age")] cred1 summary(cred1) hist(cred1$duration) hist(cred1$amount) hist(cred1$installment) hist(cred1$age) cred1$Default cred1=data.frame(cred1) ## linear discriminant analysis ## class proportions of the training set used as prior probabilities zlin=lda(Default~.,cred1) predict(zlin,newdata=data.frame(duration=6,amount=1100,installment=4,age=67)) predict(zlin,newdata=data.frame(duration=6,amount=1100,installment=4,age=67))$class zqua=qda(Default~.,cred1) predict(zqua,newdata=data.frame(duration=6,amount=1100,installment=4,age=67)) predict(zqua,newdata=data.frame(duration=6,amount=1100,installment=4,age=67))$class n=1000 neval=1 errlin=dim(n) errqua=dim(n) ## leave one out evaluation for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] ## linear discriminant analysis zlin=lda(Default~.,cred1[train,]) predict(zlin,cred1[-train,])$class tablin=table(cred1$Default[-train],predict(zlin,cred1[-train,])$class) errlin[k]=(neval-sum(diag(tablin)))/neval ## quadratic discriminant analysis zqua=qda(Default~.,cred1[train,]) predict(zqua,cred1[-train,])$class tablin=table(cred1$Default[-train],predict(zqua,cred1[-train,])$class) errqua[k]=(neval-sum(diag(tablin)))/neval } merrlin=mean(errlin) merrlin merrqua=mean(errqua) merrqua Example 2: Fisher Iris Data library(MASS) ## includes lda and qda for discriminant analysis set.seed(1) iris3 Iris=data.frame(rbind(iris3[,,1],iris3[,,2],iris3[,,3]),Sp=rep(c("s","c","v"),rep(50,3))) Iris ## linear discriminant analysis ## equal prior probabilities as same number from each species zlin=lda(Sp~.,Iris,prior=c(1,1,1)/3) predict(zlin,newdata=data.frame(Sepal.L.=5.1,Sepal.W.=3.5,Petal.L.=1.4, Petal.W.=0.2)) predict(zlin,newdata=data.frame(Sepal.L.=5.1,Sepal.W.=3.5,Petal.L.=1.4, Petal.W.=0.2))$class ## quadratic discriminant analysis zqua=qda(Sp~.,Iris,prior=c(1,1,1)/3) predict(zqua,newdata=data.frame(Sepal.L.=5.1,Sepal.W.=3.5,Petal.L.=1.4, Petal.W.=0.2)) predict(zqua,newdata=data.frame(Sepal.L.=5.1,Sepal.W.=3.5,Petal.L.=1.4, Petal.W.=0.2))$class n=150 nt=100 neval=n-nt rep=1000 errlin=dim(rep) errqua=dim(rep) for (k in 1:rep) { train=sample(1:n,nt) ## linear discriminant analysis m1=lda(Sp~.,Iris[train,],prior=c(1,1,1)/3) predict(m1,Iris[-train,])$class tablin=table(Iris$Sp[-train],predict(m1,Iris[-train,])$class) errlin[k]=(neval-sum(diag(tablin)))/neval ## quadratic discriminant analysis m2=qda(Sp~.,Iris[train,],prior=c(1,1,1)/3) predict(m2,Iris[-train,])$class tablin=table(Iris$Sp[-train],predict(m2,Iris[-train,])$class) errqua[k]=(neval-sum(diag(tablin)))/neval } merrlin=mean(errlin) merrlin merrqua=mean(errqua) merrqua Example 3: Forensic Glass Data library(MASS) ## includes lda and qda for discriminant analysis set.seed(1) data(fgl) glass=data.frame(fgl) glass ## linear discriminant analysis m1=lda(type~.,glass) m1 predict(m1,newdata=data.frame(RI=3.0,Na=13,Mg=4,Al=1,Si=70,K=0.06,Ca=9,Ba=0,Fe=0)) predict(m1,newdata=data.frame(RI=3.0,Na=13,Mg=4,Al=1,Si=70,K=0.06,Ca=9,Ba=0,Fe=0))$class ## quadratic discriminant analysis: Not enough data as each 9x9 ## covariance matrix includes (9)(8)/2 = 45 unknown coefficients n=length(fgl$type) nt=200 neval=n-nt rep=100 errlin=dim(rep) for (k in 1:rep) { train=sample(1:n,nt) glass[train,] ## linear discriminant analysis m1=lda(type~.,glass[train,]) predict(m1,glass[-train,])$class tablin=table(glass$type[-train],predict(m1,glass[-train,])$class) errlin[k]=(neval-sum(diag(tablin)))/neval } merrlin=mean(errlin) merrlin n=214 neval=1 errlin=dim(n) errqua=dim(n) for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] ## linear discriminant analysis m1=lda(type~.,glass[train,]) predict(m1,glass[-train,])$class tablin=table(glass$type[-train],predict(m1,glass[-train,])$class) errlin[k]=(neval-sum(diag(tablin)))/neval } merrlin=mean(errlin) merrlin Example 4: MBA Admission Data library(MASS) set.seed(1) ## reading the data admit <- read.csv("C:/DataMining/Data/admission.csv") adm=data.frame(admit) adm plot(adm$GPA,adm$GMAT,col=adm$De) ## linear discriminant analysis m1=lda(De~.,adm) m1 predict(m1,newdata=data.frame(GPA=3.21,GMAT=497)) ## quadratic discriminant analysis m2=qda(De~.,adm) m2 predict(m2,newdata=data.frame(GPA=3.21,GMAT=497)) n=85 nt=60 neval=n-nt rep=100 errlin=dim(rep) for (k in 1:rep) { train=sample(1:n,nt) ## linear discriminant analysis m1=lda(De~.,adm[train,]) predict(m1,adm[-train,])$class tablin=table(adm$De[-train],predict(m1,adm[-train,])$class) errlin[k]=(neval-sum(diag(tablin)))/neval } merrlin=mean(errlin) merrlin CHAPTER 13: DECISION TREES Example 1: Prostate Cancer prostate <- read.csv("C:/DataMining/Data/prostate.csv") prostate library(tree) ## Construct the tree pstree <- tree(lcavol ~., data=prostate, mindev=0.1, mincut=1) pstree <- tree(lcavol ~., data=prostate, mincut=1) pstree plot(pstree, col=8) text(pstree, digits=2) pstcut <- prune.tree(pstree,k=1.7) plot(pstcut) pstcut pstcut <- prune.tree(pstree,k=2.05) plot(pstcut) pstcut pstcut <- prune.tree(pstree,k=3) plot(pstcut) pstcut pstcut <- prune.tree(pstree) pstcut plot(pstcut) pstcut <- prune.tree(pstree,best=3) pstcut plot(pstcut) ## Use cross-validation to prune the tree set.seed(2) cvpst <- cv.tree(pstree, K=10) cvpst$size cvpst$dev plot(cvpst, pch=21, bg=8, type="p", cex=1.5, ylim=c(65,100)) pstcut <- prune.tree(pstree, best=3) pstcut plot(pstcut, col=8) text(pstcut) ## Plot what we end up with plot(prostate[,c("lcp","lpsa")],cex=0.2*exp(prostate$lcavol)) abline(v=.261624, col=4, lwd=2) lines(x=c(-2,.261624), y=c(2.30257,2.30257), col=4, lwd=2) Example 2: Motorcycle Acceleration library(MASS) library(tree) data(mcycle) mcycle plot(accel~times,data=mcycle) mct <- tree(accel ~ times, data=mcycle) mct plot(mct, col=8) text(mct, cex=.75) ## we use different font size to avoid print overlap ## scatter plot of data with overlay of fitted function x=c(1:6000) x=x/100 y1=seq(-4.357,-4.357,length.out=1510) y2=seq(-39.120,-39.120,length.out=140) y3=seq(-86.31,-86.31,length.out=300) y4=seq(-114.7,-114.7,length.out=490) y5=seq(-42.49,-42.49,length.out=300) y6=seq(10.25,10.25,length.out=240) y7=seq(40.72,40.72,length.out=520) y8=seq(3.291,3.291,length.out=2500) y=c(y1,y2,y3,y4,y5,y6,y7,y8) plot(accel~times,data=mcycle) lines(y~x) Example 3: Fisher Iris Data Revisited library(MASS) library(tree) ## read in the iris data iris iristree <- tree(Species~.,data=iris) iristree plot(iristree) plot(iristree,col=8) text(iristree,digits=2) summary(iristree) irissnip=snip.tree(iristree,nodes=c(7,12)) irissnip plot(irissnip) text(irissnip) CHAPTER 14: FURTHER DISCUSSION ON REGRESSION AND CLASSIFICATION TREES, COMPUTER SOFTWARE, AND OTHER USEFUL CLASSIFICATION METHODS no programs CHAPTER 15: CLUSTERING Example 1: European Protein Consumption ### *** European Protein Consumption, in grams/person-day *** ### ## read in the data food <- read.csv("C:/DataMining/Data/protein.csv") food[1:3,] ## first, clustering on just Red and White meat (p=2) and k=3 clusters set.seed(1) ## to fix the random starting clusters grpMeat <- kmeans(food[,c("WhiteMeat","RedMeat")], centers=3, nstart=10) grpMeat ## list of cluster assignments o=order(grpMeat$cluster) data.frame(food$Country[o],grpMeat$cluster[o]) ## plotting cluster assignments on Red and White meat scatter plot plot(food$Red, food$White, type="n", xlim=c(3,19), xlab="Red Meat", ylab="White Meat") text(x=food$Red, y=food$White, labels=food$Country, col=grpMeat$cluster+1) ## same analysis, but now with clustering on all protein groups ## change the number of clusters to 7 set.seed(1) grpProtein <- kmeans(food[,-1], centers=7, nstart=10) o=order(grpProtein$cluster) data.frame(food$Country[o],grpProtein$cluster[o]) plot(food$Red, food$White, type="n", xlim=c(3,19), xlab="Red Meat", ylab="White Meat") text(x=food$Red, y=food$White, labels=food$Country, col=rainbow(7)[grpProtein$cluster]) Example 2: Monthly US Unemployment Rates ## read the data; series are stored column-wise with labels in first row raw <- read.csv("C:/DataMining/Data/unempstates.csv") raw[1:3,] ## time sequence plots of three series plot(raw[,5],type="l",ylim=c(0,12),xlab="month",ylab="unemployment rate") ## CA points(raw[,32],type="l", cex = .5, col = "dark red") ## New York points(raw[,15],type="l", cex = .5, col = "dark green") ## Iowa ## transpose the data ## then we have 50 rows (states) and 416 columns (time periods) rawt=matrix(nrow=50,ncol=416) rawt=t(raw) rawt[1:3,] ## k-means clustering in 416 dimensions set.seed(1) grpunemp2 <- kmeans(rawt, centers=2, nstart=10) sort(grpunemp2$cluster) grpunemp3 <- kmeans(rawt, centers=3, nstart=10) sort(grpunemp3$cluster) grpunemp4 <- kmeans(rawt, centers=4, nstart=10) sort(grpunemp4$cluster) grpunemp5 <- kmeans(rawt, centers=5, nstart=10) sort(grpunemp5$cluster) ## another analysis ## data set unemp.csv with means and standard deviations for each state ## k-means clustering on 2 dimensions (mean, stddev) unemp <- read.csv("C:/DataMining/Data/unemp.csv") unemp[1:3,] set.seed(1) grpunemp <- kmeans(unemp[,c("mean","stddev")], centers=3, nstart=10) ## list of cluster assignments o=order(grpunemp$cluster) data.frame(unemp$state[o],grpunemp$cluster[o]) plot(unemp$mean,unemp$stddev,type="n",xlab="mean", ylab="stddev") text(x=unemp$mean,y=unemp$stddev,labels=unemp$state, col=grpunemp$cluster+1) Example 3: European Protein Consumption Revisited (Mixture Model) library(mixtools) ## for a brief description of mvnormalmixEM ## mvnormalmixEM(x, lambda = NULL, mu = NULL, sigma = NULL, k = 2, ## arbmean = TRUE, arbvar = TRUE, epsilon = 1e-08, ## maxit = 10000, verb = FALSE) ## arbvar=FALSE same cov matrices ## arbvar=TRUE (default) different cov matrices ## arbmean=TRUE (default) different means ## k number of groups food <- read.csv("C:/DataMining/Data/protein.csv") ## Consider just Red and White meat clusters food[1:3,] X=cbind(food[,2],food[,3]) X[1:3,] set.seed(1) ## here we use an iterative procedure and the results in repeated runs may ## not be exactly the same ## set.seed(1) is used to obtain reproducible results ## mixtures of two normal distributions on the first 2 features ## we consider different variances out2<-mvnormalmixEM(X,arbvar=TRUE,k=2,epsilon=1e-02) out2 prob1=round(out2$posterior[,1],digits=3) prob2=round(out2$posterior[,2],digits=3) prob=round(out2$posterior[,1]) o=order(prob) data.frame(food$Country[o],prob1[o],prob2[o],prob[o]) plot(food$Red, food$White, type="n",xlab="Red Meat", ylab="White Meat") text(x=food$Red,y=food$White,labels=food$Country,col=prob+1) ## mixtures of two normal distributions on all 9 features ## we consider equal variances X1=cbind(food[,2],food[,3],food[,4],food[,5],food[,6],food[,7], food[,8],food[,9],food[,10]) X1[1:3,] set.seed(1) out2all<-mvnormalmixEM(X1,arbvar=FALSE,k=2,epsilon=1e-02) out2all prob1=round(out2all$posterior[,1],digits=3) prob2=round(out2all$posterior[,2],digits=3) prob=round(out2all$posterior[,1]) data.frame(food$Country,prob1,prob2,prob) o=order(prob) data.frame(food$Country[o],prob[o]) R program to create Figure 15.1 library(cluster) dis=matrix(nrow=5,ncol=5) dis[1,1]=0 dis[2,2]=0 dis[3,3]=0 dis[4,4]=0 dis[5,5]=0 dis[2,1]=9 dis[3,1]=3 dis[4,1]=6 dis[5,1]=11 dis[3,2]=7 dis[4,2]=5 dis[5,2]=10 dis[4,3]=9 dis[5,3]=2 dis[5,4]=8 dis[1,2]=dis[2,1] dis[1,3]=dis[3,1] dis[1,4]=dis[4,1] dis[1,5]=dis[5,1] dis[2,3]=dis[3,2] dis[2,4]=dis[4,2] dis[2,5]=dis[5,2] dis[3,4]=dis[4,3] dis[3,5]=dis[5,3] dis[4,5]=dis[5,4] plot(agnes(x=dis,diss=TRUE,metric="eucledian",method="single")) plot(agnes(x=dis,diss=TRUE,metric="eucledian",method="complete")) ## correction with dis[5,3]=9 dis=matrix(nrow=5,ncol=5) dis[1,1]=0 dis[2,2]=0 dis[3,3]=0 dis[4,4]=0 dis[5,5]=0 dis[2,1]=9 dis[3,1]=3 dis[4,1]=6 dis[5,1]=11 dis[3,2]=7 dis[4,2]=5 dis[5,2]=10 dis[4,3]=9 dis[5,3]=9 ## corrected dis[5,4]=8 dis[1,2]=dis[2,1] dis[1,3]=dis[3,1] dis[1,4]=dis[4,1] dis[1,5]=dis[5,1] dis[2,3]=dis[3,2] dis[2,4]=dis[4,2] dis[2,5]=dis[5,2] dis[3,4]=dis[4,3] dis[3,5]=dis[5,3] dis[4,5]=dis[5,4] plot(agnes(x=dis,diss=TRUE,metric="eucledian",method="single")) plot(agnes(x=dis,diss=TRUE,metric="eucledian",method="complete")) Example 4: European Protein Consumption Revisited (Agglomerative Clustering) library(cluster) ## Protein Data food <- read.csv("C:/DataMining/Data/protein.csv") food[1:3,] ## we use the program agnes in the package cluster ## argument diss=FALSE indicates that we use the dissimilarity ## matrix that is being calculated from raw data. ## argument metric="euclidian" indicates that we use Euclidian distance ## no standardization is used as the default ## the default is "average" linkage ## first we consider just Red and White meat clusters food2=food[,c("WhiteMeat","RedMeat")] food2agg=agnes(food2,diss=FALSE,metric="euclidian") food2agg plot(food2agg) ## dendrogram food2agg$merge ## describes the sequential merge steps ## identical result obtained by first computing the distance matrix food2aggv=agnes(daisy(food2),metric="euclidian") plot(food2aggv) ## Using data on all nine variables (features) ## Euclidean distance and average linkage foodagg=agnes(food[,-1],diss=FALSE,metric="euclidian") plot(foodagg) ## dendrogram foodagg$merge ## describes the sequential merge steps ## Using data on all nine variables (features) ## Euclidean distance and single linkage foodaggsin=agnes(food[,-1],diss=FALSE,metric="euclidian",method="single") ## corrected plot(foodaggsin) ## dendrogram foodaggsin$merge ## describes the sequential merge steps ## Euclidean distance and complete linkage foodaggcomp=agnes(food[,-1],diss=FALSE,metric="euclidian",method="single") ## corrected plot(foodaggcomp) ## dendrogram foodaggcomp$merge ## describes the sequential merge steps foodaggcomp=agnes(food[,-1],diss=FALSE,metric="euclidian") ## corrected plot(foodaggcomp) ## dendrogram foodaggcomp$merge ## describes the sequential merge steps ## try hclust. You need to calculate the distance matrix first. food food1=food[,-1] ## strips name d=dist(food1, method="euclidean") fit=hclust(d=d, method="average") ## "single", "complete","average", fit plot(fit, hang=-1) groups=cutree(fit, k=5) # "k=" defines the number of clusters you are using groups table(groups) rect.hclust(fit, k=5, border="red") # draw dendogram with red borders around the 5 clusters Example 4: Monthly US Unemployment Rates (Agglomerative Clustering) library(cluster) ## US unemployment data library(cluster) raw <- read.csv("C:/DataMining/Data/unempstates.csv") raw[1:3,] rawt=matrix(nrow=50,ncol=416) rawt=t(raw) rawt[1:3,] ## transpose so that we have 50 rows (states) and 416 columns ## (time periods) ## Agglomerative clustering unemployment 50 states ### ## dissimilarity matrix calculated from the raw data. ## Euclidian distance and default "average" linkage outagg=agnes(rawt,diss=FALSE,metric="euclidian") plot(outagg) ## dendrogram outagg$merge ## describes the sequential merge steps ## we see about three clusters ## Cluster 1: AL, IL, OH, TN, KY, OR, WA, PA, IN, MO, WI, NC, NV, SC, ## AR, NM, ID, MT, TX, AZ, FL, GA, ME, NJ, NY, RI, CA ## Cluster 2: AK, LA, MS, WV, MI ## Cluster 3: CO, IA, MN, UT, KS, OK, WY, NE, SD, ND, CT, MA, DE, MD, ## VT, VA, NH, HI Example 5: Monthly US Unemployment Rates Revisited ## agglomerative clustering on the correlation between the series ## 2 versions: levels and differences library(cluster) raw <- read.csv("C:/DataMining/Data/unempstates.csv") raw[1:3,] ## Correlation on levels corlevel=cor(data.frame(raw)) disslevel=1-corlevel outcorlevel=agnes(disslevel,diss=TRUE,metric="euclidian",method="single") plot(outcorlevel) ## dendrogram; single linkage outcorlevel=agnes(disslevel,diss=TRUE,metric="euclidian",method="complete") plot(outcorlevel) ## dendrogram; complete linkage outcorlevel=agnes(disslevel,diss=TRUE,metric="euclidian") plot(outcorlevel) ## dendrogram; average linkage ## Correlation on differences X=matrix(nrow=415,ncol=50) for (j in 1:50) { for (i in 1:415) { X[i,j]=raw[i+1,j]-raw[i,j] } } colnames(X)=colnames(raw) cordiff=cor(data.frame(X)) dissdiff=1-cordiff outcordiff=agnes(dissdiff,diss=TRUE,metric="euclidian",method="single") plot(outcordiff) ## dendrogram; single linkage outcordiff=agnes(dissdiff,diss=TRUE,metric="euclidian",method="complete") plot(outcordiff) ## dendrograml; complete linkage outcordiff=agnes(dissdiff,diss=TRUE,metric="euclidian") plot(outcordiff) ## dendrogram; average linkage CHAPTER 16: MARKET BASKET ANALYSIS: ASSOCIATION RULES AND LIFT Example 1: Online Radio ### *** Play counts *** ### lastfm <- read.csv("C:/DataMining/Data/lastfm.csv") lastfm[1:19,] length(lastfm$user) ## 289,955 records in the file lastfm$user <- factor(lastfm$user) levels(lastfm$user) ## 15,000 users levels(lastfm$artist) ## 1,004 artists library(arules) ## a-rules package for association rules ## Computational environment for mining association rules and ## frequent item sets ## we need to manipulate the data a bit for arules playlist <- split(x=lastfm[,"artist"],f=lastfm$user) ## split into a list of users playlist <- lapply(playlist,unique) ## remove artist duplicates playlist[1:2] ## the first two listeners (1 and 3) listen to the following bands playlist <- as(playlist,"transactions") ## view this as a list of "transactions" ## transactions is a data class defined in arules itemFrequency(playlist) ## lists the support of the 1,004 bands ## number of times band is listed to on the shopping trips of 15,000 users ## computes the rel freq each artist mentioned by the 15,000 users itemFrequencyPlot(playlist,support=.08,cex.names=1.5) ## plots the item frequencies (only bands with > % support) ## Finally, we build the association rules ## only rules with support > 0.01 and confidence > .50 ## so it can’t be a super rare band musicrules <- apriori(playlist,parameter=list(support=.01,confidence=.5)) inspect(musicrules) ## let's filter by lift > 5. ## Among those associations with support > 0.01 and confidence > .50, ## only show those with lift > 5 inspect(subset(musicrules, subset=lift > 5)) ## lastly, order by confidence to make it easier to understand inspect(sort(subset(musicrules, subset=lift > 5), by="confidence")) Example 2: Predicting Income library(arules) data(AdultUCI) dim(AdultUCI) AdultUCI[1:3,] AdultUCI[["fnlwgt"]] <- NULL AdultUCI[["education-num"]] <- NULL AdultUCI[["age"]] <- ordered(cut(AdultUCI[["age"]], c(15, 25, 45, 65, 100)), labels = c("Young", "Middle-aged", "Senior", "Old")) AdultUCI[["hours-per-week"]] <- ordered(cut(AdultUCI[["hours-per-week"]], c(0, 25, 40, 60, 168)), labels = c("Part-time", "Full-time", "Over-time", "Workaholic")) AdultUCI[["capital-gain"]] <- ordered(cut(AdultUCI[["capital-gain"]], c(-Inf, 0, median(AdultUCI[["capital-gain"]][AdultUCI[["capital-gain"]] > 0]), Inf)), labels = c("None", "Low", "High")) AdultUCI[["capital-loss"]] <- ordered(cut(AdultUCI[["capital-loss"]], c(-Inf, 0, median(AdultUCI[["capital-loss"]][AdultUCI[["capital-loss"]] > 0]), Inf)), labels = c("none", "low", "high")) Adult <- as(AdultUCI, "transactions") Adult summary(Adult) aa=as(Adult,"matrix") # transforms transaction matrix into incidence matrix aa[1:2,] # print the first two rows of the incidence matrix itemFrequencyPlot(Adult[, itemFrequency(Adult) > 0.2], cex.names = 1) rules <- apriori(Adult, parameter = list(support = 0.01, confidence = 0.6)) rules summary(rules) rulesIncomeSmall <- subset(rules, subset = rhs %in% "income=small" & lift > 1.2) inspect(sort(rulesIncomeSmall, by = "confidence")[1:3]) rulesIncomeLarge <- subset(rules, subset = rhs %in% "income=large" & lift > 1.2) inspect(sort(rulesIncomeLarge, by = "confidence")[1:3]) CHAPTER 17: DIMENSION-REDUCTION: FACTOR MODELS AND PRINCIPAL COMPONENTS Example 1: European Protein Consumption food <- read.csv("C:/DataMining/Data/protein.csv") food ## correlation matrix cor(food[,-1]) pcafood <- prcomp(food[,-1], scale=TRUE) ## we strip the first column (country labels) from the data set ## scale = TRUE: variables are first standardized. Default is FALSE pcafood foodpc <- predict(pcafood) foodpc ## how many principal components do we need? plot(pcafood, main="") mtext(side=1, "European Protein Principal Components", line=1, font=2) ## how do the PCs look? par(mfrow=c(1,2)) plot(foodpc[,1:2], type="n", xlim=c(-4,5)) text(x=foodpc[,1], y=foodpc[,2], labels=food$Country) plot(foodpc[,3:4], type="n", xlim=c(-3,3)) text(x=foodpc[,3], y=foodpc[,4], labels=food$Country) pcafood$rotation[,2] Example 2: Monthly US Unemployment Rates library(cluster) ## needed for cluster analysis states=c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ", "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT", "VA","WA","WV","WI","WY") states raw <- read.csv("C:/DataMining/Data/unempstates.csv") raw[1:3,] ## transpose so that we have 50 rows (states) and 416 columns rawt=matrix(nrow=50,ncol=416) rawt=t(raw) rawt[1:3,] pcaunemp <- prcomp(rawt,scale=FALSE) pcaunemp plot(pcaunemp, main="") mtext(side=1,"Unemployment: 50 states",line=1,font=2) pcaunemp$rotation[,1] pcaunemp$rotation[1:10,1] ## just the first 10 values ave=dim(416) for (j in 1:416) { ave[j]=mean(rawt[,j]) } par(mfrow=c(1,2)) plot(-pcaunemp$rotation[,1]) ## plot negative loadings for first principal comp ## plot monthly averages of unemployment rates plot(ave,type="l",ylim=c(3,10),xlab="month",ylab="ave unemployment rate") abs(cor(ave,pcaunemp$rotation[,1])) pcaunemp$rotation[,2] pcaunemp$rotation[,3] ## below we obtain the scores of the principal components ## the first 2-3 principal components do a good job unemppc <- predict(pcaunemp) unemppc ## below we construct a scatter plot of the first two princ components ## we assess whether an informal clustering on the first two principal components ## would have lead to a similar clustering than the clustering results of the ## k-means clustering approach applied on all 416 components ## the graph indicates that it does set.seed(1) grpunemp3 <- kmeans(rawt,centers=3,nstart=10) par(mfrow=c(1,1)) plot(unemppc[,1:2],type="n") text(x=unemppc[,1],y=unemppc[,2],labels=states,col=rainbow(7)[grpunemp3$cluster]) CHAPTER 18: REDUCING THE DIMENSION IN REGRESSIONS WITH MULTICOLLINEAR INPUTS: PRINCIPAL COMPONENTS REGRESSION AND PARTIAL LEAST SQUARES Example 1: Generated Data ## PLS algorithm, following algorithm 3.3 in Hastie et al ## standardize X’s. PLS depends on scale ## we can’t have too many partial least squares directions (nc) ## otherwise problems ## here we simulate observations set.seed(1) nrow=400 ## row dimension of X ncol=100 ## column dimension of X nc=2 ## number of PLS directions nc1=nc+1 Y1=dim(nrow) X=matrix(nrow=nrow,ncol=ncol) X1=matrix(nrow=nrow,ncol=ncol) Y=matrix(nrow=nrow,ncol=nc1) Z=matrix(nrow=nrow,ncol=nc) F=matrix(nrow=nrow,ncol=ncol) FN=matrix(nrow=nrow,ncol=ncol) me=dim(ncol) s=dim(ncol) ## enter data into matrix X1 and column Y1 ## data simulation for (jj in 1:ncol) { X1[,jj]=rnorm(nrow) } Y1=rnorm(nrow) Y1=2*Y1+6 ## standardization for (j in 1:ncol) { me[j]=mean(X1[,j]) s[j]=sd(X1[,j]) } for (j in 1:ncol) { for (i in 1:nrow) { X[i,j]=(X1[i,j]-me[j])/s[j] } } ## Algorithm 3.3 starts y=Y1 F=X Y[,1]=mean(y)*y/y for (k in 1:nc) { phi=t(F)%*%y Z[,k]=F%*%phi fra=(t(Z[,k])%*%y)/(t(Z[,k])%*%Z[,k]) Y[,k+1]=Y[,k]+fra*Z[,k] for (j in 1:ncol) { fru=(t(Z[,k])%*%F[,j])/(t(Z[,k])%*%Z[,k]) FN[,j]=F[,j]-fru*Z[,k] } F=FN } fp=Y[,nc+1] ## Algorithm 3.3 ends cor(y,fp)**2 ZZ=data.frame(Z[,1:nc]) m1=lm(y~.,data=ZZ) cor(y,m1$fitted)**2 XX=data.frame(X) mall=lm(y~.,data=XX) cor(y,mall$fitted)**2 ## even with few PLS directions, R**2 of largest model is ## approached very quickly ## comparison with library(mixOmics) library(mixOmics) mpls=pls(X1,Y1,ncomp=2,mode="classic") x1=mpls$variates$X[,1] x2=mpls$variates$X[,2] m3=lm(y~x1+x2) cor(y,m3$fitted)**2 fmpls=m3$fitted ## fmpls and fp (and m1$fitted are all the same) Example 2: Predicting Next Month’s Unemployment Rate of a Certain State from Past Unemployment Rates of all 50 States library(mixOmics) nrow=412 ## row dimension of X ncol=200 ## column dimension of X nstates=50 ## number of states X=matrix(nrow=nrow,ncol=ncol) Y=matrix(nrow=nrow,ncol=nstates) raw <- read.csv("C:/DataMining/Data/unempstates.csv") raw[1:3,] X=matrix(nrow=412,ncol=200) Y=matrix(nrow=412,ncol=50) for (j in 1:50) { for (i in 1:412) { Y[i,j]=raw[i+4,j] } } for (j in 1:50) { for (i in 1:412) { X[i,j]=raw[i+3,j] X[i,j+50]=raw[i+2,j] X[i,j+100]=raw[i+1,j] X[i,j+150]=raw[i,j] } } nc=1 ## number of PLS directions ## pls on nc components mpls=pls(X,Y[,1],ncomp=nc,mode="classic") m1=lm(Y[,1]~.,data.frame(mpls$variates$X)) summary(m1) cor(Y[,1],m1$fitted)**2 nc=2 ## number of PLS directions ## pls on nc components mpls=pls(X,Y[,1],ncomp=nc,mode="classic") m2=lm(Y[,1]~.,data.frame(mpls$variates$X)) summary(m2) cor(Y[,1],m2$fitted)**2 nc=3 ## number of PLS directions ## pls on nc components mpls=pls(X,Y[,1],ncomp=nc,mode="classic") m3=lm(Y[,1]~.,data.frame(mpls$variates$X)) summary(m3) cor(Y[,1],m3$fitted)**2 ## regression on all columns of X mreg=lm(Y[,1]~.,data.frame(X)) mreg cor(Y[,1],mreg$fitted)**2 Example 3: Predicting Next Month’s Unemployment Rate. Comparing Several Methods in Terms of their Out-Of-Sample Prediction Performance R Code for predicting levels ## the program will run for some time library(mixOmics) library(lars) set.seed(1) nrow=412 ## row dimension of X ncol=200 ## column dimension of X nstates=50 ## number of states nc=10 ## number of PLS directions X=matrix(nrow=nrow,ncol=ncol) Y=matrix(nrow=nrow,ncol=nstates) raw <- read.csv("C:/DataMining/Data/unempstates.csv") raw[1:3,] X=matrix(nrow=412,ncol=200) Y=matrix(nrow=412,ncol=50) for (j in 1:50) { for (i in 1:412) { Y[i,j]=raw[i+4,j] } } for (j in 1:50) { for (i in 1:412) { X[i,j]=raw[i+3,j] X[i,j+50]=raw[i+2,j] X[i,j+100]=raw[i+1,j] X[i,j+150]=raw[i,j] } } KK=50 sSAR=dim(KK) ## univariate (single) AR(4) sVARL25=dim(KK) ## VAR(4) under various Lasso constraints sVARL50=dim(KK) sVARL75=dim(KK) sVARL100=dim(KK) sPC10=dim(KK) ## regression on 10 principal components sPC200=dim(KK) ## regression on 200 principal components sPLS10=dim(KK) ## partial least squares with 10 PLS directions sPLS100=dim(KK) ## partial least squares with 100 PLS directions predmpc=dim(25) ## We select 25 periods as the evaluation (holdout) sample ## We repeat this KK = 50 times ## We calculate the root mean square forecast error ## from the 25 periods and the 50 states for (jj in 1:KK) { eval=sample(1:412,25) ## Forecasting from individual (univariate) AR(4) models: s=0 for (i in 1:50) { y=Y[-eval,i] p1=X[-eval,i] p2=X[-eval,i+50] p3=X[-eval,i+100] p4=X[-eval,i+150] mSAR=lm(y~p1+p2+p3+p4) pr1=X[eval,i] pr2=X[eval,i+50] pr3=X[eval,i+100] pr4=X[eval,i+150] new=data.frame(p1=pr1,p2=pr2,p3=pr3,p4=pr4) predSAR=predict(mSAR,new) s=s+sum((Y[eval,i]-predSAR)**2) } sSAR[jj]=sqrt(s/(50*25)) ## Forecasting from VAR(4) models and various LASSO constraints s1=0 s2=0 s3=0 s4=0 for (i in 1:50) { lasso=lars(X[-eval,],Y[-eval,i]) predLASSO25=predict(lasso,X[eval,],s=.25,mode="fraction") s1=s1+sum((Y[eval,i]-predLASSO25$fit)**2) predLASSO50=predict(lasso,X[eval,],s=0.50,mode="fraction") s2=s2+sum((Y[eval,i]-predLASSO50$fit)**2) predLASSO75=predict(lasso,X[eval,],s=0.75,mode="fraction") s3=s3+sum((Y[eval,i]-predLASSO75$fit)**2) predLASSO100=predict(lasso,X[eval,],s=1.00,mode="fraction") s4=s4+sum((Y[eval,i]-predLASSO100$fit)**2) } sVARL25[jj]=sqrt(s1/(50*25)) sVARL50[jj]=sqrt(s2/(50*25)) sVARL75[jj]=sqrt(s3/(50*25)) sVARL100[jj]=sqrt(s4/(50*25)) ## Forecasting from regressions on first 10 principal components: pcaX <- prcomp(X[-eval,]) pred=predict(pcaX,data.frame(X[-eval,])) pred=data.frame(pred) names(pred)=paste("p", 1:200, sep="") pred1=predict(pcaX,data.frame(X[eval,])) s=0 for (i in 1:50) { mpc=lm(Y[-eval,i]~p1+p2+p3+p4+p5+p6+p7+p8+p9+p10,data.frame(pred)) g=mpc$coef for (j in 1:25) { h=pred1[j,1:10] h1=c(1,h) predmpc[j]=sum(h1*g) } s=s+sum((Y[eval,i]-predmpc)**2) } sPC10[jj]=sqrt(s/(50*25)) ## Forecasting from regressions on all 200 principal components s=0 for (i in 1:50) { mpc=lm(Y[-eval,i]~.,data.frame(pred)) g=mpc$coef for (j in 1:25) { h=pred1[j,] h1=c(1,h) predmpc[j]=sum(h1*g) } s=s+sum((Y[eval,i]-predmpc)**2) } sPC200[jj]=sqrt(s/(50*25)) ## Forecasting from regressions on first 10 PLS components: s=0 for (i in 1:50) { m1=pls(X[-eval,],Y[-eval,i],ncomp=10,mode="classic") p1=m1$variates$X[,1] p2=m1$variates$X[,2] p3=m1$variates$X[,3] p4=m1$variates$X[,4] p5=m1$variates$X[,5] p6=m1$variates$X[,6] p7=m1$variates$X[,7] p8=m1$variates$X[,8] p9=m1$variates$X[,9] p10=m1$variates$X[,10] mpc=lm(Y[-eval,i]~p1+p2+p3+p4+p5+p6+p7+p8+p9+p10) g=mpc$coef pre1=predict(m1,X[eval,]) for (j in 1:25) { h=pre1$variates[j,1:10] h1=c(1,h) predmpc[j]=sum(h1*g) } s=s+sum((Y[eval,i]-predmpc)**2) } sPLS10[jj]=sqrt(s/(50*25)) ## Forecasting from regressions on first 100 PLS components: s=0 for (i in 1:50) { m1=pls(X[-eval,],Y[-eval,i],ncomp=100,mode="classic") ppp=data.frame(m1$variates$X) mpc=lm(Y[-eval,i]~.,data=ppp) g=mpc$coef pre1=predict(m1,X[eval,]) for (j in 1:25) { h=pre1$variates[j,1:100] h1=c(1,h) predmpc[j]=sum(h1*g) } s=s+sum((Y[eval,i]-predmpc)**2) } sPLS100[jj]=sqrt(s/(50*25)) } ## Output sSAR sVARL25 sVARL50 sVARL75 sVARL100 sPC10 sPC200 sPLS10 sPLS100 mean(sSAR) mean(sVARL25) mean(sVARL50) mean(sVARL75) mean(sVARL100) mean(sPC10) mean(sPC200) mean(sPLS10) mean(sPLS100) boxplot(sSAR,sVARL25,sVARL50,sVARL75,sVARL100,sPC10,sPC200,sPLS10,sPLS100,ylab="RMSE",xlab=" AR(4) VAR(4) VAR(4) VAR(4) VAR(4) PCA10 PCA200 PLS10 PLS100",sub=" AR(4) s=0.25 s=0.50 s=0.75 s=1.00 PCA10 PCA200 PLS10 PLS100") R Code for predicting changes ## the program will run for some time library(mixOmics) library(lars) set.seed(1) nrow=412 ## row dimension of X ncol=200 ## column dimension of X nstates=50 ## number of states nc=10 ## number of PLS directions X=matrix(nrow=nrow,ncol=ncol) Y=matrix(nrow=nrow,ncol=nstates) raw <- read.csv("C:/DataMining/Data/unempstates.csv") raw[1:3,] X1=matrix(nrow=412,ncol=200) Y1=matrix(nrow=412,ncol=50) X=matrix(nrow=411,ncol=200) Y=matrix(nrow=411,ncol=50) ## defining the data matrices for (j in 1:50) { for (i in 1:412) { Y1[i,j]=raw[i+4,j] } } for (j in 1:50) { for (i in 1:412) { X1[i,j]=raw[i+3,j] X1[i,j+50]=raw[i+2,j] X1[i,j+100]=raw[i+1,j] X1[i,j+150]=raw[i,j] } } ## calculating differences for (j in 1:200) { for (i in 1:411) { X[i,j]=X1[i+1,j]-X1[i,j] } } for (j in 1:50) { for (i in 1:411) { Y[i,j]=Y1[i+1,j]-Y1[i,j] } } KK=50 sSAR=dim(KK) ## univariate (single) AR(4) sVARL25=dim(KK) ## VAR(4) under various Lasso constraints sVARL50=dim(KK) sVARL75=dim(KK) sVARL100=dim(KK) sPC10=dim(KK) ## regression on 10 principal components sPC200=dim(KK) ## regression on 200 principal components sPLS10=dim(KK) ## partial least squares with 10 PLS directions sPLS100=dim(KK) ## partial least squares with 100 PLS directions predmpc=dim(25) ## We select 25 periods as the evaluation (holdout) sample ## We repeat this KK = 50 times ## We calculate the root mean square forecast error ## from the 25 periods and the 50 states for (jj in 1:KK) { eval=sample(1:411,25) ## Forecasting from individual (univariate) AR(4) models: s=0 for (i in 1:50) { y=Y[-eval,i] p1=X[-eval,i] p2=X[-eval,i+50] p3=X[-eval,i+100] p4=X[-eval,i+150] mSAR=lm(y~p1+p2+p3+p4) pr1=X[eval,i] pr2=X[eval,i+50] pr3=X[eval,i+100] pr4=X[eval,i+150] new=data.frame(p1=pr1,p2=pr2,p3=pr3,p4=pr4) predSAR=predict(mSAR,new) s=s+sum((Y[eval,i]-predSAR)**2) } sSAR[jj]=sqrt(s/(50*25)) ## Forecasting from VAR(4) models and various LASSO constraints s1=0 s2=0 s3=0 s4=0 for (i in 1:50) { lasso=lars(X[-eval,],Y[-eval,i]) predLASSO25=predict(lasso,X[eval,],s=.25,mode="fraction") s1=s1+sum((Y[eval,i]-predLASSO25$fit)**2) predLASSO50=predict(lasso,X[eval,],s=0.50,mode="fraction") s2=s2+sum((Y[eval,i]-predLASSO50$fit)**2) predLASSO75=predict(lasso,X[eval,],s=0.75,mode="fraction") s3=s3+sum((Y[eval,i]-predLASSO75$fit)**2) predLASSO100=predict(lasso,X[eval,],s=1.00,mode="fraction") s4=s4+sum((Y[eval,i]-predLASSO100$fit)**2) } sVARL25[jj]=sqrt(s1/(50*25)) sVARL50[jj]=sqrt(s2/(50*25)) sVARL75[jj]=sqrt(s3/(50*25)) sVARL100[jj]=sqrt(s4/(50*25)) ## Forecasting from regressions on first 10 principal components: pcaX <- prcomp(X[-eval,]) pred=predict(pcaX,data.frame(X[-eval,])) pred=data.frame(pred) names(pred)=paste("p", 1:200, sep="") pred1=predict(pcaX,data.frame(X[eval,])) s=0 for (i in 1:50) { mpc=lm(Y[-eval,i]~p1+p2+p3+p4+p5+p6+p7+p8+p9+p10,data.frame(pred)) g=mpc$coef for (j in 1:25) { h=pred1[j,1:10] h1=c(1,h) predmpc[j]=sum(h1*g) } s=s+sum((Y[eval,i]-predmpc)**2) } sPC10[jj]=sqrt(s/(50*25)) ## Forecasting from regressions on all 200 principal components s=0 for (i in 1:50) { mpc=lm(Y[-eval,i]~.,data.frame(pred)) g=mpc$coef for (j in 1:25) { h=pred1[j,] h1=c(1,h) predmpc[j]=sum(h1*g) } s=s+sum((Y[eval,i]-predmpc)**2) } sPC200[jj]=sqrt(s/(50*25)) ## Forecasting from regressions on first 10 PLS components: s=0 for (i in 1:50) { m1=pls(X[-eval,],Y[-eval,i],ncomp=10,mode="classic") p1=m1$variates$X[,1] p2=m1$variates$X[,2] p3=m1$variates$X[,3] p4=m1$variates$X[,4] p5=m1$variates$X[,5] p6=m1$variates$X[,6] p7=m1$variates$X[,7] p8=m1$variates$X[,8] p9=m1$variates$X[,9] p10=m1$variates$X[,10] mpc=lm(Y[-eval,i]~p1+p2+p3+p4+p5+p6+p7+p8+p9+p10) g=mpc$coef pre1=predict(m1,X[eval,]) for (j in 1:25) { h=pre1$variates[j,1:10] h1=c(1,h) predmpc[j]=sum(h1*g) } s=s+sum((Y[eval,i]-predmpc)**2) } sPLS10[jj]=sqrt(s/(50*25)) ## Forecasting from regressions on first 100 PLS components: s=0 for (i in 1:50) { m1=pls(X[-eval,],Y[-eval,i],ncomp=100,mode="classic") ppp=data.frame(m1$variates$X) mpc=lm(Y[-eval,i]~.,data=ppp) g=mpc$coef pre1=predict(m1,X[eval,]) for (j in 1:25) { h=pre1$variates[j,1:100] h1=c(1,h) predmpc[j]=sum(h1*g) } s=s+sum((Y[eval,i]-predmpc)**2) } sPLS100[jj]=sqrt(s/(50*25)) } ## Output sSAR sVARL25 sVARL50 sVARL75 sVARL100 sPC10 sPC200 sPLS10 sPLS100 mean(sSAR) mean(sVARL25) mean(sVARL50) mean(sVARL75) mean(sVARL100) mean(sPC10) mean(sPC200) mean(sPLS10) mean(sPLS100) boxplot(sSAR,sVARL25,sVARL50,sVARL75,sVARL100,sPC10,sPC200,sPLS10,sPLS100,ylab="RMSE",xlab=" AR(4) VAR(4) VAR(4) VAR(4) VAR(4) PCA10 PCA200 PLS10 PLS100",sub=" AR(4) s=0.25 s=0.50 s=0.75 s=1.00 PCA10 PCA200 PLS10 PLS100") CHAPTER 19: TEXT AS DATA: TEXT MINING AND SENTIMENT ANALYSIS Example 1: Restaurant Reviews library(textir) data(we8there) ## 6166 reviews and 2640 bigrams dim(we8thereCounts) dimnames(we8thereCounts) dim(we8thereRatings) we8thereRatings[1:3,] ## ratings (restaurants ordered on overall rating from 5 to 1) as.matrix(we8thereCounts) as.matrix(we8thereCounts)[12,400] ## count for bigram 400 in review 12 ## get to know what’s in the matrix g1=min(as.matrix(we8thereCounts)[,]) ## min count over reviews/bigrams g2=max(as.matrix(we8thereCounts)[,]) ## max count over reviews/bigrams g1 g2 ## a certain bigram was mentioned in a certain review 13 times hh=as.matrix(we8thereCounts)[,1000] hh ## here we look at the frequencies of the bigram in column 1000 ## the data are extremely sparce overall=as.matrix(we8thereRatings[,5]) ## overall rating ## we determine frequencies of the 2640 different bigrams ## this will take some time nn=2640 cowords=dim(nn) for (i in 1:nn) { cowords[i]=sum(as.matrix(we8thereCounts)[,i]) } cowords cowords[7] plot(sort(cowords,decreasing=TRUE)) ## analysis per review ## we determine the frequencies of bigrams per review ## this will take some time nn=6166 coreview=dim(nn) for (i in 1:nn) { coreview[i]=sum(as.matrix(we8thereCounts)[i,]) } plot(sort(coreview,decreasing=TRUE)) ## Multinomial logistic regression and fitted reduction we8mnlm=mnlm(we8thereCounts,overall,bins=5) ## bins: for faster inference if covariates are factors ## covariate is a factor with 5 levels we8mnlm we8mnlm$intercept ## estimates of alphas we8mnlm$loadings ## estimates of betas fitted(we8mnlm) as.matrix(fitted(we8mnlm))[1,] ## fitted counts for first review ## following provides fitted multinomial probabilities pred=predict(we8mnlm,overall,type="response") pred[1,] ## predicted multinomial probs for review 1 sum(pred[1,]) ## must add to one ## following predicts inverse prediction (fitted reduction) predinv=predict(we8mnlm,we8thereCounts,type="reduction") predinv[1:10] ## prints predicted ratings for first 10 reviews plot(predinv) plot(predinv~overall) corr(predinv,overall) boxplot(predinv~overall) ## procedure works. Predicted ratings increase with actual ratings ## question of cutoff. Which cutoff to use for excellent review? ## ROC curve for classification of y with p roc <- function(p,y){ y <- factor(y) n <- length(p) p <- as.vector(p) Q <- p > matrix(rep(seq(0,1,length=500),n),ncol=500,byrow=TRUE) fp <- colSums((y==levels(y)[1])*Q)/sum(y==levels(y)[1]) tp <- colSums((y==levels(y)[2])*Q)/sum(y==levels(y)[2]) plot(fp, tp, xlab="1-Specificity", ylab="Sensitivity") abline(a=0,b=1,lty=2,col=8) } c2=overall==4 c3=overall==5 c=c2+c3 min=min(predinv) max=max(predinv) pp=(predinv-min)/(max-min) ## plot of ROC curve roc(p=pp, y=c) cut <- 0 truepos <- c==1 & predinv>=cut trueneg <- c==0 & predinv 0, conclude overall quality rating 4 or 5. Example 2: Political Sentiment library(textir) data(congress109) ## 529 speakers 1000 trigrams dimnames(congress109Counts) as.matrix(congress109Counts)[1,] ## Chris Cannon’s counts as.matrix(congress109Counts)[,1] ## "gifted.talented.student" counts congress109Ideology as.matrix(congress109Ideology)[,1] repshare=as.matrix(congress109Ideology[,5]) repshare ## Republican vote share ## get to know what is in the matrix g1=min(as.matrix(congress109Counts)[,]) g2=max(as.matrix(congress109Counts)[,]) g1 g2 ## a certain trigram was mentioned by a certain speaker 631 times hh=as.matrix(congress109Counts)[,1000] hh ## here we look at the frequencies of bigram in column 1000 ## Multinomial logistic regression and fitted reduction congmnlm=mnlm(congress109Counts,repshare) ## this may take some time congmnlm congmnlm$intercept ## estimates of alphas congmnlm$loadings ## estimates of betas fitted(congmnlm) as.matrix(fitted(congmnlm))[1,] ## fitted counts for first rep maxf=max(as.matrix(fitted(congmnlm))[1,]) maxf maxc=max(as.matrix(congress109Counts)[1,]) maxc ## following provides fitted multinomial probabilities pred=predict(congmnlm,repshare,type="response") pred[1,] ## predicted multinomial probs for first rep ## following predicts inverse prediction (fitted reduction) predinv=predict(congmnlm,congress109Counts,type="reduction") predinv[1:10] ## prints predicted ratings for first 10 reps plot(predinv~repshare) plot(repshare~predinv) corr(predinv,repshare) model1=lm(repshare~predinv) model1 plot(repshare~predinv) abline(model1) Appendix: Relationship between the Gentzkow/Shapiro Estimate of “Slant” and Partial Least Squares library(textir) data(congress109) ## data form Gentzkow/Shapiro ## Gentzkow/Shapiro slant (unstandardized relative frequencies) a=dim(529) b=dim(529) d=dim(1000) hh=as.matrix(freq(congress109Counts)) x=congress109Ideology$repshare for (j in 1:1000) { m1=lm(hh[,j]~x) a[j]=m1$coef[1] b[j]=m1$coef[2] } for (i in 1:529) { d[i]=sum((hh[i,]-a)*b) } cor(d,x)**2 ## Gentzkow/Shapiro slant (standardized relative frequencies) hh=as.matrix(freq(congress109Counts)) for (j in 1:1000) { hh[,j]=(hh[,j]-mean(hh[,j]))/sd(hh[,j]) } x=congress109Ideology$repshare for (j in 1:1000) { m1=lm(hh[,j]~x) a[j]=m1$coef[1] b[j]=m1$coef[2] } for (i in 1:529) { d[i]=sum((hh[i,]-a)*b) } cor(d,x)**2 ## Using PLS (textir) on first partial least squares direction ## scaling FALSE means unstandardized relative frequencies are used library(textir) fit=pls(freq(congress109Counts),congress109Ideology$repshare,scale=FALSE,K=1) cor(congress109Ideology$repshare,fit$fitted)**2 ## Using PLS (textir) on first partial least squares direction ## scaling TRUE means standardized relative frequencies ## mean zero and variance 1 library(textir) fit=pls(freq(congress109Counts),congress109Ideology$repshare,scale=TRUE,K=1) cor(congress109Ideology$repshare,fit$fitted)**2 ## Using PLS (mixOmics) on first partial least squares direction ## standardized relative frequencies (mean zero and variance 1) library(mixOmics) mpls=pls(freq(congress109Counts),congress109Ideology$repshare,ncomp=1,mode="classic",freqCut=0.000001,uniqueCut=0.000001) x1=mpls$variates$X[,1] m1=lm(congress109Ideology$repshare~x1) fmpls=m1$fitted cor(x,m1$fitted)**2 CHAPTER 20: NETWORK DATA library(igraph) m=matrix(nrow=3,ncol=3) m[1,1]=0 m[1,2]=1 m[1,3]=1 m[2,1]=1 m[2,2]=0 m[2,3]=0 m[3,1]=0 m[3,2]=1 m[3,3]=0 m lab=c(1,2,3) object <- graph.adjacency(m,mode="directed") set.seed(1) plot(object,vertex.label=lab) Example 1: Marriage and Power in 15th Century Florence library(igraph) ## load the package ## read the data florence <- as.matrix(read.csv("C:/DataMining/Data/firenze.csv")) florence marriage <- graph.adjacency(florence,mode="undirected", diag=FALSE) ## use the help function to understand the options for the graph set.seed(1) plot(marriage,layout=layout.fruchterman.reingold,vertex.label=V(marriage)$name,vertex.color="red",vertex.label.color="black", vertex.frame.color=0,vertex.label.cex=1.5) data.frame(V(marriage)$name,degree(marriage)) ## calculate and plot the shortest paths V(marriage)$color <- 8 E(marriage)$color <- 8 PtoA <- get.shortest.paths(marriage, from="Peruzzi", to="Acciaiuoli") E(marriage, path=PtoA$vpath[[1]])$color <- "magenta" V(marriage)[PtoA$vpath[[1]] ]$color <- "magenta" GtoS <- get.shortest.paths(marriage, from="Ginori", to="Strozzi") E(marriage, path=GtoS$vpath[[1]])$color <- "green" V(marriage)[ GtoS$vpath[[1]] ]$color <- "green" V(marriage)[ "Medici" ]$color <- "cyan" set.seed(1) plot(marriage, layout=layout.fruchterman.reingold, vertex.label=V(marriage)$name,vertex.label.color="black", vertex.frame.color=0, vertex.label.cex=1.5) data.frame(V(marriage)$name, betweenness(marriage)) Example 2: Connections in a Friendship Network library(statnet) data(faux.mesa.high) ## load the network object summary(faux.mesa.high) ## summarize the data set lab=network.vertex.names(faux.mesa.high)=c(1:205) ## assigns numbers to nodes grd=faux.mesa.high%v%"Grade" sx=faux.mesa.high%v%"Sex" race=faux.mesa.high%v%"Race" ## we don’t look at race in this example vs=c(4,12)[match(sx,c("M","F"))] ## used for graph later on; boys by square (4 sides); girls by 12-sided col=c(6,5,3,7,4,2) ## used for graph later on as.sociomatrix(faux.mesa.high)## gives adjacency matrix faux.mesa.high[1,] faux.mesa.high[5,] faux.mesa.high[,3] m=faux.mesa.high[,] ## adjacency matrix network.density(faux.mesa.high) ## density of network = NuEdges/[nodes*(nodes-1)/2] deg=degree(faux.mesa.high)/2 ## degree of network nodes (number of connections) ## Statnet double-counts the connections in an undirected network ## Edge between nodes i and j in an undirected network is counted twice ## We divide by 2 in order to make the results consistent with our ## discussion in the text and the output from igraph (in Example 1) deg betw=betweenness(faux.mesa.high)/2 ## betweenness of network ## Statnet double-counts the betweenness in an undirected network ## We divide by 2 in order to make the results consistent with our ## discussion in the text and the output from igraph betw plot(deg) plot(betw) hist(deg,breaks= c(-0.5,0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5,12.5,13.5)) plot(deg,betw) boxplot(deg~grd) boxplot(deg~sx) ## faux.mesa.high is already a network object ## below we illustrate how to create an undirected network ## from the edge list ## first we obtain the edge list of a network object attributes(faux.mesa.high) vv=faux.mesa.high$mel edge=matrix(nrow=203,ncol=2) for (i in 1:203) { vvv=vv[[203+i]] edge[i,1]=vvv$inl edge[i,2]=vvv$outl } edge ## edge contains the edge list ## in an undirected network, edge information is stored in the ## second half of faux.mesa.high$mel faux1=network(edge,directed=FALSE,matrix.type="edgelist") faux1 faux1[,] deg=degree(faux1)/2 betw=betweenness(faux1)/2 plot(deg) plot(betw) plot(deg,betw) ## faux.mesa.high is already a network object ## below we illustrate how to create an undirected network ## from the adjacency matrix ## the adjacency matrix had been stored previously in m faux2=network(m,directed=FALSE,matrix.type="adjacency") faux2 faux2[,] deg=degree(faux2)/2 betw=betweenness(faux2)/2 plot(deg) plot(betw) plot(deg,betw) ## visual display of the network set.seed(654) ## to get reproducible graphs plot(faux.mesa.high) ## generic graph without labels/covariates set.seed(654) ## to get reproducible graphs plot(faux.mesa.high,label=lab) ## generic graph with labels set.seed(654) ## to get reproducible graphs plot(faux.mesa.high,vertex.sides=vs,vertex.rot=45,vertex.cex=2, vertex.col=col[grd-6],edge.lwd=2,cex.main=3,displayisolates=FALSE) legend("bottomright",legend=7:12,fill=col,cex=0.75) ## 45 rotates square ## isolates are not displayed ## density of interaction among students from the ## same grade (ignoring gender) m1=m[grd==7,grd==7] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==8,grd==8] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==9,grd==9] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==10,grd==10] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==11,grd==11] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==12,grd==12] sum(m1)/(nrow(m1)*(ncol(m1)-1)) ## density of interaction among students from a given grade ## with students from all grades (ignoring gender) ## matrix m1 shown below is not square; it has r rows and c columns ## the c columns include the r nodes that determine the rows of m1 ## the number of possible edges in m1 are r(r-1) + r(c-r) = r(c-1) m1=m[grd==7,] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==8,] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==9,] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==10,] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==11,] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[grd==12,] sum(m1)/(nrow(m1)*(ncol(m1)-1)) ## density of interaction among students from the ## same gender group (ignoring grade) m1=m[sx=="F",sx=="F"] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[sx=="M",sx=="M"] sum(m1)/(nrow(m1)*(ncol(m1)-1)) ## density of interaction among students from a given gender ## group with students of either gender (ignoring grade) m1=m[sx=="F",] sum(m1)/(nrow(m1)*(ncol(m1)-1)) m1=m[sx=="M",] sum(m1)/(nrow(m1)*(ncol(m1)-1)) ## density of interaction among students from the ## same grade, for given gender ## female seventh graders m1=m[sx=="F",sx=="F"] grd1=grd[sx=="F"] m2=m1[grd1==7,grd1==7] sum(m2)/(nrow(m2)*(ncol(m2)-1)) ## male seventh graders m1=m[sx=="M",sx=="M"] grd1=grd[sx=="M"] m2=m1[grd1==7,grd1==7] sum(m2)/(nrow(m2)*(ncol(m2)-1)) ## female twelfth graders m1=m[sx=="F",sx=="F"] grd1=grd[sx=="F"] m2=m1[grd1==12,grd1==12] sum(m2)/(nrow(m2)*(ncol(m2)-1)) ## male twelfth graders m1=m[sx=="M",sx=="M"] grd1=grd[sx=="M"] m2=m1[grd1==12,grd1==12] sum(m2)/(nrow(m2)*(ncol(m2)-1)) ## density of interaction among students from a given grade ## with students from all grades, for given gender ## female seventh graders m1=m[sx=="F",sx=="F"] grd1=grd[sx=="F"] m2=m1[grd1==7,] sum(m2)/(nrow(m2)*(ncol(m2)-1)) ## male seventh graders m1=m[sx=="M",sx=="M"] grd1=grd[sx=="M"] m2=m1[grd1==7,] sum(m2)/(nrow(m2)*(ncol(m2)-1)) ## female twelfth graders m1=m[sx=="F",sx=="F"] grd1=grd[sx=="F"] m2=m1[grd1==12,] sum(m2)/(nrow(m2)*(ncol(m2)-1)) ## male twelfth graders m1=m[sx=="M",sx=="M"] grd1=grd[sx=="M"] m2=m1[grd1==12,] sum(m2)/(nrow(m2)*(ncol(m2)-1)) ## Plotting options. Not that easy. Will make pictures look differently ## Principles of Fruchterman/Reingold: ## Distribute vertices evenly in the frame ## Minimize the number of edge crossings ## Make edge lengths uniform ## Reflect inherent symmetry ## Conform to the frame set.seed(654) ## to get reproducible graphs plot(faux.mesa.high,mode="fruchtermanreingold",label=lab,vertex.sides=vs,vertex.rot=45,vertex.cex=2.5,vertex.col=col[grd-6],edge.lwd=2,cex.main=3,displayisolates=FALSE) legend("bottomright",legend=7:12,fill=col,cex=0.75) set.seed(654) ## to get reproducible graphs plot(faux.mesa.high,mode="kamadakawai",label=lab,vertex.sides=vs,vertex.rot=45,vertex.cex=2.5,vertex.col=col[grd-6],edge.lwd=2,cex.main=3,displayisolates=FALSE) legend("bottomright",legend=7:12,fill=col,cex=0.75) set.seed(654) ## to get reproducible graphs plot(faux.mesa.high,mode="circle",label=lab,vertex.sides=vs,vertex.rot=45,vertex.cex=2.5,vertex.col=col[grd-6],edge.lwd=2,cex.main=3,displayisolates=FALSE) legend("bottomright",legend=7:12,fill=col,cex=0.75) EXERCISES Exercises 2 (Wolfgang Jank: Business Analystics for Managers. Springer, 2011) hp <- read.csv("C:/DataMining/Data/HousePrices.csv") hp[1:3,] dm <- read.csv("C:/DataMining/Data/DirectMarketing.csv") dm[1:3,] gd <- read.csv("C:/DataMining/Data/GenderDiscrimination.csv") gd[1:3,] ld <- read.csv("C:/DataMining/Data/LoanData.csv") ld[1:3,] fi <- read.csv("C:/DataMining/Data/FinancialIndicators.csv") fi[1:3,] Exercises 3 (Graham Williams: Data Mining with Rattle and R. Springer, 2011) weather <- read.csv("C:/DataMining/Data/weather.csv") weather[1:3,] weatherAUS <- read.csv("C:/DataMining/Data/weatherAUS.csv") weatherAUS[1:3,] audit <- read.csv("C:/DataMining/Data/audit.csv") audit[1:3,] Exercises 4 (1989/99 KDD Cup) ## read the data cup98LRN <- read.csv("C:/DataMining/Data/cup98LRN.csv") cup98LRN[1:3,] ## read the data cup98VAL <- read.csv("C:/DataMining/Data/cup98VAL.csv") cup98VAL[1:3,] ## read the data cup98VALtargt <- read.csv("C:/DataMining/Data/cup98VALtargt.csv") cup98VALtargt[1:3,] Exercise 5: Byssinosis ## read the data bys <- read.csv("C:/DataMining/Data/byssinosisWeights.csv") bys Exercise 6: Toxaemia ## read the data tox <- read.csv("C:/DataMining/Data/toxaemiaWeights.csv") tox Exercises 7 (8 examples) Example 1: Classification Tree for Identifying Soybean Disease library(ares) ## needed to determine the proportion of missing observations library(tree) ## classification trees ## reading the data soybean15 <- read.csv("C:/DataMining/Data/soybean15.csv") soybean15[1:3,] ## converting the attributes into factors (nominal scale) ## calculating the proportion of missing observations miss=dim(36) for (j in 1:36) { soybean15[,j]=factor(soybean15[,j]) miss[j]=count.na(soybean15[,j])$na/length(soybean15[,j]) } miss ## fifth attribute (presence/absence of hail) has 8.27% missing observations ## constructing the classification tree soytree <- tree(disease ~., data = soybean15, mincut=1) soytree summary(soytree) plot(soytree, col=8) text(soytree, digits=2) ## cross-validation to prune the tree set.seed(2) cvsoy <- cv.tree(soytree, K=10) cvsoy$size cvsoy$dev plot(cvsoy, pch=21, bg=8, type="p", cex=1.5, ylim=c(0,1400)) ## shows that the tree has many terminal nodes soycut <- prune.tree(soytree, best=17) soycut summary(soycut) plot(soycut, col=8) text(soycut) plot(soycut, col=8) ## below we have omitted the text as it is difficult to read ## terminal node under 31 is the one on the far right of the graph ## first split: C15ac (to left) and C15b (to the right) ## second split: C1abcd (to left) and C1efg (to right) ## third split: C19a (to left) and C19b (to right) ## fourth split: C28a (to left) and C28bcd (to right) Example 2: Classification Tree for Fitting Contact Lenses library(tree) ## read the data ContactLens <- read.csv("C:/DataMining/Data/ContactLens.csv") levels(ContactLens[,1]) ## age levels(ContactLens[,2]) ## spectacle presription levels(ContactLens[,3]) ## astigmatism levels(ContactLens[,4]) ## tear production rate levels(ContactLens[,5]) ## contact lens ContactLens ## constructing the classification tree that fits the data perfectly cltree <- tree(ContactLens ~., data = ContactLens, mindev=0, minsize=1) cltree summary(cltree) plot(cltree, col=8) text(cltree, digits=2) ## pruning the tree to get a simpler tree clcut <- prune.tree(cltree, best=3) clcut summary(clcut) plot(clcut, col=8) text(clcut) Example 3: Determining the Credit Risk Using a Classification Tree library(tree) ## first we read in the data credit <- read.csv("C:/DataMining/Data/credit.csv") credit credit[,1] credit[,2] credit[,3] credit[,4] credit[,5] ## constructing the classification tree that fits the data perfectly credittree <- tree(Risk ~., data = credit, mindev=0, minsize=1) credittree summary(credittree) plot(credittree, col=8) text(credittree, digits=2) Example 4: Determining the Progression of Liver Disease Using a Classification Tree library(tree) ## data set from Witten ## missing data hepatitis <- read.csv("C:/DataMining/Data/hepatitis.csv") hepatitis ## calculating YWD = (Age – YWOD) hepatitis[,20]=hepatitis[,18]-hepatitis[,17] colnames(hepatitis)[20]= "YWD" hepatitis[1:3,] ## cleaning up the data set hh=hepatitis[,c(-2:-4,-17)] hh[1:3,] ## create factors for the categorical variables for (j in 1:13) { hh[,j]=factor(hh[,j]) } hh[1:3,] levels(hh[,6]) levels(hh[,8]) levels(hh[,13]) ## constructing the classification tree heptree <- tree(Bx ~., data = hh) heptree summary(heptree) plot(heptree, col=8) text(heptree, digits=2) ## cross-validation to prune the tree set.seed(2) cvhep <- cv.tree(heptree, K=10) cvhep$size cvhep$dev plot(cvhep, pch=21, bg=8, type="p", cex=1.5, ylim=c(400,750)) hepcut <- prune.tree(heptree, best=6) hepcut summary(hepcut) plot(hepcut, col=8) text(hepcut) Example 5: Predicting the Outcome of Labor Negotiations Using a Classification Tree library(tree) ## read the data labor <- read.csv("C:/DataMining/Data/labor.csv") labor[1:3,] ## omit variables with lots of missing values ll=labor[,c(-3:-5,-7:-11,-13:-16)] ll[1:3,] levels(ll[,4]) ## vacation benefits levels(ll[,5]) ## response: overall contract quality ## constructing the classification tree labortree <- tree(Class ~., data = ll) labortree summary(labortree) plot(labortree, col=8) text(labortree, digits=2) p1=snip.tree(labortree,nodes=2) p1 summary(p1) plot(p1) text(p1) Example 6: Diabetes among Pima Indians ## read the data and create plots PimaIndians <- read.csv("C:/DataMining/Data/PimaIndians.csv") PimaIndians plot(PimaIndians) PI=data.frame(PimaIndians) ## logistic regression model ## mm1: model fitted to all data mm1=glm(Class~.,family=binomial,data=PI) mm1 summary(mm1) ## simplifying the model through backward elimination RPI=PI[,-4] ## dropping triceps skin fold thickness mm1=glm(Class~.,family=binomial,data=RPI) mm1 summary(mm1) RPI=RPI[,-7] ## dropping age mm1=glm(Class~.,family=binomial,data=RPI) mm1 summary(mm1) RPI=RPI[,-4] ## dropping serum insulin RPI[1:3,] mm1=glm(Class~.,family=binomial,data=RPI) mm1 summary(mm1) ## evaluation of the full model ## split the data set into a training (50%) and a test (evaluation) set (50%) set.seed(1) n=length(PI$Class) n n1=floor(n*(0.5)) n1 n2=n-n1 n2 train=sample(1:n,n1) PI1=data.frame(PI[train,]) PI2=data.frame(PI[-train,]) ## mm2: model fitted on the training data set mm2=glm(Class~.,family=binomial,data=PI1) mm2 summary(mm2) ## create predictions for the test (evaluation) data set gg=predict(mm2,newdata=PI2,type= "response") gg hist(gg) plot(PI$Class[-train]~gg) ## coding as 1 if probability 0.5 or larger gg1=floor(gg+0.5) ttt=table(PI$Class[-train],gg1) ttt error=(ttt[1,2]+ttt[2,1])/n2 error ## evaluation of the simplified model ## mm2: model fitted on the training data set mm2=glm(Class~NuPregnancy+Glucose+DiastolicBP+BodyMassIndex+DiabetesPedigree,family=binomial,data=PI1) mm2 summary(mm2) ## create predictions for the test (evaluation) data set gg=predict(mm2,newdata=PI2,type= "response") gg hist(gg) plot(PI$Class[-train]~gg) ## coding as 1 if probability 0.5 or larger gg1=floor(gg+0.5) ttt=table(PI$Class[-train],gg1) ttt error=(ttt[1,2]+ttt[2,1])/n2 error ## read the data PimaIndians <- read.csv("C:/DataMining/Data/PimaIndians.csv") PimaIndians ## CART analysis library(tree) PimaIndians$Class=factor(PimaIndians$Class) ## constructing the classification tree PItree <- tree(Class ~., data = PimaIndians,mindev=0.01) PItree summary(PItree) plot(PItree, col=8) text(PItree, digits=2) ## cross-validation to prune the tree set.seed(2) cvPI <- cv.tree(PItree, K=10) cvPI$size cvPI$dev plot(cvPI, pch=21, bg=8, type="p", cex=1.5, ylim=c(700,1000)) PIcut <- prune.tree(PItree, best=7) PIcut summary(PIcut) plot(PIcut, col=8) text(PIcut) P1=snip.tree(PIcut,nodes=c(2,7)) P1 summary(P1) plot(P1) text(P1) Example 7: Predicting the CPU Performance with Regression and Regression Trees ## read the data and create a matrix plot cpu <- read.csv("C:/DataMining/Data/cpu.csv") cpu xx=cpu[,c(-1,-9)] xx[1:3,] plot(xx) ## regression model regfit=lm(PRP~.,data=xx) regfit summary(regfit) ## cross-validation (leave one out): regression model on all six regressors n=length(cpu$PRP) diff=dim(n) percdiff=dim(n) for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] m1=lm(PRP~.,data=xx[train,]) pred=predict(m1,newdat=xx[-train,]) obs=xx[-train,7] diff[k]=obs-pred percdiff[k]=abs(diff[k])/obs } me=mean(diff) rmse=sqrt(mean(diff**2)) mape=100*(mean(percdiff)) me # mean error rmse # root mean square error mape # mean absolute percent error library(tree) ## Construct the regression tree cputree <- tree(PRP ~., data=xx, mindev=0.1, mincut=1) cputree <- tree(PRP ~., data= xx, mincut=1) cputree summary(cputree) plot(cputree, col=8) text(cputree, digits=2) ## Use cross-validation to prune the regression tree set.seed(2) cvcpu <- cv.tree(cputree, K=10) cvcpu$size cvcpu$dev plot(cvcpu, pch=21, bg=8, type="p", cex=1.5, ylim=c(0,6000000)) cpucut <- prune.tree(cputree, best=7) cpucut summary(cpucut) plot(cpucut, col=8) text(cpucut) Example 8: Inferring the Cultivar of Wine Using Classification Trees, Discriminant Analysis and Multinomial Logistic Regression ## read the data and plots wine <- read.csv("C:/DataMining/Data/wine.csv") wine[1:3,] plot(wine) ## CART library(tree) wine$Class=factor(wine$Class) ## constructing the classification tree Winetree <- tree(Class ~., data = wine) Winetree summary(Winetree) plot(Winetree, col=8) text(Winetree, digits=2) ## cross-validation to prune the tree set.seed(1) cvWine <- cv.tree(Winetree, K=10) cvWine$size cvWine$dev plot(cvWine, pch=21, bg=8, type="p", cex=1.5, ylim=c(100,400)) Winecut <- prune.tree(Winetree, best=4) Winecut summary(Winecut) plot(Winecut, col=8) text(Winecut) ## Clustering ## standardizing the attributes as units considerably different wines=matrix(nrow=length(wine[,1]),ncol=length(wine[1,])) for (j in 2:14) { wines[,j]=(wine[,j]-mean(wine[,j]))/sd(wine[,j]) } wines[,1]=wine[,1] winesr=wines[,-1] winesr[1:3,] ## kmeans clustering with 13 standardized attributes grpwines <- kmeans(winesr, centers=3, nstart=20) grpwines grpwines$cluster ## displaying clustering results wine$Class ## actual classes ## 6 mistakes made among 178 wines ## Discriminant analysis (linear/quadratic) library(MASS) ## linear discriminant analysis using the standardized attributes wines[1:3,] ws=data.frame(wines) ws[1:3,] zlin=lda(X1~.,ws,prior=c(1,1,1)/3) zlin ## quadratic discriminant analysis zqua=qda(X1~.,ws,prior=c(1,1,1)/3) zqua n=dim(ws)[1] errorlin=1-(sum(ws$X1==predict(zlin,ws)$class)/n) errorlin errorqua=1-(sum(ws$X1==predict(zqua,ws)$class)/n) errorqua neval=1 corlin=dim(n) corqua=dim(n) ## leave one out evaluation for (k in 1:n) { train1=c(1:n) train=train1[train1!=k] ## linear discriminant analysis zlin=lda(X1~.,ws[train,],prior=c(1,1,1)/3) corlin[k]=ws$X1[-train]==predict(zlin,ws[-train,])$class ## quadratic discriminant analysis zqua=qda(X1~.,ws[train,],prior=c(1,1,1)/3) corqua[k]=ws$X1[-train]==predict(zqua,ws[-train,])$class } merrlin=1-mean(corlin) merrlin merrqua=1-mean(corqua) merrqua ## Multinomial logistic regression ## using VGAM library(VGAM) ws=data.frame(wines) gg <- vglm(X1 ~ .,multinomial,data=ws) summary(gg) predict(gg) ## log-odds relative to last group round(fitted(gg),2) ## probabilities cbind(round(fitted(gg),2),ws$X1) ## perfect classification