R version 2.15.0 (2012-03-30) Copyright (C) 2012 The R Foundation for Statistical Computing ISBN 3-900051-07-0 Platform: x86_64-unknown-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > # > # A hard-core test of losses and priors > # Simple data set where I know what the answers must be > # > library(rpart) > aeq <- function(x,y, ...) all.equal(as.vector(x), as.vector(y), ...) > > dummy <- c(3,1,4,1,5,9,2,6,5,3,5,8,9,7,9)/5 > pdata <- data.frame(y=factor(rep(1:3, 5)), + x1 = 1:15, + x2 = c(1:6, 1:6, 1:3), + x3 = (rep(1:3, 5) + dummy)*10) > > pdata$x3[c(1,5,10)] <- NA > pdata$y[15] <- 1 # make things unbalanced > > pfit <- rpart(y ~ x1 + x2 + x3, pdata, + cp=0, xval=0, minsplit=5, maxdepth=1, + parms=list(prior=c(.2, .3, .5), + loss =matrix(c(0,2,2,2,0,6,1,1,0), 3,3,byrow=T))) > > # > # See section 12.1 of the report for these numbers > # > > ntot <- c(6,5,4) > phat <- c(6,5,4)/15 # observed class probabilities > prior <- c(.2, .3, .5) # priors > aprior <- c(4,12,5)/21 # altered priors > lmat <- matrix(c(0,1,2, 2,0,1, 2,6,0), ncol=3) #loss matrix > > gini <- function(p) 1-sum(p^2) > loss <- function(n, class) sum(n * lmat[,class]) > > phat <- function(n, ntot=c(6,5,4), prior=c(.2, .3, .5)) { + n*prior/ntot + } > > # Are the losses correct? > # Class counts for the two children are (4,4,0) and (2,1,4), when > # using surrogates > aeq(pfit$frame$dev/15, c(loss(prior,2), loss(phat(c(4,4,0)),2), + loss(phat(c(2,1,4)),3))) [1] TRUE > # Node probabilities? > aeq(pfit$frame$yval2[,8] , + c(1, sum(phat(c(4,4,0))), sum(phat(c(2,1,4))))) [1] TRUE > > aeq(pfit$frame$yval2[,5:7] , rbind(prior, + phat(c(4,4,0))/ sum(phat(c(4,4,0))), + phat(c(2,1,4))/ sum(phat(c(2,1,4))))) [1] TRUE > > # Now the node and class probs, under altered priors > phat2 <- function(n, ntot=c(6,5,4), prior=aprior) { + n*prior/ntot + } > > # Use these to create the gini losses, base data, and for the best > # splits on variables 1, 2, 3 > gfun <- function(n) { #The gini loss for a node, given the counts + temp <- phat2(n) + sum(temp) * gini(temp/sum(temp)) + } > > # These are in order x3, x2, x1 (best split to worst) > # Note that for x3, missing values cause the "parent" to be viewed as > # having 12 obs instead of 15. > # Each line is gini(parent) - gini(children) > aeq(pfit$splits[1:3, 3], + 15* c(gfun(c(4,4,4)) - (gfun(c(3,4,0)) + gfun(c(1,0,4))), + gfun(c(6,5,4)) - (gfun(c(6,5,2)) + gfun(c(0,0,2))), + gfun(c(6,5,4)) - (gfun(c(4,4,4)) + gfun(c(2,1,0))))) [1] TRUE > > proc.time() user system elapsed 0.211 0.059 0.315