# # Test weights in a regression problem # library(rpart) mystate <- data.frame(state.x77, region=factor(state.region)) names(mystate) <- c("population","income" , "illiteracy","life" , "murder", "hs.grad", "frost", "area", "region") xgrp <- rep(1:10,5) fit4 <- rpart(income ~ population + region + illiteracy +life + murder + hs.grad + frost , mystate, control=rpart.control(minsplit=10, xval=xgrp)) wts <- rep(3, nrow(mystate)) fit4b <- rpart(income ~ population + region + illiteracy +life + murder + hs.grad + frost , mystate, control=rpart.control(minsplit=10, xval=xgrp), weights=wts) fit4b$frame$wt <- fit4b$frame$wt/3 fit4b$frame$dev <- fit4b$frame$dev/3 fit4b$cptable[,5] <- fit4b$cptable[,5] * sqrt(3) temp <- c('frame', 'where', 'splits', 'csplit', 'cptable') all.equal(fit4[temp], fit4b[temp]) # Next is a very simple case, but worth keeping dummy <- data.frame(y=1:10, x1=c(10:4, 1:3), x2=c(1,3,5,7,9,2,4,6,8,0)) xx1 <- rpart(y ~ x1 + x2, dummy, minsplit=4, xval=0) xx2 <- rpart(y ~ x1 + x2, dummy, weights=rep(2,10), minsplit=4, xval=0) all.equal(xx1$frame$dev, c(82.5, 10, 2, .5, 10, .5, 2)) all.equal(xx2$frame$dev, c(82.5, 10, 2, .5, 10, .5, 2)*2) # Now for a set of non-equal weights # We need to set maxcompete=3 because there just happens to be, in one # of the lower nodes, an exact tie between variables "life" and "murder". # Round off error causes fit5 to choose one and fit5b the other. # Later -- cut it back to maxdepth=3 for the same reason (a tie). # nn <- nrow(mystate) wts <- rep(1:5, length=nn) temp <- rep(1:nn, wts) #row replicates xgrp <- rep(1:10, length=nn) xgrp2<- rep(xgrp, wts) tempc <- rpart.control(minsplit=2, xval=xgrp2, maxsurrogate=0, maxcompete=3, maxdepth=3) # Direct: replicate rows in the data set, and use unweighted fit5 <- rpart(income ~ population + region + illiteracy +life + murder + hs.grad + frost , data=mystate[temp,], control=tempc) # Weighted tempc <- rpart.control(minsplit=2, xval=xgrp, maxsurrogate=0, maxcompete=3, maxdepth=3) fit5b <- rpart(income ~ population + region + illiteracy +life + murder + hs.grad + frost , data=mystate, control=tempc, weights=wts) all.equal(fit5$frame[-2], fit5b$frame[-2]) # the "n" component won't match all.equal(fit5$cptable, fit5b$cptable) all.equal(fit5$splits[,-1],fit5b$splits[,-1]) all.equal(fit5$csplit, fit5b$csplit)