R : Copyright 2002, The R Development Core Team Version 1.5.1 (2002-06-17) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type `license()' or `licence()' for distribution details. R is a collaborative project with many contributors. Type `contributors()' for more information. Type `demo()' for some demos, `help()' for on-line help, or `help.start()' for a HTML browser interface to help. Type `q()' to quit R. > invisible(options(echo = TRUE)) > library(rpart) > > target.lrn <- read.table("../lrn/num/472.dat",header=T,colClasses="numeric") > target.val <- read.table("../val/num/472.dat",header=T,colClasses="numeric") > target.tst <- read.table("../tst/num/472.dat",header=T,colClasses="numeric") > > > y.lrn <- target.lrn[,1] > y.val <- target.val[,1] > y.tst <- target.tst[,1] > y <- c(y.lrn,y.val,y.tst) > > target.lrn <- read.table("../lrn/num/471.dat",header=T,colClasses="numeric") > target.val <- read.table("../val/num/471.dat",header=T,colClasses="numeric") > target.tst <- read.table("../tst/num/471.dat",header=T,colClasses="numeric") > > b.lrn <- target.lrn[,1] > b.val <- target.val[,1] > b.tst <- target.tst[,1] > b <- c(b.lrn,b.val,b.tst) > > n.lrn <- length(y.lrn) > n.val <- length(y.val) > n.tst <- length(y.tst) > n <- length(y) > > rm(target.lrn,target.val,target.tst) > > wts <- mat.or.vec(n,1) ; for (i in 1:n.lrn) wts[i]=1 > idx.lrn <- 1:n.lrn > idx.val <- (n.lrn+1):(n.lrn+n.val) > idx.tst <- (n.lrn+n.val+1):n > > mod <- read.table("../cty_mod.txt", + header=F,colClasses="character",col.names=c("file","feature","type")) > > n.mod <- length(mod$file) > > first.time <- TRUE > > for (i in 1:n.mod) { + + fn.lrn <- paste("../lrn/",mod$type[i],"/",mod$file[i],".dat",sep="") + fn.val <- paste("../val/",mod$type[i],"/",mod$file[i],".dat",sep="") + fn.tst <- paste("../tst/",mod$type[i],"/",mod$file[i],".dat",sep="") + print(mod$feature[i]) + + if (mod$type[i]=="chr") { + + f.lrn <- read.table(fn.lrn, + header=T,colClasses="character",blank.lines.skip=F) + f.val <- read.table(fn.val, + header=T,colClasses="character",blank.lines.skip=F) + f.tst <- read.table(fn.tst, + header=T,colClasses="character",blank.lines.skip=F) + + f <- c(f.lrn[,1],f.val[,1],f.tst[,1]) + + if (mod$feature[i]=="STATE") { + f[f=="AS"|f=="DC"|f=="DE"|f=="MA"|f=="ME"|f=="NH"] <- "S1" + f[f=="OH"|f=="RI"|f=="VI"|f=="WV"] <- "S1" + f[f=="AA"|f=="AE"|f=="AP"|f=="CT"|f=="GU"|f=="MD"] <- "S2" + f[f=="NJ"|f=="NY"|f=="PA"|f=="VA"|f=="VT"|f=="WY"] <- "S2" + f[f=="AK"|f=="UT"|f=="MS"] <- "S3" + f[f=="NE"|f=="ND"] <- "S4" + f[f=="SD"|f=="SC"] <- "S5" + } + + f <- as.factor(f) + + n.lev <- nlevels(f) + f.name <- levels(f) + if(n.lev==2) f.name <- c(mod$feature[i],mod$feature[i]) + print(paste(" nlevels = ",n.lev)) + + f <- model.matrix(y ~ f - 1) # Note: Intercept removed. + f <- f[,2:n.lev] # Note: First dummy deleted. + f.name <- f.name[2:n.lev] # Note: First name deleted. + + } else { + + f.lrn<-read.table(fn.lrn, + header=T,colClasses="numeric",blank.lines.skip=F) + f.val<-read.table(fn.val, + header=T,colClasses="numeric",blank.lines.skip=F) + f.tst<-read.table(fn.tst, + header=T,colClasses="numeric",blank.lines.skip=F) + + f <- c(f.lrn[,1],f.val[,1],f.tst[,1]) + + f[is.na(f)] <- 0 + + f.name <- mod$feature[i] + + if (mod$feature[i]=="DOB") { + d <- f ; d[d>0] <- 1 # Note: Dummy for missing DOB + f <- cbind(d,f,f^2) # Note: Quadratic term added to DOB. + rm(d) + f.name <- c("DOB.0","DOB.1","DOB.2") + } + } + + if (first.time) { + X <- f + X.names <- f.name + first.time <- FALSE + } else { + X <- cbind(prev.X,f) + X.names <- c(prev.X.names,f.name) + } + + prev.X <- X + prev.X.names <- X.names + + } [1] "LASTGIFT" [1] "PEPSTRFL" [1] " nlevels = 2" [1] "STATE" [1] " nlevels = 33" [1] "RECP3" [1] " nlevels = 2" [1] "DOB" [1] "MAILCODE" [1] " nlevels = 2" [1] "MHUC2" [1] "LASTDATE" [1] "MINRAMNT" > > LAST.by.PEP <- X[,1]*X[,2] > print("LAST.by.PEP") [1] "LAST.by.PEP" > > X <- cbind(prev.X,LAST.by.PEP) > X.names <- c(prev.X.names,"LAST.by.PEP") > > rm(prev.X,prev.X.names) > rm(f.lrn,f.val,f) > > dimnames(X) <- list(NULL,X.names) > > y.fit <- lm(y~X,weights=wts) # Note: Validation sample not in fit. > > yhat <- y.fit$fitted.values > > ehat <- y - mean(y) > mse.nul <- mean(ehat[idx.lrn]^2) > > ehat <- y.fit$residuals > mse.lrn <- mean(ehat[idx.lrn]^2) > mse.val <- mean(ehat[idx.val]^2) > mse.tst <- mean(ehat[idx.tst]^2) > > print(paste(" mse.nul = ",mse.nul)) [1] " mse.nul = 20.0992375595439" > print(paste(" mse.lrn = ",mse.lrn)) [1] " mse.lrn = 19.9111807739089" > print(paste(" mse.val = ",mse.val)) [1] " mse.val = 18.6052588093824" > print(paste(" mse.tst = ",mse.tst)) [1] " mse.tst = 17.7951514079773" > > y0 <- y[y<=0] > X0 <- X[y<=0,] > yhat0 <- yhat[y<=0] > ehat0 <- ehat[y<=0] > > idx <- seq(1,length(yhat0),length=4000) > > y0 <- y0[idx] > X0 <- X0[idx,] > yhat0 <- yhat0[idx] > ehat0 <- ehat0[idx] > > y1 <- y[y>0] > X1 <- X[y>0,] > yhat1 <- yhat[y>0] > ehat1 <- ehat[y>0] > > source("psopts.r") > postscript(file="cty_ehat_yhat.eps") > > plot(x=c(yhat0,yhat1),y=c(ehat0,ehat1),ylab="ehat=y-yhat",xlab="yhat",type="n") > points(x=yhat0,y=ehat0,col="black") > points(x=yhat1,y=ehat1,col="red") > > dev.off() null device 1 > > p <- ncol(X) > > for (i in (1:p)) { + feature <- dimnames(X)[[2]][i] + filename <- paste("cty_target_",feature,".eps",sep="") + postscript(file=filename) + + plot(x=c(X0[,i],X1[,i]),y=c(y0,y1),ylab="target",xlab=feature,type="n") + # points(x=X0[,i],y=y0,col="black") + points(x=X1[,i],y=y1,col="red") + + dev.off() + } > > > > > proc.time() [1] 85.85 8.57 95.04 0.00 0.00 >