R : Copyright 2005, The R Foundation for Statistical Computing Version 2.1.1 (2005-06-20), ISBN 3-900051-07-0 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for a HTML browser interface to help. Type 'q()' to quit R. > invisible(options(echo = TRUE)) > library(rpart) > > target.lrn <- read.table("../lrn/num/472.dat",header=T,colClasses="numeric") > target.val <- read.table("../val/num/472.dat",header=T,colClasses="numeric") > target.tst <- read.table("../tst/num/472.dat",header=T,colClasses="numeric") > > > y.lrn <- target.lrn[,1] > y.val <- target.val[,1] > y.tst <- target.tst[,1] > y <- c(y.lrn,y.val,y.tst) > > n.lrn <- length(y.lrn) > n.val <- length(y.val) > n.tst <- length(y.tst) > n <- length(y) > > rm(target.lrn,target.val,target.tst) > > wts <- mat.or.vec(n,1) ; for (i in 1:n.lrn) wts[i]=1 > idx.lrn <- 1:n.lrn > idx.val <- (n.lrn+1):(n.lrn+n.val) > idx.tst <- (n.lrn+n.val+1):n > > mod <- read.table("cty_usa.txt", + header=F,colClasses="character",col.names=c("file","feature","type")) > > n.mod <- length(mod$file) > > first.time <- TRUE > > for (i in 1:n.mod) { + + fn.lrn <- paste("../lrn/",mod$type[i],"/",mod$file[i],".dat",sep="") + fn.val <- paste("../val/",mod$type[i],"/",mod$file[i],".dat",sep="") + fn.tst <- paste("../tst/",mod$type[i],"/",mod$file[i],".dat",sep="") + + print(mod$feature[i]) + + if (mod$type[i]=="chr") { + + f.lrn <- read.table(fn.lrn, + header=T,colClasses="character",blank.lines.skip=F) + f.val <- read.table(fn.val, + header=T,colClasses="character",blank.lines.skip=F) + f.tst <- read.table(fn.tst, + header=T,colClasses="character",blank.lines.skip=F) + + f <- c(f.lrn[,1],f.val[,1],f.tst[,1]) + + if (mod$feature[i]=="STATE") { + f[f=="AS"|f=="DC"|f=="DE"|f=="MA"|f=="ME"|f=="NH"] <- "S1" + f[f=="OH"|f=="RI"|f=="VI"|f=="WV"] <- "S1" + f[f=="AA"|f=="AE"|f=="AP"|f=="CT"|f=="GU"|f=="MD"] <- "S2" + f[f=="NJ"|f=="NY"|f=="PA"|f=="PA"|f=="VA"|f=="VT"] <- "S2" + f[f=="WY"] <- "S2" + f[f=="AK"|f=="UT"|f=="MS"] <- "S3" + f[f=="NE"|f=="ND"] <- "S4" + f[f=="SD"|f=="SC"] <- "S5" + } + + f <- as.factor(f) + + n.lev <- nlevels(f) + f.name <- levels(f) + if(n.lev==2) f.name <- c(mod$feature[i],mod$feature[i]) + print(paste(" nlevels = ",n.lev)) + + f <- model.matrix(y ~ f - 1) # Note: Intercept removed. + f <- f[,2:n.lev] # Note: First dummy deleted. + f.name <- f.name[2:n.lev] # Note: First name deleted. + + } else { + + f.lrn<-read.table(fn.lrn, + header=T,colClasses="numeric",blank.lines.skip=F) + f.val<-read.table(fn.val, + header=T,colClasses="numeric",blank.lines.skip=F) + f.tst<-read.table(fn.tst, + header=T,colClasses="numeric",blank.lines.skip=F) + + f <- c(f.lrn[,1],f.val[,1],f.tst[,1]) + + f[is.na(f)] <- 0 + + f.name <- mod$feature[i] + + if (mod$feature[i]=="DOB") { + d <- f ; d[d>0] <- 1 # Note: Dummy for missing DOB + f <- cbind(d,f,f^2) # Note: Quadratic term added to DOB. + rm(d) + f.name <- c("DOB_miss","DOB_linr","DOB_quad") + } + + } + + if (first.time) { + X <- f + X.names <- f.name + first.time <- FALSE + } else { + X <- cbind(prev.X,f) + X.names <- c(prev.X.names,f.name) + } + + prev.X <- X + prev.X.names <- X.names + + } [1] "LASTGIFT" [1] "PEPSTRFL" [1] " nlevels = 2" [1] "LON" [1] "LONSQR" [1] "LAT" [1] "LATSQR" [1] "LONLAT" [1] "HI" [1] "AK" [1] "TERRITORY" [1] "RECP3" [1] " nlevels = 2" [1] "DOB" [1] "MAILCODE" [1] " nlevels = 2" [1] "MHUC2" [1] "LASTDATE" [1] "MINRAMNT" > > LAST.by.PEP <- X[,1]*X[,2] > > X <- cbind(prev.X,LAST.by.PEP) > X.names <- c(prev.X.names,"LAST.by.PEP") > > rm(prev.X,prev.X.names) > rm(f.lrn,f.val,f) > > dimnames(X) <- list(NULL,X.names) > > fit <- lm(y~X,weights=wts) > aov <- anova.lm(fit) > > ehat <- fit$residuals > mse.lrn <- sum(ehat[idx.lrn]^2)/n.lrn > mse.val <- sum(ehat[idx.val]^2)/n.val > mse.tst <- sum(ehat[idx.tst]^2)/n.tst > > print(paste(" rank = ",fit$rank)) [1] " rank = 20" > print(paste(" P-value = ",aov[1,5])) [1] " P-value = 1.00783101919317e-115" > print(paste(" mse.lrn = ",mse.lrn)) [1] " mse.lrn = 19.9187451580094" > print(paste(" mse.val = ",mse.val)) [1] " mse.val = 18.6023779400238" > print(paste(" mse.tst = ",mse.tst)) [1] " mse.tst = 17.7922581965671" > > print(summary.lm(fit)) Call: lm(formula = y ~ X, weights = wts) Residuals: Min 1Q Median 3Q Max -21.9483 -0.8104 -0.5646 0.0000 199.2596 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.948e+01 3.419e+00 -5.697 1.22e-08 *** XLASTGIFT 7.585e-03 1.767e-03 4.293 1.76e-05 *** XPEPSTRFL -3.310e-01 5.560e-02 -5.954 2.63e-09 *** XLON 3.689e-02 1.479e-02 2.494 0.012643 * XLONSQR 2.273e-04 8.765e-05 2.594 0.009501 ** XLAT 8.273e-02 3.697e-02 2.238 0.025232 * XLATSQR -1.108e-03 6.815e-04 -1.626 0.103866 XLONLAT 5.561e-05 3.135e-04 0.177 0.859209 XHI 3.925e-01 4.744e-01 0.827 0.407984 XAK -4.635e-01 5.133e-01 -0.903 0.366626 XTERRITORY 6.006e-01 6.217e-01 0.966 0.334015 XRECP3 6.033e-01 1.224e-01 4.927 8.36e-07 *** XDOB_miss -2.166e-01 9.623e-02 -2.251 0.024361 * XDOB_linr 1.882e-04 5.198e-05 3.621 0.000293 *** XDOB_quad -2.552e-08 6.782e-09 -3.762 0.000169 *** XMAILCODE -4.341e-01 1.450e-01 -2.993 0.002762 ** XMHUC2 5.983e-02 2.036e-02 2.939 0.003299 ** XLASTDATE 2.084e-03 3.556e-04 5.862 4.60e-09 *** XMINRAMNT 3.760e-03 2.491e-03 1.509 0.131239 XLAST.by.PEP 3.474e-02 2.689e-03 12.920 < 2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 4.464 on 66880 degrees of freedom Multiple R-Squared: 0.008979, Adjusted R-squared: 0.008698 F-statistic: 31.89 on 19 and 66880 DF, p-value: < 2.2e-16 > > outvec <- fit$fitted.values[idx.lrn] > outlab <- "yhat.lrn" > outfil <- "yhat_lrn_usa.dat" > write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil) > > outvec <- fit$fitted.values[idx.val] > outlab <- "yhat.val" > outfil <- "yhat_val_usa.dat" > write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil) > > outvec <- fit$fitted.values[idx.tst] > outlab <- "yhat.tst" > outfil <- "yhat_tst_usa.dat" > write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil) > > rm(outvec,outlab,outfil) > > x0 <- c(0,n.lrn) > y0 <- c(0,sum(y.lrn-0.68)) > > x1 <- (1:n.lrn) > y1 <- fit$fitted.values[1:n.lrn] > y1 <- y1-0.68 > y1 <- sort(y1) > y1 <- y1[n.lrn:1] > y1 <- cumsum(y1) > > idx <- 1:n.lrn > > print(paste("maximum profit in learning sample is ", max(y1))) [1] "maximum profit in learning sample is 12055.5708077004" > print(paste("maximum occurs at ", idx[y1==max(y1)])) [1] "maximum occurs at 38159" > > idx <- seq(1,n.lrn,length=200) > > x1 <- x1[idx] > y1 <- y1[idx] > > source("psopts.r"); > postscript(file="cty_usa_lif.eps"); > > plot(x=c(x0,x1),y=c(y0,y1),ylab="dollars",xlab="size of mailing",type="n") > lines(x=x0,y=y0,col="green") > lines(x=x1,y=y1,col="red") > > dev.off() null device 1 > proc.time() [1] 39.57 4.59 50.93 0.00 0.01 >