target.lrn <- read.table("../lrn/num/472.dat",header=T,colClasses="numeric")
target.val <- read.table("../val/num/472.dat",header=T,colClasses="numeric")
 
y.lrn <- target.lrn[,1]
y.val <- target.val[,1]
y     <- c(y.lrn,y.val)

n.lrn <- length(y.lrn)
n.val <- length(y.val)
n     <- length(y)

rm(target.lrn,target.val)

wts <- mat.or.vec(n,1) ; for (i in 1:n.lrn) wts[i]=1

sel <- read.fwf("cty_sel.txt",widths=c(5,15,5),skip=2,strip.white=T,
         colClasses="character",col.names=c("file","feature","type")) 

n.sel <- length(sel$file)

curr.best.mse.val <- sum(y.val^2/n.val)
prev.best.mse.val <- curr.best.mse.val + 1

first.time <- TRUE

while (curr.best.mse.val < prev.best.mse.val) {
  prev.best.mse.val <- curr.best.mse.val
  print("")
  for (i in 1:n.sel) {
  
    fn.lrn <- paste("../lrn/",sel$type[i],"/",sel$file[i],".dat",sep="")
    fn.val <- paste("../val/",sel$type[i],"/",sel$file[i],".dat",sep="")
    print(sel$feature[i])
  
    if (sel$type[i]=="chr") {
  
      f.lrn <- read.table(fn.lrn,
                 header=T,colClasses="character",blank.lines.skip=F)
      f.val <- read.table(fn.val,
                 header=T,colClasses="character",blank.lines.skip=F)
  
      f <- c(f.lrn[,1],f.val[,1])

      if (sel$feature[i]=="STATE") {
        f[f=="AS"|f=="DC"|f=="DE"|f=="MA"|f=="ME"|f=="NH"] <- "S1"
        f[f=="OH"|f=="RI"|f=="VI"|f=="WV"]                 <- "S1"
        f[f=="AA"|f=="AE"|f=="AP"|f=="CT"|f=="GU"|f=="MD"] <- "S2"
        f[f=="NJ"|f=="NY"|f=="PA"|f=="PA"|f=="VA"|f=="VT"] <- "S2"
        f[f=="WY"]                                         <- "S2"  
        f[f=="AK"|f=="UT"|f=="MS"]                         <- "S3"
        f[f=="NE"|f=="ND"]                                 <- "S4"
        f[f=="SD"|f=="SC"]                                 <- "S5"
      } 
      
      f <- as.factor(f)

      n.lev <- nlevels(f)
      print(paste("  nlevels = ",n.lev))

      f <- model.matrix(y ~ f - 1)  
      f <- f[,2:ncol(f)]
  
    } else {
      
      f.lrn<-read.table(fn.lrn,
               header=T,colClasses="numeric",blank.lines.skip=F)
      f.val<-read.table(fn.val,
               header=T,colClasses="numeric",blank.lines.skip=F)
  
      f <- c(f.lrn[,1],f.val[,1])

      f[is.na(f)] <- 0

      if (sel$feature[i]=="DOB") { 
        d <- f ; d[d>0] <- 1   
        f <- cbind(d,f,f^2)
        rm(d)
      }
    }
  
    if (first.time) {
      X <- f
      prev.best.X <- X
    } else {
      X <- cbind(prev.best.X,f)
    }

    rm(f.lrn,f.val,f)

    fit <- lm(y~X,weights=wts)
  
    ehat.lrn <- fit$residuals[1:n.lrn]
    ehat.val <- fit$residuals[(n.lrn+1):n]
    ehat     <- fit$residuals
  
    sse.lrn <- sum(ehat.lrn^2) ; mse.lrn <- sse.lrn/n.lrn
    sse.val <- sum(ehat.val^2) ; mse.val <- sse.val/n.val
    sse     <- sum(ehat^2)     ; mse     <- sse/n

    rm(ehat.lrn,ehat.val,ehat)
    aov <- anova.lm(fit)
  
    print(paste("  rank    = ",fit$rank))
    print(paste("  P-value = ",aov[1,5]))
    print(paste("  mse.lrn = ",mse.lrn))
    print(paste("  mse.val = ",mse.val))
  
    if (mse.val < curr.best.mse.val) {
      curr.best.mse.val <- mse.val
      curr.best.i <- i
      curr.best.X <- X
      curr.best.rank <- fit$rank
      curr.best.pval <- aov[1,5]
      curr.best.mse.lrn <- mse.lrn
      curr.best.mse.val <- mse.val
    }
    rm(fit,aov)
  }
  
  if (curr.best.mse.val < prev.best.mse.val) {
    print("")
    print(paste("best is", sel$feature[curr.best.i]))
    print(paste("  rank    = ",curr.best.rank))
    print(paste("  P-value = ",curr.best.pval))
    print(paste("  mse.lrn = ",curr.best.mse.lrn))
    print(paste("  mse.val = ",curr.best.mse.val))
  }


  if (first.time) {
    mod.file <- sel$file[curr.best.i]
    mod.feature <- sel$feature[curr.best.i]
    mod.type <- sel$type[curr.best.i]
  } else {
    mod.file <- c(mod.file,sel$file[curr.best.i]) 
    mod.feature <- c(mod.feature,sel$feature[curr.best.i])
    mod.type <- c(mod.type,sel$type[curr.best.i]) 
  }

  sel <- sel[(1:n.sel) != curr.best.i,]
  n.sel <- n.sel - 1 ; if (n.sel == 0) break

  prev.best.X <- curr.best.X

  first.time <- FALSE
}

print("")
print("")
print("features selected are")
print("")
n.mod <- length(mod.file)
for (i in 1:(n.mod-1)) {
  print(paste(mod.file[i],"  ",mod.feature[i],"  ",mod.type[i]))
}

