###################################################################################### #### garbage collector ###################################################################################### collectGarbage <- function() { while (gc()[2,4] != gc()[2,4]){} } ###################################################################################### # bind the column with name response.predict and contents vec as last column # to data frame d ###################################################################################### bind_response <- function(d,response.predict,vec) { # drop column response.predict if there, do nothing if not there d <- d[,setdiff(names(d),response.predict)] # bind column response.predict as last column to data frame d d <- cbind(d, prediction=vec) names(d)[names(d)=="prediction"] <- response.predict return(d) } ###################################################################################### # sorted_rf_importance: # return the list of input variables sorted increasingly by their RF-importance # (uses na.roughfix for missing value replacement) # IN # ... # LOAD_IMPORTANCE =F: train a random forest on the data and save the importance # result in /_imp1.Rdata # =T: load the previous results from /_imp1.Rdata # This switch is only to speed up calculation when the importance has # been calculated and saved previously. It is always safe to # set LOAD_IMPORTANCE=F (but it may take longer). # GRAPH2FILE =T: plot into file /_importance.png # =F: plot importance bar plot onto screen # OUT # s_input input.variables sorted by increasing importance ###################################################################################### sorted_rf_importance <- function(dir.rdata,filename,d_train,response.variable, input.variables,LOAD_IMPORTANCE,CLASSWT, GRAPH2FILE,graphic,graphicext) { formula <- formula(paste(response.variable, "~ .")) # use all possible input variables if (LOAD_IMPORTANCE==F) { cat(filename,": Train RF (importance) ...\n") to.model <- d_train[,c(response.variable,input.variables)] samp <- min(length(to.model[,1]),3000) res.rf <- randomForest( formula=formula, data=to.model, nodesize=1,classwt=CLASSWT, sampsize=samp, proximity=F, na.action=na.roughfix) imp1 <- res.rf$importance save(imp1,file=paste(dir.rdata, sub(".txt","",sub(".csv", "", filename)),"_imp1.Rdata", sep="")) } else { # load imp1 from previous run (nothing different, only faster) load(file=paste(dir.rdata, sub(".txt","",sub(".csv", "", filename)),"_imp1.Rdata", sep="")) } # s_input: input.variables sorted by increasing importance: s_input <- input.variables[order(imp1,decreasing=F)] # plot it: if (GRAPH2FILE == F){ X11() # synonym for win.graph(), works on Unix and Win } else { # write graphical output to file graphic(filename=paste(dir.graphics, sub(".csv", "", filename), "_", response.variable, "_importance", graphicext, sep="")) } par(mar=c(20.1,3.1,2.1,2.1)) barplot(t(imp1[order(imp1)]), names.arg=s_input, las=3, cex.lab=0.75, col=2, main=paste(filename, response.variable, sep=" : ")) if (GRAPH2FILE == T) dev.off() return (s_input) } ###################################################################################### # confmat: function to calculate confusion matrix & gain # # INPUT: # ------ # d data frame # colreal name of column in d which contains the real class # colpred name of column in d which contains the predicted class # costmat the cost matrix for each possible outcome, same size as cm$mat (see below); # costmat[R1,P2]: the cost associated with a record of real class R1 which we # predict as class P2. # # # OUTPUT: # ------- # cm, a list containing # mat matrix with real class levels as rows, predicted class levels columns. # mat[R1,P2]: number of records with real class R1 # predicted as class P2. # CAUTION: If colpred contains NA's, those cases are missing in mat (!) # (but the class errors are correct as long as there are no NA's in colreal) # cerr class error rates, vector of size nlevels(colreal)+1. # cerr[X] is the misclassification rate for real class X. # cerr["Total"] is the total classification error rate. # gain the total gain, given the cost matrix costmat # gain.vector gain.vector[X] is the gain coming from real class X. # gain.vector["Total"] is again the total gain # gainmax the maximum reachable gain, assuming perfect prediction # rgain ratio gain/gainmax in percent ###################################################################################### confmat <- function(d,colreal,colpred,costmat) { cmat = matrix(0,nrow=nlevels(d[,colreal]),ncol=nlevels(d[,colpred]), dimnames=list(levels(d[,colreal]),levels(d[,colpred]))) ccase = matrix(0,nrow=1,ncol=nlevels(d[,colreal])+1, dimnames=list("class cases:",c(levels(d[,colpred]),"Total"))) # confusion matrix for (rc in levels(d[,colreal])) { d1 <- d[d[,colreal]==rc, ] # all records with real class rc ccase[1,rc] <- dim(d1)[1] for (pc in levels(d[,colpred])) { cmat[rc,pc] <- length(which(d1[,colpred]==pc)) # each case with NA in colpred is in *none* of the which sets } } ccase[1,"Total"] <- sum(ccase) # all cases, including those with NA in colpred # class errors cerr = matrix(0,nrow=1,ncol=nlevels(d[,colpred])+1, dimnames=list("class errors:",c(levels(d[,colpred]),"Total"))) for (rc in levels(d[,colreal])) { cerr[1,rc] <- 1-cmat[rc,rc]/ccase[1,rc] # before: /sum(cmat[rc,]) } cerr[1,"Total"] <- 1-sum(diag(cmat))/ccase[1,"Total"] # /sum(cmat) gain = sum(costmat*cmat) gain.vector = matrix(0,nrow=1,ncol=nlevels(d[,colpred])+1, dimnames=list("gain.vector",c(levels(d[,colpred]),"Total"))) for (rc in levels(d[,colreal])) { gain.vector[1,rc] <- sum(costmat[rc,] * cmat[rc,]) } gain.vector[1,"Total"] <- sum(gain.vector) gainmax = sum(diag(costmat)*rowSums(cmat)) rgain = gain/gainmax*100 cm = list(mat=cmat,cerr=cerr,ccase=ccase,gain=gain,gain.vector=gain.vector, gainmax=gainmax,rgain=rgain) return(cm) }