######################################################################################
#### garbage collector
######################################################################################
collectGarbage <- function()
{
while (gc()[2,4] != gc()[2,4]){}
}
######################################################################################
# bind the column with name response.predict and contents vec as last column
# to data frame d
######################################################################################
bind_response <- function(d,response.predict,vec)
{
# drop column response.predict if there, do nothing if not there
d <- d[,setdiff(names(d),response.predict)]
# bind column response.predict as last column to data frame d
d <- cbind(d, prediction=vec)
names(d)[names(d)=="prediction"] <- response.predict
return(d)
}
######################################################################################
# sorted_rf_importance:
# return the list of input variables sorted increasingly by their RF-importance
# (uses na.roughfix for missing value replacement)
# IN
# ...
# LOAD_IMPORTANCE =F: train a random forest on the data and save the importance
# result in
/_imp1.Rdata
# =T: load the previous results from /_imp1.Rdata
# This switch is only to speed up calculation when the importance has
# been calculated and saved previously. It is always safe to
# set LOAD_IMPORTANCE=F (but it may take longer).
# GRAPH2FILE =T: plot into file /_importance.png
# =F: plot importance bar plot onto screen
# OUT
# s_input input.variables sorted by increasing importance
######################################################################################
sorted_rf_importance <- function(dir.rdata,filename,d_train,response.variable,
input.variables,LOAD_IMPORTANCE,CLASSWT,
GRAPH2FILE,graphic,graphicext)
{
formula <- formula(paste(response.variable, "~ .")) # use all possible input variables
if (LOAD_IMPORTANCE==F) {
cat(filename,": Train RF (importance) ...\n")
to.model <- d_train[,c(response.variable,input.variables)]
samp <- min(length(to.model[,1]),3000)
res.rf <- randomForest( formula=formula, data=to.model, nodesize=1,classwt=CLASSWT,
sampsize=samp, proximity=F, na.action=na.roughfix)
imp1 <- res.rf$importance
save(imp1,file=paste(dir.rdata, sub(".txt","",sub(".csv", "", filename)),"_imp1.Rdata", sep=""))
} else {
# load imp1 from previous run (nothing different, only faster)
load(file=paste(dir.rdata, sub(".txt","",sub(".csv", "", filename)),"_imp1.Rdata", sep=""))
}
# s_input: input.variables sorted by increasing importance:
s_input <- input.variables[order(imp1,decreasing=F)]
# plot it:
if (GRAPH2FILE == F){
X11() # synonym for win.graph(), works on Unix and Win
} else {
# write graphical output to file
graphic(filename=paste(dir.graphics, sub(".csv", "", filename), "_", response.variable, "_importance", graphicext, sep=""))
}
par(mar=c(20.1,3.1,2.1,2.1))
barplot(t(imp1[order(imp1)]), names.arg=s_input, las=3, cex.lab=0.75, col=2, main=paste(filename, response.variable, sep=" : "))
if (GRAPH2FILE == T) dev.off()
return (s_input)
}
######################################################################################
# confmat: function to calculate confusion matrix & gain
#
# INPUT:
# ------
# d data frame
# colreal name of column in d which contains the real class
# colpred name of column in d which contains the predicted class
# costmat the cost matrix for each possible outcome, same size as cm$mat (see below);
# costmat[R1,P2]: the cost associated with a record of real class R1 which we
# predict as class P2.
#
#
# OUTPUT:
# -------
# cm, a list containing
# mat matrix with real class levels as rows, predicted class levels columns.
# mat[R1,P2]: number of records with real class R1
# predicted as class P2.
# CAUTION: If colpred contains NA's, those cases are missing in mat (!)
# (but the class errors are correct as long as there are no NA's in colreal)
# cerr class error rates, vector of size nlevels(colreal)+1.
# cerr[X] is the misclassification rate for real class X.
# cerr["Total"] is the total classification error rate.
# gain the total gain, given the cost matrix costmat
# gain.vector gain.vector[X] is the gain coming from real class X.
# gain.vector["Total"] is again the total gain
# gainmax the maximum reachable gain, assuming perfect prediction
# rgain ratio gain/gainmax in percent
######################################################################################
confmat <- function(d,colreal,colpred,costmat)
{
cmat = matrix(0,nrow=nlevels(d[,colreal]),ncol=nlevels(d[,colpred]),
dimnames=list(levels(d[,colreal]),levels(d[,colpred])))
ccase = matrix(0,nrow=1,ncol=nlevels(d[,colreal])+1,
dimnames=list("class cases:",c(levels(d[,colpred]),"Total")))
# confusion matrix
for (rc in levels(d[,colreal])) {
d1 <- d[d[,colreal]==rc, ] # all records with real class rc
ccase[1,rc] <- dim(d1)[1]
for (pc in levels(d[,colpred])) {
cmat[rc,pc] <- length(which(d1[,colpred]==pc))
# each case with NA in colpred is in *none* of the which sets
}
}
ccase[1,"Total"] <- sum(ccase) # all cases, including those with NA in colpred
# class errors
cerr = matrix(0,nrow=1,ncol=nlevels(d[,colpred])+1,
dimnames=list("class errors:",c(levels(d[,colpred]),"Total")))
for (rc in levels(d[,colreal])) {
cerr[1,rc] <- 1-cmat[rc,rc]/ccase[1,rc] # before: /sum(cmat[rc,])
}
cerr[1,"Total"] <- 1-sum(diag(cmat))/ccase[1,"Total"] # /sum(cmat)
gain = sum(costmat*cmat)
gain.vector = matrix(0,nrow=1,ncol=nlevels(d[,colpred])+1,
dimnames=list("gain.vector",c(levels(d[,colpred]),"Total")))
for (rc in levels(d[,colreal])) {
gain.vector[1,rc] <- sum(costmat[rc,] * cmat[rc,])
}
gain.vector[1,"Total"] <- sum(gain.vector)
gainmax = sum(diag(costmat)*rowSums(cmat))
rgain = gain/gainmax*100
cm = list(mat=cmat,cerr=cerr,ccase=ccase,gain=gain,gain.vector=gain.vector,
gainmax=gainmax,rgain=rgain)
return(cm)
}