######################################################################################
##########     utils_DMC.r:  Some useful utility function      #######################
##########     Author: Wolfgang Konen, FHK, March-Oct 2009     #######################
######################################################################################

######################################################################################
# garbage collector
######################################################################################
collectGarbage <- function()
{
  	while (gc()[2,4] != gc()[2,4]){}
}

######################################################################################
# bind the column with name response.predict and contents vec as last column
# to data frame d
######################################################################################
bind_response <- function(d,response.predict,vec) 
{
    # drop column response.predict if there, do nothing if not there
    d <- d[,setdiff(names(d),response.predict)]
    # bind column response.predict as last column to data frame d
    d <- cbind(d, prediction=vec)
    names(d)[names(d)=="prediction"] <- response.predict

    return(d)
}

######################################################################################
# sorted_rf_importance:
#       return the list of input variables sorted increasingly by their RF-importance
#       (uses na.roughfix for missing value replacement)
# IN
#       ...
#       CLASSWT     class weight vector to use in random forest training
# OUT
#       s_input     input.variables sorted by increasing importance
######################################################################################
sorted_rf_importance <- function(filename,d_train,response.variable,
                                 input.variables,CLASSWT) 
{
    formula <- formula(paste(response.variable, "~ ."))   # use all possible input variables
    cat(filename,": Train RF (importance) ...\n")
    to.model <- d_train[,c(response.variable,input.variables)]
    samp <- min(length(to.model[,1]),3000)
    res.rf <- randomForest( formula=formula, data=to.model, nodesize=1,classwt=CLASSWT,
                            sampsize=samp, proximity=F, na.action=na.roughfix)
    imp1 <- res.rf$importance
    
    # s_input: input.variables sorted by increasing importance:
    s_input <- input.variables[order(imp1,decreasing=F)]

    # plot it:
    X11()         # synonym for win.graph(), works on Unix and Win
    par(mar=c(20.1,3.1,2.1,2.1))
    barplot(t(imp1[order(imp1)]), names.arg=s_input, las=3, cex.lab=0.75, col=2, main=paste(filename, response.variable, sep=" : "))
    
    return (s_input)
}

######################################################################################
# confmat: function to calculate confusion matrix & gain
#
# INPUT:
# ------
#   d 		      data frame
#   colreal     name of column in d which contains the real class
#   colpred 		name of column in d which contains the predicted class
#   costmat     the cost matrix for each possible outcome, same size as cm$mat (see below);
#               costmat[R1,P2]: the cost associated with a record of real class R1 which we 
#               predict as class P2.
#
#
# OUTPUT:
# -------
# cm, a list containing
#     mat       matrix with real class levels as rows, predicted class levels columns.
#               mat[R1,P2]: number of records with real class R1
#               predicted as class P2.
#               CAUTION: If colpred contains NA's, those cases are missing in mat (!)
#               (but the class errors are correct as long as there are no NA's in colreal)
#     cerr      class error rates, vector of size nlevels(colreal)+1.
#               cerr[X] is the misclassification rate for real class X.
#               cerr["Total"] is the total classification error rate.
#     gain      the total gain, given the cost matrix costmat
#     gain.vector  gain.vector[X] is the gain attributed to real class X.
#               gain.vector["Total"] is again the total gain
#     gainmax   the maximum achievable gain, assuming perfect prediction
#     rgain     ratio gain/gainmax in percent
######################################################################################
confmat <- function(d,colreal,colpred,costmat) 
{
    cmat = matrix(0,nrow=nlevels(d[,colreal]),ncol=nlevels(d[,colpred]),
                  dimnames=list(levels(d[,colreal]),levels(d[,colpred])))
    ccase = matrix(0,nrow=1,ncol=nlevels(d[,colreal])+1,
                  dimnames=list("class cases:",c(levels(d[,colpred]),"Total")))

    # confusion matrix
    for (rc in levels(d[,colreal])) {
        d1 <- d[d[,colreal]==rc, ]      # all records with real class rc
        ccase[1,rc] <- dim(d1)[1]
        for (pc in levels(d[,colpred])) {
            cmat[rc,pc] <- length(which(d1[,colpred]==pc))
                           # each case with NA in colpred is in *none* of the which sets  
        }
    }
    ccase[1,"Total"] <- sum(ccase)      # all cases, including those with NA in colpred
    
    # class errors
    cerr = matrix(0,nrow=1,ncol=nlevels(d[,colpred])+1,
                  dimnames=list("class errors:",c(levels(d[,colpred]),"Total")))
    for (rc in levels(d[,colreal])) {
        cerr[1,rc] <- 1-cmat[rc,rc]/ccase[1,rc]     # before: /sum(cmat[rc,])
    }
    cerr[1,"Total"] <- 1-sum(diag(cmat))/ccase[1,"Total"]   # /sum(cmat)
    
    gain = sum(costmat*cmat)
    gain.vector = matrix(0,nrow=1,ncol=nlevels(d[,colpred])+1,
                  dimnames=list("gain.vector",c(levels(d[,colpred]),"Total")))
    for (rc in levels(d[,colreal])) {
        gain.vector[1,rc] <- sum(costmat[rc,] * cmat[rc,])
    }
    gain.vector[1,"Total"] <- sum(gain.vector)
    
    gainmax = sum(diag(costmat)*rowSums(cmat))
    rgain = gain/gainmax*100

    cm = list(mat=cmat,cerr=cerr,ccase=ccase,gain=gain,gain.vector=gain.vector,gainmax=gainmax,rgain=rgain)
    
    return(cm)
}