#
# More advanced template for a data mining process
# (does the same as ../main_iris.R, but has more 'bells and whistles', e.g. 
# more switches to adjust, speed-up items for large datasets, 
# more methods for graphics and output)
#
# Author: Wolfgang Konen, FHK, March 2009
#
rm(list=ls())           # delete all variables and functions
graphics.off()          # close all graphics windows
library(randomForest)
source("utils_DMC_adv.R")
collectGarbage()

READ_CSV = T            # =T: read data from .csv & save on .Rdata
                        # =F: load from .Rdata (faster)
SHOW_SUMM = T           # =T: show summary of data
GRAPH2FILE = F          # =T: write graphical output to png files in dir.graphics directory
                        # =F: plot to screen
PRED_ALL = F            # =T/F: setting for keyword predict.all in predict.randomForest
NAIVE_BAYES = F         # =T/F: use Naive Bayes modeling instead of RF (requires package e1071)
CLASSWT = c(10,10,10)   # class weights for A,B,C (the higher, the more costly
                        # is a misclassification of that real class). NULL for no weights
LOAD_IMPORTANCE = F     # =T: load importance from ..._imp.Rdata, =F: calculate importance 
NDROP = 0               # 0..N: how many variables (those with lowest importance) to drop

#basedir <- "C:/user/datasets/Vorlesungen/DMC-Cup/IrisTest/"
basedir <- "./"

if (Sys.info()["sysname"]=="Windows") {
    directory <- paste(basedir,"", sep="")
    graphic <- function(filename, ...) png(filename, ...)
    graphicext = ".png"
} else {
    directory <- "D:/tmp/"
    graphic <- function(filename, ...) jpeg(filename, ...)
    graphicext = ".jpg"
}
dir.data <- paste(directory, "", sep="")
dir.graphics <- paste(directory, "Graphics/", sep="")
dir.rdata <- paste(directory, "RData/", sep="")
dir.output <- paste(directory, "Output/", sep="")

filenames <- c("iris.csv")

filename = filenames[1]

    # write standard output also into log file:
    sink(paste(dir.output, sub(".csv", "", filename),"_sink.txt", sep=""),append=F,split=T)
    # With 'append=F' we indicate that we will write a new file. 'split=T' means that the 
    # output is written both to R console AND to file.

    #===============================================
    # PART 1: READ DATA
    #===============================================
    if (READ_CSV==T) {
        cat(filename,": Read data ...\n")
        raw.ds <- read.csv2(file=paste(dir.data, filename, sep=""), dec=".",na.string="-1",nrow=-1)
        # REMARK: when you have a large dataset (e.g. 100000 records), you may use nrow=100 or so as 
        # long as you experiment with R script and code (during debugging) in order to speed up things.
        # When your real experiment starts, set nrow=-1 to fetch all records.
        save(raw.ds,file=paste(dir.rdata, sub(".csv", ".Rdata", filename), sep=""))
    } else {
        # this branch is to speed up the reading of large data sets:
        cat(filename,": Load data ...\n")
        system.time(load(file=paste(dir.rdata, sub(".csv", ".Rdata", filename), sep="")))
    }

    # which variables are response variables:
    response.variables <- "Species" 

    # special for iris-dataset: force columns to be numeric (otherwise R thinks 
    # that they are factors and rowSums below does not work)
    #    for (n in input.variables) {    
    #        raw.ds[,n] <- as.numeric(raw.ds[,n])  
    #    }                                      
        
    #=============================================
    # PART 2a: SELECT + PREPOC DATA
    # --- this step can be skipped if you don't need additional variables   ---
    #=============================================
    cat(filename,": Select + preproc data ...\n")
    # just as an example how to add new variables to training and test data set:
    # add a new column "sum_c" = sum of all columns 1:4 
    # (bind_response is a short function in utils_DMC.r) 
    raw.ds <- bind_response(raw.ds,"sum_c",rowSums(raw.ds[,1:4]))

     #=============================================
    # PART 2b: DIVIDE INTO TRAINING SET / TEST SET
    #=============================================
    # the division is done by random sampling 
    #set.seed(44)                    # if you want reproducably the same sets
    L <- length(raw.ds[,1])
    p <- sample(L)                  # random permutation of indices 1:L
    d_test <- raw.ds[p[1:(L/3)], ]     # test set     (1/3 of the data)
    d_train <- raw.ds[p[(L/3+1):L], ]  # training set (2/3 of the data)
    # set input variables (everything what is not in response.variables):
    input.variables <- setdiff(names(raw.ds), response.variables)
    if (SHOW_SUMM) {
      cat("Summary of raw.ds:\n")
      print(summary(raw.ds))             # most columns are of numeric type
                                         # -> summary min,max,quantiles...
    }
    cat("\n")

    #=============================================
    # PART 3: TREAT MISSING VALUES
    #=============================================
    #cat(filename,": Missing Values ...\n")
    #--- currently not necessary -----------------

    for (response.variable in response.variables) {
        #=============================================
        # PART 4a: TRAIN RANDOM FOREST (IMPORTANCE)
        #=============================================
        # determine the importance of all input var's by constructing a test RF
        cat(filename,": Importance check ...\n")
        s_input <- sorted_rf_importance(dir.rdata,filename,d_train,response.variable,
                        input.variables,LOAD_IMPORTANCE,CLASSWT,GRAPH2FILE,graphic,graphicext)

        # remove the NDROP input variables which have the lowest importance:
        input.variables <- s_input[(NDROP+1):length(s_input)]
        cat("Dropped columns: ", setdiff(s_input,input.variables),"\n")
        d_train <- d_train[,c(response.variable,input.variables)]
        d_test <- d_test[,c(response.variable,input.variables)]

        
        #=============================================
        # PART 4b: MODELING: TRAIN RANDOM FOREST
        #=============================================
        cat(filename,": Train RF ...\n")
        cat("Class weights: ", CLASSWT,"\n")
        formula <- formula(paste(response.variable, "~ ."))   # use all possible input variables
        # estimate random forest based on previous step:
        res.rf <- randomForest( formula=formula, data=d_train, nodesize=1,
                                classwt=CLASSWT, sampsize=length(d_train[,1]), 
                                proximity=F, na.action=na.roughfix)
        print(res.rf)

        #=============================================
        # PART 5: APPLY RANDOM FOREST
        #=============================================
        cat(filename,": Apply RF ...\n")
        test.predict <- predict(res.rf, newdata=d_test,predict.all=PRED_ALL)
        if (PRED_ALL==T) 
            test.predict <- test.predict$aggregate
        train.predict <- res.rf$predicted
        # (res.rf$predicted is the *OOB-prediction* on the training set)
        # Think about this! Why is it WRONG (or too optimistic) to use here
        #        train.predict <- predict(res.rf, newdata=d_train)
        # as the prediction for the training set?
        
        if (NAIVE_BAYES) {
            cat(filename,": Train & predict with Naive Bayes ...\n")
            library(e1071)          # required for naiveBayes
            res.nb <- naiveBayes(formula=formula, data=d_train)
            test.predict<-predict(res.nb,newdata=d_test[,-1])
            # CAUTION!! predict.naiveBayes requires that only the input variables
            # enter via 'newdata', so we have to exclude the 1st column (Species)!!
            # Otherwise a strange error appears:
            #   Fehler in FUN(1:6[[1L]], ...) : Indizierung außerhalb der Grenzen
        }    

        # bind the predicted class pred_... as last column to data frames
        # (res.rf$predicted is the *OOB-prediction* on the training set)
        name.of.prediction <- paste("pred_", response.variable, sep="")
        d_train <- bind_response(d_train, name.of.prediction,train.predict)
        d_test  <- bind_response(d_test,name.of.prediction,test.predict)

        #=============================================
        # PART 6: POSTPROCESSING
        #=============================================
        #cat(filename,": Postprocessing ...\n")
        #--- currently not necessary -----------------

        #=============================================
        # PART 7: EVAL: CALC CONFUSION MATRIX + GAIN
        # --- this part has to be replaced by something else if   ---
        # --- you do regression instead of classification         ---
        #=============================================
        cat(filename,": Calc confusion matrix + gain ...\n")
        costmat <- matrix(c(+1,-1,-1,
                            -1,+1,-1,
                            -1,-1,+1), byrow=T,nrow=3,ncol=3,
                        dimnames=list(levels(d_test[,response.variable]),   # row names
                                      levels(d_test[,name.of.prediction])   # column names
                                      ))
        cat("\nTraining cases:\n")
        cm.train <- confmat(d_train,response.variable,name.of.prediction,costmat)
        cat("           --- predicted ---\n")
        print(cm.train$mat)                     # confusion matrix
        #print(colSums(cm.train$mat))
        print(cm.train$cerr)                    # error of OOB-prediction on trainig set
        print(res.rf$err.rate[res.rf$ntree,])   # OOB training error (all trees)
        print(cm.train$gain.vector)
        cat("total gain:", cm.train$gain, "(is",cm.train$rgain,"% of max. gain)\n")
        
        cat("\nTest cases:\n")
        cm.test <- confmat(d_test,response.variable,name.of.prediction,costmat)
        cat("           --- predicted ---\n")
        print(cm.test$mat)                      # confusion matrix on test set
        #print(colSums(cm.test$mat))
        cat("Test set error:\n")
        print(cm.test$cerr)                      
        print(cm.test$gain.vector)
        cat("total gain: ", cm.test$gain, "(is",cm.test$rgain,"% of max. gain)\n")

        #=============================================
        # PART 8: GRAPHICS 
        # --- this part has to be replaced by something else if   ---
        # --- you do regression instead of classification         ---
        #=============================================
        if (GRAPH2FILE == F){
             X11()         # synonym for win.graph(), works on Unix and Win
        } else {
             # write graphical output to file
             graphic(filename=paste(dir.graphics, sub(".csv", "", filename), "_scatter", graphicext, sep=""))
        }
        # scatter plot for each  pair of input variables, colored with 'Species'
        pairs(cbind(raw.ds[,1:4], sum_c=raw.ds$sum_c), cex=0.6, gap=0,                       
              col=c("red", "green", "blue")[as.numeric(raw.ds$Species)],
              main="Iris Data: Predictors and their sum")
        if (GRAPH2FILE == T) dev.off()
          
        # bar plot of true/false test cases for all classes
        if (GRAPH2FILE == F){
             X11()         # synonym for win.graph(), works on Unix and Win
        } else {
             # write graphical output to file
             graphic(filename=paste(dir.graphics, sub(".csv", "", filename), "_bar_test", graphicext, sep=""))
        }
        cmt<-cm.test$mat
        heigth<-t(matrix(c(diag(cmt),rowSums(cmt)-diag(cmt)),nrow=3,ncol=2))
        # 'height' is a 2*3 matrix containing in its first row the number of 
        # correct classifications for each Species (diagonal of confusion matrix) 
        # and in its 2nd row the number of wrong classifications for each Species
        # (sum of off-diagonal elements in each row of the confusion matrix)        
        barplot(heigth,beside=TRUE,col=c("blue","red"),legend=c("true","false"),                  
                names.arg=colnames(cmt),
                main="True/false classification on test set")
        if (GRAPH2FILE == T) dev.off()
 
        #=============================================
        # PART 9: WRITE RESULTS TO FILE
        #=============================================
        #save.image(paste(dir.rdata, filename, "_", response.variable, ".RData", sep=""))
        write.table(d_test, file=paste(dir.output, sub(".csv", "", filename), "_predictions.csv", sep=""), quote=F, sep=";", dec=".", row.names=F, col.names=T)
        # (only .csv for the last response.variable survives)

    } # for (response.variable)
    cat(filename,": All done.\n")


    sink()            # close log file