#
# Simple template for a data mining process
# (dataset: IRIS, method: Random Forest)
#
# Author: Wolfgang Konen, FHK, March-Oct 2009
#
rm(list=ls())           # delete all variables and functions
graphics.off()          # close all graphics windows
library(randomForest)
source("utils_DMC.R")
collectGarbage()

CLASSWT = c(10,10,10)   # class weights for Species (the higher, the more costly
                        # is a misclassification of that real class). NULL for no weights
NDROP = 0               # 0..N: how many variables (those with lowest importance) to drop

#directory <- "C:/user/datasets/Vorlesungen/DMC-Cup/IrisTest/"
directory <- "./"
dir.data <- paste(directory, "", sep="")

filename = "iris.csv"

    #===============================================
    # PART 1: READ DATA
    #===============================================
    cat(filename,": Read data ...\n")
    raw.ds <- read.csv2(file=paste(dir.data, filename, sep=""), dec=".",
                      na.string="-1",nrow=-1)
    # REMARK: when you have a large dataset (e.g. 100000 records), you may use 
    # nrow=100 as long as you experiment with R script and code (during 
    # debugging) in order to speed up things. When your real experiment starts, 
    # set nrow=-1 to fetch all records.

    # which variable is response variable:
    response.variable <- "Species" 

    #=============================================
    # PART 2a: SELECT + PREPOC DATA
    # --- this step can be skipped if you don't need additional variables   ---
    #=============================================
    cat(filename,": Select + preproc data ...\n")
    # just as an example how to add new variables to training and test data set:
    # add a new column "sum_c" = sum of all columns 1:4 
    # (bind_response is a short function in utils_DMC.r) 
    raw.ds <- bind_response(raw.ds,"sum_c",rowSums(raw.ds[,1:4]))

    #=============================================
    # PART 2b: DIVIDE INTO TRAINING SET / TEST SET
    #=============================================
    # the division is done by random sampling 
    set.seed(44)                    # if you want reproducably the same sets
    L <- length(raw.ds[,1])
    p <- sample(L)                  # random permutation of indices 1:L
    d_test <- raw.ds[p[1:(L/3)], ]     # test set     (1/3 of the data)
    d_train <- raw.ds[p[(L/3+1):L], ]  # training set (2/3 of the data)

    # set input variables (everything what is not response.variable):
    input.variables <- setdiff(names(raw.ds), response.variable)
    cat("Summary of raw.ds:\n")
    print(summary(raw.ds))             # most columns are of numeric type
                                       # -> summary min,max,quantiles...
    cat("\n")

    #=============================================
    # PART 3: TRAIN RANDOM FOREST (IMPORTANCE)
    # --- this step can be skipped if you use all input variables (NDROP=0) ---
    # --- and if you do not need to see the importance of variables         ---
    #=============================================
    # determine the importance of all input var's by constructing a test RF
    cat(filename,": Importance check ...\n")
    s_input <- sorted_rf_importance(filename,d_train,response.variable,
                    input.variables,CLASSWT)

    # remove the NDROP input variables which have the lowest importance:
    input.variables <- s_input[(NDROP+1):length(s_input)]
    cat("Dropped columns: ", setdiff(s_input,input.variables),"\n")
    d_train <- d_train[,c(response.variable,input.variables)]
    d_test <- d_test[,c(response.variable,input.variables)]

    
    #=============================================
    # PART 4: MODELING: TRAIN RANDOM FOREST
    #=============================================
    cat(filename,": Train RF ...\n")
    cat("Class weights: ", CLASSWT,"\n")
    formula <- formula(paste(response.variable, "~ ."))   # use all possible input variables
    # estimate random forest based on previous step:
    res.rf <- randomForest( formula=formula, data=d_train, nodesize=1,
                            classwt=CLASSWT, sampsize=length(d_train[,1]), 
                            proximity=F, na.action=na.roughfix)
    print(res.rf)

    #=============================================
    # PART 5: APPLY RANDOM FOREST
    #=============================================
    cat(filename,": Apply RF ...\n")
    test.predict <- predict(res.rf, newdata=d_test)
    train.predict <- res.rf$predicted
    # (res.rf$predicted is the *OOB-prediction* on the training set)
    # Think about this! Why is it WRONG (or too optimistic) to use here
    #        train.predict <- predict(res.rf, newdata=d_train)
    # as the prediction for the training set?
    
    # bind the predicted class pred_... as last column to the data frames
    name.of.prediction <- paste("pred_", response.variable, sep="")
    d_train <- bind_response(d_train, name.of.prediction, train.predict)
    d_test  <- bind_response(d_test, name.of.prediction, test.predict)

    #=============================================
    # PART 7: EVAL: CALC CONFUSION MATRIX + GAIN
    # --- this part has to be replaced by something else if   ---
    # --- you do regression instead of classification         ---
    #=============================================
    cat(filename,": Calc confusion matrix + gain ...\n")
    costmat <- matrix(c(+1,-1,-1,
                        -1,+1,-1,
                        -1,-1,+1), byrow=T,nrow=3,ncol=3,
                        dimnames=list(levels(d_test[,response.variable]),   # row names
                                      levels(d_test[,name.of.prediction])   # column names
                                      ))
    cat("\nTraining cases:\n")
    cm.train <- confmat(d_train,response.variable,name.of.prediction,costmat)
    cat("           --- predicted ---\n")
    print(cm.train$mat)                     # confusion matrix on training set
    cat("total gain:", cm.train$gain, "(is",cm.train$rgain,"% of max. gain)\n")
    
    cat("\nTest cases:\n")
    cm.test <- confmat(d_test,response.variable,name.of.prediction,costmat)
    cat("           --- predicted ---\n")
    print(cm.test$mat)                      # confusion matrix on test set
    cat("Test set error:\n")
    print(cm.test$cerr)                      
    #print(cm.test$gain.vector)
    cat("total gain: ", cm.test$gain, "(is",cm.test$rgain,"% of max. gain)\n")

    #=============================================
    # PART 8: GRAPHICS 
    # --- this part has to be replaced by something else if   ---
    # --- you do regression instead of classification         ---
    #=============================================
    X11()         # synonym for win.graph(), works on Unix and Win
    # scatter plot for each  pair of input variables, colored with 'Species'
    pairs(cbind(raw.ds[,1:4], sum_c=raw.ds$sum_c), cex=0.6, gap=0,                       
          col=c("red", "green", "blue")[as.numeric(raw.ds$Species)],
          main="Iris Data: Predictors and their sum")
      
    # bar plot of true/false test cases for all classes
    X11()         # synonym for win.graph(), works on Unix and Win
    cmt<-cm.test$mat
    heigth<-t(matrix(c(diag(cmt),rowSums(cmt)-diag(cmt)),nrow=3,ncol=2))
    # 'height' is a 2*3 matrix containing in its first row the number of 
    # correct classifications for each Species (diagonal of confusion matrix) 
    # and in its 2nd row the number of wrong classifications for each Species
    # (sum of off-diagonal elements in each row of the confusion matrix)
    barplot(heigth,beside=TRUE,col=c("blue","red"),legend=c("true","false"),
            names.arg=colnames(cmt),
            main="True/false classification on test set")

    #=============================================
    # PART 9: WRITE RESULTS ON TEST SET TO FILE
    #=============================================
    outfile = paste(sub(".csv", "", filename), "_predictions.csv", sep="")
    write.table(d_test, file=outfile, quote=F, sep=";", dec=".", row.names=F, col.names=T)

    cat(filename,": All done.\n")