# # More advanced template for a data mining process # (does the same as ../main_iris.R, but has more 'bells and whistles', e.g. # more switches to adjust, speed-up items for large datasets, # more methods for graphics and output) # # Author: Wolfgang Konen, FHK, March 2009 # rm(list=ls()) # delete all variables and functions graphics.off() # close all graphics windows library(randomForest) source("utils_DMC_adv.R") collectGarbage() READ_CSV = T # =T: read data from .csv & save on .Rdata # =F: load from .Rdata (faster) SHOW_SUMM = T # =T: show summary of data GRAPH2FILE = F # =T: write graphical output to png files in dir.graphics directory # =F: plot to screen PRED_ALL = F # =T/F: setting for keyword predict.all in predict.randomForest NAIVE_BAYES = F # =T/F: use Naive Bayes modeling instead of RF (requires package e1071) CLASSWT = c(10,10,10) # class weights for A,B,C (the higher, the more costly # is a misclassification of that real class). NULL for no weights LOAD_IMPORTANCE = F # =T: load importance from ..._imp.Rdata, =F: calculate importance NDROP = 0 # 0..N: how many variables (those with lowest importance) to drop #basedir <- "C:/user/datasets/Vorlesungen/DMC-Cup/IrisTest/" basedir <- "./" if (Sys.info()["sysname"]=="Windows") { directory <- paste(basedir,"", sep="") graphic <- function(filename, ...) png(filename, ...) graphicext = ".png" } else { directory <- "D:/tmp/" graphic <- function(filename, ...) jpeg(filename, ...) graphicext = ".jpg" } dir.data <- paste(directory, "", sep="") dir.graphics <- paste(directory, "Graphics/", sep="") dir.rdata <- paste(directory, "RData/", sep="") dir.output <- paste(directory, "Output/", sep="") filenames <- c("iris.csv") filename = filenames[1] # write standard output also into log file: sink(paste(dir.output, sub(".csv", "", filename),"_sink.txt", sep=""),append=F,split=T) # With 'append=F' we indicate that we will write a new file. 'split=T' means that the # output is written both to R console AND to file. #=============================================== # PART 1: READ DATA #=============================================== if (READ_CSV==T) { cat(filename,": Read data ...\n") raw.ds <- read.csv2(file=paste(dir.data, filename, sep=""), dec=".",na.string="-1",nrow=-1) # REMARK: when you have a large dataset (e.g. 100000 records), you may use nrow=100 or so as # long as you experiment with R script and code (during debugging) in order to speed up things. # When your real experiment starts, set nrow=-1 to fetch all records. save(raw.ds,file=paste(dir.rdata, sub(".csv", ".Rdata", filename), sep="")) } else { # this branch is to speed up the reading of large data sets: cat(filename,": Load data ...\n") system.time(load(file=paste(dir.rdata, sub(".csv", ".Rdata", filename), sep=""))) } # which variables are response variables: response.variables <- "Species" # special for iris-dataset: force columns to be numeric (otherwise R thinks # that they are factors and rowSums below does not work) # for (n in input.variables) { # raw.ds[,n] <- as.numeric(raw.ds[,n]) # } #============================================= # PART 2a: SELECT + PREPOC DATA # --- this step can be skipped if you don't need additional variables --- #============================================= cat(filename,": Select + preproc data ...\n") # just as an example how to add new variables to training and test data set: # add a new column "sum_c" = sum of all columns 1:4 # (bind_response is a short function in utils_DMC.r) raw.ds <- bind_response(raw.ds,"sum_c",rowSums(raw.ds[,1:4])) #============================================= # PART 2b: DIVIDE INTO TRAINING SET / TEST SET #============================================= # the division is done by random sampling #set.seed(44) # if you want reproducably the same sets L <- length(raw.ds[,1]) p <- sample(L) # random permutation of indices 1:L d_test <- raw.ds[p[1:(L/3)], ] # test set (1/3 of the data) d_train <- raw.ds[p[(L/3+1):L], ] # training set (2/3 of the data) # set input variables (everything what is not in response.variables): input.variables <- setdiff(names(raw.ds), response.variables) if (SHOW_SUMM) { cat("Summary of raw.ds:\n") print(summary(raw.ds)) # most columns are of numeric type # -> summary min,max,quantiles... } cat("\n") #============================================= # PART 3: TREAT MISSING VALUES #============================================= #cat(filename,": Missing Values ...\n") #--- currently not necessary ----------------- for (response.variable in response.variables) { #============================================= # PART 4a: TRAIN RANDOM FOREST (IMPORTANCE) #============================================= # determine the importance of all input var's by constructing a test RF cat(filename,": Importance check ...\n") s_input <- sorted_rf_importance(dir.rdata,filename,d_train,response.variable, input.variables,LOAD_IMPORTANCE,CLASSWT,GRAPH2FILE,graphic,graphicext) # remove the NDROP input variables which have the lowest importance: input.variables <- s_input[(NDROP+1):length(s_input)] cat("Dropped columns: ", setdiff(s_input,input.variables),"\n") d_train <- d_train[,c(response.variable,input.variables)] d_test <- d_test[,c(response.variable,input.variables)] #============================================= # PART 4b: MODELING: TRAIN RANDOM FOREST #============================================= cat(filename,": Train RF ...\n") cat("Class weights: ", CLASSWT,"\n") formula <- formula(paste(response.variable, "~ .")) # use all possible input variables # estimate random forest based on previous step: res.rf <- randomForest( formula=formula, data=d_train, nodesize=1, classwt=CLASSWT, sampsize=length(d_train[,1]), proximity=F, na.action=na.roughfix) print(res.rf) #============================================= # PART 5: APPLY RANDOM FOREST #============================================= cat(filename,": Apply RF ...\n") test.predict <- predict(res.rf, newdata=d_test,predict.all=PRED_ALL) if (PRED_ALL==T) test.predict <- test.predict$aggregate train.predict <- res.rf$predicted # (res.rf$predicted is the *OOB-prediction* on the training set) # Think about this! Why is it WRONG (or too optimistic) to use here # train.predict <- predict(res.rf, newdata=d_train) # as the prediction for the training set? if (NAIVE_BAYES) { cat(filename,": Train & predict with Naive Bayes ...\n") library(e1071) # required for naiveBayes res.nb <- naiveBayes(formula=formula, data=d_train) test.predict<-predict(res.nb,newdata=d_test[,-1]) # CAUTION!! predict.naiveBayes requires that only the input variables # enter via 'newdata', so we have to exclude the 1st column (Species)!! # Otherwise a strange error appears: # Fehler in FUN(1:6[[1L]], ...) : Indizierung außerhalb der Grenzen } # bind the predicted class pred_... as last column to data frames # (res.rf$predicted is the *OOB-prediction* on the training set) name.of.prediction <- paste("pred_", response.variable, sep="") d_train <- bind_response(d_train, name.of.prediction,train.predict) d_test <- bind_response(d_test,name.of.prediction,test.predict) #============================================= # PART 6: POSTPROCESSING #============================================= #cat(filename,": Postprocessing ...\n") #--- currently not necessary ----------------- #============================================= # PART 7: EVAL: CALC CONFUSION MATRIX + GAIN # --- this part has to be replaced by something else if --- # --- you do regression instead of classification --- #============================================= cat(filename,": Calc confusion matrix + gain ...\n") costmat <- matrix(c(+1,-1,-1, -1,+1,-1, -1,-1,+1), byrow=T,nrow=3,ncol=3, dimnames=list(levels(d_test[,response.variable]), # row names levels(d_test[,name.of.prediction]) # column names )) cat("\nTraining cases:\n") cm.train <- confmat(d_train,response.variable,name.of.prediction,costmat) cat(" --- predicted ---\n") print(cm.train$mat) # confusion matrix #print(colSums(cm.train$mat)) print(cm.train$cerr) # error of OOB-prediction on trainig set print(res.rf$err.rate[res.rf$ntree,]) # OOB training error (all trees) print(cm.train$gain.vector) cat("total gain:", cm.train$gain, "(is",cm.train$rgain,"% of max. gain)\n") cat("\nTest cases:\n") cm.test <- confmat(d_test,response.variable,name.of.prediction,costmat) cat(" --- predicted ---\n") print(cm.test$mat) # confusion matrix on test set #print(colSums(cm.test$mat)) cat("Test set error:\n") print(cm.test$cerr) print(cm.test$gain.vector) cat("total gain: ", cm.test$gain, "(is",cm.test$rgain,"% of max. gain)\n") #============================================= # PART 8: GRAPHICS # --- this part has to be replaced by something else if --- # --- you do regression instead of classification --- #============================================= if (GRAPH2FILE == F){ X11() # synonym for win.graph(), works on Unix and Win } else { # write graphical output to file graphic(filename=paste(dir.graphics, sub(".csv", "", filename), "_scatter", graphicext, sep="")) } # scatter plot for each pair of input variables, colored with 'Species' pairs(cbind(raw.ds[,1:4], sum_c=raw.ds$sum_c), cex=0.6, gap=0, col=c("red", "green", "blue")[as.numeric(raw.ds$Species)], main="Iris Data: Predictors and their sum") if (GRAPH2FILE == T) dev.off() # bar plot of true/false test cases for all classes if (GRAPH2FILE == F){ X11() # synonym for win.graph(), works on Unix and Win } else { # write graphical output to file graphic(filename=paste(dir.graphics, sub(".csv", "", filename), "_bar_test", graphicext, sep="")) } cmt<-cm.test$mat heigth<-t(matrix(c(diag(cmt),rowSums(cmt)-diag(cmt)),nrow=3,ncol=2)) # 'height' is a 2*3 matrix containing in its first row the number of # correct classifications for each Species (diagonal of confusion matrix) # and in its 2nd row the number of wrong classifications for each Species # (sum of off-diagonal elements in each row of the confusion matrix) barplot(heigth,beside=TRUE,col=c("blue","red"),legend=c("true","false"), names.arg=colnames(cmt), main="True/false classification on test set") if (GRAPH2FILE == T) dev.off() #============================================= # PART 9: WRITE RESULTS TO FILE #============================================= #save.image(paste(dir.rdata, filename, "_", response.variable, ".RData", sep="")) write.table(d_test, file=paste(dir.output, sub(".csv", "", filename), "_predictions.csv", sep=""), quote=F, sep=";", dec=".", row.names=F, col.names=T) # (only .csv for the last response.variable survives) } # for (response.variable) cat(filename,": All done.\n") sink() # close log file