# # Simple template for a data mining process # (dataset: IRIS, method: Random Forest) # # Author: Wolfgang Konen, FHK, March-Oct 2009 # rm(list=ls()) # delete all variables and functions graphics.off() # close all graphics windows library(randomForest) source("utils_DMC.R") collectGarbage() CLASSWT = c(10,10,10) # class weights for Species (the higher, the more costly # is a misclassification of that real class). NULL for no weights NDROP = 0 # 0..N: how many variables (those with lowest importance) to drop #directory <- "C:/user/datasets/Vorlesungen/DMC-Cup/IrisTest/" directory <- "./" dir.data <- paste(directory, "", sep="") filename = "iris.csv" #=============================================== # PART 1: READ DATA #=============================================== cat(filename,": Read data ...\n") raw.ds <- read.csv2(file=paste(dir.data, filename, sep=""), dec=".", na.string="-1",nrow=-1) # REMARK: when you have a large dataset (e.g. 100000 records), you may use # nrow=100 as long as you experiment with R script and code (during # debugging) in order to speed up things. When your real experiment starts, # set nrow=-1 to fetch all records. # which variable is response variable: response.variable <- "Species" #============================================= # PART 2a: SELECT + PREPOC DATA # --- this step can be skipped if you don't need additional variables --- #============================================= cat(filename,": Select + preproc data ...\n") # just as an example how to add new variables to training and test data set: # add a new column "sum_c" = sum of all columns 1:4 # (bind_response is a short function in utils_DMC.r) raw.ds <- bind_response(raw.ds,"sum_c",rowSums(raw.ds[,1:4])) #============================================= # PART 2b: DIVIDE INTO TRAINING SET / TEST SET #============================================= # the division is done by random sampling set.seed(44) # if you want reproducably the same sets L <- length(raw.ds[,1]) p <- sample(L) # random permutation of indices 1:L d_test <- raw.ds[p[1:(L/3)], ] # test set (1/3 of the data) d_train <- raw.ds[p[(L/3+1):L], ] # training set (2/3 of the data) # set input variables (everything what is not response.variable): input.variables <- setdiff(names(raw.ds), response.variable) cat("Summary of raw.ds:\n") print(summary(raw.ds)) # most columns are of numeric type # -> summary min,max,quantiles... cat("\n") #============================================= # PART 3: TRAIN RANDOM FOREST (IMPORTANCE) # --- this step can be skipped if you use all input variables (NDROP=0) --- # --- and if you do not need to see the importance of variables --- #============================================= # determine the importance of all input var's by constructing a test RF cat(filename,": Importance check ...\n") s_input <- sorted_rf_importance(filename,d_train,response.variable, input.variables,CLASSWT) # remove the NDROP input variables which have the lowest importance: input.variables <- s_input[(NDROP+1):length(s_input)] cat("Dropped columns: ", setdiff(s_input,input.variables),"\n") d_train <- d_train[,c(response.variable,input.variables)] d_test <- d_test[,c(response.variable,input.variables)] #============================================= # PART 4: MODELING: TRAIN RANDOM FOREST #============================================= cat(filename,": Train RF ...\n") cat("Class weights: ", CLASSWT,"\n") formula <- formula(paste(response.variable, "~ .")) # use all possible input variables # estimate random forest based on previous step: res.rf <- randomForest( formula=formula, data=d_train, nodesize=1, classwt=CLASSWT, sampsize=length(d_train[,1]), proximity=F, na.action=na.roughfix) print(res.rf) #============================================= # PART 5: APPLY RANDOM FOREST #============================================= cat(filename,": Apply RF ...\n") test.predict <- predict(res.rf, newdata=d_test) train.predict <- res.rf$predicted # (res.rf$predicted is the *OOB-prediction* on the training set) # Think about this! Why is it WRONG (or too optimistic) to use here # train.predict <- predict(res.rf, newdata=d_train) # as the prediction for the training set? # bind the predicted class pred_... as last column to the data frames name.of.prediction <- paste("pred_", response.variable, sep="") d_train <- bind_response(d_train, name.of.prediction, train.predict) d_test <- bind_response(d_test, name.of.prediction, test.predict) #============================================= # PART 7: EVAL: CALC CONFUSION MATRIX + GAIN # --- this part has to be replaced by something else if --- # --- you do regression instead of classification --- #============================================= cat(filename,": Calc confusion matrix + gain ...\n") costmat <- matrix(c(+1,-1,-1, -1,+1,-1, -1,-1,+1), byrow=T,nrow=3,ncol=3, dimnames=list(levels(d_test[,response.variable]), # row names levels(d_test[,name.of.prediction]) # column names )) cat("\nTraining cases:\n") cm.train <- confmat(d_train,response.variable,name.of.prediction,costmat) cat(" --- predicted ---\n") print(cm.train$mat) # confusion matrix on training set cat("total gain:", cm.train$gain, "(is",cm.train$rgain,"% of max. gain)\n") cat("\nTest cases:\n") cm.test <- confmat(d_test,response.variable,name.of.prediction,costmat) cat(" --- predicted ---\n") print(cm.test$mat) # confusion matrix on test set cat("Test set error:\n") print(cm.test$cerr) #print(cm.test$gain.vector) cat("total gain: ", cm.test$gain, "(is",cm.test$rgain,"% of max. gain)\n") #============================================= # PART 8: GRAPHICS # --- this part has to be replaced by something else if --- # --- you do regression instead of classification --- #============================================= X11() # synonym for win.graph(), works on Unix and Win # scatter plot for each pair of input variables, colored with 'Species' pairs(cbind(raw.ds[,1:4], sum_c=raw.ds$sum_c), cex=0.6, gap=0, col=c("red", "green", "blue")[as.numeric(raw.ds$Species)], main="Iris Data: Predictors and their sum") # bar plot of true/false test cases for all classes X11() # synonym for win.graph(), works on Unix and Win cmt<-cm.test$mat heigth<-t(matrix(c(diag(cmt),rowSums(cmt)-diag(cmt)),nrow=3,ncol=2)) # 'height' is a 2*3 matrix containing in its first row the number of # correct classifications for each Species (diagonal of confusion matrix) # and in its 2nd row the number of wrong classifications for each Species # (sum of off-diagonal elements in each row of the confusion matrix) barplot(heigth,beside=TRUE,col=c("blue","red"),legend=c("true","false"), names.arg=colnames(cmt), main="True/false classification on test set") #============================================= # PART 9: WRITE RESULTS ON TEST SET TO FILE #============================================= outfile = paste(sub(".csv", "", filename), "_predictions.csv", sep="") write.table(d_test, file=outfile, quote=F, sep=";", dec=".", row.names=F, col.names=T) cat(filename,": All done.\n")