rm(list=objects()) library(tidyverse) library(lubridate) library(ranger) library(rpart) source("R/rmse.R") Data0 <- read.csv(file="Data/train.csv", sep=",", dec='.') Data1 <- read.csv(file="Data/test.csv", sep=",", dec='.') ################################################################# #####complétion des valeurs manquantes ################################################################# summary(Data0) percentageNA <- function(x){100*which(is.na(x))%>%length/length(x)} apply(Data0, 2, percentageNA) apply(Data1, 2, percentageNA) Data <-rbind(Data0[,-2], Data1[,-ncol(Data1)]) dim(Data) Data_WTHNA <- Data[-which(is.na(Data$RH_6)), ] RF_NA <- ranger(RH_6 ~ RH_1 + RH_2 + RH_3 + RH_4 + RH_5 + RH_7 + RH_8, data=Data_WTHNA) Data$RH_6[which(is.na(Data$RH_6))] <- predict(RF_NA,data=Data[which(is.na(Data$RH_6)),])$predictions Data_WTHNA <- Data[-which(is.na(Data$Visibility)), ] RF_NA <- ranger(Visibility ~ Tdewpoint + Windspeed + T_out + Instant, data=Data_WTHNA) Data$Visibility[which(is.na(Data$Visibility))] <- predict(RF_NA, data=Data[which(is.na(Data$Visibility)),])$predictions Data0[,-2] <- Data[1:nrow(Data0),] Data1[,-ncol(Data1)] <- Data[(nrow(Data0)+1):nrow(Data),] ############################################################################################### #####LM ############################################################################################### n <- nrow(Data0) set.seed(100) eval_sample <- sample(c(1:n), (n*20/100)%>%floor, replace=F) reg <- lm(Appliances~InstantF, data=Data0[-eval_sample, ]) reg.forecast <- predict(reg, newdata=Data0[eval_sample, ]) rmse(y=Data0[eval_sample, ]$Appliances, ychap=reg.forecast) ############################################################################################### #####rpart ############################################################################################### n <- nrow(Data0) set.seed(100) eval_sample <- sample(c(1:n), (n*20/100)%>%floor, replace=F) cart <- rpart(Appliances~lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 + RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility + Tdewpoint + rv1 + rv2 + Day_of_week + WeekStatus + Instant + BE_load_actual_entsoe_transparency , data=Data0[-eval_sample, ]) summary(cart) cart.forecast <- predict(cart, newdata=Data0[eval_sample, ]) rmse(y=Data0[eval_sample, ]$Appliances, ychap=cart.forecast) ############################################################################################### #####gam ############################################################################################### library(mgcv) n <- nrow(Data0) set.seed(100) eval_sample <- sample(c(1:n), (n*20/100)%>%floor, replace=F) cart <- rpart(Appliances~lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 + RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility + Tdewpoint + rv1 + rv2 + Day_of_week + WeekStatus + Instant + BE_load_actual_entsoe_transparency , data=Data0[-eval_sample, ]) summary(cart) cart.forecast <- predict(cart, newdata=Data0[eval_sample, ]) rmse(y=Data0[eval_sample, ]$Appliances, ychap=cart.forecast) ############################################################################################### #####RF ############################################################################################### rf0 <- ranger(Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 + RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility + Tdewpoint + rv1 + rv2 + Day_of_week + WeekStatus + Instant + BE_load_actual_entsoe_transparency, data=Data0[-eval_sample, ], importance='permutation', oob.error=T, mtry=10, max.depth=1, num.trees = 1000) rf0.forecast <- predict(rf0, data=Data0[eval_sample, ])$predictions rmse(y=Data0[eval_sample, ]$Appliances, ychap=rf0.forecast) rf0.trees <- predict(rf0, predict.all=T,data=Data0[eval_sample, ])$predictions rmse(y=Data0[eval_sample, ]$Appliances, ychap=rowMeans(rf0.trees)) seq(2, 1000, by=100) rmse_tree <- lapply(seq(2, 1000, by=100), function(x){rmse(y=Data0[eval_sample, ]$Appliances, ychap=rowMeans(rf0.trees[,1:x]))} ) plot(seq(2, 1000, by=100), rmse_tree%>%unlist, type='l') which.min(rmse_tree%>%unlist) ############################################################################################### rmse_mtry <- NULL for(mtry in c(1:10)) { rf0 <- ranger(Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 + RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility + Tdewpoint + rv1 + rv2 + Day_of_week + WeekStatus + Instant + BE_load_actual_entsoe_transparency, data=Data0[-eval_sample, ], importance='permutation', oob.error=T, mtry=mtry, num.trees=200) rf0.forecast <- predict(rf0, data=Data0[eval_sample, ])$predictions rmse_mtry <- c(rmse_mtry, rmse(y=Data0[eval_sample, ]$Appliances, ychap=rf0.forecast)) print(mtry) } plot(rmse_mtry, type='b') rf0.forecast <- predict(rf0, data=Data0[eval_sample, ])$predictions rmse(y=Data0[eval_sample, ]$Appliances, ychap=rf0.forecast) names(rf0) plot(rf0$prediction.error) ?ranger library(randomForest) rf0 <- randomForest(Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 + RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility + Tdewpoint + rv1 + rv2 + Day_of_week + WeekStatus + Instant + BE_load_actual_entsoe_transparency, data=Data0[-eval_sample, ]) #####quantiles rf1 <- ranger(Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 + RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility + Tdewpoint + rv1 + rv2 + Day_of_week + WeekStatus + Instant + BE_load_actual_entsoe_transparency, data=Data0[-eval_sample, ], importance='permutation', quantreg=T) rf1.forecast <- predict(rf1, data=Data0[eval_sample, ])$predictions rmse(y=Data0[eval_sample, ]$Appliances, ychap=rf1.forecast) q <- predict(rf1, data=Data0[eval_sample, ], quantiles=0.5, type = 'quantiles')$prediction rmse(y=Data0[eval_sample, ]$Appliances, ychap=q) rmse(y=Data0[eval_sample, ]$Appliances, ychap=rf1.forecast) plot(Data0[eval_sample, ]$Appliances%>%head(500), type='l') lines(q, col='red') lines(rf0.forecast, col='blue') rf1.fitted <- predict(rf1, data=Data0[-eval_sample, ])$predictions hist(Data0[-eval_sample, ]$Appliances-rf1.fitted, breaks=100) ################################### #####variable selection: importance ################################### imp <- rf0$variable.importance o <- order(imp, decreasing=T) plot(imp[o], type='b', pch=20) cov <- names(imp)[o[1:10]] form_rf <- paste0("Appliances ~ ", paste0(cov, collapse='+'))%>%as.formula() rf1 <- ranger(form_rf, data=Data0[-eval_sample, ], importance='permutation') rf1.forecast <- predict(rf1, data=Data0[eval_sample, ])$predictions rmse(y=Data0[eval_sample, ]$Appliances, ychap=rf1.forecast) ################################### #####instant model ################################### Data0_a <- Data0[-eval_sample,] Data0_b <- Data0[eval_sample,] pred.rf.inst <- array(0, dim=nrow(Data0_b)) valInst <- unique(Data0$Instant) rmse_inst1 <- NULL rmse_inst2 <- NULL for(i in valInst) { sela <- which(Data0_a$Instant==i) Data0_a_inst <- Data0_a[sela, ] selb <- which(Data0_b$Instant==i) Data0_b_inst <- Data0_b[selb, ] formule <- Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 + RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility + Tdewpoint + rv1 + rv2 + Day_of_week + WeekStatus + BE_load_actual_entsoe_transparency #cov <- names(imp)[o[2:10]] #form_rf <- paste0("Appliances ~ ", paste0(cov, collapse='+'))%>%as.formula() rf_inst <- ranger(formule, data=Data0_a_inst, importance='permutation') o <- order(imp, decreasing=T) cov <- names(imp)[o[1:20]] form_rf <- paste0("Appliances ~ ", paste0(cov, collapse='+'))%>%as.formula() rf_inst <- ranger(form_rf, data=Data0_a_inst) rf_inst.forecast <- predict(rf_inst, data=Data0_b_inst)$prediction pred.rf.inst[selb] <- rf_inst.forecast rmse_inst1 <- c(rmse_inst1, rmse(Data0_b_inst$Appliances, rf_inst.forecast)) rmse_inst2 <- c(rmse_inst2, rmse(Data0_b_inst$Appliances, rf1.forecast[selb])) } plot(rmse_inst, type='l') plot(Data0_b$Appliances%>%head(24*6), type='l') lines(pred.rf.inst%>%head(24*6),col='red') lines(rf1.forecast%>%head(24*6), col='blue') plot(rmse_inst1, type='l', ylim=range(rmse_inst1, rmse_inst2)) lines(rmse_inst2, col='red') rmse(Data0_b$Appliances, rf1.forecast) rmse(Data0_b$Appliances, pred.rf.inst) ################################### #####variable selection: glmnet ################################### library(glmnet) ?glmnet names(Data0) fact <- lapply(Data0, is.character)%>%unlist which(fact==T) is.factor(Data0$InstantF) Xfact <- model.matrix(Appliances ~ InstantF+Day_of_week-1, data = Data0) Xfact dim(Xfact) levels(as.factor(Data0$InstantF))%>%length+levels(as.factor(Data0$Day_of_week))%>%length-1 #X <- cbind(as.matrix(Xfact), as.matrix(Data0[,-which(fact==T)])) X <- as.matrix(Xfact) glm <- glmnet(x=X, y=Data0$Appliances, lambda=seq(0.0001,10, length=100), alpha=1) plot(glm) ?cv.glmnet cvfit <- cv.glmnet(x=X, y=Data0$Appliances, family = "gaussian", type.measure="mse", lambda=seq(0.0001,10, length=100)) cvfit$lambda.min plot(cvfit) plot(cvfit$cvm) names(cvfit) plot(cvfit$lambda, cvfit$glmnet.fit$ plot(cvfit$glmnet.fit$lambda, cvfit$glmnet.fit$dev.ratio) ################################################################################################################ n <- nrow(Data) Data0 <- Data[1:(n-7*144*2),] Data1 <- Data[(n-7*144*2+1):n,] s <- c(1000:2000) Data0 <- Data[-s,] Data1 <- Data[s,] j1 <- which(Data$date=='2016-01-15 00:00:00') j2 <- which(Data$date=='2016-02-15 00:00:00') j3 <- which(Data$date=='2016-03-15 00:00:00') j4 <- which(Data$date=='2016-04-15 00:00:00') names(Data0) Data0$date randomDay <- c(j1+ c(0:143), j2+ c(0:143), j3+ c(0:143), j4+ c(0:143)) Data0 <- Data[-randomDay,] Data1 <- Data[randomDay,] randomObs <- sample(c(1:nrow(Data)), 1000) Data0 <- Data[-randomObs,] Data1 <- Data[randomObs,] ############################################################################################### #####sélection de variable ############################################################################################### #####regression model n <- nrow(Data0) set.seed(100) eval_sample <- sample(c(1:n), (n*20/100)%>%floor, replace=F) reg <- lm(Appliances~InstantF, data=Data0[-eval_sample, ]) # test <- lm(Appliances~InstantF, data=Data0) # summary(test) # plot(test$coefficients[-1], type='l') reg.forecast <- predict(reg, newdata=Data0[eval_sample, ])