library(dplyr)
library(tidyr)
library(caret)
library(randomForest)
library(randomForestExplainer)
library(rattle)
library(rpart)
library(ggplot2)
library(plotly)
library(xlsx)
library(readxl)
library(rpart)
library(plotly)
Bring in both the Model Tiering Sheet and Model Inventory Sheet from the APplication Tiering Workbook. Fields from both sheets (tabs) will be used.
dir <- getwd()
filename <- "Enterprise Business Metrics V4.xlsx"
#Bring in tiering Sheet
modeltiering.df <- read_excel(paste0(dir,"/",filename),
sheet = "Model Tiering", skip = 2)
## New names:
## * `Tier Compare` -> `Tier Compare...9`
## * `Tier Compare` -> `Tier Compare...12`
#Bring in inventory Sheet
modelinventory.df <- read_excel(paste0(dir,"/",filename),
sheet = "Models", skip = 1, n_max = 621)
Join the sheets together with an inter join
#Join sheets together
jointiering.df <- inner_join(x = modeltiering.df, y = modelinventory.df, by = "ID")
Pull in only selected variables and update the column names to be continous text.
The fields used did not include any information related to the application dependency. We would like to see if a model can predict the model’s tier by just looking at simple information on the model inventory and the “Prioritization Process”. Prioritization will likely have a high impact of understanding the importance of the model.
#First test will be to not use the application identification. Just using current inventory field
mitrim.df <- jointiering.df[ , c(1:5,11,13,55:65)]
#take spaces out of column names
colnames(mitrim.df) <- gsub(" ", "_", x = colnames(mitrim.df))
colnames(mitrim.df)
## [1] "ID" "Title.x"
## [3] "Model_Status.x" "Model_Classification.x"
## [5] "Business_Unit.x" "Proposed_Tier"
## [7] "Prioritization" "Model_Owner"
## [9] "Model_Executive" "Primary_Governance_Body_Owner"
## [11] "Primary_Governance_Body" "Author"
## [13] "MOAG_Member" "Interested_Party_1"
## [15] "Interested_Party_2" "Interested_Party_3"
## [17] "Products" "Production_Cycle"
If Prioritization was blank, I included “Maybe” as an identifies
#change NAs in prioritization to Maybe
mitrim.df$Prioritization[is.na(mitrim.df$Prioritization)] <- "Maybe"
Clean the data of zero variance columns, columns with NAs
#Clean data of zero variance
nzv <- nearZeroVar(mitrim.df,saveMetrics=TRUE)
zero.var.ind <- sum(nzv$nzv)
if ((zero.var.ind>0)) {
mitrim.df <- mitrim.df[,nzv$nzv==FALSE]
}
#dropcolumns with NAs
mitrim.df <- mitrim.df[, colSums(is.na(mitrim.df)) == 0]
#Current Columns
colnames(mitrim.df)
## [1] "ID" "Title.x"
## [3] "Model_Status.x" "Model_Classification.x"
## [5] "Business_Unit.x" "Proposed_Tier"
## [7] "Prioritization" "Model_Owner"
## [9] "Model_Executive"
Make all variables a factor
#set as factors
mitrim.df <- lapply(mitrim.df, as.factor)
mitrim.df <- as.data.frame(mitrim.df)
Add features to Model specifically related to models with VA or FIA implications. Additional features to specific salient features may be added. Some labeling isn’t obvious and labeling was added to specific models
#add VA and FIA Variables (This should help with accuracy)
mitrim.df <- mitrim.df %>% mutate(VA = grepl("VA|Variable Annuity", mitrim.df$Title.x)
, FIA = grepl("FIA|Fixed Index Annuity", mitrim.df$Title.x))
#### USE FUNCTION FROM BOTTOM####
mitrim.df <- mitrim.df %>% mutate_cond(ID %in% c("ERM_MOD_0184", "ERM_MOD_0565", "ERM_MOD_0587", "ERM_MOD_0589", "ERM_MOD_0642"), VA = TRUE)
mitrim.df <- mitrim.df %>% mutate_cond(ID %in% c("ERM_MOD_0498", "ERM_MOD_0587"), FIA = TRUE)
colnames(mitrim.df)
## [1] "ID" "Title.x"
## [3] "Model_Status.x" "Model_Classification.x"
## [5] "Business_Unit.x" "Proposed_Tier"
## [7] "Prioritization" "Model_Owner"
## [9] "Model_Executive" "VA"
## [11] "FIA"
Move the Predictor to the end of the frame
#Moved predictor to end
col_idx <- grep("Proposed_Tier", names(mitrim.df))
mitrim.df <- mitrim.df[, c((1:ncol(mitrim.df))[-col_idx], col_idx)]
names(mitrim.df)
## [1] "ID" "Title.x"
## [3] "Model_Status.x" "Model_Classification.x"
## [5] "Business_Unit.x" "Prioritization"
## [7] "Model_Owner" "Model_Executive"
## [9] "VA" "FIA"
## [11] "Proposed_Tier"
We only have 400 rows of data, so the data was split on just a train and testing set. The use of cross validation may be considered if multiple techniques are being explored
#Partition Data
set.seed(123)
in.training <- createDataPartition(mitrim.df$Proposed_Tier , p=0.75, list=F)
train.df <- mitrim.df[in.training, ]
testing.df <- mitrim.df[-in.training, ]
Run a decision tree using the Caret Package
#Run Decision Tree
set.seed(123)
rpart.model <- train(Proposed_Tier ~ ., data = train.df, method = "rpart")
Confustion Matrix on Testing Set
#confusion Matrix for Classification Tree
rpart.predict <- predict(rpart.model, newdata = testing.df)
confusion <- confusionMatrix(testing.df$Proposed_Tier, rpart.predict)
confusion
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tier 1 Tier 2 Tier 3
## Tier 1 2 6 0
## Tier 2 3 32 7
## Tier 3 0 22 26
##
## Overall Statistics
##
## Accuracy : 0.6122
## 95% CI : (0.5085, 0.709)
## No Information Rate : 0.6122
## P-Value [Acc > NIR] : 0.5443
##
## Kappa : 0.3179
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Tier 1 Class: Tier 2 Class: Tier 3
## Sensitivity 0.40000 0.5333 0.7879
## Specificity 0.93548 0.7368 0.6615
## Pos Pred Value 0.25000 0.7619 0.5417
## Neg Pred Value 0.96667 0.5000 0.8600
## Prevalence 0.05102 0.6122 0.3367
## Detection Rate 0.02041 0.3265 0.2653
## Detection Prevalence 0.08163 0.4286 0.4898
## Balanced Accuracy 0.66774 0.6351 0.7247
Confustion Matrix on Entire Data Set
rpart.predict2 <- predict(rpart.model, newdata = mitrim.df)
rpart.confusion2 <- confusionMatrix(mitrim.df$Proposed_Tier, rpart.predict2)
rpart.confusion2
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tier 1 Tier 2 Tier 3
## Tier 1 13 22 0
## Tier 2 5 146 20
## Tier 3 1 48 145
##
## Overall Statistics
##
## Accuracy : 0.76
## 95% CI : (0.7151, 0.801)
## No Information Rate : 0.54
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5752
##
## Mcnemar's Test P-Value : 3.611e-05
##
## Statistics by Class:
##
## Class: Tier 1 Class: Tier 2 Class: Tier 3
## Sensitivity 0.6842 0.6759 0.8788
## Specificity 0.9423 0.8641 0.7915
## Pos Pred Value 0.3714 0.8538 0.7474
## Neg Pred Value 0.9836 0.6943 0.9029
## Prevalence 0.0475 0.5400 0.4125
## Detection Rate 0.0325 0.3650 0.3625
## Detection Prevalence 0.0875 0.4275 0.4850
## Balanced Accuracy 0.8132 0.7700 0.8351
The testing accuracy was only 0.6122449 and positive predictors varied by tier. The decision tree overfit the training data and resulted in an overall accuracy for the data set of 0.76.
#Run Random Forest
set.seed(123)
#rf.model <- randomForest(Proposed_Tier ~ ., data = train.df, ntree = 251, localImp = TRUE)
rf.model <- train(Proposed_Tier ~ ., data = train.df, method = "rf", localImp = TRUE)
Confustion Matrix on Testing Set
#confusion Matrix for Randome Forest
rf.predict <- predict(rf.model, newdata = testing.df)
rf.confusion <- confusionMatrix(testing.df$Proposed_Tier, rf.predict)
rf.confusion
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tier 1 Tier 2 Tier 3
## Tier 1 3 5 0
## Tier 2 3 32 7
## Tier 3 0 4 44
##
## Overall Statistics
##
## Accuracy : 0.8061
## 95% CI : (0.7139, 0.879)
## No Information Rate : 0.5204
## P-Value [Acc > NIR] : 3.905e-09
##
## Kappa : 0.6543
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Tier 1 Class: Tier 2 Class: Tier 3
## Sensitivity 0.50000 0.7805 0.8627
## Specificity 0.94565 0.8246 0.9149
## Pos Pred Value 0.37500 0.7619 0.9167
## Neg Pred Value 0.96667 0.8393 0.8600
## Prevalence 0.06122 0.4184 0.5204
## Detection Rate 0.03061 0.3265 0.4490
## Detection Prevalence 0.08163 0.4286 0.4898
## Balanced Accuracy 0.72283 0.8025 0.8888
Confustion Matrix on Entire Data Set
rf.predict2 <- predict(rf.model, newdata = mitrim.df)
rf.confusion2 <- confusionMatrix(mitrim.df$Proposed_Tier, rf.predict2)
rf.confusion2
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tier 1 Tier 2 Tier 3
## Tier 1 26 9 0
## Tier 2 3 159 9
## Tier 3 0 4 190
##
## Overall Statistics
##
## Accuracy : 0.9375
## 95% CI : (0.9091, 0.9591)
## No Information Rate : 0.4975
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8901
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Tier 1 Class: Tier 2 Class: Tier 3
## Sensitivity 0.8966 0.9244 0.9548
## Specificity 0.9757 0.9474 0.9801
## Pos Pred Value 0.7429 0.9298 0.9794
## Neg Pred Value 0.9918 0.9432 0.9563
## Prevalence 0.0725 0.4300 0.4975
## Detection Rate 0.0650 0.3975 0.4750
## Detection Prevalence 0.0875 0.4275 0.4850
## Balanced Accuracy 0.9361 0.9359 0.9674
#Write to excel file Predictions
#scoringpredict <- data.frame(mitrim.df, modeltiering.df$Criticality,rf.predict2)
#write.xlsx(scoringpredict, file="Scoring Prediction.xlsx")
The testing accuracy was 0.8061224, which was a great gain. Looking into the testing run more deeply, we see that the prediction for Tier 3 models was 0.9166667. This meant that the model was able to have great success in determining a Tier 3 model vs. Tier 1/Tier 2 models.
With all the data the model had an overall accuracy for the data set of 0.9375. Tier 3 models even had greater accuracy of 0.9793814. We see in both the testing set and the overall set that the accuracy of Tier 1 models could use lift, the accuracy is 0.7428571. This can be acheived by adding more salient features that highlight our most important models. Just like features for identifying VA and FIA was added, we could add features that identify BOP, BA, Auto, Home, Structured Products.
Plot Random FOrest
plot(rf.model)
plot(rf.model$finalModel)
Variable Importance
#VI_F <-importance(rf.model$finalMode)
varImp(rf.model)
## rf variable importance
##
## variables are sorted by maximum importance across the classes
## only 20 most important variables shown (out of 1018)
##
## Tier 1 Tier 2 Tier 3
## PrioritizationNo 59.54 99.33 100.00
## PrioritizationYes 36.88 93.99 94.22
## Model_Classification.xCapital 81.89 54.73 43.90
## Model_Classification.xStrategic Plan / Forecast 35.04 57.85 61.98
## Model_Status.xProduction 11.94 59.72 54.08
## Model_Classification.xPricing 42.80 59.14 40.45
## Model_Ownernungek1 38.36 43.70 59.06
## Model_Executivekraicm1 20.20 57.71 28.95
## Model_Ownerwunderp 31.08 52.52 56.65
## VATRUE 55.96 20.40 53.79
## Model_Executiveperrinj4 27.15 45.03 55.57
## Model_Executivefrenchr1 31.52 53.18 26.57
## Model_Ownerchakrar1 19.18 32.87 50.53
## Business_Unit.xPC 33.01 50.02 27.49
## Model_Executivescottn7 19.18 49.62 42.07
## Model_Executiveschleyd 23.91 43.55 48.33
## Model_Ownersterlia4 29.94 48.20 15.85
## Model_Executiveconovem2 47.48 38.73 33.24
## Business_Unit.xNF 28.78 45.32 25.02
## Model_Ownershenq4 19.18 34.71 44.87
varImpPlot(rf.model$finalModel,type=2, main = "Random FOrest Variable Mean Decrease Gini")
Create an Importance frame and list the important variables
#Multi-way importance plot
importance_frame <- measure_importance(rf.model$finalModel)
## Warning: Factor `split var` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `split var` contains implicit NA, consider using
## `forcats::fct_explicit_na`
#List important variables
important_variables(importance_frame)
## [1] "PrioritizationNo"
## [2] "PrioritizationYes"
## [3] "Model_Classification.xCapital"
## [4] "Model_Classification.xPricing"
## [5] "Model_Classification.xStrategic Plan / Forecast"
## [6] "Model_Status.xProduction"
## [7] "Business_Unit.xPC"
## [8] "Model_Ownerwunderp"
## [9] "Business_Unit.xNF"
## [10] "Model_Executiveperrinj4"
## [11] "Model_Executivekraicm1"
## [12] "Model_Ownernungek1"
## [13] "VATRUE"
## [14] "Model_Executivefrenchr1"
## [15] "Model_Executiveryanc2"
Multiple Importance Plots
plot_multi_way_importance(importance_frame)
#plot of pairs
plot_importance_ggpairs(importance_frame)
#Plot Rankings
plot_importance_rankings(importance_frame)
#plot 1
plot(rpart.model$finalModel, uniform = TRUE, main = "Classification Tree")
text(rpart.model$finalModel, use.n = TRUE, all = TRUE, cex = .8)
#plot 2
fancyRpartPlot(rpart.model$finalModel)