Library Install and Data Load

library(dplyr)
library(tidyr)
library(caret)
library(randomForest)
library(randomForestExplainer)
library(rattle)
library(rpart)
library(ggplot2)
library(plotly)
library(xlsx)
library(readxl)
library(rpart)
library(plotly)

Data Load and Manipulation

Bring in both the Model Tiering Sheet and Model Inventory Sheet from the APplication Tiering Workbook. Fields from both sheets (tabs) will be used.

dir <- getwd()
filename <- "Enterprise Business Metrics V4.xlsx"

#Bring in tiering Sheet
modeltiering.df <- read_excel(paste0(dir,"/",filename), 
                                             sheet = "Model Tiering", skip = 2)

## New names:
## * `Tier Compare` -> `Tier Compare...9`
## * `Tier Compare` -> `Tier Compare...12`

#Bring in inventory Sheet
modelinventory.df <- read_excel(paste0(dir,"/",filename), 
                              sheet = "Models", skip = 1, n_max = 621)

Join the sheets together with an inter join

#Join sheets together
jointiering.df <- inner_join(x = modeltiering.df, y = modelinventory.df, by = "ID")

Pull in only selected variables and update the column names to be continous text.

The fields used did not include any information related to the application dependency. We would like to see if a model can predict the model’s tier by just looking at simple information on the model inventory and the “Prioritization Process”. Prioritization will likely have a high impact of understanding the importance of the model.

#First test will be to not use the application identification. Just using current inventory field
mitrim.df <- jointiering.df[ , c(1:5,11,13,55:65)]

#take spaces out of column names
colnames(mitrim.df) <- gsub(" ", "_", x = colnames(mitrim.df))

colnames(mitrim.df)

##  [1] "ID"                            "Title.x"                      
##  [3] "Model_Status.x"                "Model_Classification.x"       
##  [5] "Business_Unit.x"               "Proposed_Tier"                
##  [7] "Prioritization"                "Model_Owner"                  
##  [9] "Model_Executive"               "Primary_Governance_Body_Owner"
## [11] "Primary_Governance_Body"       "Author"                       
## [13] "MOAG_Member"                   "Interested_Party_1"           
## [15] "Interested_Party_2"            "Interested_Party_3"           
## [17] "Products"                      "Production_Cycle"

If Prioritization was blank, I included “Maybe” as an identifies

#change NAs in prioritization to Maybe
mitrim.df$Prioritization[is.na(mitrim.df$Prioritization)] <- "Maybe"

Clean the data of zero variance columns, columns with NAs

#Clean data of zero variance
nzv <- nearZeroVar(mitrim.df,saveMetrics=TRUE)
zero.var.ind <- sum(nzv$nzv)

if ((zero.var.ind>0)) {
  mitrim.df <- mitrim.df[,nzv$nzv==FALSE]
 }

#dropcolumns with NAs
mitrim.df <- mitrim.df[, colSums(is.na(mitrim.df)) == 0] 

#Current Columns
colnames(mitrim.df)

## [1] "ID"                     "Title.x"               
## [3] "Model_Status.x"         "Model_Classification.x"
## [5] "Business_Unit.x"        "Proposed_Tier"         
## [7] "Prioritization"         "Model_Owner"           
## [9] "Model_Executive"

Make all variables a factor

#set as factors
mitrim.df <- lapply(mitrim.df, as.factor)
mitrim.df <- as.data.frame(mitrim.df)

Add features to Model specifically related to models with VA or FIA implications. Additional features to specific salient features may be added. Some labeling isn’t obvious and labeling was added to specific models

#add VA and FIA Variables (This should help with accuracy)
mitrim.df <- mitrim.df %>% mutate(VA = grepl("VA|Variable Annuity", mitrim.df$Title.x)
                                  , FIA = grepl("FIA|Fixed Index Annuity", mitrim.df$Title.x))

#### USE FUNCTION FROM BOTTOM####
mitrim.df <- mitrim.df %>% mutate_cond(ID %in% c("ERM_MOD_0184", "ERM_MOD_0565", "ERM_MOD_0587", "ERM_MOD_0589", "ERM_MOD_0642"), VA = TRUE)
mitrim.df <- mitrim.df %>% mutate_cond(ID %in% c("ERM_MOD_0498", "ERM_MOD_0587"), FIA = TRUE)

colnames(mitrim.df)

##  [1] "ID"                     "Title.x"               
##  [3] "Model_Status.x"         "Model_Classification.x"
##  [5] "Business_Unit.x"        "Proposed_Tier"         
##  [7] "Prioritization"         "Model_Owner"           
##  [9] "Model_Executive"        "VA"                    
## [11] "FIA"

Move the Predictor to the end of the frame

#Moved predictor to end
col_idx <- grep("Proposed_Tier", names(mitrim.df))
mitrim.df <- mitrim.df[, c((1:ncol(mitrim.df))[-col_idx], col_idx)]
names(mitrim.df)

##  [1] "ID"                     "Title.x"               
##  [3] "Model_Status.x"         "Model_Classification.x"
##  [5] "Business_Unit.x"        "Prioritization"        
##  [7] "Model_Owner"            "Model_Executive"       
##  [9] "VA"                     "FIA"                   
## [11] "Proposed_Tier"

Partition Data

We only have 400 rows of data, so the data was split on just a train and testing set. The use of cross validation may be considered if multiple techniques are being explored

#Partition Data
set.seed(123)
in.training <- createDataPartition(mitrim.df$Proposed_Tier , p=0.75, list=F)
train.df <- mitrim.df[in.training, ]
testing.df <- mitrim.df[-in.training, ]

Modeling

Decision Tree

Run a decision tree using the Caret Package

#Run Decision Tree
set.seed(123)
rpart.model <- train(Proposed_Tier ~ ., data = train.df, method = "rpart")

Confustion Matrix on Testing Set

#confusion Matrix for Classification Tree
rpart.predict <- predict(rpart.model, newdata = testing.df)
confusion <- confusionMatrix(testing.df$Proposed_Tier, rpart.predict)
confusion

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Tier 1 Tier 2 Tier 3
##     Tier 1      2      6      0
##     Tier 2      3     32      7
##     Tier 3      0     22     26
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6122         
##                  95% CI : (0.5085, 0.709)
##     No Information Rate : 0.6122         
##     P-Value [Acc > NIR] : 0.5443         
##                                          
##                   Kappa : 0.3179         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Tier 1 Class: Tier 2 Class: Tier 3
## Sensitivity                0.40000        0.5333        0.7879
## Specificity                0.93548        0.7368        0.6615
## Pos Pred Value             0.25000        0.7619        0.5417
## Neg Pred Value             0.96667        0.5000        0.8600
## Prevalence                 0.05102        0.6122        0.3367
## Detection Rate             0.02041        0.3265        0.2653
## Detection Prevalence       0.08163        0.4286        0.4898
## Balanced Accuracy          0.66774        0.6351        0.7247

Confustion Matrix on Entire Data Set

rpart.predict2 <- predict(rpart.model, newdata = mitrim.df)
rpart.confusion2 <- confusionMatrix(mitrim.df$Proposed_Tier, rpart.predict2)
rpart.confusion2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Tier 1 Tier 2 Tier 3
##     Tier 1     13     22      0
##     Tier 2      5    146     20
##     Tier 3      1     48    145
## 
## Overall Statistics
##                                          
##                Accuracy : 0.76           
##                  95% CI : (0.7151, 0.801)
##     No Information Rate : 0.54           
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5752         
##                                          
##  Mcnemar's Test P-Value : 3.611e-05      
## 
## Statistics by Class:
## 
##                      Class: Tier 1 Class: Tier 2 Class: Tier 3
## Sensitivity                 0.6842        0.6759        0.8788
## Specificity                 0.9423        0.8641        0.7915
## Pos Pred Value              0.3714        0.8538        0.7474
## Neg Pred Value              0.9836        0.6943        0.9029
## Prevalence                  0.0475        0.5400        0.4125
## Detection Rate              0.0325        0.3650        0.3625
## Detection Prevalence        0.0875        0.4275        0.4850
## Balanced Accuracy           0.8132        0.7700        0.8351

Decision Tree Results

The testing accuracy was only 0.6122449 and positive predictors varied by tier. The decision tree overfit the training data and resulted in an overall accuracy for the data set of 0.76.

Random Forest

#Run Random Forest
set.seed(123)
#rf.model <- randomForest(Proposed_Tier ~ ., data = train.df, ntree = 251, localImp = TRUE)
rf.model <- train(Proposed_Tier ~ ., data = train.df, method = "rf", localImp = TRUE)

Confustion Matrix on Testing Set

#confusion Matrix for Randome Forest
rf.predict <- predict(rf.model, newdata = testing.df)
rf.confusion <- confusionMatrix(testing.df$Proposed_Tier, rf.predict)
rf.confusion

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Tier 1 Tier 2 Tier 3
##     Tier 1      3      5      0
##     Tier 2      3     32      7
##     Tier 3      0      4     44
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8061         
##                  95% CI : (0.7139, 0.879)
##     No Information Rate : 0.5204         
##     P-Value [Acc > NIR] : 3.905e-09      
##                                          
##                   Kappa : 0.6543         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Tier 1 Class: Tier 2 Class: Tier 3
## Sensitivity                0.50000        0.7805        0.8627
## Specificity                0.94565        0.8246        0.9149
## Pos Pred Value             0.37500        0.7619        0.9167
## Neg Pred Value             0.96667        0.8393        0.8600
## Prevalence                 0.06122        0.4184        0.5204
## Detection Rate             0.03061        0.3265        0.4490
## Detection Prevalence       0.08163        0.4286        0.4898
## Balanced Accuracy          0.72283        0.8025        0.8888

Confustion Matrix on Entire Data Set

rf.predict2 <- predict(rf.model, newdata = mitrim.df)
rf.confusion2 <- confusionMatrix(mitrim.df$Proposed_Tier, rf.predict2)
rf.confusion2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Tier 1 Tier 2 Tier 3
##     Tier 1     26      9      0
##     Tier 2      3    159      9
##     Tier 3      0      4    190
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9375          
##                  95% CI : (0.9091, 0.9591)
##     No Information Rate : 0.4975          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8901          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Tier 1 Class: Tier 2 Class: Tier 3
## Sensitivity                 0.8966        0.9244        0.9548
## Specificity                 0.9757        0.9474        0.9801
## Pos Pred Value              0.7429        0.9298        0.9794
## Neg Pred Value              0.9918        0.9432        0.9563
## Prevalence                  0.0725        0.4300        0.4975
## Detection Rate              0.0650        0.3975        0.4750
## Detection Prevalence        0.0875        0.4275        0.4850
## Balanced Accuracy           0.9361        0.9359        0.9674

#Write to excel file Predictions
#scoringpredict <- data.frame(mitrim.df, modeltiering.df$Criticality,rf.predict2)
#write.xlsx(scoringpredict, file="Scoring Prediction.xlsx")

Random Forest Results

The testing accuracy was 0.8061224, which was a great gain. Looking into the testing run more deeply, we see that the prediction for Tier 3 models was 0.9166667. This meant that the model was able to have great success in determining a Tier 3 model vs. Tier 1/Tier 2 models.

With all the data the model had an overall accuracy for the data set of 0.9375. Tier 3 models even had greater accuracy of 0.9793814. We see in both the testing set and the overall set that the accuracy of Tier 1 models could use lift, the accuracy is 0.7428571. This can be acheived by adding more salient features that highlight our most important models. Just like features for identifying VA and FIA was added, we could add features that identify BOP, BA, Auto, Home, Structured Products.

Appendix

Random Forest Plots

Plot Random FOrest

plot(rf.model)

plot(rf.model$finalModel)

Variable Importance

#VI_F <-importance(rf.model$finalMode)
varImp(rf.model)

## rf variable importance
## 
##   variables are sorted by maximum importance across the classes
##   only 20 most important variables shown (out of 1018)
## 
##                                                 Tier 1 Tier 2 Tier 3
## PrioritizationNo                                 59.54  99.33 100.00
## PrioritizationYes                                36.88  93.99  94.22
## Model_Classification.xCapital                    81.89  54.73  43.90
## Model_Classification.xStrategic Plan / Forecast  35.04  57.85  61.98
## Model_Status.xProduction                         11.94  59.72  54.08
## Model_Classification.xPricing                    42.80  59.14  40.45
## Model_Ownernungek1                               38.36  43.70  59.06
## Model_Executivekraicm1                           20.20  57.71  28.95
## Model_Ownerwunderp                               31.08  52.52  56.65
## VATRUE                                           55.96  20.40  53.79
## Model_Executiveperrinj4                          27.15  45.03  55.57
## Model_Executivefrenchr1                          31.52  53.18  26.57
## Model_Ownerchakrar1                              19.18  32.87  50.53
## Business_Unit.xPC                                33.01  50.02  27.49
## Model_Executivescottn7                           19.18  49.62  42.07
## Model_Executiveschleyd                           23.91  43.55  48.33
## Model_Ownersterlia4                              29.94  48.20  15.85
## Model_Executiveconovem2                          47.48  38.73  33.24
## Business_Unit.xNF                                28.78  45.32  25.02
## Model_Ownershenq4                                19.18  34.71  44.87

varImpPlot(rf.model$finalModel,type=2, main = "Random FOrest Variable Mean Decrease Gini")

Create an Importance frame and list the important variables

#Multi-way importance plot
importance_frame <- measure_importance(rf.model$finalModel)

## Warning: Factor `split var` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `split var` contains implicit NA, consider using
## `forcats::fct_explicit_na`

#List important variables
important_variables(importance_frame)

##  [1] "PrioritizationNo"                               
##  [2] "PrioritizationYes"                              
##  [3] "Model_Classification.xCapital"                  
##  [4] "Model_Classification.xPricing"                  
##  [5] "Model_Classification.xStrategic Plan / Forecast"
##  [6] "Model_Status.xProduction"                       
##  [7] "Business_Unit.xPC"                              
##  [8] "Model_Ownerwunderp"                             
##  [9] "Business_Unit.xNF"                              
## [10] "Model_Executiveperrinj4"                        
## [11] "Model_Executivekraicm1"                         
## [12] "Model_Ownernungek1"                             
## [13] "VATRUE"                                         
## [14] "Model_Executivefrenchr1"                        
## [15] "Model_Executiveryanc2"

Multiple Importance Plots

plot_multi_way_importance(importance_frame)

#plot of pairs
plot_importance_ggpairs(importance_frame)

#Plot Rankings
plot_importance_rankings(importance_frame)

Decision Tree Plots

#plot 1
plot(rpart.model$finalModel, uniform = TRUE, main = "Classification Tree")
text(rpart.model$finalModel, use.n = TRUE, all = TRUE, cex = .8)

#plot 2
fancyRpartPlot(rpart.model$finalModel)

Model Tiering with a Random Forest

Jon Huber

April 19, 2019