> # Name: UCDiseaseActivity_2ndNNModel.R
> # Author: Iolanda Valentina Popa, iolivp@gmail.com
> # Year: 2020
> # Software: R Studio Version 1.2.1335 © 2009-2019 RStudio, Inc. Build 1379 (f1ac3452)
> # Language: R
> # Program size: 6.31kB
> 
> 
> library(mice); library(ggplot2); library(dplyr); library(caret); library(grid); library(pROC); library(mice); library(UBL)
Loading required package: lattice

Attaching package: ‘mice’

The following objects are masked from ‘package:base’:

    cbind, rbind


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var

Loading required package: MBA
Loading required package: gstat
Registered S3 method overwritten by 'xts':
  method     from
  as.zoo.xts zoo 
Loading required package: automap
Loading required package: sp
Loading required package: randomForest
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:dplyr’:

    combine

The following object is masked from ‘package:ggplot2’:

    margin

> 
> rcuh <- read.csv('pacientibii.csv')
> rcuh <- subset(rcuh, !is.na(Mayo))
> 
> rcuh <- subset(rcuh, select = c("Mayo", "WBC", "PLT", "MONO",
+                                 "VSH1h", "Fibrinogen", 
+                                 "Fier", "ProteineTotale", "alfa1GLOBULINE",
+                                 "CRP", "RBC", "HGB", "PDW"))
> 
> sapply(rcuh, function(x) sum(is.na(x)))  #Checking for missing values
          Mayo            WBC            PLT           MONO          VSH1h 
             0              0              0              1             55 
    Fibrinogen           Fier ProteineTotale alfa1GLOBULINE            CRP 
            90             26             61             78             33 
           RBC            HGB            PDW 
             0              0              7 
> 
> 
> ## ----------------------------------------------------------------------------------------------
> ## Imputing missing values using MICE function 
> ## ----------------------------------------------------------------------------------------------
> 
> rcuh <- rcuh %>%
+   mutate(
+     Mayo = as.factor(Mayo),
+     WBC = as.numeric(WBC),
+     HGB = as.numeric(HGB),
+     PLT = as.numeric(PLT),
+     MONO = as.numeric(MONO),
+     VSH1h = as.numeric(VSH1h),
+     Fibrinogen = as.numeric(Fibrinogen),
+     Fier = as.numeric(Fier),
+     ProteineTotale = as.numeric(ProteineTotale),
+     CRP = as.numeric(CRP),
+     alfa1GLOBULINE = as.numeric(alfa1GLOBULINE),
+     RBC = as.numeric(RBC),
+     PDW = as.numeric(PDW),
+   )
> 
> init = mice(rcuh, maxit=0) 
> meth = init$method
> predM = init$predictorMatrix
> meth[c("RBC")]="norm" 
> meth[c("PDW")]="norm" 
> meth[c("WBC")]="norm" 
> meth[c("HGB")]="norm" 
> meth[c("PLT")]="norm" 
> meth[c("MONO")]="norm"
> meth[c("VSH1h")]="norm" 
> meth[c("Fibrinogen")]="norm" 
> meth[c("Fier")]="norm" 
> meth[c("ProteineTotale")]="norm" 
> meth[c("CRP")]="norm" 
> meth[c("alfa1GLOBULINE")]="norm" 
> 
> set.seed(103)
> imputed = mice(rcuh, method=meth, predictorMatrix=predM, m=5)

 iter imp variable
  1   1  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  1   2  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  1   3  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  1   4  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  1   5  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  2   1  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  2   2  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  2   3  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  2   4  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  2   5  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  3   1  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  3   2  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  3   3  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  3   4  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  3   5  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  4   1  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  4   2  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  4   3  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  4   4  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  4   5  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  5   1  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  5   2  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  5   3  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  5   4  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
  5   5  MONO  VSH1h  Fibrinogen  Fier  ProteineTotale  alfa1GLOBULINE  CRP  PDW
> imputed <- complete(imputed)
> 
> rcuh <- imputed
> 
> sapply(rcuh, function(x) sum(is.na(x))) #checking for missing values after imputation (no missing values detected)
          Mayo            WBC            PLT           MONO          VSH1h 
             0              0              0              0              0 
    Fibrinogen           Fier ProteineTotale alfa1GLOBULINE            CRP 
             0              0              0              0              0 
           RBC            HGB            PDW 
             0              0              0 
> 
> rcuh$Mayo <- as.numeric(rcuh$Mayo) - 1
> 
> 
> ## ----------------------------------------------------------------------------------------------
> ## Normalize numeric variables using the min-max method
> ## ----------------------------------------------------------------------------------------------
> 
> temp <- rcuh$Mayo
> 
> normalize <- function(x, ...) {
+   return ((x - min(x, ...)) / (max(x, ...) - min(x, ...)))
+ }
> maxmindf <- as.data.frame(lapply(rcuh, normalize, na.rm = T))
> rcuh <- maxmindf
> rcuh$Mayo <- temp
> 
> ## Transform Mayo score into a binary score (Mayo 0 and 1 become 0 and Mayo 2 and 3 become 1) 
> 
> rcuh$Mayo <- replace(rcuh$Mayo, rcuh$Mayo==1, 0)
> rcuh$Mayo <- replace(rcuh$Mayo, rcuh$Mayo==2, 1)
> rcuh$Mayo <- replace(rcuh$Mayo, rcuh$Mayo==3, 1)
> 
> 
> ## ----------------------------------------------------------------------------------------------
> ## Randomly divide initial data into the training set (80%) and the test set (20%)
> ## ----------------------------------------------------------------------------------------------
> 
> validSetSize <- 30
> initDataSize <- nrow(rcuh) - validSetSize
> 
> rcuh_init <- rcuh[1 : initDataSize, ]
> set.seed(2)
> trainset_idx <- sample(1:nrow(rcuh_init), size = round(initDataSize * 0.8))
> trainset <- rcuh_init[trainset_idx,]
> testset <- rcuh_init[-trainset_idx,]
> 
> ## Validation set constituted by the last 30 records added independently
> 
> validset <- rcuh[(initDataSize + 1) : nrow(rcuh), ]
> 
> 
> ## ----------------------------------------------------------------------------------------------
> ## Train the Neural Network model with caret::train function 
> ## ----------------------------------------------------------------------------------------------
> 
> trainset$Mayo <- factor(trainset$Mayo)
> testset$Mayo <- factor(testset$Mayo)
> validset$Mayo <- factor(validset$Mayo)
> 
> set.seed(405)
> model_nn <- caret::train(Mayo ~ .,
+                          data = trainset,
+                          method = "mlpML",
+                          preProcess = c("scale", "center"),
+                          trControl = trainControl(method = "repeatedcv", 
+                                                   number = 10, 
+                                                   repeats = 10, 
+                                                   verboseIter = FALSE,
+                                                   sampling = "smote"))
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
> 
> 
> ## ----------------------------------------------------------------------------------------------
> ## Test the Neural Network model
> ## ----------------------------------------------------------------------------------------------
> 
> ## Model's performance metrics on the train set
> 
> final <- data.frame(actual = trainset$Mayo,
+                     predict(model_nn, newdata = trainset, type = "prob"))
> final$predict <- factor(ifelse(final$X0 > 0.5, 0, 1))
> caret::confusionMatrix(final$predict, trainset$Mayo)
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0  82  27
         1  10 166
                                          
               Accuracy : 0.8702          
                 95% CI : (0.8255, 0.9069)
    No Information Rate : 0.6772          
    P-Value [Acc > NIR] : 4.33e-14        
                                          
                  Kappa : 0.7168          
                                          
 Mcnemar's Test P-Value : 0.008529        
                                          
            Sensitivity : 0.8913          
            Specificity : 0.8601          
         Pos Pred Value : 0.7523          
         Neg Pred Value : 0.9432          
             Prevalence : 0.3228          
         Detection Rate : 0.2877          
   Detection Prevalence : 0.3825          
      Balanced Accuracy : 0.8757          
                                          
       'Positive' Class : 0               
                                          
> resultRoc1 <- roc(trainset$Mayo, as.numeric(final$X1))
Setting levels: control = 0, case = 1
Setting direction: controls < cases
> 
> ## Model's performance metrics on the test set
> 
> final <- data.frame(actual = testset$Mayo,
+                     predict(model_nn, newdata = testset, type = "prob"))
> final$predict <- factor(ifelse(final$X0 > 0.5, 0, 1))
> 
> caret::confusionMatrix(final$predict, testset$Mayo)
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 22  5
         1  3 41
                                        
               Accuracy : 0.8873        
                 95% CI : (0.79, 0.9501)
    No Information Rate : 0.6479        
    P-Value [Acc > NIR] : 4.3e-06       
                                        
                  Kappa : 0.7575        
                                        
 Mcnemar's Test P-Value : 0.7237        
                                        
            Sensitivity : 0.8800        
            Specificity : 0.8913        
         Pos Pred Value : 0.8148        
         Neg Pred Value : 0.9318        
             Prevalence : 0.3521        
         Detection Rate : 0.3099        
   Detection Prevalence : 0.3803        
      Balanced Accuracy : 0.8857        
                                        
       'Positive' Class : 0             
                                        
> resultRoc2 <- roc(testset$Mayo, as.numeric(final$X1))
Setting levels: control = 0, case = 1
Setting direction: controls < cases
> 
> ## Model's performance metrics on the validation set
> 
> final <- data.frame(actual = validset$Mayo,
+                     predict(model_nn, newdata = validset, type = "prob"))
> final$predict <- factor(ifelse(final$X0 > 0.5, 0, 1))
> caret::confusionMatrix(final$predict, validset$Mayo)
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 11  2
         1  3 14
                                          
               Accuracy : 0.8333          
                 95% CI : (0.6528, 0.9436)
    No Information Rate : 0.5333          
    P-Value [Acc > NIR] : 0.0005955       
                                          
                  Kappa : 0.6637          
                                          
 Mcnemar's Test P-Value : 1.0000000       
                                          
            Sensitivity : 0.7857          
            Specificity : 0.8750          
         Pos Pred Value : 0.8462          
         Neg Pred Value : 0.8235          
             Prevalence : 0.4667          
         Detection Rate : 0.3667          
   Detection Prevalence : 0.4333          
      Balanced Accuracy : 0.8304          
                                          
       'Positive' Class : 0               
                                          
> resultRoc3 <- roc(validset$Mayo, as.numeric(final$X1))
Setting levels: control = 0, case = 1
Setting direction: controls < cases
> 
> 
> ## Plot ROC curves obtained on the train, test and validation sets using ggroc function from pROC package
> 
> ggroc(list("Train set" = resultRoc1, "Test set" = resultRoc2, 
+            "Validation set" = resultRoc3),
+       legacy.axes = TRUE, linetype=1) +
+   theme_minimal() +
+   geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), 
+                color="darkgrey", linetype=2) + 
+   theme (axis.title.x = element_text(size = 10),
+          axis.text=element_text(size=7),
+          axis.title.y = element_text(size = 10),
+          panel.grid.major = element_line(colour = "#edecec", linetype = 3),
+          panel.grid.minor = element_blank())
> 
> ggsave("roc_bin_biol.png", limitsize = FALSE)
Saving 6.01 x 3.77 in image