> # Name: UCDiseaseActivity_2ndNNModel.R > # Author: Iolanda Valentina Popa, iolivp@gmail.com > # Year: 2020 > # Software: R Studio Version 1.2.1335 © 2009-2019 RStudio, Inc. Build 1379 (f1ac3452) > # Language: R > # Program size: 6.31kB > > > library(mice); library(ggplot2); library(dplyr); library(caret); library(grid); library(pROC); library(mice); library(UBL) Loading required package: lattice Attaching package: ‘mice’ The following objects are masked from ‘package:base’: cbind, rbind Attaching package: ‘dplyr’ The following objects are masked from ‘package:stats’: filter, lag The following objects are masked from ‘package:base’: intersect, setdiff, setequal, union Type 'citation("pROC")' for a citation. Attaching package: ‘pROC’ The following objects are masked from ‘package:stats’: cov, smooth, var Loading required package: MBA Loading required package: gstat Registered S3 method overwritten by 'xts': method from as.zoo.xts zoo Loading required package: automap Loading required package: sp Loading required package: randomForest randomForest 4.6-14 Type rfNews() to see new features/changes/bug fixes. Attaching package: ‘randomForest’ The following object is masked from ‘package:dplyr’: combine The following object is masked from ‘package:ggplot2’: margin > > rcuh <- read.csv('pacientibii.csv') > rcuh <- subset(rcuh, !is.na(Mayo)) > > rcuh <- subset(rcuh, select = c("Mayo", "WBC", "PLT", "MONO", + "VSH1h", "Fibrinogen", + "Fier", "ProteineTotale", "alfa1GLOBULINE", + "CRP", "RBC", "HGB", "PDW")) > > sapply(rcuh, function(x) sum(is.na(x))) #Checking for missing values Mayo WBC PLT MONO VSH1h 0 0 0 1 55 Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP 90 26 61 78 33 RBC HGB PDW 0 0 7 > > > ## ---------------------------------------------------------------------------------------------- > ## Imputing missing values using MICE function > ## ---------------------------------------------------------------------------------------------- > > rcuh <- rcuh %>% + mutate( + Mayo = as.factor(Mayo), + WBC = as.numeric(WBC), + HGB = as.numeric(HGB), + PLT = as.numeric(PLT), + MONO = as.numeric(MONO), + VSH1h = as.numeric(VSH1h), + Fibrinogen = as.numeric(Fibrinogen), + Fier = as.numeric(Fier), + ProteineTotale = as.numeric(ProteineTotale), + CRP = as.numeric(CRP), + alfa1GLOBULINE = as.numeric(alfa1GLOBULINE), + RBC = as.numeric(RBC), + PDW = as.numeric(PDW), + ) > > init = mice(rcuh, maxit=0) > meth = init$method > predM = init$predictorMatrix > meth[c("RBC")]="norm" > meth[c("PDW")]="norm" > meth[c("WBC")]="norm" > meth[c("HGB")]="norm" > meth[c("PLT")]="norm" > meth[c("MONO")]="norm" > meth[c("VSH1h")]="norm" > meth[c("Fibrinogen")]="norm" > meth[c("Fier")]="norm" > meth[c("ProteineTotale")]="norm" > meth[c("CRP")]="norm" > meth[c("alfa1GLOBULINE")]="norm" > > set.seed(103) > imputed = mice(rcuh, method=meth, predictorMatrix=predM, m=5) iter imp variable 1 1 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 2 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 3 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 4 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 5 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 1 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 2 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 3 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 4 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 5 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 1 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 2 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 3 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 4 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 5 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 1 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 2 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 3 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 4 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 5 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 1 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 2 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 3 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 4 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 5 MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW > imputed <- complete(imputed) > > rcuh <- imputed > > sapply(rcuh, function(x) sum(is.na(x))) #checking for missing values after imputation (no missing values detected) Mayo WBC PLT MONO VSH1h 0 0 0 0 0 Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP 0 0 0 0 0 RBC HGB PDW 0 0 0 > > rcuh$Mayo <- as.numeric(rcuh$Mayo) - 1 > > > ## ---------------------------------------------------------------------------------------------- > ## Normalize numeric variables using the min-max method > ## ---------------------------------------------------------------------------------------------- > > temp <- rcuh$Mayo > > normalize <- function(x, ...) { + return ((x - min(x, ...)) / (max(x, ...) - min(x, ...))) + } > maxmindf <- as.data.frame(lapply(rcuh, normalize, na.rm = T)) > rcuh <- maxmindf > rcuh$Mayo <- temp > > ## Transform Mayo score into a binary score (Mayo 0 and 1 become 0 and Mayo 2 and 3 become 1) > > rcuh$Mayo <- replace(rcuh$Mayo, rcuh$Mayo==1, 0) > rcuh$Mayo <- replace(rcuh$Mayo, rcuh$Mayo==2, 1) > rcuh$Mayo <- replace(rcuh$Mayo, rcuh$Mayo==3, 1) > > > ## ---------------------------------------------------------------------------------------------- > ## Randomly divide initial data into the training set (80%) and the test set (20%) > ## ---------------------------------------------------------------------------------------------- > > validSetSize <- 30 > initDataSize <- nrow(rcuh) - validSetSize > > rcuh_init <- rcuh[1 : initDataSize, ] > set.seed(2) > trainset_idx <- sample(1:nrow(rcuh_init), size = round(initDataSize * 0.8)) > trainset <- rcuh_init[trainset_idx,] > testset <- rcuh_init[-trainset_idx,] > > ## Validation set constituted by the last 30 records added independently > > validset <- rcuh[(initDataSize + 1) : nrow(rcuh), ] > > > ## ---------------------------------------------------------------------------------------------- > ## Train the Neural Network model with caret::train function > ## ---------------------------------------------------------------------------------------------- > > trainset$Mayo <- factor(trainset$Mayo) > testset$Mayo <- factor(testset$Mayo) > validset$Mayo <- factor(validset$Mayo) > > set.seed(405) > model_nn <- caret::train(Mayo ~ ., + data = trainset, + method = "mlpML", + preProcess = c("scale", "center"), + trControl = trainControl(method = "repeatedcv", + number = 10, + repeats = 10, + verboseIter = FALSE, + sampling = "smote")) Registered S3 method overwritten by 'quantmod': method from as.zoo.data.frame zoo > > > ## ---------------------------------------------------------------------------------------------- > ## Test the Neural Network model > ## ---------------------------------------------------------------------------------------------- > > ## Model's performance metrics on the train set > > final <- data.frame(actual = trainset$Mayo, + predict(model_nn, newdata = trainset, type = "prob")) > final$predict <- factor(ifelse(final$X0 > 0.5, 0, 1)) > caret::confusionMatrix(final$predict, trainset$Mayo) Confusion Matrix and Statistics Reference Prediction 0 1 0 82 27 1 10 166 Accuracy : 0.8702 95% CI : (0.8255, 0.9069) No Information Rate : 0.6772 P-Value [Acc > NIR] : 4.33e-14 Kappa : 0.7168 Mcnemar's Test P-Value : 0.008529 Sensitivity : 0.8913 Specificity : 0.8601 Pos Pred Value : 0.7523 Neg Pred Value : 0.9432 Prevalence : 0.3228 Detection Rate : 0.2877 Detection Prevalence : 0.3825 Balanced Accuracy : 0.8757 'Positive' Class : 0 > resultRoc1 <- roc(trainset$Mayo, as.numeric(final$X1)) Setting levels: control = 0, case = 1 Setting direction: controls < cases > > ## Model's performance metrics on the test set > > final <- data.frame(actual = testset$Mayo, + predict(model_nn, newdata = testset, type = "prob")) > final$predict <- factor(ifelse(final$X0 > 0.5, 0, 1)) > > caret::confusionMatrix(final$predict, testset$Mayo) Confusion Matrix and Statistics Reference Prediction 0 1 0 22 5 1 3 41 Accuracy : 0.8873 95% CI : (0.79, 0.9501) No Information Rate : 0.6479 P-Value [Acc > NIR] : 4.3e-06 Kappa : 0.7575 Mcnemar's Test P-Value : 0.7237 Sensitivity : 0.8800 Specificity : 0.8913 Pos Pred Value : 0.8148 Neg Pred Value : 0.9318 Prevalence : 0.3521 Detection Rate : 0.3099 Detection Prevalence : 0.3803 Balanced Accuracy : 0.8857 'Positive' Class : 0 > resultRoc2 <- roc(testset$Mayo, as.numeric(final$X1)) Setting levels: control = 0, case = 1 Setting direction: controls < cases > > ## Model's performance metrics on the validation set > > final <- data.frame(actual = validset$Mayo, + predict(model_nn, newdata = validset, type = "prob")) > final$predict <- factor(ifelse(final$X0 > 0.5, 0, 1)) > caret::confusionMatrix(final$predict, validset$Mayo) Confusion Matrix and Statistics Reference Prediction 0 1 0 11 2 1 3 14 Accuracy : 0.8333 95% CI : (0.6528, 0.9436) No Information Rate : 0.5333 P-Value [Acc > NIR] : 0.0005955 Kappa : 0.6637 Mcnemar's Test P-Value : 1.0000000 Sensitivity : 0.7857 Specificity : 0.8750 Pos Pred Value : 0.8462 Neg Pred Value : 0.8235 Prevalence : 0.4667 Detection Rate : 0.3667 Detection Prevalence : 0.4333 Balanced Accuracy : 0.8304 'Positive' Class : 0 > resultRoc3 <- roc(validset$Mayo, as.numeric(final$X1)) Setting levels: control = 0, case = 1 Setting direction: controls < cases > > > ## Plot ROC curves obtained on the train, test and validation sets using ggroc function from pROC package > > ggroc(list("Train set" = resultRoc1, "Test set" = resultRoc2, + "Validation set" = resultRoc3), + legacy.axes = TRUE, linetype=1) + + theme_minimal() + + geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), + color="darkgrey", linetype=2) + + theme (axis.title.x = element_text(size = 10), + axis.text=element_text(size=7), + axis.title.y = element_text(size = 10), + panel.grid.major = element_line(colour = "#edecec", linetype = 3), + panel.grid.minor = element_blank()) > > ggsave("roc_bin_biol.png", limitsize = FALSE) Saving 6.01 x 3.77 in image