> # Name: UCDiseaseActivity_1stNNModel.R > # Author: Iolanda Valentina Popa, iolivp@gmail.com > # Year: 2020 > # Software: R Studio Version 1.2.1335 © 2009-2019 RStudio, Inc. Build 1379 (f1ac3452) > # Language: R > # Program size: 7.4kB > > > library(mice); library(ggplot2); library(dplyr); library(caret); library(grid); library(pROC); library(mice); library(UBL) Loading required package: lattice Attaching package: ‘mice’ The following objects are masked from ‘package:base’: cbind, rbind Attaching package: ‘dplyr’ The following objects are masked from ‘package:stats’: filter, lag The following objects are masked from ‘package:base’: intersect, setdiff, setequal, union Type 'citation("pROC")' for a citation. Attaching package: ‘pROC’ The following objects are masked from ‘package:stats’: cov, smooth, var Loading required package: MBA Loading required package: gstat Registered S3 method overwritten by 'xts': method from as.zoo.xts zoo Loading required package: automap Loading required package: sp Loading required package: randomForest randomForest 4.6-14 Type rfNews() to see new features/changes/bug fixes. Attaching package: ‘randomForest’ The following object is masked from ‘package:dplyr’: combine The following object is masked from ‘package:ggplot2’: margin > > rcuh <- read.csv('pacientibii.csv') > rcuh <- subset(rcuh, !is.na(Mayo)) > rcuh <- subset(rcuh, select = c("Mayo", "NrScaune24H", "Diaree", "TenesmeRectale", + "HDI", "DurereAbdominala","ScaderePonderala", + "Astenie", "PaloareCutanata", + "WBC", "PLT", "MONO", "VSH1h", "Fibrinogen", + "Fier", "ProteineTotale", "alfa1GLOBULINE", + "CRP", "RBC", "HGB", "PDW")) > > > sapply(rcuh, function(x) sum(is.na(x))) #Checking for missing values Mayo NrScaune24H Diaree TenesmeRectale HDI 0 0 0 3 1 DurereAbdominala ScaderePonderala Astenie PaloareCutanata WBC 2 4 2 4 0 PLT MONO VSH1h Fibrinogen Fier 0 1 55 90 26 ProteineTotale alfa1GLOBULINE CRP RBC HGB 61 78 33 0 0 PDW 7 > > ## ---------------------------------------------------------------------------------------------- > ## Imputing missing values using MICE function > ## ---------------------------------------------------------------------------------------------- > > rcuh <- rcuh %>% + mutate( + Mayo = as.factor(Mayo), + Diaree = as.factor(Diaree), + TenesmeRectale = as.factor(TenesmeRectale), + HDI = as.factor(HDI), + DurereAbdominala = as.factor(DurereAbdominala), + ScaderePonderala = as.factor(ScaderePonderala), + Astenie = as.factor(Astenie), + PaloareCutanata = as.factor(PaloareCutanata), + + NrScaune24H = as.numeric(NrScaune24H), + WBC = as.numeric(WBC), + HGB = as.numeric(HGB), + PLT = as.numeric(PLT), + MONO = as.numeric(MONO), + VSH1h = as.numeric(VSH1h), + Fibrinogen = as.numeric(Fibrinogen), + Fier = as.numeric(Fier), + ProteineTotale = as.numeric(ProteineTotale), + CRP = as.numeric(CRP), + alfa1GLOBULINE = as.numeric(alfa1GLOBULINE), + RBC = as.numeric(RBC), + PDW = as.numeric(PDW) + ) > > init = mice(rcuh, maxit=0) > meth = init$method > predM = init$predictorMatrix > > meth[c("RBC")]="norm" > meth[c("PDW")]="norm" > meth[c("WBC")]="norm" > meth[c("HGB")]="norm" > meth[c("PLT")]="norm" > meth[c("MONO")]="norm" > meth[c("VSH1h")]="norm" > meth[c("Fibrinogen")]="norm" > meth[c("Fier")]="norm" > meth[c("ProteineTotale")]="norm" > meth[c("CRP")]="norm" > meth[c("alfa1GLOBULINE")]="norm" > meth[c("NrScaune24H")]="norm" > > meth[c("Diaree")]="logreg" > meth[c("TenesmeRectale")]="logreg" > meth[c("HDI")]="logreg" > meth[c("DurereAbdominala")]="logreg" > meth[c("ScaderePonderala")]="logreg" > meth[c("Astenie")]="logreg" > meth[c("PaloareCutanata")]="logreg" > > set.seed(103) > imputed = mice(rcuh, method=meth, predictorMatrix=predM, m=5) iter imp variable 1 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW > imputed <- complete(imputed) > > rcuh <- imputed > > sapply(rcuh, function(x) sum(is.na(x))) #checking for missing values after imputation (no missing values detected) Mayo NrScaune24H Diaree TenesmeRectale HDI 0 0 0 0 0 DurereAbdominala ScaderePonderala Astenie PaloareCutanata WBC 0 0 0 0 0 PLT MONO VSH1h Fibrinogen Fier 0 0 0 0 0 ProteineTotale alfa1GLOBULINE CRP RBC HGB 0 0 0 0 0 PDW 0 > > rcuh$Diaree <- as.numeric(rcuh$Diaree) > rcuh$TenesmeRectale <- as.numeric(rcuh$TenesmeRectale) > rcuh$HDI <- as.numeric(rcuh$HDI) > rcuh$DurereAbdominala <- as.numeric(rcuh$DurereAbdominala) > rcuh$ScaderePonderala <- as.numeric(rcuh$ScaderePonderala) > rcuh$Astenie <- as.numeric(rcuh$Astenie) > rcuh$PaloareCutanata <- as.numeric(rcuh$PaloareCutanata) > rcuh$Mayo <- as.numeric(rcuh$Mayo) - 1 > > > ## ---------------------------------------------------------------------------------------------- > ## Normalize numeric variables using the min-max method > ## ---------------------------------------------------------------------------------------------- > > temp <- rcuh$Mayo > normalize <- function(x, ...) { + return ((x - min(x, ...)) / (max(x, ...) - min(x, ...))) + } > maxmindf <- as.data.frame(lapply(rcuh, normalize, na.rm = T)) > rcuh <- maxmindf > rcuh$Mayo <- temp > > ## Transform Mayo score into a binary score (Mayo 0 and 1 become 0 and Mayo 2 and 3 become 1) > > rcuh$Mayo <- replace(rcuh$Mayo, rcuh$Mayo==1, 0) > rcuh$Mayo <- replace(rcuh$Mayo, rcuh$Mayo==2, 1) > rcuh$Mayo <- replace(rcuh$Mayo, rcuh$Mayo==3, 1) > > > ## ---------------------------------------------------------------------------------------------- > ## Randomly divide initial data into the training set (80%) and the test set (20%) > ## ---------------------------------------------------------------------------------------------- > > validSetSize <- 30 > initDataSize <- nrow(rcuh) - validSetSize > > rcuh_init <- rcuh[1 : initDataSize, ] > set.seed(2) > trainset_idx <- sample(1:nrow(rcuh_init), size = round(initDataSize * 0.8)) > trainset <- rcuh_init[trainset_idx,] > testset <- rcuh_init[-trainset_idx,] > > ## Validation set constituted by the last 30 records added independently > > validset <- rcuh[(initDataSize + 1) : nrow(rcuh), ] > > > ## ---------------------------------------------------------------------------------------------- > ## Train the Neural Network model with caret::train function > ## ---------------------------------------------------------------------------------------------- > > trainset$Mayo <- factor(trainset$Mayo) > testset$Mayo <- factor(testset$Mayo) > validset$Mayo <- factor(validset$Mayo) > > set.seed(91) > model_nn <- caret::train(Mayo ~ ., + data = trainset, + method = "mlpML", + preProcess = c("scale", "center"), + trControl = trainControl(method = "repeatedcv", + number = 10, + repeats = 10, + verboseIter = FALSE, + sampling = "smote")) Registered S3 method overwritten by 'quantmod': method from as.zoo.data.frame zoo > > > ## ---------------------------------------------------------------------------------------------- > ## Test the Neural Network model > ## ---------------------------------------------------------------------------------------------- > > ## Model's performance metrics on the train set > > final <- data.frame(actual = trainset$Mayo, + predict(model_nn, newdata = trainset, type = "prob")) > final$predict <- factor(ifelse(final$X0 > 0.5, 0, 1)) > caret::confusionMatrix(final$predict, trainset$Mayo) Confusion Matrix and Statistics Reference Prediction 0 1 0 83 12 1 9 181 Accuracy : 0.9263 95% CI : (0.8896, 0.9538) No Information Rate : 0.6772 P-Value [Acc > NIR] : <2e-16 Kappa : 0.8329 Mcnemar's Test P-Value : 0.6625 Sensitivity : 0.9022 Specificity : 0.9378 Pos Pred Value : 0.8737 Neg Pred Value : 0.9526 Prevalence : 0.3228 Detection Rate : 0.2912 Detection Prevalence : 0.3333 Balanced Accuracy : 0.9200 'Positive' Class : 0 > resultRoc1 <- roc(trainset$Mayo, as.numeric(final$X1)) Setting levels: control = 0, case = 1 Setting direction: controls < cases > > ## Model's performance metrics on the test set > > final <- data.frame(actual = testset$Mayo, + predict(model_nn, newdata = testset, type = "prob")) > final$predict <- factor(ifelse(final$X0 > 0.5, 0, 1)) > caret::confusionMatrix(final$predict, testset$Mayo) Confusion Matrix and Statistics Reference Prediction 0 1 0 22 1 1 3 45 Accuracy : 0.9437 95% CI : (0.862, 0.9844) No Information Rate : 0.6479 P-Value [Acc > NIR] : 3.916e-09 Kappa : 0.8742 Mcnemar's Test P-Value : 0.6171 Sensitivity : 0.8800 Specificity : 0.9783 Pos Pred Value : 0.9565 Neg Pred Value : 0.9375 Prevalence : 0.3521 Detection Rate : 0.3099 Detection Prevalence : 0.3239 Balanced Accuracy : 0.9291 'Positive' Class : 0 > resultRoc2 <- roc(testset$Mayo, as.numeric(final$X1)) Setting levels: control = 0, case = 1 Setting direction: controls < cases > > ## Model's performance metrics on the validation set > > final <- data.frame(actual = validset$Mayo, + predict(model_nn, newdata = validset, type = "prob")) > final$predict <- factor(ifelse(final$X0 > 0.5, 0, 1)) > caret::confusionMatrix(final$predict, validset$Mayo) Confusion Matrix and Statistics Reference Prediction 0 1 0 13 1 1 1 15 Accuracy : 0.9333 95% CI : (0.7793, 0.9918) No Information Rate : 0.5333 P-Value [Acc > NIR] : 2.326e-06 Kappa : 0.8661 Mcnemar's Test P-Value : 1 Sensitivity : 0.9286 Specificity : 0.9375 Pos Pred Value : 0.9286 Neg Pred Value : 0.9375 Prevalence : 0.4667 Detection Rate : 0.4333 Detection Prevalence : 0.4667 Balanced Accuracy : 0.9330 'Positive' Class : 0 > resultRoc3 <- roc(validset$Mayo, as.numeric(final$X1)) Setting levels: control = 0, case = 1 Setting direction: controls < cases > > > ## Plot ROC curves obtained on the train, test and validation sets using ggroc function from pROC package > > ggroc(list("Train set" = resultRoc1, "Test set" = resultRoc2, + "Validation set" = resultRoc3), + legacy.axes = TRUE, linetype=1) + + theme_minimal() + + geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), + color="darkgrey", linetype=2) + + facet_grid(.~name) + + theme (axis.title.x = element_text(size = 10), + axis.text=element_text(size=7), + axis.title.y = element_text(size = 10), + panel.grid.major = element_line(colour = "#edecec", linetype = 3), + panel.grid.minor = element_blank()) > > ggsave("roc_bin_clin_biol.png", limitsize = FALSE) Saving 6.01 x 3.77 in image