> # Name: UCDiseaseActivity_3rdNNModel.R > # Author: Iolanda Valentina Popa, iolivp@gmail.com > # Year: 2020 > # Software: R Studio Version 1.2.1335 © 2009-2019 RStudio, Inc. Build 1379 (f1ac3452) > # Language: R > # Program size: 7.57kB > > > library(mice); library(ggplot2); library(dplyr); library(caret); library(grid); library(pROC); library(mice); library(UBL) Loading required package: lattice Attaching package: ‘mice’ The following objects are masked from ‘package:base’: cbind, rbind Attaching package: ‘dplyr’ The following objects are masked from ‘package:stats’: filter, lag The following objects are masked from ‘package:base’: intersect, setdiff, setequal, union Type 'citation("pROC")' for a citation. Attaching package: ‘pROC’ The following objects are masked from ‘package:stats’: cov, smooth, var Loading required package: MBA Loading required package: gstat Registered S3 method overwritten by 'xts': method from as.zoo.xts zoo Loading required package: automap Loading required package: sp Loading required package: randomForest randomForest 4.6-14 Type rfNews() to see new features/changes/bug fixes. Attaching package: ‘randomForest’ The following object is masked from ‘package:dplyr’: combine The following object is masked from ‘package:ggplot2’: margin > > rcuh <- read.csv('pacientibii.csv') > rcuh <- subset(rcuh, !is.na(Mayo)) > > rcuh <- subset(rcuh, select = c("Mayo", "NrScaune24H", "Diaree", "TenesmeRectale", + "HDI", "DurereAbdominala","ScaderePonderala", + "Astenie", "PaloareCutanata", + "WBC", "PLT", "MONO", "VSH1h", "Fibrinogen", + "Fier", "ProteineTotale", "alfa1GLOBULINE", + "CRP", "RBC", "HGB", "PDW")) > > > sapply(rcuh, function(x) sum(is.na(x))) #Checking for missing values Mayo NrScaune24H Diaree TenesmeRectale HDI 0 0 0 3 1 DurereAbdominala ScaderePonderala Astenie PaloareCutanata WBC 2 4 2 4 0 PLT MONO VSH1h Fibrinogen Fier 0 1 55 90 26 ProteineTotale alfa1GLOBULINE CRP RBC HGB 61 78 33 0 0 PDW 7 > > > ## ---------------------------------------------------------------------------------------------- > ## Imputing missing values using MICE function > ## ---------------------------------------------------------------------------------------------- > > > rcuh <- rcuh %>% + mutate( + Mayo = as.factor(Mayo), + Diaree = as.factor(Diaree), + TenesmeRectale = as.factor(TenesmeRectale), + HDI = as.factor(HDI), + DurereAbdominala = as.factor(DurereAbdominala), + ScaderePonderala = as.factor(ScaderePonderala), + Astenie = as.factor(Astenie), + PaloareCutanata = as.factor(PaloareCutanata), + + NrScaune24H = as.numeric(NrScaune24H), + WBC = as.numeric(WBC), + HGB = as.numeric(HGB), + PLT = as.numeric(PLT), + MONO = as.numeric(MONO), + VSH1h = as.numeric(VSH1h), + Fibrinogen = as.numeric(Fibrinogen), + Fier = as.numeric(Fier), + ProteineTotale = as.numeric(ProteineTotale), + CRP = as.numeric(CRP), + alfa1GLOBULINE = as.numeric(alfa1GLOBULINE), + RBC = as.numeric(RBC), + PDW = as.numeric(PDW) + ) > > init = mice(rcuh, maxit=0) > meth = init$method > predM = init$predictorMatrix > > meth[c("RBC")]="norm" > meth[c("PDW")]="norm" > meth[c("WBC")]="norm" > meth[c("HGB")]="norm" > meth[c("PLT")]="norm" > meth[c("MONO")]="norm" > meth[c("VSH1h")]="norm" > meth[c("Fibrinogen")]="norm" > meth[c("Fier")]="norm" > meth[c("ProteineTotale")]="norm" > meth[c("CRP")]="norm" > meth[c("alfa1GLOBULINE")]="norm" > meth[c("NrScaune24H")]="norm" > > meth[c("Diaree")]="logreg" > meth[c("TenesmeRectale")]="logreg" > meth[c("HDI")]="logreg" > meth[c("DurereAbdominala")]="logreg" > meth[c("ScaderePonderala")]="logreg" > meth[c("Astenie")]="logreg" > meth[c("PaloareCutanata")]="logreg" > > set.seed(103) > imputed = mice(rcuh, method=meth, predictorMatrix=predM, m=5) iter imp variable 1 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 1 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 2 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 3 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 4 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 1 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 2 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 3 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 4 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW 5 5 TenesmeRectale HDI DurereAbdominala ScaderePonderala Astenie PaloareCutanata MONO VSH1h Fibrinogen Fier ProteineTotale alfa1GLOBULINE CRP PDW > imputed <- complete(imputed) > > rcuh <- imputed > > sapply(rcuh, function(x) sum(is.na(x))) #checking for missing values after imputation (no missing values detected) Mayo NrScaune24H Diaree TenesmeRectale HDI 0 0 0 0 0 DurereAbdominala ScaderePonderala Astenie PaloareCutanata WBC 0 0 0 0 0 PLT MONO VSH1h Fibrinogen Fier 0 0 0 0 0 ProteineTotale alfa1GLOBULINE CRP RBC HGB 0 0 0 0 0 PDW 0 > > > rcuh$Diaree <- as.numeric(rcuh$Diaree) > rcuh$TenesmeRectale <- as.numeric(rcuh$TenesmeRectale) > rcuh$HDI <- as.numeric(rcuh$HDI) > rcuh$DurereAbdominala <- as.numeric(rcuh$DurereAbdominala) > rcuh$ScaderePonderala <- as.numeric(rcuh$ScaderePonderala) > rcuh$Astenie <- as.numeric(rcuh$Astenie) > rcuh$PaloareCutanata <- as.numeric(rcuh$PaloareCutanata) > rcuh$Mayo <- as.numeric(rcuh$Mayo) - 1 > > > ## ---------------------------------------------------------------------------------------------- > ## Normalize numeric variables using the min-max method > ## ---------------------------------------------------------------------------------------------- > > temp <- rcuh$Mayo > normalize <- function(x, ...) { + return ((x - min(x, ...)) / (max(x, ...) - min(x, ...))) + } > maxmindf <- as.data.frame(lapply(rcuh, normalize, na.rm = T)) > rcuh <- maxmindf > rcuh$Mayo <- temp > > > ## ---------------------------------------------------------------------------------------------- > ## Randomly divide initial data into the training set (80%) and the test set (20%) > ## ---------------------------------------------------------------------------------------------- > > validSetSize <- 30 > initDataSize <- nrow(rcuh) - validSetSize > > rcuh_init <- rcuh[1 : initDataSize, ] > set.seed(2) > trainset_idx <- sample(1:nrow(rcuh_init), size = round(initDataSize * 0.8)) > trainset <- rcuh_init[trainset_idx,] > testset <- rcuh_init[-trainset_idx,] > > ## Validation set constituted by the last 30 records added independently > > validset <- rcuh[(initDataSize + 1) : nrow(rcuh), ] > > trainset$Mayo <- factor(trainset$Mayo) > testset$Mayo <- factor(testset$Mayo) > validset$Mayo <- factor(validset$Mayo) > > ## ---------------------------------------------------------------------------------------------- > ## Balance Mayo classes in the train set using synthetic minority over-sampling technique (SMOTE) > ## ---------------------------------------------------------------------------------------------- > > set.seed(8) > balancedtrainset <- SmoteClassif(Mayo ~ ., trainset) > > #Shuffle the balanced dataset returned by the SMOTE sampling > set.seed(3165) > balancedtrainShuffled_idx <- sample(1:nrow(balancedtrainset), size=nrow(balancedtrainset)) > balancedtrainShuffled <- balancedtrainset[balancedtrainShuffled_idx,] > > > ## ---------------------------------------------------------------------------------------------- > ## Train the Neural Network model with caret::train function > ## ---------------------------------------------------------------------------------------------- > > set.seed(9) > model_nn <- caret::train(Mayo ~ ., + data = balancedtrainShuffled, + method = "mlpML", + preProcess = c("scale", "center"), + trControl = trainControl(method = "repeatedcv", + number = 10, + repeats = 10, + verboseIter = FALSE)) > > > ## ---------------------------------------------------------------------------------------------- > ## Test the Neural Network model > ## ---------------------------------------------------------------------------------------------- > > ## Model's performance metrics on the train set > > final <- data.frame(actual = balancedtrainShuffled$Mayo, + predict(model_nn, newdata = balancedtrainShuffled, type = "prob")) > > final$predict <- max.col(subset(final, select = c("X0", "X1", "X2", "X3"))) > confusionMatrix(as.factor(final$predict-1), balancedtrainShuffled$Mayo) Confusion Matrix and Statistics Reference Prediction 0 1 2 3 0 69 2 1 0 1 2 65 8 0 2 0 3 58 9 3 0 1 4 62 Overall Statistics Accuracy : 0.8944 95% CI : (0.8526, 0.9276) No Information Rate : 0.25 P-Value [Acc > NIR] : < 2.2e-16 Kappa : 0.8592 Mcnemar's Test P-Value : NA Statistics by Class: Class: 0 Class: 1 Class: 2 Class: 3 Sensitivity 0.9718 0.9155 0.8169 0.8732 Specificity 0.9859 0.9531 0.9437 0.9765 Pos Pred Value 0.9583 0.8667 0.8286 0.9254 Neg Pred Value 0.9906 0.9713 0.9393 0.9585 Prevalence 0.2500 0.2500 0.2500 0.2500 Detection Rate 0.2430 0.2289 0.2042 0.2183 Detection Prevalence 0.2535 0.2641 0.2465 0.2359 Balanced Accuracy 0.9789 0.9343 0.8803 0.9249 > levels(balancedtrainShuffled$Mayo) <- list(X0="0", X1="1", X2="2", X3="3") > resultRoc <- multiclass.roc(balancedtrainShuffled$Mayo, subset(final, select = c("X0", "X1", "X2", "X3"))) > resultRoc$auc Multi-class area under the curve: 0.9541 > > ## Model's performance metrics on the test set > > final <- data.frame(actual = testset$Mayo, + predict(model_nn, newdata = testset, type = "prob")) > > final$predict <- max.col(subset(final, select = c("X0", "X1", "X2", "X3"))) > confusionMatrix(as.factor(final$predict-1), testset$Mayo) Confusion Matrix and Statistics Reference Prediction 0 1 2 3 0 12 1 0 0 1 3 7 2 1 2 0 1 23 2 3 0 1 6 12 Overall Statistics Accuracy : 0.7606 95% CI : (0.6446, 0.8539) No Information Rate : 0.4366 P-Value [Acc > NIR] : 2.887e-08 Kappa : 0.667 Mcnemar's Test P-Value : NA Statistics by Class: Class: 0 Class: 1 Class: 2 Class: 3 Sensitivity 0.8000 0.70000 0.7419 0.8000 Specificity 0.9821 0.90164 0.9250 0.8750 Pos Pred Value 0.9231 0.53846 0.8846 0.6316 Neg Pred Value 0.9483 0.94828 0.8222 0.9423 Prevalence 0.2113 0.14085 0.4366 0.2113 Detection Rate 0.1690 0.09859 0.3239 0.1690 Detection Prevalence 0.1831 0.18310 0.3662 0.2676 Balanced Accuracy 0.8911 0.80082 0.8335 0.8375 > levels(testset$Mayo) <- list(X0="0", X1="1", X2="2", X3="3") > resultRoc <- multiclass.roc(testset$Mayo, subset(final, select = c("X0", "X1", "X2", "X3"))) > resultRoc$auc Multi-class area under the curve: 0.8726 > > ## Model's performance metrics on the validation set > > final <- data.frame(actual = validset$Mayo, + predict(model_nn, newdata = validset, type = "prob")) > > final$predict <- max.col(subset(final, select = c("X0", "X1", "X2", "X3"))) > confusionMatrix(as.factor(final$predict-1), validset$Mayo) Confusion Matrix and Statistics Reference Prediction 0 1 2 3 0 8 1 1 0 1 0 5 1 0 2 0 0 5 1 3 0 0 2 6 Overall Statistics Accuracy : 0.8 95% CI : (0.6143, 0.9229) No Information Rate : 0.3 P-Value [Acc > NIR] : 2.194e-08 Kappa : 0.7329 Mcnemar's Test P-Value : NA Statistics by Class: Class: 0 Class: 1 Class: 2 Class: 3 Sensitivity 1.0000 0.8333 0.5556 0.8571 Specificity 0.9091 0.9583 0.9524 0.9130 Pos Pred Value 0.8000 0.8333 0.8333 0.7500 Neg Pred Value 1.0000 0.9583 0.8333 0.9545 Prevalence 0.2667 0.2000 0.3000 0.2333 Detection Rate 0.2667 0.1667 0.1667 0.2000 Detection Prevalence 0.3333 0.2000 0.2000 0.2667 Balanced Accuracy 0.9545 0.8958 0.7540 0.8851 > levels(validset$Mayo) <- list(X0="0", X1="1", X2="2", X3="3") > resultRoc <- multiclass.roc(validset$Mayo, subset(final, select = c("X0", "X1", "X2", "X3"))) > resultRoc$auc Multi-class area under the curve: 0.9175