BAB 3. Pemodelan Klasifikasi

3.1 Regresi Logistik

Program 3.1

# mengimpor data
dataEHR <- read.csv("data-EHR.csv", sep=",")
# Mengubah peubah “SOURCE” dan “AGE” menjadi factor.
library(dplyr)
dataEHR <- dataEHR %>% mutate_at(c(10,11),factor)
# Memisahkan data menjadi 80% data latih dan 20% data uji.
set.seed(123)
n <- round(0.2*nrow(dataEHR),0)
contoh <- sample(nrow(dataEHR), n, replace = FALSE)
EHRlatih <- dataEHR[-contoh,]
EHRuji <- dataEHR[contoh,]
# Menjalankan Regresi Logistik
logit <- glm(SOURCE~., data=EHRlatih, family=binomial)
# menyajikan ringkasan model
summary(logit)
# menampilkan Akurasi, Sensitivitas dan Spesifisitas
perform <- function(pred,data){
  tabel <- caret::confusionMatrix(pred, data$SOURCE, positive =
                                    "in")
  result <- c(tabel$overall[1],tabel$byClass[1:2])
  return(result)
}
pred.logit <- NULL
pred.logit$pred <- predict(logit, newdata = EHRuji,
                           type="response")
pred.logit$in_pred <- ifelse(pred.logit$pred > 0.50, "out", "in")
pred.logit$in_pred <- as.factor(pred.logit$in_pred)
data.frame(nilai=perform(pred.logit$in_pred, EHRuji))

Output 3.1

## 
## Call:
## glm(formula = SOURCE ~ ., family = binomial, data = EHRlatih)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  12.7040360  9.2818415   1.369   0.1711    
## HAEMATOCRIT   0.0087554  0.0605721   0.145   0.8851    
## HAEMOGLOBINS  0.0991677  0.2089742   0.475   0.6351    
## ERYTHROCYTE   0.5399139  0.4188916   1.289   0.1974    
## LEUCOCYTE    -0.0850973  0.0094811  -8.976  < 2e-16 ***
## THROMBOCYTE   0.0071632  0.0004233  16.923  < 2e-16 ***
## MCH           0.8117858  0.3244528   2.502   0.0123 *  
## MCHC         -0.6368730  0.2695991  -2.362   0.0182 *  
## MCV          -0.2206734  0.1081518  -2.040   0.0413 *  
## AGE          -0.0048489  0.0020418  -2.375   0.0176 *  
## SEXM         -0.3694524  0.0828202  -4.461 8.16e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4761.0  on 3529  degrees of freedom
## Residual deviance: 4045.6  on 3519  degrees of freedom
## AIC: 4067.6
## 
## Number of Fisher Scoring iterations: 4
##                 nilai
## Accuracy    0.6882086
## Sensitivity 0.4333333
## Specificity 0.8639847

3.2. Pohon Klasifikasi

Program 3.2

# Memanggil data
dataEHR <- read.csv("data-EHR.csv", sep=",")
# Mengubah peubah “SOURCE” dan “AGE” menjadi factor.
library(dplyr)
dataEHR <- dataEHR %>% mutate_at(c(10,11),factor)
# Memisahkan data menjadi 80% latih dan 20% uji.
set.seed(123)
n <- round(0.2*nrow(dataEHR),0)
contoh <- sample(nrow(dataEHR), n, replace = FALSE)
EHRlatih <- dataEHR[-contoh,]
EHRuji <- dataEHR[contoh,]
# Menjalankan Pohon Klasifikasi
library(rpart)
pohon <- rpart(SOURCE~., data=EHRlatih,
 method='class',
control=rpart.control(minsplit = 50, cp=0,
maxdepth = 4))
# Menampilkan Pohon Klasifikasi
library(rpart.plot)
rpart.plot(pohon, type = 2, extra = 101, under = TRUE, cex = 0.8)

Gambar 3.2

3.3. Support Vector Machine untuk Klasifikasi

Program 3.3

# Mengimport data
dataEHR <- read.csv("data-EHR.csv", sep=",")
# Mengubah peubah “SOURCE” dan “AGE” menjadi factor.
library(dplyr)
dataEHR <- dataEHR %>% mutate_at(c(10,11),factor)
# Memisahkan data menjadi 80% latih dan 20% uji.
set.seed(123)
n <- round(0.2*nrow(dataEHR),0)
contoh <- sample(nrow(dataEHR), n, replace = FALSE)
EHRlatih <- dataEHR[-contoh,]
EHRuji <- dataEHR[contoh,]
# Menjalankan fungsi svm dengan berbagai pilihan kernel
library(e1071)
model.linear <- svm(SOURCE~., data=EHRlatih, kernel="linear")
pred.linear <- predict(model.linear, EHRuji)
model.polynomial <- svm(SOURCE~.,data=EHRlatih,kernel="polynomial")
pred.polynomial <- predict(model.polynomial,EHRuji)
model.radial <- svm(SOURCE~., data=EHRlatih, kernel="radial")
pred.radial <- predict(model.radial, EHRuji)
model.sigmoid <- svm(SOURCE~.,data=EHRlatih, kernel="sigmoid")
pred.sigmoid <- predict(model.sigmoid, EHRuji)
# Fungsi mengeluarkan Akurasi, Sensitivitas dan Spesifisitas
perform <- function(pred,data){
 tabel <- caret::confusionMatrix(pred, data$SOURCE, positive =
"in")
 result <- c(tabel$overall[1],tabel$byClass[1:2])
 return(result)
}
# Mengeluarkan output
data.frame(svm.linear=perform(pred.linear, EHRuji),
 svm.polynomial=perform(pred.polynomial, EHRuji),
 svm.radial=perform(pred.radial, EHRuji),
 svm.sigmoid=perform(pred.sigmoid, EHRuji))
##             svm.linear svm.polynomial svm.radial svm.sigmoid
## Accuracy     0.6916100      0.7142857  0.7346939   0.6099773
## Sensitivity  0.4250000      0.4416667  0.5138889   0.4666667
## Specificity  0.8754789      0.9022989  0.8869732   0.7088123

3.4. Neural Network untuk Klasifikasi

Program 3.4

# mengimport data
dataEHR <- read.csv("data-EHR.csv", sep=",")
# Mengubah peubah “SOURCE” dan “AGE” menjadi factor.
library(dplyr)
dataEHR <- dataEHR %>% mutate_at(c(10,11),factor)
# membagi data menjadi 80% latih dan 20% uji.
set.seed(123)
n <- round(0.2*nrow(dataEHR),0)
contoh <- sample(nrow(dataEHR), n, replace = FALSE)
EHRlatih <- dataEHR[-contoh,]
EHRuji <- dataEHR[contoh,]
# Mengubah paubah Kategorik menjadi Peubah Dummy
EHRlatih <- model.matrix(
 ~ SOURCE + SEX + ERYTHROCYTE + HAEMOGLOBINS + HAEMATOCRIT,
 data = EHRlatih)
head(EHRlatih)
EHRuji <- model.matrix(
 ~ SOURCE + SEX + ERYTHROCYTE + HAEMOGLOBINS + HAEMATOCRIT,
 data = EHRuji)

Output 3.3

##   (Intercept) SOURCEout SEXM ERYTHROCYTE HAEMOGLOBINS HAEMATOCRIT
## 1           1         1    0        4.65         11.8        35.1
## 2           1         1    0        5.39         14.8        43.5
## 3           1         1    0        4.74         11.3        33.5
## 4           1         1    0        4.98         13.7        39.1
## 5           1         1    1        4.23          9.9        30.9
## 6           1         1    1        4.53         11.6        34.3

Program 3.5

# Menjalankan Neural Network
library(caret)
library(neuralnet)
model.nn =neuralnet(SOURCEout ~ HAEMOGLOBINS,
 data = EHRlatih,
hidden = c(3),
linear.output = FALSE,
act.fct = "logistic")
model.nn$result.matrix

Output 3.4

##                                   [,1]
## error                     3.882273e+02
## reached.threshold         9.943177e-03
## steps                     2.450100e+04
## Intercept.to.1layhid1     4.770737e+01
## HAEMOGLOBINS.to.1layhid1 -1.224816e+01
## Intercept.to.1layhid2     3.959587e+00
## HAEMOGLOBINS.to.1layhid2 -4.803760e-01
## Intercept.to.1layhid3     1.532838e+01
## HAEMOGLOBINS.to.1layhid3 -1.887416e+00
## Intercept.to.SOURCEout    1.280384e+00
## 1layhid1.to.SOURCEout     1.609495e+03
## 1layhid2.to.SOURCEout    -6.941681e+00
## 1layhid3.to.SOURCEout     3.746391e+00

Program 3.6

# Mengeluarkan Plot Neural Network
plot(model.nn)

Output 3.5

##                                   [,1]
## error                     3.879311e+02
## reached.threshold         8.991188e-03
## steps                     3.578200e+04
## Intercept.to.1layhid1    -1.379596e+01
## HAEMOGLOBINS.to.1layhid1  1.597750e+00
## Intercept.to.1layhid2     1.059891e+01
## HAEMOGLOBINS.to.1layhid2 -6.412510e-01
## Intercept.to.1layhid3    -1.142406e+01
## HAEMOGLOBINS.to.1layhid3  1.157201e+00
## Intercept.to.SOURCEout    1.847248e+00
## 1layhid1.to.SOURCEout    -3.371541e+00
## 1layhid2.to.SOURCEout    -1.638484e+00
## 1layhid3.to.SOURCEout     3.716425e+00

Program 3.7

pred.nn <- neuralnet::compute(model.nn, EHRuji)
head(pred.nn$net.result)

Output 3.6

##                                   [,1]
## error                     3.882653e+02
## reached.threshold         9.420197e-03
## steps                     8.025000e+03
## Intercept.to.1layhid1     1.733201e+01
## HAEMOGLOBINS.to.1layhid1 -2.112837e+00
## Intercept.to.1layhid2    -4.336221e+00
## HAEMOGLOBINS.to.1layhid2  4.963030e-01
## Intercept.to.1layhid3     2.633239e+01
## HAEMOGLOBINS.to.1layhid3 -7.189382e+00
## Intercept.to.SOURCEout   -4.594180e+00
## 1layhid1.to.SOURCEout     3.169463e+00
## 1layhid2.to.SOURCEout     5.864738e+00
## 1layhid3.to.SOURCEout     4.695038e+02
##           [,1]
## 2463 0.7120028
## 2511 0.7393380
## 2227 0.5650440
## 526  0.6776682
## 4291 0.6191826
## 2986 0.7046642

Program 3.8

# mengkategorikan nilai prediksi peluang menjadi IN atau OUT berdasarkan batasn yang ditentukan.
yhat=data.frame("yhat"=ifelse(pred.nn$net.result >= 0.5,"1", "0"))
# menghasilkan confusion matriks.
confusionMatrix(data = as.factor(yhat$yhat),
 reference = as.factor(EHRuji[,2]),
positive = "1")

Output 3.7

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 107  83
##          1 253 439
##                                           
##                Accuracy : 0.619           
##                  95% CI : (0.5861, 0.6512)
##     No Information Rate : 0.5918          
##     P-Value [Acc > NIR] : 0.05331         
##                                           
##                   Kappa : 0.1491          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.8410          
##             Specificity : 0.2972          
##          Pos Pred Value : 0.6344          
##          Neg Pred Value : 0.5632          
##              Prevalence : 0.5918          
##          Detection Rate : 0.4977          
##    Detection Prevalence : 0.7846          
##       Balanced Accuracy : 0.5691          
##                                           
##        'Positive' Class : 1               
## 

Leave a Comment