BAB 5. Metode Seleksi secara Penyaringan (Filtering) dalam Pemodelan Klasifikasi

5.1 Statistik Uji ANOVA

Program 5.1

promo <- read.csv("promo.csv")
promo2 <- promo[,-c(1,2,4)]
promo2$X12 <- promo2$frekuensi.fashion*promo2$nilai.fashion
promo2$X13 <- promo2$frekuensi.footwear*promo2$nilai.footwear
promo2$X14 <- promo2$frekuensi.lainnya*promo2$nilai.lainnya
promo2$X15 <- promo2$total.nilai.tunai/(promo2$X12 + promo2$X13 + promo2$X14)

hasil = NULL
for(i in c(1:9, 11:14)){
  y = promo2[,i]
  x = promo2$promo
  F = oneway.test(y ~ x)$statistic
  hasil = rbind(hasil, c(i, F))
}
namavar = colnames(promo2)[-10]
hasil=data.frame(namavar, hasil)

Output 5.1

##               namavar V1      F
## 11                X13 12 17.523
## 5      nilai.footwear  5 12.593
## 4  frekuensi.footwear  4  7.501
## 9         lama.member  9  6.086
## 1                usia  1  4.335
## 8   total.nilai.tunai  8  2.360
## 6   frekuensi.lainnya  6  1.270
## 3       nilai.fashion  3  0.753
## 2   frekuensi.fashion  2  0.517
## 10                X12 11  0.466
## 13                X15 14  0.252
## 12                X14 13  0.101
## 7       nilai.lainnya  7  0.000

5.2 Statistik Uji Chi-Square

Program 5.2

promo <- read.csv("promo.csv")
promo$X12 <- promo$frekuensi.fashion*promo$nilai.fashion
promo$X13 <- promo$frekuensi.footwear*promo$nilai.footwear
promo$X14 <- promo$frekuensi.lainnya*promo$nilai.lainnya
promo$X15 <- promo$total.nilai.tunai/(promo$X12 + promo$X13 + promo$X14)

hasil = NULL
for(i in c(2,4)){
  x = promo[,i]
  chi = chisq.test(x=x, y=promo$promo)$statistic
  hasil = rbind(hasil, c(i, chi))
}
## Warning in chisq.test(x = x, y = promo$promo): Chi-squared approximation may be
## incorrect
library(classInt)
for(i in c(3, 5:12, 14:17)){
  batas = classIntervals(promo[,i], style="quantile", n=3)$brks
  batas = batas[!duplicated(batas)]
  x = cut(promo[,i], breaks = batas)
  chi = chisq.test(x=x, y=promo$promo)$statistic
  hasil = rbind(hasil, c(i, chi))
}
namavar = colnames(promo)[hasil[,1]]
hasil = data.frame(namavar, hasil[,2])
hasil[order(hasil[,2], decreasing=TRUE),]
##               namavar  hasil...2.
## 13                X13 13.05116836
## 7      nilai.footwear  9.65383753
## 2          pendidikan  5.83806675
## 3                usia  5.22957670
## 11        lama.member  4.94429513
## 6  frekuensi.footwear  4.30378447
## 8   frekuensi.lainnya  3.55309707
## 14                X14  2.80560292
## 12                X12  1.90548398
## 10  total.nilai.tunai  1.72696472
## 4   frekuensi.fashion  1.47473733
## 5       nilai.fashion  1.07881176
## 15                X15  1.03711611
## 9       nilai.lainnya  0.20361673
## 1       jenis.kelamin  0.09438563

5.3 Performa Model Klasifikasi Sederhana

Program 5.3

promo <- read.csv("promo.csv")
promo$X12 <- promo$frekuensi.fashion*promo$nilai.fashion
promo$X13 <- promo$frekuensi.footwear*promo$nilai.footwear
promo$X14 <- promo$frekuensi.lainnya*promo$nilai.lainnya
promo$X15 <- promo$total.nilai.tunai/(promo$X12 + promo$X13 + promo$X14)

library(rpart)
library(pROC)
y = promo[,13]
hasil = NULL
for (i in c(2:12, 14:17)) {
  x = promo[,i]
  model.x = rpart(as.factor(y) ~ x, control=rpart.control(cp=-1, minsplit=15))
  prediksi = predict(model.x, data.frame(x))[,2]
  
  kurva.roc <- roc(y, prediksi)
  hasil = rbind(hasil, c(i, auc(kurva.roc)))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
namavar = colnames(promo)[hasil[,1]]
hasil = data.frame(namavar, hasil[,2])
hasil[order(hasil[,2], decreasing=TRUE),]
##               namavar hasil...2.
## 13                X13  0.8322176
## 7      nilai.footwear  0.8321827
## 14                X14  0.8305265
## 9       nilai.lainnya  0.8065202
## 5       nilai.fashion  0.7940028
## 12                X12  0.7812936
## 11        lama.member  0.7152894
## 15                X15  0.7012204
## 10  total.nilai.tunai  0.6999128
## 2                usia  0.6625523
## 6  frekuensi.footwear  0.5932531
## 8   frekuensi.lainnya  0.5761158
## 4   frekuensi.fashion  0.5759589
## 3          pendidikan  0.5612796
## 1       jenis.kelamin  0.5116283

Program 5.4

promo <- read.csv("promo.csv")
promo$X12 <- promo$frekuensi.fashion*promo$nilai.fashion
promo$X13 <- promo$frekuensi.footwear*promo$nilai.footwear
promo$X14 <- promo$frekuensi.lainnya*promo$nilai.lainnya
promo$X15 <- promo$total.nilai.tunai/(promo$X12 + promo$X13 + promo$X14)

library(FSelector)
library(classInt)
promo.diskret = promo
for(i in c(3, 5:12, 14:17)){
  batas = classIntervals(promo[,i], style="quantile", n=5)$brks
  batas = batas[!duplicated(batas)]
  promo.diskret[,i] = cut(promo[,i], breaks = batas)
}
promo.diskret = promo.diskret[,-1]

kinerja = (oneR(promo ~ ., promo.diskret))
kinerja = data.frame(namavar=rownames(kinerja), kinerja=kinerja[,1])

## Urutkan hasil
kinerja[order(kinerja[,2], decreasing=TRUE),]
##               namavar   kinerja
## 10  total.nilai.tunai 0.7994429
## 15                X15 0.7715877
## 14                X14 0.5292479
## 13                X13 0.5236769
## 8   frekuensi.lainnya 0.5208914
## 12                X12 0.5208914
## 6  frekuensi.footwear 0.5097493
## 4   frekuensi.fashion 0.4930362
## 7      nilai.footwear 0.4902507
## 5       nilai.fashion 0.4818942
## 2                usia 0.4791086
## 11        lama.member 0.4707521
## 9       nilai.lainnya 0.4596100
## 1       jenis.kelamin 0.4233983
## 3          pendidikan 0.4233983

5.4 Information Gain dan Gain Ratio

Program 5.5

promo <- read.csv("promo.csv")
promo$X12 <- promo$frekuensi.fashion*promo$nilai.fashion
promo$X13 <- promo$frekuensi.footwear*promo$nilai.footwear
promo$X14 <- promo$frekuensi.lainnya*promo$nilai.lainnya
promo$X15 <- promo$total.nilai.tunai/(promo$X12 + promo$X13 + promo$X14)


library(FSelector)
library(classInt)
promo.diskret = promo
for(i in c(3, 5:12, 14:17)){
  batas = classIntervals(promo[,i], style="quantile", n=6)$brks
  batas = batas[!duplicated(batas)]
  promo.diskret[,i] = cut(promo[,i], breaks = batas)
}
promo.diskret = promo.diskret[,-1]

inf.gain  <- information.gain(promo ~ ., promo.diskret, unit="log2")
inf.gain = data.frame(namavar=rownames(inf.gain), kinerja=inf.gain[,1])

## Urutkan hasil
inf.gain[order(inf.gain[,2], decreasing=TRUE),]
##               namavar    kinerja
## 13                X13 0.07177722
## 14                X14 0.06093536
## 7      nilai.footwear 0.06046003
## 11        lama.member 0.05865637
## 8   frekuensi.lainnya 0.04552890
## 5       nilai.fashion 0.04546746
## 4   frekuensi.fashion 0.04330867
## 6  frekuensi.footwear 0.04264099
## 10  total.nilai.tunai 0.04257620
## 2                usia 0.03935223
## 9       nilai.lainnya 0.03614214
## 12                X12 0.03303504
## 15                X15 0.01648100
## 1       jenis.kelamin 0.00000000
## 3          pendidikan 0.00000000

Program 5.6

gain.r  <- gain.ratio(promo ~ ., promo.diskret, unit="log2")
gain.r = data.frame(namavar=rownames(gain.r), kinerja=gain.r[,1])

## Urutkan hasil
gain.r[order(gain.r[,2], decreasing=TRUE),]
##               namavar     kinerja
## 13                X13 0.026476710
## 7      nilai.footwear 0.023205901
## 14                X14 0.022230694
## 11        lama.member 0.022121073
## 10  total.nilai.tunai 0.019943318
## 6  frekuensi.footwear 0.018772561
## 8   frekuensi.lainnya 0.018458150
## 5       nilai.fashion 0.017451422
## 4   frekuensi.fashion 0.016819272
## 2                usia 0.015200164
## 9       nilai.lainnya 0.013872154
## 12                X12 0.012163445
## 15                X15 0.007719942
## 1       jenis.kelamin 0.000000000
## 3          pendidikan 0.000000000

5.5 Information Value

Program 5.7

library(woe)
hasil = NULL
for (i in c(1:11, 13:16)){
  data = promo.diskret[,c(i,12)]
  namavariable = colnames(data)[1]
  n = nrow(data[,1])
  iv = sum(woe(data, namavariable, FALSE, "promo", n, Bad=0, Good=1)[,9])
  hasil = rbind(hasil, c(i, iv))
}
iv = data.frame(namavar=colnames(promo.diskret[,-12]), iv=hasil[,2])
iv[order(iv[,2], decreasing=TRUE),]
##               namavar    iv
## 13                X13 0.293
## 7      nilai.footwear 0.168
## 2                usia 0.111
## 11        lama.member 0.102
## 4   frekuensi.fashion 0.098
## 10  total.nilai.tunai 0.081
## 14                X14 0.079
## 6  frekuensi.footwear 0.074
## 3          pendidikan 0.069
## 15                X15 0.067
## 12                X12 0.051
## 8   frekuensi.lainnya 0.050
## 5       nilai.fashion 0.024
## 9       nilai.lainnya 0.009
## 1       jenis.kelamin 0.002
write.csv(iv, "hasil inf value.csv")

Leave a Comment