BAB 4. Metode Seleksi secara Penyaringan (Filtering) dalam Pemodelan Regresi

4.2 Variance Threshold

Program 4.1

data <- read.csv("data sapi.csv")
data <- data[,-1]
head(format(data, digits=6))
##   X1pjg_badan X2ttg_badan X3lkr_badan X4pjg_ekor X5pjg_tanduk Ybbt_badan
## 1     118.488     118.634     191.772    36.1714     15.99607    2836.12
## 2     170.237     127.834     174.372    35.6906     14.27944    3058.19
## 3     157.333     155.058     171.637    31.5397     11.45499    3153.16
## 4     116.805     151.166     194.035    30.3483     10.04847    2923.64
## 5     194.384     118.115     188.246    39.9795     12.58484    3315.49
## 6     194.347     137.987     152.743    38.3557     16.57711    3144.16

Program 4.1

ragam <- apply(data[-6], 2, var)
ragam <- sort(ragam, decreasing = TRUE)
ragam
##  X1pjg_badan  X2ttg_badan  X3lkr_badan X5pjg_tanduk   X4pjg_ekor 
##   891.199765   251.548587   233.559358    15.106889     8.605239
barplot(ragam, 
        names.arg = names(ragam),
        col = "darkgreen",
        xlab = "variabel penjelas",
        ylab = "ragam")

4.3 Penyaringan menggunakan Koefisien Korelasi

Program 4.3

data <- read.csv("data sapi.csv")
data <- data[,-1]
matriks.korelasi <- cor(data)
round(matriks.korelasi, 4)
##              X1pjg_badan X2ttg_badan X3lkr_badan X4pjg_ekor X5pjg_tanduk
## X1pjg_badan       1.0000     -0.0153      0.1339     0.0438      -0.0882
## X2ttg_badan      -0.0153      1.0000     -0.1216    -0.0776      -0.0946
## X3lkr_badan       0.1339     -0.1216      1.0000    -0.0110       0.0735
## X4pjg_ekor        0.0438     -0.0776     -0.0110     1.0000       0.0808
## X5pjg_tanduk     -0.0882     -0.0946      0.0735     0.0808       1.0000
## Ybbt_badan        0.8300      0.2266      0.5436     0.0194      -0.0576
##              Ybbt_badan
## X1pjg_badan      0.8300
## X2ttg_badan      0.2266
## X3lkr_badan      0.5436
## X4pjg_ekor       0.0194
## X5pjg_tanduk    -0.0576
## Ybbt_badan       1.0000
library(ggplot2)
library(reshape2)
matriks.korelasi <- melt(matriks.korelasi)
ggplot(data = matriks.korelasi, aes(x=Var1, y=Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient(low = "coral", high = "skyblue") +
  geom_text(aes(label = sprintf("%.2f", value)), vjust = 1) +  
  labs(title = "Correlation Heatmap", x= "Fitur", y= "Target") +
  theme_minimal()

4.4 Predictive Power Score

Program 4.4

#generate data X dan Y dengan pola hubungan kuadratik
set.seed(123)
X <- runif(100, -15, 15)
Y <-  X^2 + rnorm(100, 0, 25)
data <- data.frame(X, Y) 

# model regresi pohon denan library “rpart”
library(rpart)
model.pohon <- rpart(Y ~ X, data = data, method = "anova")

# prediksi model regresi pohon
prediksi <- predict(model.pohon, newdata = data)

# Menghitung MAE model regresi pohon
mae_pohon <- mean(abs(data$Y - prediksi))

# Menghitung MAE untuk  model naif (median)
mae_naive <- mean(abs(data$Y - median(data$Y)))

ppskor <- 1 - (mae_pohon/mae_naive)

# membuat Plot y vs x 
plot(X, Y, 
     type = "p", 
     col = "blue", 
     xlab = "x", 
     ylab = "y", 
     main = "PPS model Pohon Regresi")

# menambahkan garis trend line y = x**2
lines(-150:150/10, (-150:150/10)^2, col = "black", lt = 1)
# menampilkan skor PPS(x,y)
text(0, 150, sprintf("PPS (x,y) = %.2f", ppskor), col = "blue" )

Program 4.5

data <- read.csv("data sapi.csv")
library(ppsr)
score(df=data, x="X1pjg_badan", y="Ybbt_badan")$pps
score(df=data, x="X2ttg_badan", y="Ybbt_badan")$pps
score(df=data, x="X3lkr_badan", y="Ybbt_badan")$pps
score(df=data, x="X4pjg_ekor", y="Ybbt_badan")$pps
score(df=data, x="X5pjg_tanduk", y="Ybbt_badan")$pps

Output 4.1

## [1] 0.4355636
## [1] 0.01688277
## [1] 0.07520535
## [1] 0
## [1] 0.06049186

Gambar 4.5

visualize_pps(df=data, y='Ybbt_badan', include_target = FALSE )

Gambar 4.6

visualize_pps(df=data, y='Ybbt_badan', include_target = FALSE , algorithm="glm")

Leave a Comment