4.2 Variance Threshold
Program 4.1
data <- read.csv("data sapi.csv")
data <- data[,-1]
head(format(data, digits=6))
## X1pjg_badan X2ttg_badan X3lkr_badan X4pjg_ekor X5pjg_tanduk Ybbt_badan
## 1 118.488 118.634 191.772 36.1714 15.99607 2836.12
## 2 170.237 127.834 174.372 35.6906 14.27944 3058.19
## 3 157.333 155.058 171.637 31.5397 11.45499 3153.16
## 4 116.805 151.166 194.035 30.3483 10.04847 2923.64
## 5 194.384 118.115 188.246 39.9795 12.58484 3315.49
## 6 194.347 137.987 152.743 38.3557 16.57711 3144.16
Program 4.1
ragam <- apply(data[-6], 2, var)
ragam <- sort(ragam, decreasing = TRUE)
ragam
## X1pjg_badan X2ttg_badan X3lkr_badan X5pjg_tanduk X4pjg_ekor
## 891.199765 251.548587 233.559358 15.106889 8.605239
barplot(ragam,
names.arg = names(ragam),
col = "darkgreen",
xlab = "variabel penjelas",
ylab = "ragam")
4.3 Penyaringan menggunakan Koefisien Korelasi
Program 4.3
data <- read.csv("data sapi.csv")
data <- data[,-1]
matriks.korelasi <- cor(data)
round(matriks.korelasi, 4)
## X1pjg_badan X2ttg_badan X3lkr_badan X4pjg_ekor X5pjg_tanduk
## X1pjg_badan 1.0000 -0.0153 0.1339 0.0438 -0.0882
## X2ttg_badan -0.0153 1.0000 -0.1216 -0.0776 -0.0946
## X3lkr_badan 0.1339 -0.1216 1.0000 -0.0110 0.0735
## X4pjg_ekor 0.0438 -0.0776 -0.0110 1.0000 0.0808
## X5pjg_tanduk -0.0882 -0.0946 0.0735 0.0808 1.0000
## Ybbt_badan 0.8300 0.2266 0.5436 0.0194 -0.0576
## Ybbt_badan
## X1pjg_badan 0.8300
## X2ttg_badan 0.2266
## X3lkr_badan 0.5436
## X4pjg_ekor 0.0194
## X5pjg_tanduk -0.0576
## Ybbt_badan 1.0000
library(ggplot2)
library(reshape2)
matriks.korelasi <- melt(matriks.korelasi)
ggplot(data = matriks.korelasi, aes(x=Var1, y=Var2, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "coral", high = "skyblue") +
geom_text(aes(label = sprintf("%.2f", value)), vjust = 1) +
labs(title = "Correlation Heatmap", x= "Fitur", y= "Target") +
theme_minimal()
4.4 Predictive Power Score
Program 4.4
#generate data X dan Y dengan pola hubungan kuadratik
set.seed(123)
X <- runif(100, -15, 15)
Y <- X^2 + rnorm(100, 0, 25)
data <- data.frame(X, Y)
# model regresi pohon denan library “rpart”
library(rpart)
model.pohon <- rpart(Y ~ X, data = data, method = "anova")
# prediksi model regresi pohon
prediksi <- predict(model.pohon, newdata = data)
# Menghitung MAE model regresi pohon
mae_pohon <- mean(abs(data$Y - prediksi))
# Menghitung MAE untuk model naif (median)
mae_naive <- mean(abs(data$Y - median(data$Y)))
ppskor <- 1 - (mae_pohon/mae_naive)
# membuat Plot y vs x
plot(X, Y,
type = "p",
col = "blue",
xlab = "x",
ylab = "y",
main = "PPS model Pohon Regresi")
# menambahkan garis trend line y = x**2
lines(-150:150/10, (-150:150/10)^2, col = "black", lt = 1)
# menampilkan skor PPS(x,y)
text(0, 150, sprintf("PPS (x,y) = %.2f", ppskor), col = "blue" )
Program 4.5
data <- read.csv("data sapi.csv")
library(ppsr)
score(df=data, x="X1pjg_badan", y="Ybbt_badan")$pps
score(df=data, x="X2ttg_badan", y="Ybbt_badan")$pps
score(df=data, x="X3lkr_badan", y="Ybbt_badan")$pps
score(df=data, x="X4pjg_ekor", y="Ybbt_badan")$pps
score(df=data, x="X5pjg_tanduk", y="Ybbt_badan")$pps
Output 4.1
## [1] 0.4355636
## [1] 0.01688277
## [1] 0.07520535
## [1] 0
## [1] 0.06049186
Gambar 4.5
visualize_pps(df=data, y='Ybbt_badan', include_target = FALSE )
Gambar 4.6
visualize_pps(df=data, y='Ybbt_badan', include_target = FALSE , algorithm="glm")