load("D:/Users/SUUSER/Desktop/spam_data.RData")
head(spam_data)
## # A tibble: 6 x 59
## train_test spam_or_not V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 0.640 0.640 0 0.320 0 0 0
## 2 0 1 0.210 0.280 0.500 0 0.140 0.280 0.210 0.0700
## 3 0 1 0.0600 0 0.710 0 1.23 0.190 0.190 0.120
## 4 0 1 0 0 0 0 0.630 0 0.310 0.630
## 5 0 1 0 0 0 0 0.630 0 0.310 0.630
## 6 0 1 0 0 0 0 1.85 0 0 1.85
## # ... with 49 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## # V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## # V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## # V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## # V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## # V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## # V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## # V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## # V55 <dbl>, V56 <int>, V57 <int>
Fİrstly, I have created a classification tree model with r part for visualization purpose.
spam_train <- spam_data %>% filter(train_test == 0) %>% select(-train_test)
spam_model <- rpart(spam_or_not ~ ., data=spam_train)
fancyRpartPlot(spam_model)
Then, I have figured out the error rate of train data.
spam_in_sample <- predict(spam_model)
head(spam_in_sample)
## 1 2 3 4 5 6
## 0.86772487 0.94753747 0.94753747 0.90812721 0.90812721 0.05763952
My prediction model is evaluated as above, and it works well.
in_sample_prediction <-
cbind(
spam_in_sample %>% tbl_df %>%
transmute(spam_predict = ifelse(spam_in_sample >= 0.5,1,0)),
spam_train %>% tbl_df %>%
transmute(spam_actual = ifelse(spam_or_not == 1,1,0))
) %>%
mutate(correct_class = (spam_predict == spam_actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
in_sample_prediction
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 F 398 0.0970
## 2 T 3703 0.903
spam_test <- spam_data %>% filter(train_test== 1) %>% select(-train_test)
spam_predict <- predict(spam_model,newdata=spam_test)
head(spam_predict)
## 1 2 3 4 5 6
## 0.9475375 0.9081272 0.4009662 0.9475375 0.3214286 0.9475375
Let’s see out of sample values:
out_of_sample_prediction <-
cbind(
spam_predict %>% tbl_df %>%
transmute(spam_predict = ifelse(spam_predict >= 0.5,1,0)),
spam_test %>% tbl_df %>%
transmute(spam_actual = ifelse(spam_or_not == 1 ,1,0))
) %>%
mutate(correct_class = (spam_predict == spam_actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(out_of_sample_prediction)
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 F 50 0.100
## 2 T 450 0.900
Therefore as two of it compatible, I can predict if a mail is spam or not with aproximately %90 accuracy.
#Reference:
Statistical Models in R: Part 2 : https://boun-etm58d.github.io/files/intro_to_ml_2.html