The aim of this study is to create a model that detects spam mails by looking at spesific words.
First we load tidyverse and rattle from the library
library(tidyverse)
library(rpart)
library(rpart.plot)
library(rattle)
Than we load the spam data and create two different sets, train and test resspectively. Also we take a look atthe data using rattle.
load("spam_data.RData")
head(spam_data)
## # A tibble: 6 x 59
## train_test spam_or_not V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 0.640 0.640 0 0.320 0 0 0
## 2 0 1 0.210 0.280 0.500 0 0.140 0.280 0.210 0.0700
## 3 0 1 0.0600 0 0.710 0 1.23 0.190 0.190 0.120
## 4 0 1 0 0 0 0 0.630 0 0.310 0.630
## 5 0 1 0 0 0 0 0.630 0 0.310 0.630
## 6 0 1 0 0 0 0 1.85 0 0 1.85
## # ... with 49 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## # V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## # V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## # V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## # V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## # V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## # V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## # V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## # V55 <dbl>, V56 <int>, V57 <int>
spam_train <- spam_data %>% filter(train_test == 0) %>% select(-train_test)
spam_model <- rpart(spam_or_not ~ ., data=spam_train)
fancyRpartPlot(spam_model)
After splitting the data we create a predict model and make an in sample prediction. We check how accurate our prediction is.
spam_in_sample <- predict(spam_model)
print(head(spam_in_sample))
## 1 2 3 4 5 6
## 0.86772487 0.94753747 0.94753747 0.90812721 0.90812721 0.05763952
in_sample_prediction <-
cbind(
spam_in_sample %>% tbl_df %>%
transmute(spam_predict = ifelse(spam_in_sample >= 0.5,1,0)),
spam_train %>% tbl_df %>%
transmute(spam_actual = ifelse(spam_or_not == 1 ,1,0))
) %>%
mutate(correct_class = (spam_predict == spam_actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(in_sample_prediction)
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 F 398 0.0970
## 2 T 3703 0.903
spam_test <- spam_data %>% filter(train_test== 1) %>% select(-train_test)
spam_predict <- predict(spam_model,newdata=spam_test)
print(head(spam_predict))
## 1 2 3 4 5 6
## 0.9475375 0.9081272 0.4009662 0.9475375 0.3214286 0.9475375
As we can see our prediction rate is very high which indicates that our model is accurate.
Lastly we make an out of sample prediction using our test data and also control its accuracy.
spam_test <- spam_data %>% filter(train_test== 1) %>% select(-train_test)
spam_predict <- predict(spam_model,newdata=spam_test)
print(head(spam_predict))
## 1 2 3 4 5 6
## 0.9475375 0.9081272 0.4009662 0.9475375 0.3214286 0.9475375
out_of_sample_prediction <-
cbind(
spam_predict %>% tbl_df %>%
transmute(spam_predict = ifelse(spam_predict >= 0.5,1,0)),
spam_test %>% tbl_df %>%
transmute(spam_actual = ifelse(spam_or_not == 1 ,1,0))
) %>%
mutate(correct_class = (spam_predict == spam_actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(out_of_sample_prediction)
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 F 50 0.100
## 2 T 450 0.900