The aim of this study is to create a model that detects spam mails by looking at spesific words.

First we load tidyverse and rattle from the library

library(tidyverse)
library(rpart)
library(rpart.plot)
library(rattle)

Than we load the spam data and create two different sets, train and test resspectively. Also we take a look atthe data using rattle.

load("spam_data.RData")
head(spam_data)
## # A tibble: 6 x 59
##   train_test spam_or_not     V1    V2    V3    V4    V5    V6    V7     V8
##        <dbl>       <int>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1          0           1 0      0.640 0.640     0 0.320 0     0     0     
## 2          0           1 0.210  0.280 0.500     0 0.140 0.280 0.210 0.0700
## 3          0           1 0.0600 0     0.710     0 1.23  0.190 0.190 0.120 
## 4          0           1 0      0     0         0 0.630 0     0.310 0.630 
## 5          0           1 0      0     0         0 0.630 0     0.310 0.630 
## 6          0           1 0      0     0         0 1.85  0     0     1.85  
## # ... with 49 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## #   V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## #   V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## #   V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## #   V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## #   V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## #   V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## #   V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## #   V55 <dbl>, V56 <int>, V57 <int>
spam_train <- spam_data %>% filter(train_test == 0) %>% select(-train_test)
spam_model <- rpart(spam_or_not ~ ., data=spam_train)
fancyRpartPlot(spam_model)

After splitting the data we create a predict model and make an in sample prediction. We check how accurate our prediction is.

spam_in_sample <- predict(spam_model)

print(head(spam_in_sample))
##          1          2          3          4          5          6 
## 0.86772487 0.94753747 0.94753747 0.90812721 0.90812721 0.05763952
in_sample_prediction <-
  cbind(
    spam_in_sample %>% tbl_df %>%
      transmute(spam_predict = ifelse(spam_in_sample >= 0.5,1,0)),
    spam_train %>% tbl_df %>%
      transmute(spam_actual = ifelse(spam_or_not == 1 ,1,0))
  ) %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))

print(in_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 F               398     0.0970
## 2 T              3703     0.903
spam_test <- spam_data %>% filter(train_test== 1) %>% select(-train_test)
spam_predict <- predict(spam_model,newdata=spam_test)
print(head(spam_predict))
##         1         2         3         4         5         6 
## 0.9475375 0.9081272 0.4009662 0.9475375 0.3214286 0.9475375

As we can see our prediction rate is very high which indicates that our model is accurate.

Lastly we make an out of sample prediction using our test data and also control its accuracy.

spam_test <- spam_data %>% filter(train_test== 1) %>% select(-train_test)
spam_predict <- predict(spam_model,newdata=spam_test)
print(head(spam_predict))
##         1         2         3         4         5         6 
## 0.9475375 0.9081272 0.4009662 0.9475375 0.3214286 0.9475375
out_of_sample_prediction <-
  cbind(
    spam_predict %>% tbl_df %>%
      transmute(spam_predict = ifelse(spam_predict >= 0.5,1,0)),
    spam_test %>% tbl_df %>%
      transmute(spam_actual = ifelse(spam_or_not == 1 ,1,0))
  ) %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))

print(out_of_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 F                50      0.100
## 2 T               450      0.900