Assignment 3: Spam Data

library(tidyverse)

## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(rpart)
library(rpart.plot)

load("spam_data.RData")
head(spam_data)

## # A tibble: 6 x 59
##   train_test spam_or_not    V1    V2    V3    V4    V5    V6    V7    V8
##        <dbl>       <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1          0           1  0     0.64  0.64     0  0.32  0     0     0   
## 2          0           1  0.21  0.28  0.5      0  0.14  0.28  0.21  0.07
## 3          0           1  0.06  0     0.71     0  1.23  0.19  0.19  0.12
## 4          0           1  0     0     0        0  0.63  0     0.31  0.63
## 5          0           1  0     0     0        0  0.63  0     0.31  0.63
## 6          0           1  0     0     0        0  1.85  0     0     1.85
## # ... with 49 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## #   V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## #   V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## #   V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## #   V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## #   V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## #   V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## #   V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## #   V55 <dbl>, V56 <int>, V57 <int>

A meaningfull classification&regression tree model was established with rpart.

spam_train <- spam_data %>% filter(train_test == 0) %>% select(-train_test)
spam_model <- rpart(spam_or_not ~ ., data = spam_train)
rpart.plot(spam_model)

Then, out-of-sample analysis was done to predict the response values outside the training data. First, inside values were tested. Then if the results are logical, outside values will be tested.

spam_in_sample <- predict(spam_model)
head(spam_in_sample)

##          1          2          3          4          5          6 
## 0.86772487 0.94753747 0.94753747 0.90812721 0.90812721 0.05763952

in_sample_prediction <-
  cbind(
    spam_in_sample %>% tbl_df %>%
      transmute(spam_predict = ifelse(spam_in_sample >= 0.5,1,0)),
    spam_train %>% tbl_df %>%
      transmute(spam_actual = ifelse(spam_or_not == 1,1,0))
  ) %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
in_sample_prediction

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           398     0.0970
## 2 TRUE           3703     0.903

The method works well with the inside values. Let’s see the outside values of the training data:

spam_test <- spam_data %>% filter(train_test== 1) %>% select(-train_test)
spam_predict <- predict(spam_model,newdata=spam_test)
head(spam_predict)

##         1         2         3         4         5         6 
## 0.9475375 0.9081272 0.4009662 0.9475375 0.3214286 0.9475375

out_of_sample_prediction <-
  cbind(
    spam_predict %>% tbl_df %>%
      transmute(spam_predict = ifelse(spam_predict >= 0.5,1,0)),
    spam_test %>% tbl_df %>%
      transmute(spam_actual = ifelse(spam_or_not ==1,1,0))
  ) %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
out_of_sample_prediction

## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE            50        0.1
## 2 TRUE            450        0.9

So, we can predict if the mail is spam or not with 90% accuracy for both inside and outside values of the train data.

References

Statistical Models in R: Part 2 : https://boun-etm58d.github.io/files/intro_to_ml_2.html

Assignment 3: Spam Data

Ilgaz Yüksel

28 04 2018

References