Assignment 3: Spam Data

We begin by loading necessary packages and the dataset:

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(rpart)
library(rpart.plot)
load("~/Desktop/ETM 2018 Spring/ETM 58D/Assignments/data/spam_data.RData")
head(spam_data)
## # A tibble: 6 x 59
##   train_test spam_or_not     V1    V2    V3    V4    V5    V6    V7     V8
##        <dbl>       <int>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1         0.           1 0.     0.640 0.640    0. 0.320 0.    0.    0.    
## 2         0.           1 0.210  0.280 0.500    0. 0.140 0.280 0.210 0.0700
## 3         0.           1 0.0600 0.    0.710    0. 1.23  0.190 0.190 0.120 
## 4         0.           1 0.     0.    0.       0. 0.630 0.    0.310 0.630 
## 5         0.           1 0.     0.    0.       0. 0.630 0.    0.310 0.630 
## 6         0.           1 0.     0.    0.       0. 1.85  0.    0.    1.85  
## # ... with 49 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## #   V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## #   V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## #   V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## #   V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## #   V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## #   V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## #   V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## #   V55 <dbl>, V56 <int>, V57 <int>

We are requested to create a CART model to identify which mail is spam or not from the given dataset. In the class, we have done the same analysis to the Titanic data, so we can use a similar modeling as follows:

spam_train <- spam_data %>% filter(train_test == 0) %>% select(-train_test)
spam_model <- rpart(spam_or_not ~ ., data=spam_train)
rpart.plot(spam_model)

We can comment on this tree as follows:

We can also choose which columns to be focused on:

set.seed(39) #Set the random seed
spam2 <-
spam_data %>%
select(V16,V20,V24,spam_or_not) %>% #Select only required columns
filter(complete.cases(.)) %>% #Remove all NA including rows
#Split the data into train and test
mutate(train_test = ifelse(runif(nrow(.)) < 0.25,"test","train")) %>%
tbl_df()
print(spam2)
## # A tibble: 4,601 x 5
##       V16    V20    V24 spam_or_not train_test
##     <dbl>  <dbl>  <dbl>       <int> <chr>     
##  1 0.320  0.     0.               1 train     
##  2 0.140  0.     0.430            1 test      
##  3 0.0600 0.320  0.0600           1 test      
##  4 0.310  0.     0.               1 test      
##  5 0.310  0.     0.               1 train     
##  6 0.     0.     0.               1 train     
##  7 0.960  0.     0.               1 train     
##  8 0.     0.     0.               1 train     
##  9 0.     0.0600 0.               1 train     
## 10 0.     0.     0.               1 train     
## # ... with 4,591 more rows
spam2_train <- spam2 %>% filter(train_test == "train") %>% select(-train_test)
spam2_model <- rpart(spam_or_not ~ ., data=spam2_train)
rpart.plot(spam2_model)

Here, I have chosen 3 key words which are highly possible to be seen in spam mails:

Now when we look at the results, we see that majority of the data has low frequency of these words (67%) and the same data has a lower spam percentage (18%)

On the other hand, when we look at the data that has high probability to contain these words, we observe that they make up 9% of the overall data and they have a very high spam percentage of 97%.

I also add here the summary of my findings in this second example:

summary(spam2_model)
## Call:
## rpart(formula = spam_or_not ~ ., data = spam2_train)
##   n= 3381 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.23425540      0 1.0000000 1.0006582 0.007865354
## 2 0.11660836      1 0.7657446 0.7663865 0.016064174
## 3 0.02724381      2 0.6491362 0.6509947 0.017572466
## 4 0.01723728      3 0.6218924 0.6272518 0.017816018
## 5 0.01071989      4 0.6046551 0.6107841 0.017149485
## 6 0.01000000      5 0.5939352 0.6065485 0.017340960
## 
## Variable importance
## V16 V24 V20 
##  54  36  10 
## 
## Node number 1: 3381 observations,    complexity param=0.2342554
##   mean=0.3886424, MSE=0.2375995 
##   left son=2 (2526 obs) right son=3 (855 obs)
##   Primary splits:
##       V16 < 0.095 to the left,  improve=0.2342554, (0 missing)
##       V24 < 0.035 to the left,  improve=0.2203433, (0 missing)
##       V20 < 0.075 to the left,  improve=0.1094768, (0 missing)
##   Surrogate splits:
##       V24 < 0.095 to the left,  agree=0.778, adj=0.122, (0 split)
##       V20 < 0.11  to the left,  agree=0.758, adj=0.043, (0 split)
## 
## Node number 2: 2526 observations,    complexity param=0.1166084
##   mean=0.2513856, MSE=0.1881909 
##   left son=4 (2305 obs) right son=5 (221 obs)
##   Primary splits:
##       V24 < 0.035 to the left,  improve=0.197055400, (0 missing)
##       V20 < 0.045 to the left,  improve=0.111308600, (0 missing)
##       V16 < 0.04  to the left,  improve=0.002114324, (0 missing)
##   Surrogate splits:
##       V20 < 0.075 to the left,  agree=0.917, adj=0.05, (0 split)
## 
## Node number 3: 855 observations,    complexity param=0.01723728
##   mean=0.794152, MSE=0.1634746 
##   left son=6 (560 obs) right son=7 (295 obs)
##   Primary splits:
##       V24 < 0.07  to the left,  improve=0.09907024, (0 missing)
##       V20 < 0.08  to the left,  improve=0.03825054, (0 missing)
##       V16 < 0.295 to the left,  improve=0.01967431, (0 missing)
##   Surrogate splits:
##       V20 < 0.08  to the left,  agree=0.713, adj=0.169, (0 split)
##       V16 < 0.105 to the right, agree=0.674, adj=0.054, (0 split)
## 
## Node number 4: 2305 observations,    complexity param=0.02724381
##   mean=0.191757, MSE=0.1549863 
##   left son=8 (2265 obs) right son=9 (40 obs)
##   Primary splits:
##       V20 < 0.505 to the left,  improve=0.061262440, (0 missing)
##       V16 < 0.055 to the right, improve=0.001554051, (0 missing)
## 
## Node number 5: 221 observations
##   mean=0.8733032, MSE=0.1106447 
## 
## Node number 6: 560 observations,    complexity param=0.01071989
##   mean=0.7017857, MSE=0.2092825 
##   left son=12 (109 obs) right son=13 (451 obs)
##   Primary splits:
##       V16 < 0.305 to the left,  improve=0.073478480, (0 missing)
##       V20 < 0.225 to the left,  improve=0.017700070, (0 missing)
##       V24 < 0.025 to the right, improve=0.001027802, (0 missing)
##   Surrogate splits:
##       V24 < 0.025 to the right, agree=0.814, adj=0.046, (0 split)
##       V20 < 0.025 to the right, agree=0.807, adj=0.009, (0 split)
## 
## Node number 7: 295 observations
##   mean=0.9694915, MSE=0.02957771 
## 
## Node number 8: 2265 observations
##   mean=0.1788079, MSE=0.1468357 
## 
## Node number 9: 40 observations
##   mean=0.925, MSE=0.069375 
## 
## Node number 12: 109 observations
##   mean=0.4495413, MSE=0.2474539 
## 
## Node number 13: 451 observations
##   mean=0.7627494, MSE=0.1809627

One final analysis regarding CART is the out-of-sample analysis.

First we begin with in-sample analysis of “spam_model” to see a prediction about the data:

spam_in_sample <- predict(spam_model)
print(head(spam_in_sample))
##          1          2          3          4          5          6 
## 0.86772487 0.94753747 0.94753747 0.90812721 0.90812721 0.05763952
in_sample_prediction <-
cbind(
spam_in_sample %>% tbl_df %>%
transmute(spam_data_predict = ifelse(spam_in_sample >= 0.5,1,0)),
spam_train %>% tbl_df %>%
transmute(spam_data_actual = ifelse(spam_or_not == 1, 1,0))
) %>%
mutate(correct_class = (spam_data_predict == spam_data_actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(in_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           398     0.0970
## 2 TRUE           3703     0.903

This result means that our “predict” function was able to correctly tell whether a data is spam or not for 90% of all cases. This is a good prediction. Now, let’s also try out-of-sample prediction:

spam_data_test <- spam_data %>% filter(train_test==1) %>% select(-train_test)
spam_data_predict <- predict(spam_model,newdata=spam_data_test)
print(head(spam_data_predict))
##         1         2         3         4         5         6 
## 0.9475375 0.9081272 0.4009662 0.9475375 0.3214286 0.9475375
out_of_sample_prediction <-
cbind(
spam_data_predict %>% tbl_df %>%
transmute(spam_data_predict = ifelse(spam_data_predict >= 0.5,1,0)),
spam_data_test %>% tbl_df %>%
transmute(spam_data_actual = ifelse(spam_or_not == 1,1,0))
) %>%
mutate(correct_class = (spam_data_predict == spam_data_actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
print(out_of_sample_prediction)
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE            50      0.100
## 2 TRUE            450      0.900

It can be seen from this result that, out-of-sample prediction also works fine, enabling us to predict with 90% accuracy whether a mail is spam or not.