esoph
## agegp alcgp tobgp ncases ncontrols
## 1 25-34 0-39g/day 0-9g/day 0 40
## 2 25-34 0-39g/day 10-19 0 10
## 3 25-34 0-39g/day 20-29 0 6
## 4 25-34 0-39g/day 30+ 0 5
## 5 25-34 40-79 0-9g/day 0 27
## 6 25-34 40-79 10-19 0 7
## 7 25-34 40-79 20-29 0 4
## 8 25-34 40-79 30+ 0 7
## 9 25-34 80-119 0-9g/day 0 2
## 10 25-34 80-119 10-19 0 1
## 11 25-34 80-119 30+ 0 2
## 12 25-34 120+ 0-9g/day 0 1
## 13 25-34 120+ 10-19 1 1
## 14 25-34 120+ 20-29 0 1
## 15 25-34 120+ 30+ 0 2
## 16 35-44 0-39g/day 0-9g/day 0 60
## 17 35-44 0-39g/day 10-19 1 14
## 18 35-44 0-39g/day 20-29 0 7
## 19 35-44 0-39g/day 30+ 0 8
## 20 35-44 40-79 0-9g/day 0 35
## 21 35-44 40-79 10-19 3 23
## 22 35-44 40-79 20-29 1 14
## 23 35-44 40-79 30+ 0 8
## 24 35-44 80-119 0-9g/day 0 11
## 25 35-44 80-119 10-19 0 6
## 26 35-44 80-119 20-29 0 2
## 27 35-44 80-119 30+ 0 1
## 28 35-44 120+ 0-9g/day 2 3
## 29 35-44 120+ 10-19 0 3
## 30 35-44 120+ 20-29 2 4
## 31 45-54 0-39g/day 0-9g/day 1 46
## 32 45-54 0-39g/day 10-19 0 18
## 33 45-54 0-39g/day 20-29 0 10
## 34 45-54 0-39g/day 30+ 0 4
## 35 45-54 40-79 0-9g/day 6 38
## 36 45-54 40-79 10-19 4 21
## 37 45-54 40-79 20-29 5 15
## 38 45-54 40-79 30+ 5 7
## 39 45-54 80-119 0-9g/day 3 16
## 40 45-54 80-119 10-19 6 14
## 41 45-54 80-119 20-29 1 5
## 42 45-54 80-119 30+ 2 4
## 43 45-54 120+ 0-9g/day 4 4
## 44 45-54 120+ 10-19 3 4
## 45 45-54 120+ 20-29 2 3
## 46 45-54 120+ 30+ 4 4
## 47 55-64 0-39g/day 0-9g/day 2 49
## 48 55-64 0-39g/day 10-19 3 22
## 49 55-64 0-39g/day 20-29 3 12
## 50 55-64 0-39g/day 30+ 4 6
## 51 55-64 40-79 0-9g/day 9 40
## 52 55-64 40-79 10-19 6 21
## 53 55-64 40-79 20-29 4 17
## 54 55-64 40-79 30+ 3 6
## 55 55-64 80-119 0-9g/day 9 18
## 56 55-64 80-119 10-19 8 15
## 57 55-64 80-119 20-29 3 6
## 58 55-64 80-119 30+ 4 4
## 59 55-64 120+ 0-9g/day 5 10
## 60 55-64 120+ 10-19 6 7
## 61 55-64 120+ 20-29 2 3
## 62 55-64 120+ 30+ 5 6
## 63 65-74 0-39g/day 0-9g/day 5 48
## 64 65-74 0-39g/day 10-19 4 14
## 65 65-74 0-39g/day 20-29 2 7
## 66 65-74 0-39g/day 30+ 0 2
## 67 65-74 40-79 0-9g/day 17 34
## 68 65-74 40-79 10-19 3 10
## 69 65-74 40-79 20-29 5 9
## 70 65-74 80-119 0-9g/day 6 13
## 71 65-74 80-119 10-19 4 12
## 72 65-74 80-119 20-29 2 3
## 73 65-74 80-119 30+ 1 1
## 74 65-74 120+ 0-9g/day 3 4
## 75 65-74 120+ 10-19 1 2
## 76 65-74 120+ 20-29 1 1
## 77 65-74 120+ 30+ 1 1
## 78 75+ 0-39g/day 0-9g/day 1 18
## 79 75+ 0-39g/day 10-19 2 6
## 80 75+ 0-39g/day 30+ 1 3
## 81 75+ 40-79 0-9g/day 2 5
## 82 75+ 40-79 10-19 1 3
## 83 75+ 40-79 20-29 0 3
## 84 75+ 40-79 30+ 1 1
## 85 75+ 80-119 0-9g/day 1 1
## 86 75+ 80-119 10-19 1 1
## 87 75+ 120+ 0-9g/day 2 2
## 88 75+ 120+ 10-19 1 1
library(stats)
require(stats)
summary(esoph)
## agegp alcgp tobgp ncases ncontrols
## 25-34:15 0-39g/day:23 0-9g/day:24 Min. : 0.000 Min. : 1.00
## 35-44:15 40-79 :23 10-19 :24 1st Qu.: 0.000 1st Qu.: 3.00
## 45-54:16 80-119 :21 20-29 :20 Median : 1.000 Median : 6.00
## 55-64:16 120+ :21 30+ :20 Mean : 2.273 Mean :11.08
## 65-74:15 3rd Qu.: 4.000 3rd Qu.:14.00
## 75+ :11 Max. :17.000 Max. :60.00
#Let's initially check the effects of Alcohol and Tobacco on Cancer seperately:
cancer_tobacco_alcohol_sep <- glm(cbind(ncases, ncontrols) ~ agegp + unclass(tobgp) + unclass(alcgp), data = esoph, family = binomial())
anova(cancer_tobacco_alcohol_sep)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: cbind(ncases, ncontrols)
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 87 227.241
## agegp 5 88.128 82 139.112
## unclass(tobgp) 1 17.522 81 121.591
## unclass(alcgp) 1 62.314 80 59.277
The results suggest that age is highly correlated to the occurrence of cancer; then alcohol consumption comes second. Lastly we can observe the effect of tobacco consumption to be loosely correlated when compared with age and alcohol consumption. These deductions are made according to the change in residual deviance.
As a second model, we may form an additional model to check the combined effects of alcohol and tobacco.
#Try a linear effect of alcohol and tobacco
cancer_tobacco_alcohol_comb <- glm(cbind(ncases, ncontrols) ~ agegp + tobgp * alcgp, data = esoph, family = binomial())
anova(cancer_tobacco_alcohol_comb)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: cbind(ncases, ncontrols)
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 87 227.241
## agegp 5 88.128 82 139.112
## tobgp 3 19.085 79 120.028
## alcgp 3 66.054 76 53.973
## tobgp:alcgp 9 6.489 67 47.484
The result shows that combination of alcohol and tobacco usage together effects the rate of cancer more compared with the effects of alcohol and tobacco consumption seperately.
Reference: http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/esoph.html
Let’s run a Pricipal Component Analysis to understand the relation between the number of PC and the variance explained.
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 2.2.1 <U+221A> purrr 0.2.4
## <U+221A> tibble 1.4.2 <U+221A> dplyr 0.7.4
## <U+221A> tidyr 0.8.0 <U+221A> stringr 1.3.0
## <U+221A> readr 1.1.1 <U+221A> forcats 0.2.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
youth_responses <- read.csv("C:/Users/Sezgin/Desktop/youth_responses.csv",sep=",")
youth_responses_data <- youth_responses %>% filter(complete.cases(.)) %>% tbl_df()
#Prepare PCA data
youth_responses_pca <- youth_responses_data[,sapply(youth_responses_data,class)=="integer"] %>%
select(Music:Pets)
#Run PCA analysis
youth_responses_pca_result<-princomp(youth_responses_pca,cor=T)
#See the PCA output
ggplot(data=data.frame(PC=1:length(youth_responses_pca_result$sdev),
var_exp=cumsum(youth_responses_pca_result$sdev^2/sum(youth_responses_pca_result$sdev^2))),
aes(x=PC,y=var_exp)) + geom_line() +
geom_point() +
scale_y_continuous(labels = scales::percent,breaks=seq(0,1,length.out=11)) +
scale_x_continuous(breaks=seq(0,135,by=5))
#The plot states that with around 110 PC, it is possible to explain 90% of the variance.
#Now let's do the Multidimensional Scaling method, to infer meaning according to the responses on Likert scale for data between History-Pets.
youth_responses_mdsa_data <- youth_responses_pca %>% select(History:Pets)
print(head(as.data.frame(youth_responses_mdsa_data)))
## History Psychology Politics Mathematics Physics Internet PC
## 1 1 5 1 3 3 5 3
## 2 1 3 4 5 2 4 4
## 3 1 2 1 5 2 4 2
## 4 3 2 3 2 2 2 2
## 5 5 3 4 2 3 4 4
## 6 3 3 1 1 1 2 1
## Economy.Management Biology Chemistry Reading Geography Foreign.languages
## 1 5 3 3 3 3 5
## 2 5 1 1 4 4 5
## 3 4 1 1 5 2 5
## 4 2 3 3 5 2 3
## 5 1 4 4 3 3 4
## 6 3 5 5 3 3 4
## Medicine Law Cars Art.exhibitions Religion Countryside..outdoors Dancing
## 1 3 1 1 1 1 5 3
## 2 1 2 2 2 1 1 1
## 3 2 3 1 5 5 5 5
## 4 3 2 3 1 4 4 1
## 5 4 3 5 2 2 5 1
## 6 5 3 4 1 1 4 3
## Musical.instruments Writing Passive.sport Active.sport Gardening
## 1 3 2 1 5 5
## 2 1 1 1 1 1
## 3 5 5 5 2 1
## 4 3 1 3 1 4
## 5 5 1 5 4 2
## 6 2 1 5 3 3
## Celebrities Shopping Science.and.technology Theatre Fun.with.friends
## 1 1 4 4 2 5
## 2 2 3 3 2 4
## 3 1 4 2 5 5
## 4 3 3 3 2 4
## 5 1 2 3 1 3
## 6 1 3 4 3 5
## Adrenaline.sports Pets
## 1 4 4
## 2 2 5
## 3 5 5
## 4 2 1
## 5 3 2
## 6 1 5
#As discussed in the lecture, and as stated in the lecture notes, let's form distance measure and be sure to make distances from 0.
youth_responses_dist <- 1 - cor(youth_responses_mdsa_data)
#Apply MDSA
youth_responses_mdsa <- cmdscale(youth_responses_dist,k=2)
colnames(youth_responses_mdsa) <- c("x","y")
print(youth_responses_mdsa)
## x y
## History 0.031306226 -0.057585867
## Psychology 0.221673919 -0.034255291
## Politics -0.210706395 0.084207670
## Mathematics -0.366106787 -0.301011922
## Physics -0.338602615 -0.488946202
## Internet -0.435984609 0.193575251
## PC -0.602886441 -0.104205122
## Economy.Management -0.313464728 0.369369369
## Biology 0.249496019 -0.352229723
## Chemistry 0.111150203 -0.427188341
## Reading 0.525002226 -0.036403993
## Geography -0.082496760 -0.010735122
## Foreign.languages 0.197742370 0.164499851
## Medicine 0.190107055 -0.319538506
## Law -0.091720302 0.234211611
## Cars -0.593017958 0.091655704
## Art.exhibitions 0.380132671 -0.025937391
## Religion 0.177085206 -0.229380678
## Countryside..outdoors 0.159766147 -0.101417980
## Dancing 0.326087243 0.151882648
## Musical.instruments 0.165516710 -0.156630640
## Writing 0.291051922 -0.065898705
## Passive.sport -0.288889494 0.153049302
## Active.sport -0.180541543 0.077474365
## Gardening 0.203069385 -0.051532921
## Celebrities 0.059109759 0.515094714
## Shopping 0.248971459 0.505415685
## Science.and.technology -0.368579088 -0.307211932
## Theatre 0.452232345 0.006680821
## Fun.with.friends 0.002109219 0.282818197
## Adrenaline.sports -0.251016420 0.072054588
## Pets 0.132403055 0.168120560
#Now let's apply K-Means method to check which of the Hobbies and Interests are in the same cluster.
library(tidyverse)
#Set the seed because K-Means algorithm uses a search based method
set.seed(58)
#Apply K-Means
genre_cluster<-kmeans(youth_responses_mdsa,centers=6)
##Get the clusters
mds_clusters<-
data.frame(genre=names(genre_cluster$cluster),
cluster_mds=genre_cluster$cluster) %>% arrange(cluster_mds,genre)
mds_clusters
## genre cluster_mds
## 1 Art.exhibitions 1
## 2 Dancing 1
## 3 Gardening 1
## 4 Psychology 1
## 5 Reading 1
## 6 Theatre 1
## 7 Writing 1
## 8 Mathematics 2
## 9 Physics 2
## 10 Science.and.technology 2
## 11 Celebrities 3
## 12 Foreign.languages 3
## 13 Fun.with.friends 3
## 14 Pets 3
## 15 Shopping 3
## 16 Cars 4
## 17 Internet 4
## 18 PC 4
## 19 Active.sport 5
## 20 Adrenaline.sports 5
## 21 Economy.Management 5
## 22 Geography 5
## 23 Law 5
## 24 Passive.sport 5
## 25 Politics 5
## 26 Biology 6
## 27 Chemistry 6
## 28 Countryside..outdoors 6
## 29 History 6
## 30 Medicine 6
## 31 Musical.instruments 6
## 32 Religion 6
ggplot(
data.frame(youth_responses_mdsa) %>% mutate(clusters=as.factor(genre_cluster$cluster),
genres=rownames(youth_responses_mdsa)),aes(x=x,y=y)) +
geom_text(aes(label=genres,color=clusters),angle=45,size=2) +
geom_point(data=as.data.frame(genre_cluster$centers),aes(x=x,y=y)
)
The plot and the K-Means analysis suggests that; for example Psychology, Gardening, Writing and Art exhibitions, outdoors are close; additionally Adrenaline sports and Active sport and Politics are also pretty close and in same cluster. As a result, it is possible to infer that a person who likes Active sport is also into Politics or; a person who is interested in Psychology is also into History.
set.seed(503)
library(tidyverse)
library(rpart) #for the CART models
library(rpart.plot)
diamonds_test <- diamonds %>% mutate(diamond_id = row_number()) %>%
group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
diamonds_train <- anti_join(diamonds %>% mutate(diamond_id = row_number()),
diamonds_test, by = "diamond_id")
diamonds_train
## # A tibble: 43,143 x 11
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.230 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
## 2 0.210 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
## 3 0.230 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
## 5 0.240 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
## 6 0.240 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
## 7 0.260 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
## 8 0.220 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
## 9 0.230 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39
## 10 0.300 Good J SI1 64.0 55.0 339 4.25 4.28 2.73
## # ... with 43,133 more rows, and 1 more variable: diamond_id <int>
diamonds_test
## # A tibble: 10,797 x 11
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 3.40 Fair D I1 66.8 52.0 15964 9.42 9.34 6.27
## 2 0.900 Fair D SI2 64.7 59.0 3205 6.09 5.99 3.91
## 3 0.950 Fair D SI2 64.4 60.0 3384 6.06 6.02 3.89
## 4 1.00 Fair D SI2 65.2 56.0 3634 6.27 6.21 4.07
## 5 0.700 Fair D SI2 58.1 60.0 2358 5.79 5.82 3.37
## 6 1.04 Fair D SI2 64.9 56.0 4398 6.39 6.34 4.13
## 7 0.700 Fair D SI2 65.6 55.0 2167 5.59 5.50 3.64
## 8 1.03 Fair D SI2 66.4 56.0 3743 6.31 6.19 4.15
## 9 1.10 Fair D SI2 64.6 54.0 4725 6.56 6.49 4.22
## 10 2.01 Fair D SI2 59.4 66.0 15627 8.20 8.17 4.86
## # ... with 10,787 more rows, and 1 more variable: diamond_id <int>
#Taking below link as a reference, we proceed to create our model.
#https://mef-bda503.github.io/pj-Bengisunz/files/diamonds_price_prediction_r_markdownfile_v3.html
#In our assignment description, price is defined as to be the response variable and other columns as carat, cut, color, clarity etc. as the predictors.
#Let's initially visualize/plot the relation between the price and color; price and cut and finally price and clarity of the diamond to obtain exploratory information.
ggplot(diamonds,aes(x=color,y=price))+geom_jitter()
#Color G and H seems to have the highest price overall.
ggplot(diamonds,aes(x=cut,y=price))+geom_jitter()
#As expected, Premium and Ideal cuts have the highest price overall.
ggplot(diamonds,aes(x=clarity,y=price))+geom_jitter()
#VS2 AND SI2 clarities have the highest price level overall.
#Now let's proceed to form our model with the the predictors stated.
(formula <- as.formula(price ~ carat + cut + color + clarity + depth + table + x + y + z))
## price ~ carat + cut + color + clarity + depth + table + x + y +
## z
m1 <- rpart(formula, data = diamonds_train, method = "anova")
m1
## n= 43143
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 43143 687673200000 3933.419
## 2) carat< 0.995 27919 34912140000 1634.795
## 4) y< 5.525 19911 5428982000 1056.168 *
## 5) y>=5.525 8008 6241528000 3073.487 *
## 3) carat>=0.995 15224 234721600000 8148.821
## 6) y< 7.195 10290 48143650000 6134.445
## 12) clarity=I1,SI2,SI1,VS2 7828 16158110000 5394.948 *
## 13) clarity=VS1,VVS2,VVS1,IF 2462 14093830000 8485.700 *
## 7) y>=7.195 4934 57745320000 12349.860
## 14) y< 7.855 3214 27840990000 10961.400 *
## 15) y>=7.855 1720 12130460000 14944.340 *
#Making a prediction with the model we've constructed.
prediction <- predict(m1, diamonds_test)
summary(prediction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1056 1056 3073 3942 5395 14944
#Let's calculate Mean Absolute Error and utilize our model on the test data to obtain an estimation for price.
MAE <- function(actual, predicted) {mean(abs(actual - predicted))}
MAE(diamonds_test$price, prediction)
## [1] 889.7092
Estimation_price <- predict(m1,newdata = diamonds_test)
diamonds_test_new <- diamonds_test %>% select(price)
print(diamonds_test_new)
## # A tibble: 10,797 x 1
## price
## <int>
## 1 15964
## 2 3205
## 3 3384
## 4 3634
## 5 2358
## 6 4398
## 7 2167
## 8 3743
## 9 4725
## 10 15627
## # ... with 10,787 more rows
print(head(Estimation_price))
## 1 2 3 4 5 6
## 14944.339 3073.487 3073.487 5394.948 3073.487 5394.948
library(tidyverse)
library(rpart)
library(rpart.plot)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
load("C:/Users/Sezgin/Desktop/spam_data.RData")
head(spam_data)
## # A tibble: 6 x 59
## train_test spam_or_not V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 0.640 0.640 0 0.320 0 0 0
## 2 0 1 0.210 0.280 0.500 0 0.140 0.280 0.210 0.0700
## 3 0 1 0.0600 0 0.710 0 1.23 0.190 0.190 0.120
## 4 0 1 0 0 0 0 0.630 0 0.310 0.630
## 5 0 1 0 0 0 0 0.630 0 0.310 0.630
## 6 0 1 0 0 0 0 1.85 0 0 1.85
## # ... with 49 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## # V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## # V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## # V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## # V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## # V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## # V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## # V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## # V55 <dbl>, V56 <int>, V57 <int>
#In the data set and the assignment description, it is stated that that the column "spam_or_not" is given as integer. Let's convert this integer to factor.
spam_data$spam_or_not <- as.factor(spam_data$spam_or_not)
#Another given column "train_or_test" where the value "0" states train row and "1" states for test row.
table(spam_data$spam_or_not==0)
##
## FALSE TRUE
## 1813 2788
#To build a model, we need a two sets of data; one for test and one for train, as done in the previous "diamonds" assignment.
spam_data_train<-subset(spam_data,train_test==0)
spam_data_test<-subset(spam_data,train_test==1)
#Now let's start to build our CART model.
spam_data_CART_model <- rpart(spam_or_not ~ ., data=spam_data_train, method = 'class')
printcp(spam_data_CART_model)
##
## Classification tree:
## rpart(formula = spam_or_not ~ ., data = spam_data_train, method = "class")
##
## Variables actually used in tree construction:
## [1] V16 V25 V52 V53 V57 V7
##
## Root node error: 1605/4101 = 0.39137
##
## n= 4101
##
## CP nsplit rel error xerror xstd
## 1 0.481620 0 1.00000 1.00000 0.019473
## 2 0.143925 1 0.51838 0.54206 0.016312
## 3 0.049221 2 0.37445 0.44673 0.015155
## 4 0.037383 3 0.32523 0.35950 0.013873
## 5 0.030530 4 0.28785 0.33583 0.013481
## 6 0.011838 5 0.25732 0.30530 0.012942
## 7 0.010000 6 0.24548 0.28723 0.012603
fancyRpartPlot(spam_data_CART_model)
summary(spam_data_CART_model)
## Call:
## rpart(formula = spam_or_not ~ ., data = spam_data_train, method = "class")
## n= 4101
##
## CP nsplit rel error xerror xstd
## 1 0.48161994 0 1.0000000 1.0000000 0.01947331
## 2 0.14392523 1 0.5183801 0.5420561 0.01631204
## 3 0.04922118 2 0.3744548 0.4467290 0.01515496
## 4 0.03738318 3 0.3252336 0.3595016 0.01387350
## 5 0.03052960 4 0.2878505 0.3358255 0.01348098
## 6 0.01183801 5 0.2573209 0.3052960 0.01294172
## 7 0.01000000 6 0.2454829 0.2872274 0.01260321
##
## Variable importance
## V53 V7 V23 V24 V52 V56 V20 V9 V25 V57 V26 V55 V16 V21 V50 V5 V31
## 29 13 10 10 7 7 5 5 4 2 2 1 1 1 1 1 1
##
## Node number 1: 4101 observations, complexity param=0.4816199
## predicted class=0 expected loss=0.391368 P(node) =1
## class counts: 2496 1605
## probabilities: 0.609 0.391
## left son=2 (3092 obs) right son=3 (1009 obs)
## Primary splits:
## V53 < 0.0555 to the left, improve=647.0601, (0 missing)
## V52 < 0.0795 to the left, improve=637.7163, (0 missing)
## V7 < 0.01 to the left, improve=527.5502, (0 missing)
## V16 < 0.075 to the left, improve=498.3240, (0 missing)
## V21 < 0.595 to the left, improve=482.5029, (0 missing)
## Surrogate splits:
## V23 < 0.055 to the left, agree=0.842, adj=0.356, (0 split)
## V24 < 0.035 to the left, agree=0.835, adj=0.331, (0 split)
## V20 < 0.025 to the left, agree=0.797, adj=0.173, (0 split)
## V56 < 71.5 to the left, agree=0.796, adj=0.169, (0 split)
## V9 < 0.18 to the left, agree=0.794, adj=0.164, (0 split)
##
## Node number 2: 3092 observations, complexity param=0.1439252
## predicted class=0 expected loss=0.2309185 P(node) =0.7539624
## class counts: 2378 714
## probabilities: 0.769 0.231
## left son=4 (2809 obs) right son=5 (283 obs)
## Primary splits:
## V7 < 0.055 to the left, improve=285.7257, (0 missing)
## V52 < 0.0915 to the left, improve=250.0214, (0 missing)
## V16 < 0.135 to the left, improve=236.2023, (0 missing)
## V21 < 0.445 to the left, improve=139.5265, (0 missing)
## V55 < 3.6835 to the left, improve=136.5273, (0 missing)
## Surrogate splits:
## V56 < 131.5 to the left, agree=0.911, adj=0.032, (0 split)
## V20 < 1.635 to the left, agree=0.910, adj=0.018, (0 split)
## V54 < 0.826 to the left, agree=0.910, adj=0.018, (0 split)
## V4 < 8.115 to the left, agree=0.909, adj=0.007, (0 split)
## V17 < 4.325 to the left, agree=0.909, adj=0.007, (0 split)
##
## Node number 3: 1009 observations, complexity param=0.0305296
## predicted class=1 expected loss=0.1169475 P(node) =0.2460376
## class counts: 118 891
## probabilities: 0.117 0.883
## left son=6 (61 obs) right son=7 (948 obs)
## Primary splits:
## V25 < 0.4 to the right, improve=79.95414, (0 missing)
## V26 < 0.12 to the right, improve=38.11919, (0 missing)
## V52 < 0.0495 to the left, improve=34.98541, (0 missing)
## V37 < 0.085 to the right, improve=32.13206, (0 missing)
## V27 < 0.21 to the right, improve=31.55716, (0 missing)
## Surrogate splits:
## V26 < 0.305 to the right, agree=0.966, adj=0.443, (0 split)
## V31 < 0.05 to the right, agree=0.949, adj=0.164, (0 split)
## V28 < 0.025 to the right, agree=0.947, adj=0.131, (0 split)
## V27 < 0.225 to the right, agree=0.945, adj=0.098, (0 split)
## V29 < 0.075 to the right, agree=0.945, adj=0.098, (0 split)
##
## Node number 4: 2809 observations, complexity param=0.04922118
## predicted class=0 expected loss=0.1626913 P(node) =0.6849549
## class counts: 2352 457
## probabilities: 0.837 0.163
## left son=8 (2452 obs) right son=9 (357 obs)
## Primary splits:
## V52 < 0.3775 to the left, improve=164.13240, (0 missing)
## V16 < 0.225 to the left, improve=139.18240, (0 missing)
## V55 < 3.687 to the left, improve= 68.03708, (0 missing)
## V21 < 0.865 to the left, improve= 58.60639, (0 missing)
## V25 < 0.025 to the right, improve= 57.38702, (0 missing)
## Surrogate splits:
## V16 < 2.415 to the left, agree=0.876, adj=0.028, (0 split)
## V23 < 0.62 to the left, agree=0.876, adj=0.028, (0 split)
## V24 < 3.305 to the left, agree=0.874, adj=0.006, (0 split)
## V13 < 2.53 to the left, agree=0.873, adj=0.003, (0 split)
## V18 < 3.93 to the left, agree=0.873, adj=0.003, (0 split)
##
## Node number 5: 283 observations
## predicted class=1 expected loss=0.09187279 P(node) =0.06900756
## class counts: 26 257
## probabilities: 0.092 0.908
##
## Node number 6: 61 observations
## predicted class=0 expected loss=0.09836066 P(node) =0.01487442
## class counts: 55 6
## probabilities: 0.902 0.098
##
## Node number 7: 948 observations
## predicted class=1 expected loss=0.0664557 P(node) =0.2311631
## class counts: 63 885
## probabilities: 0.066 0.934
##
## Node number 8: 2452 observations
## predicted class=0 expected loss=0.09747145 P(node) =0.597903
## class counts: 2213 239
## probabilities: 0.903 0.097
##
## Node number 9: 357 observations, complexity param=0.03738318
## predicted class=1 expected loss=0.3893557 P(node) =0.08705194
## class counts: 139 218
## probabilities: 0.389 0.611
## left son=18 (168 obs) right son=19 (189 obs)
## Primary splits:
## V57 < 64.5 to the left, improve=53.08715, (0 missing)
## V56 < 10.5 to the left, improve=49.11356, (0 missing)
## V55 < 2.654 to the left, improve=43.90820, (0 missing)
## V16 < 0.04 to the left, improve=38.03825, (0 missing)
## V21 < 0.765 to the left, improve=21.76457, (0 missing)
## Surrogate splits:
## V56 < 12.5 to the left, agree=0.854, adj=0.690, (0 split)
## V55 < 2.8055 to the left, agree=0.754, adj=0.476, (0 split)
## V21 < 0.115 to the left, agree=0.731, adj=0.429, (0 split)
## V50 < 0.008 to the left, agree=0.709, adj=0.381, (0 split)
## V5 < 0.065 to the left, agree=0.697, adj=0.357, (0 split)
##
## Node number 18: 168 observations, complexity param=0.01183801
## predicted class=0 expected loss=0.3214286 P(node) =0.04096562
## class counts: 114 54
## probabilities: 0.679 0.321
## left son=36 (147 obs) right son=37 (21 obs)
## Primary splits:
## V16 < 0.775 to the left, improve=19.108840, (0 missing)
## V55 < 2.654 to the left, improve=11.583360, (0 missing)
## V52 < 0.8045 to the left, improve= 8.926531, (0 missing)
## V56 < 8.5 to the left, improve= 7.012698, (0 missing)
## V57 < 22.5 to the left, improve= 6.549351, (0 missing)
## Surrogate splits:
## V49 < 0.294 to the left, agree=0.893, adj=0.143, (0 split)
## V1 < 1.39 to the left, agree=0.887, adj=0.095, (0 split)
## V6 < 1.145 to the left, agree=0.887, adj=0.095, (0 split)
## V18 < 3.84 to the left, agree=0.887, adj=0.095, (0 split)
## V55 < 3.757 to the left, agree=0.887, adj=0.095, (0 split)
##
## Node number 19: 189 observations
## predicted class=1 expected loss=0.1322751 P(node) =0.04608632
## class counts: 25 164
## probabilities: 0.132 0.868
##
## Node number 36: 147 observations
## predicted class=0 expected loss=0.2312925 P(node) =0.03584492
## class counts: 113 34
## probabilities: 0.769 0.231
##
## Node number 37: 21 observations
## predicted class=1 expected loss=0.04761905 P(node) =0.005120702
## class counts: 1 20
## probabilities: 0.048 0.952
#What is the testing set accuracy of spamCART, using a threshold of 0.5 for predictions?
predPercCART.test <- predict(spam_data_CART_model, newdata = spam_data_test)[ , 2]
head(predPercCART.test)
## 1 2 3 4 5 6
## 0.93354430 0.90812721 0.09747145 0.93354430 0.23129252 0.93354430
predCART.test <- ifelse(predPercCART.test > 0.5, 1, 0)
table(predCART.test, spam_data_test$spam_or_not)
##
## predCART.test 0 1
## 0 278 40
## 1 14 168
#To visualize, let's get a graphical representation
bestcp <- spam_data_CART_model$cptable[which.min(spam_data_CART_model$cptable[,"xerror"]),"CP"]
#Prune the tree using the best Complexity Parameter. The "best" here means that the value of CP should be the least, so that the cross-validated error rate is minimum.
tree.pruned <- prune(spam_data_CART_model, cp = bestcp)
plot(tree.pruned)
text(tree.pruned, cex = 0.8, use.n = TRUE, xpd = TRUE)
#In order to increase the visualization, below link/hw is taken as a reference.
#https://mef-bda503.github.io/pj-ferayece/files/Assignment3_SpamDataAnalysis.html
count_only <- function(x, labs, digits, varlen)
{
paste(x$frame$n)
}
boxcols <- c("green", "red")[tree.pruned$frame$yval]
par(xpd=TRUE)
prp(tree.pruned, faclen = 0, cex = 0.8, node.fun=count_only, box.col = boxcols,,main = 'Classification Tree for Spam')
legend("bottomright", legend = c("Not Spam","Spam"), fill = c("green", "red"),
title = "Spam or Not?")
What this model and classification states that, for example for the first split (V53 -char_freq_$), if V53<0.056 then the rest of the responses are in splits. If V53>0.056, for 948 spam, 63 people that actually spam and 885 that are predicted as spam.