library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(tibble)
library(readr)
library(ggcorrplot)
library(dplyr)
library(purrr)
esoph
## agegp alcgp tobgp ncases ncontrols
## 1 25-34 0-39g/day 0-9g/day 0 40
## 2 25-34 0-39g/day 10-19 0 10
## 3 25-34 0-39g/day 20-29 0 6
## 4 25-34 0-39g/day 30+ 0 5
## 5 25-34 40-79 0-9g/day 0 27
## 6 25-34 40-79 10-19 0 7
## 7 25-34 40-79 20-29 0 4
## 8 25-34 40-79 30+ 0 7
## 9 25-34 80-119 0-9g/day 0 2
## 10 25-34 80-119 10-19 0 1
## 11 25-34 80-119 30+ 0 2
## 12 25-34 120+ 0-9g/day 0 1
## 13 25-34 120+ 10-19 1 1
## 14 25-34 120+ 20-29 0 1
## 15 25-34 120+ 30+ 0 2
## 16 35-44 0-39g/day 0-9g/day 0 60
## 17 35-44 0-39g/day 10-19 1 14
## 18 35-44 0-39g/day 20-29 0 7
## 19 35-44 0-39g/day 30+ 0 8
## 20 35-44 40-79 0-9g/day 0 35
## 21 35-44 40-79 10-19 3 23
## 22 35-44 40-79 20-29 1 14
## 23 35-44 40-79 30+ 0 8
## 24 35-44 80-119 0-9g/day 0 11
## 25 35-44 80-119 10-19 0 6
## 26 35-44 80-119 20-29 0 2
## 27 35-44 80-119 30+ 0 1
## 28 35-44 120+ 0-9g/day 2 3
## 29 35-44 120+ 10-19 0 3
## 30 35-44 120+ 20-29 2 4
## 31 45-54 0-39g/day 0-9g/day 1 46
## 32 45-54 0-39g/day 10-19 0 18
## 33 45-54 0-39g/day 20-29 0 10
## 34 45-54 0-39g/day 30+ 0 4
## 35 45-54 40-79 0-9g/day 6 38
## 36 45-54 40-79 10-19 4 21
## 37 45-54 40-79 20-29 5 15
## 38 45-54 40-79 30+ 5 7
## 39 45-54 80-119 0-9g/day 3 16
## 40 45-54 80-119 10-19 6 14
## 41 45-54 80-119 20-29 1 5
## 42 45-54 80-119 30+ 2 4
## 43 45-54 120+ 0-9g/day 4 4
## 44 45-54 120+ 10-19 3 4
## 45 45-54 120+ 20-29 2 3
## 46 45-54 120+ 30+ 4 4
## 47 55-64 0-39g/day 0-9g/day 2 49
## 48 55-64 0-39g/day 10-19 3 22
## 49 55-64 0-39g/day 20-29 3 12
## 50 55-64 0-39g/day 30+ 4 6
## 51 55-64 40-79 0-9g/day 9 40
## 52 55-64 40-79 10-19 6 21
## 53 55-64 40-79 20-29 4 17
## 54 55-64 40-79 30+ 3 6
## 55 55-64 80-119 0-9g/day 9 18
## 56 55-64 80-119 10-19 8 15
## 57 55-64 80-119 20-29 3 6
## 58 55-64 80-119 30+ 4 4
## 59 55-64 120+ 0-9g/day 5 10
## 60 55-64 120+ 10-19 6 7
## 61 55-64 120+ 20-29 2 3
## 62 55-64 120+ 30+ 5 6
## 63 65-74 0-39g/day 0-9g/day 5 48
## 64 65-74 0-39g/day 10-19 4 14
## 65 65-74 0-39g/day 20-29 2 7
## 66 65-74 0-39g/day 30+ 0 2
## 67 65-74 40-79 0-9g/day 17 34
## 68 65-74 40-79 10-19 3 10
## 69 65-74 40-79 20-29 5 9
## 70 65-74 80-119 0-9g/day 6 13
## 71 65-74 80-119 10-19 4 12
## 72 65-74 80-119 20-29 2 3
## 73 65-74 80-119 30+ 1 1
## 74 65-74 120+ 0-9g/day 3 4
## 75 65-74 120+ 10-19 1 2
## 76 65-74 120+ 20-29 1 1
## 77 65-74 120+ 30+ 1 1
## 78 75+ 0-39g/day 0-9g/day 1 18
## 79 75+ 0-39g/day 10-19 2 6
## 80 75+ 0-39g/day 30+ 1 3
## 81 75+ 40-79 0-9g/day 2 5
## 82 75+ 40-79 10-19 1 3
## 83 75+ 40-79 20-29 0 3
## 84 75+ 40-79 30+ 1 1
## 85 75+ 80-119 0-9g/day 1 1
## 86 75+ 80-119 10-19 1 1
## 87 75+ 120+ 0-9g/day 2 2
## 88 75+ 120+ 10-19 1 1
summary(esoph)
## agegp alcgp tobgp ncases ncontrols
## 25-34:15 0-39g/day:23 0-9g/day:24 Min. : 0.000 Min. : 1.00
## 35-44:15 40-79 :23 10-19 :24 1st Qu.: 0.000 1st Qu.: 3.00
## 45-54:16 80-119 :21 20-29 :20 Median : 1.000 Median : 6.00
## 55-64:16 120+ :21 30+ :20 Mean : 2.273 Mean :11.08
## 65-74:15 3rd Qu.: 4.000 3rd Qu.:14.00
## 75+ :11 Max. :17.000 Max. :60.00
cancerrelation <- glm(cbind(ncases,ncontrols) ~ agegp + tobgp + alcgp , data = esoph , family = binomial() )
anova(cancerrelation)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: cbind(ncases, ncontrols)
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 87 227.241
## agegp 5 88.128 82 139.112
## tobgp 3 19.085 79 120.028
## alcgp 3 66.054 76 53.973
We can see that age is the most related to cancer with 88 deviance. After age factor, the alcohol is related to the cancer.
Grouping by age, according to sum of control counts.
d1<- esoph %>% group_by(agegp) %>%
summarise(count = n(), total_cases = sum(ncases), total_controls = sum(ncontrols),
percentage=total_cases*100/total_controls)
ggplot(d1, aes(x=d1$agegp, y=d1$percentage,fill=d1$agegp)) +
geom_bar(stat="identity", position = "dodge") +
scale_fill_brewer(palette ="Set1")+
labs(x= 'Age Groups', y= 'Percentage Of Cancer Cases')+
guides(fill=guide_legend(title="Age Groups"))
youth_resp <- read.csv("/cloud/project/youth_responses.csv",sep=",")
youth_resp <- youth_resp %>% select(History:Pets)
print(head(as.data.frame(youth_resp)))
## History Psychology Politics Mathematics Physics Internet PC
## 1 1 5 1 3 3 5 3
## 2 1 3 4 5 2 4 4
## 3 1 2 1 5 2 4 2
## 4 4 4 5 4 1 3 1
## 5 3 2 3 2 2 2 2
## 6 5 3 4 2 3 4 4
## Economy.Management Biology Chemistry Reading Geography Foreign.languages
## 1 5 3 3 3 3 5
## 2 5 1 1 4 4 5
## 3 4 1 1 5 2 5
## 4 2 3 3 5 4 4
## 5 2 3 3 5 2 3
## 6 1 4 4 3 3 4
## Medicine Law Cars Art.exhibitions Religion Countryside..outdoors Dancing
## 1 3 1 1 1 1 5 3
## 2 1 2 2 2 1 1 1
## 3 2 3 1 5 5 5 5
## 4 2 5 1 5 4 1 1
## 5 3 2 3 1 4 4 1
## 6 4 3 5 2 2 5 1
## Musical.instruments Writing Passive.sport Active.sport Gardening
## 1 3 2 1 5 5
## 2 1 1 1 1 1
## 3 5 5 5 2 1
## 4 1 3 1 1 1
## 5 3 1 3 1 4
## 6 5 1 5 4 2
## Celebrities Shopping Science.and.technology Theatre Fun.with.friends
## 1 1 4 4 2 5
## 2 2 3 3 2 4
## 3 1 4 2 5 5
## 4 2 4 3 1 2
## 5 3 3 3 2 4
## 6 1 2 3 1 3
## Adrenaline.sports Pets
## 1 4 4
## 2 2 5
## 3 5 5
## 4 1 1
## 5 2 1
## 6 3 2
summary(youth_resp)
## History Psychology Politics Mathematics
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
## Median :3.000 Median :3.000 Median :2.000 Median :2.000
## Mean :3.207 Mean :3.138 Mean :2.596 Mean :2.335
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NA's :2 NA's :5 NA's :1 NA's :3
## Physics Internet PC Economy.Management
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:1.000
## Median :2.000 Median :4.000 Median :3.000 Median :2.000
## Mean :2.065 Mean :4.176 Mean :3.136 Mean :2.644
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NA's :3 NA's :4 NA's :6 NA's :5
## Biology Chemistry Reading Geography
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :3.000 Median :3.000
## Mean :2.665 Mean :2.165 Mean :3.159 Mean :3.083
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NA's :6 NA's :10 NA's :6 NA's :9
## Foreign.languages Medicine Law Cars
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :4.000 Median :2.000 Median :2.000 Median :3.000
## Mean :3.778 Mean :2.516 Mean :2.257 Mean :2.687
## 3rd Qu.:5.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NA's :5 NA's :5 NA's :1 NA's :4
## Art.exhibitions Religion Countryside..outdoors Dancing
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.00 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000
## Median :2.00 Median :2.000 Median :4.000 Median :2.000
## Mean :2.59 Mean :2.273 Mean :3.687 Mean :2.462
## 3rd Qu.:4.00 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.00 Max. :5.000 Max. :5.000 Max. :5.000
## NA's :6 NA's :3 NA's :7 NA's :3
## Musical.instruments Writing Passive.sport Active.sport
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :3.000 Median :3.000
## Mean :2.324 Mean :1.901 Mean :3.388 Mean :3.291
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NA's :1 NA's :6 NA's :15 NA's :4
## Gardening Celebrities Shopping Science.and.technology
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :1.000 Median :2.000 Median :3.000 Median :3.000
## Mean :1.907 Mean :2.362 Mean :3.277 Mean :3.234
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NA's :7 NA's :2 NA's :2 NA's :6
## Theatre Fun.with.friends Adrenaline.sports Pets
## Min. :1.000 Min. :2.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :5.000 Median :3.000 Median :4.000
## Mean :3.025 Mean :4.558 Mean :2.948 Mean :3.335
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## NA's :8 NA's :4 NA's :3 NA's :4
library(rpart) #To construct CART models
library(rpart.plot) # It also includes titanic data
library(rattle.data) #For visualization
ggplot(diamonds) + geom_point(aes(x = carat, y = price, color = cut ))
set.seed(503)
library(tidyverse)
diamonds_test <- diamonds %>% mutate(diamond_id = row_number()) %>%
group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
diamonds_train <- anti_join(diamonds %>% mutate(diamond_id = row_number()),
diamonds_test, by = "diamond_id")
diamonds_train
## # A tibble: 43,143 x 11
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 6 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 7 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 8 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 9 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## 10 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
## # ... with 43,133 more rows, and 1 more variable: diamond_id <int>
library(tidyverse)
library(rpart)
library(rpart.plot)
load("/cloud/project/spam_data.RData")
head(spam_data)
## # A tibble: 6 x 59
## train_test spam_or_not V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 0.64 0.64 0 0.32 0 0 0
## 2 0 1 0.21 0.28 0.5 0 0.14 0.28 0.21 0.07
## 3 0 1 0.06 0 0.71 0 1.23 0.19 0.19 0.12
## 4 0 1 0 0 0 0 0.63 0 0.31 0.63
## 5 0 1 0 0 0 0 0.63 0 0.31 0.63
## 6 0 1 0 0 0 0 1.85 0 0 1.85
## # ... with 49 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## # V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## # V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## # V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## # V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## # V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## # V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## # V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## # V55 <dbl>, V56 <int>, V57 <int>
spam_train <- spam_data %>% filter(train_test == 0) %>% select(-train_test)
spam_model <- rpart(spam_or_not ~ ., data = spam_train)
rpart.plot(spam_model)
spam_in_sample <- predict(spam_model)
head(spam_in_sample)
## 1 2 3 4 5 6
## 0.86772487 0.94753747 0.94753747 0.90812721 0.90812721 0.05763952
in_sample_prediction <-
cbind(
spam_in_sample %>% tbl_df %>%
transmute(spam_predict = ifelse(spam_in_sample >= 0.5,1,0)),
spam_train %>% tbl_df %>%
transmute(spam_actual = ifelse(spam_or_not == 1,1,0))
) %>%
mutate(correct_class = (spam_predict == spam_actual)) %>%
group_by(correct_class) %>%
summarise(count=n(),percentage=n()/nrow(.))
in_sample_prediction
## # A tibble: 2 x 3
## correct_class count percentage
## <lgl> <int> <dbl>
## 1 FALSE 398 0.0970
## 2 TRUE 3703 0.903