Esoph And Young People Survey

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(tibble)
library(readr)
library(ggcorrplot)
library(dplyr)
library(purrr)

Esoph

esoph
##    agegp     alcgp    tobgp ncases ncontrols
## 1  25-34 0-39g/day 0-9g/day      0        40
## 2  25-34 0-39g/day    10-19      0        10
## 3  25-34 0-39g/day    20-29      0         6
## 4  25-34 0-39g/day      30+      0         5
## 5  25-34     40-79 0-9g/day      0        27
## 6  25-34     40-79    10-19      0         7
## 7  25-34     40-79    20-29      0         4
## 8  25-34     40-79      30+      0         7
## 9  25-34    80-119 0-9g/day      0         2
## 10 25-34    80-119    10-19      0         1
## 11 25-34    80-119      30+      0         2
## 12 25-34      120+ 0-9g/day      0         1
## 13 25-34      120+    10-19      1         1
## 14 25-34      120+    20-29      0         1
## 15 25-34      120+      30+      0         2
## 16 35-44 0-39g/day 0-9g/day      0        60
## 17 35-44 0-39g/day    10-19      1        14
## 18 35-44 0-39g/day    20-29      0         7
## 19 35-44 0-39g/day      30+      0         8
## 20 35-44     40-79 0-9g/day      0        35
## 21 35-44     40-79    10-19      3        23
## 22 35-44     40-79    20-29      1        14
## 23 35-44     40-79      30+      0         8
## 24 35-44    80-119 0-9g/day      0        11
## 25 35-44    80-119    10-19      0         6
## 26 35-44    80-119    20-29      0         2
## 27 35-44    80-119      30+      0         1
## 28 35-44      120+ 0-9g/day      2         3
## 29 35-44      120+    10-19      0         3
## 30 35-44      120+    20-29      2         4
## 31 45-54 0-39g/day 0-9g/day      1        46
## 32 45-54 0-39g/day    10-19      0        18
## 33 45-54 0-39g/day    20-29      0        10
## 34 45-54 0-39g/day      30+      0         4
## 35 45-54     40-79 0-9g/day      6        38
## 36 45-54     40-79    10-19      4        21
## 37 45-54     40-79    20-29      5        15
## 38 45-54     40-79      30+      5         7
## 39 45-54    80-119 0-9g/day      3        16
## 40 45-54    80-119    10-19      6        14
## 41 45-54    80-119    20-29      1         5
## 42 45-54    80-119      30+      2         4
## 43 45-54      120+ 0-9g/day      4         4
## 44 45-54      120+    10-19      3         4
## 45 45-54      120+    20-29      2         3
## 46 45-54      120+      30+      4         4
## 47 55-64 0-39g/day 0-9g/day      2        49
## 48 55-64 0-39g/day    10-19      3        22
## 49 55-64 0-39g/day    20-29      3        12
## 50 55-64 0-39g/day      30+      4         6
## 51 55-64     40-79 0-9g/day      9        40
## 52 55-64     40-79    10-19      6        21
## 53 55-64     40-79    20-29      4        17
## 54 55-64     40-79      30+      3         6
## 55 55-64    80-119 0-9g/day      9        18
## 56 55-64    80-119    10-19      8        15
## 57 55-64    80-119    20-29      3         6
## 58 55-64    80-119      30+      4         4
## 59 55-64      120+ 0-9g/day      5        10
## 60 55-64      120+    10-19      6         7
## 61 55-64      120+    20-29      2         3
## 62 55-64      120+      30+      5         6
## 63 65-74 0-39g/day 0-9g/day      5        48
## 64 65-74 0-39g/day    10-19      4        14
## 65 65-74 0-39g/day    20-29      2         7
## 66 65-74 0-39g/day      30+      0         2
## 67 65-74     40-79 0-9g/day     17        34
## 68 65-74     40-79    10-19      3        10
## 69 65-74     40-79    20-29      5         9
## 70 65-74    80-119 0-9g/day      6        13
## 71 65-74    80-119    10-19      4        12
## 72 65-74    80-119    20-29      2         3
## 73 65-74    80-119      30+      1         1
## 74 65-74      120+ 0-9g/day      3         4
## 75 65-74      120+    10-19      1         2
## 76 65-74      120+    20-29      1         1
## 77 65-74      120+      30+      1         1
## 78   75+ 0-39g/day 0-9g/day      1        18
## 79   75+ 0-39g/day    10-19      2         6
## 80   75+ 0-39g/day      30+      1         3
## 81   75+     40-79 0-9g/day      2         5
## 82   75+     40-79    10-19      1         3
## 83   75+     40-79    20-29      0         3
## 84   75+     40-79      30+      1         1
## 85   75+    80-119 0-9g/day      1         1
## 86   75+    80-119    10-19      1         1
## 87   75+      120+ 0-9g/day      2         2
## 88   75+      120+    10-19      1         1
summary(esoph)
##    agegp          alcgp         tobgp        ncases         ncontrols    
##  25-34:15   0-39g/day:23   0-9g/day:24   Min.   : 0.000   Min.   : 1.00  
##  35-44:15   40-79    :23   10-19   :24   1st Qu.: 0.000   1st Qu.: 3.00  
##  45-54:16   80-119   :21   20-29   :20   Median : 1.000   Median : 6.00  
##  55-64:16   120+     :21   30+     :20   Mean   : 2.273   Mean   :11.08  
##  65-74:15                                3rd Qu.: 4.000   3rd Qu.:14.00  
##  75+  :11                                Max.   :17.000   Max.   :60.00

Analysis data

cancerrelation <- glm(cbind(ncases,ncontrols) ~ agegp + tobgp + alcgp , data = esoph , family = binomial() )

anova(cancerrelation)
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: cbind(ncases, ncontrols)
## 
## Terms added sequentially (first to last)
## 
## 
##       Df Deviance Resid. Df Resid. Dev
## NULL                     87    227.241
## agegp  5   88.128        82    139.112
## tobgp  3   19.085        79    120.028
## alcgp  3   66.054        76     53.973

We can see that age is the most related to cancer with 88 deviance. After age factor, the alcohol is related to the cancer.

Plotting

Grouping by age, according to sum of control counts.

d1<- esoph %>%  group_by(agegp) %>%
    summarise(count = n(), total_cases = sum(ncases), total_controls = sum(ncontrols),
              percentage=total_cases*100/total_controls)

ggplot(d1, aes(x=d1$agegp, y=d1$percentage,fill=d1$agegp)) +
   geom_bar(stat="identity", position = "dodge") +
   scale_fill_brewer(palette ="Set1")+
   labs(x= 'Age Groups', y= 'Percentage Of Cancer Cases')+
   guides(fill=guide_legend(title="Age Groups"))

Young People Survey

youth_resp <- read.csv("/cloud/project/youth_responses.csv",sep=",") 
youth_resp <- youth_resp %>% select(History:Pets)
print(head(as.data.frame(youth_resp)))
##   History Psychology Politics Mathematics Physics Internet PC
## 1       1          5        1           3       3        5  3
## 2       1          3        4           5       2        4  4
## 3       1          2        1           5       2        4  2
## 4       4          4        5           4       1        3  1
## 5       3          2        3           2       2        2  2
## 6       5          3        4           2       3        4  4
##   Economy.Management Biology Chemistry Reading Geography Foreign.languages
## 1                  5       3         3       3         3                 5
## 2                  5       1         1       4         4                 5
## 3                  4       1         1       5         2                 5
## 4                  2       3         3       5         4                 4
## 5                  2       3         3       5         2                 3
## 6                  1       4         4       3         3                 4
##   Medicine Law Cars Art.exhibitions Religion Countryside..outdoors Dancing
## 1        3   1    1               1        1                     5       3
## 2        1   2    2               2        1                     1       1
## 3        2   3    1               5        5                     5       5
## 4        2   5    1               5        4                     1       1
## 5        3   2    3               1        4                     4       1
## 6        4   3    5               2        2                     5       1
##   Musical.instruments Writing Passive.sport Active.sport Gardening
## 1                   3       2             1            5         5
## 2                   1       1             1            1         1
## 3                   5       5             5            2         1
## 4                   1       3             1            1         1
## 5                   3       1             3            1         4
## 6                   5       1             5            4         2
##   Celebrities Shopping Science.and.technology Theatre Fun.with.friends
## 1           1        4                      4       2                5
## 2           2        3                      3       2                4
## 3           1        4                      2       5                5
## 4           2        4                      3       1                2
## 5           3        3                      3       2                4
## 6           1        2                      3       1                3
##   Adrenaline.sports Pets
## 1                 4    4
## 2                 2    5
## 3                 5    5
## 4                 1    1
## 5                 2    1
## 6                 3    2
summary(youth_resp)
##     History        Psychology       Politics      Mathematics   
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :3.000   Median :3.000   Median :2.000   Median :2.000  
##  Mean   :3.207   Mean   :3.138   Mean   :2.596   Mean   :2.335  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NA's   :2       NA's   :5       NA's   :1       NA's   :3      
##     Physics         Internet           PC        Economy.Management
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000     
##  1st Qu.:1.000   1st Qu.:4.000   1st Qu.:2.000   1st Qu.:1.000     
##  Median :2.000   Median :4.000   Median :3.000   Median :2.000     
##  Mean   :2.065   Mean   :4.176   Mean   :3.136   Mean   :2.644     
##  3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000     
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000     
##  NA's   :3       NA's   :4       NA's   :6       NA's   :5         
##     Biology        Chemistry        Reading        Geography    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :2.000   Median :3.000   Median :3.000  
##  Mean   :2.665   Mean   :2.165   Mean   :3.159   Mean   :3.083  
##  3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NA's   :6       NA's   :10      NA's   :6       NA's   :9      
##  Foreign.languages    Medicine          Law             Cars      
##  Min.   :1.000     Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000     1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :4.000     Median :2.000   Median :2.000   Median :3.000  
##  Mean   :3.778     Mean   :2.516   Mean   :2.257   Mean   :2.687  
##  3rd Qu.:5.000     3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :5.000     Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NA's   :5         NA's   :5       NA's   :1       NA's   :4      
##  Art.exhibitions    Religion     Countryside..outdoors    Dancing     
##  Min.   :1.00    Min.   :1.000   Min.   :1.000         Min.   :1.000  
##  1st Qu.:1.00    1st Qu.:1.000   1st Qu.:3.000         1st Qu.:1.000  
##  Median :2.00    Median :2.000   Median :4.000         Median :2.000  
##  Mean   :2.59    Mean   :2.273   Mean   :3.687         Mean   :2.462  
##  3rd Qu.:4.00    3rd Qu.:3.000   3rd Qu.:5.000         3rd Qu.:4.000  
##  Max.   :5.00    Max.   :5.000   Max.   :5.000         Max.   :5.000  
##  NA's   :6       NA's   :3       NA's   :7             NA's   :3      
##  Musical.instruments    Writing      Passive.sport    Active.sport  
##  Min.   :1.000       Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000       1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000       Median :1.000   Median :3.000   Median :3.000  
##  Mean   :2.324       Mean   :1.901   Mean   :3.388   Mean   :3.291  
##  3rd Qu.:4.000       3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000       Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  NA's   :1           NA's   :6       NA's   :15      NA's   :4      
##    Gardening      Celebrities       Shopping     Science.and.technology
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000         
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000         
##  Median :1.000   Median :2.000   Median :3.000   Median :3.000         
##  Mean   :1.907   Mean   :2.362   Mean   :3.277   Mean   :3.234         
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.000         
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000         
##  NA's   :7       NA's   :2       NA's   :2       NA's   :6             
##     Theatre      Fun.with.friends Adrenaline.sports      Pets      
##  Min.   :1.000   Min.   :2.000    Min.   :1.000     Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:4.000    1st Qu.:2.000     1st Qu.:2.000  
##  Median :3.000   Median :5.000    Median :3.000     Median :4.000  
##  Mean   :3.025   Mean   :4.558    Mean   :2.948     Mean   :3.335  
##  3rd Qu.:4.000   3rd Qu.:5.000    3rd Qu.:4.000     3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000    Max.   :5.000     Max.   :5.000  
##  NA's   :8       NA's   :4        NA's   :3         NA's   :4

DIAMOND

library(rpart) #To construct CART models
library(rpart.plot) # It also includes titanic data
library(rattle.data) #For visualization


ggplot(diamonds) + geom_point(aes(x = carat, y = price, color = cut ))

set.seed(503)
library(tidyverse)
diamonds_test <- diamonds %>% mutate(diamond_id = row_number()) %>%
group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
diamonds_train <- anti_join(diamonds %>% mutate(diamond_id = row_number()),
diamonds_test, by = "diamond_id")
diamonds_train
## # A tibble: 43,143 x 11
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  6 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  7 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  8 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
##  9 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## 10 0.3   Good      J     SI1      64      55   339  4.25  4.28  2.73
## # ... with 43,133 more rows, and 1 more variable: diamond_id <int>

Spam Data

library(tidyverse)
library(rpart)
library(rpart.plot)
load("/cloud/project/spam_data.RData")
head(spam_data)
## # A tibble: 6 x 59
##   train_test spam_or_not    V1    V2    V3    V4    V5    V6    V7    V8
##        <dbl>       <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1          0           1  0     0.64  0.64     0  0.32  0     0     0   
## 2          0           1  0.21  0.28  0.5      0  0.14  0.28  0.21  0.07
## 3          0           1  0.06  0     0.71     0  1.23  0.19  0.19  0.12
## 4          0           1  0     0     0        0  0.63  0     0.31  0.63
## 5          0           1  0     0     0        0  0.63  0     0.31  0.63
## 6          0           1  0     0     0        0  1.85  0     0     1.85
## # ... with 49 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## #   V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>,
## #   V19 <dbl>, V20 <dbl>, V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>,
## #   V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>,
## #   V31 <dbl>, V32 <dbl>, V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>,
## #   V37 <dbl>, V38 <dbl>, V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>,
## #   V43 <dbl>, V44 <dbl>, V45 <dbl>, V46 <dbl>, V47 <dbl>, V48 <dbl>,
## #   V49 <dbl>, V50 <dbl>, V51 <dbl>, V52 <dbl>, V53 <dbl>, V54 <dbl>,
## #   V55 <dbl>, V56 <int>, V57 <int>
spam_train <- spam_data %>% filter(train_test == 0) %>% select(-train_test)
spam_model <- rpart(spam_or_not ~ ., data = spam_train)
rpart.plot(spam_model)

spam_in_sample <- predict(spam_model)
head(spam_in_sample)
##          1          2          3          4          5          6 
## 0.86772487 0.94753747 0.94753747 0.90812721 0.90812721 0.05763952
in_sample_prediction <-
  cbind(
    spam_in_sample %>% tbl_df %>%
      transmute(spam_predict = ifelse(spam_in_sample >= 0.5,1,0)),
    spam_train %>% tbl_df %>%
      transmute(spam_actual = ifelse(spam_or_not == 1,1,0))
  ) %>%
  mutate(correct_class = (spam_predict == spam_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
in_sample_prediction
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           398     0.0970
## 2 TRUE           3703     0.903