Objective of this assignment is to help you help yourself to get data from outside resources, analyze the data, validate your model and convey your conclusions in a clear and reproducible way

esoph 
##    agegp     alcgp    tobgp ncases ncontrols
## 1  25-34 0-39g/day 0-9g/day      0        40
## 2  25-34 0-39g/day    10-19      0        10
## 3  25-34 0-39g/day    20-29      0         6
## 4  25-34 0-39g/day      30+      0         5
## 5  25-34     40-79 0-9g/day      0        27
## 6  25-34     40-79    10-19      0         7
## 7  25-34     40-79    20-29      0         4
## 8  25-34     40-79      30+      0         7
## 9  25-34    80-119 0-9g/day      0         2
## 10 25-34    80-119    10-19      0         1
## 11 25-34    80-119      30+      0         2
## 12 25-34      120+ 0-9g/day      0         1
## 13 25-34      120+    10-19      1         1
## 14 25-34      120+    20-29      0         1
## 15 25-34      120+      30+      0         2
## 16 35-44 0-39g/day 0-9g/day      0        60
## 17 35-44 0-39g/day    10-19      1        14
## 18 35-44 0-39g/day    20-29      0         7
## 19 35-44 0-39g/day      30+      0         8
## 20 35-44     40-79 0-9g/day      0        35
## 21 35-44     40-79    10-19      3        23
## 22 35-44     40-79    20-29      1        14
## 23 35-44     40-79      30+      0         8
## 24 35-44    80-119 0-9g/day      0        11
## 25 35-44    80-119    10-19      0         6
## 26 35-44    80-119    20-29      0         2
## 27 35-44    80-119      30+      0         1
## 28 35-44      120+ 0-9g/day      2         3
## 29 35-44      120+    10-19      0         3
## 30 35-44      120+    20-29      2         4
## 31 45-54 0-39g/day 0-9g/day      1        46
## 32 45-54 0-39g/day    10-19      0        18
## 33 45-54 0-39g/day    20-29      0        10
## 34 45-54 0-39g/day      30+      0         4
## 35 45-54     40-79 0-9g/day      6        38
## 36 45-54     40-79    10-19      4        21
## 37 45-54     40-79    20-29      5        15
## 38 45-54     40-79      30+      5         7
## 39 45-54    80-119 0-9g/day      3        16
## 40 45-54    80-119    10-19      6        14
## 41 45-54    80-119    20-29      1         5
## 42 45-54    80-119      30+      2         4
## 43 45-54      120+ 0-9g/day      4         4
## 44 45-54      120+    10-19      3         4
## 45 45-54      120+    20-29      2         3
## 46 45-54      120+      30+      4         4
## 47 55-64 0-39g/day 0-9g/day      2        49
## 48 55-64 0-39g/day    10-19      3        22
## 49 55-64 0-39g/day    20-29      3        12
## 50 55-64 0-39g/day      30+      4         6
## 51 55-64     40-79 0-9g/day      9        40
## 52 55-64     40-79    10-19      6        21
## 53 55-64     40-79    20-29      4        17
## 54 55-64     40-79      30+      3         6
## 55 55-64    80-119 0-9g/day      9        18
## 56 55-64    80-119    10-19      8        15
## 57 55-64    80-119    20-29      3         6
## 58 55-64    80-119      30+      4         4
## 59 55-64      120+ 0-9g/day      5        10
## 60 55-64      120+    10-19      6         7
## 61 55-64      120+    20-29      2         3
## 62 55-64      120+      30+      5         6
## 63 65-74 0-39g/day 0-9g/day      5        48
## 64 65-74 0-39g/day    10-19      4        14
## 65 65-74 0-39g/day    20-29      2         7
## 66 65-74 0-39g/day      30+      0         2
## 67 65-74     40-79 0-9g/day     17        34
## 68 65-74     40-79    10-19      3        10
## 69 65-74     40-79    20-29      5         9
## 70 65-74    80-119 0-9g/day      6        13
## 71 65-74    80-119    10-19      4        12
## 72 65-74    80-119    20-29      2         3
## 73 65-74    80-119      30+      1         1
## 74 65-74      120+ 0-9g/day      3         4
## 75 65-74      120+    10-19      1         2
## 76 65-74      120+    20-29      1         1
## 77 65-74      120+      30+      1         1
## 78   75+ 0-39g/day 0-9g/day      1        18
## 79   75+ 0-39g/day    10-19      2         6
## 80   75+ 0-39g/day      30+      1         3
## 81   75+     40-79 0-9g/day      2         5
## 82   75+     40-79    10-19      1         3
## 83   75+     40-79    20-29      0         3
## 84   75+     40-79      30+      1         1
## 85   75+    80-119 0-9g/day      1         1
## 86   75+    80-119    10-19      1         1
## 87   75+      120+ 0-9g/day      2         2
## 88   75+      120+    10-19      1         1
require(stats)
require(graphics) # for mosaicplot
summary(esoph)
##    agegp          alcgp         tobgp        ncases         ncontrols    
##  25-34:15   0-39g/day:23   0-9g/day:24   Min.   : 0.000   Min.   : 1.00  
##  35-44:15   40-79    :23   10-19   :24   1st Qu.: 0.000   1st Qu.: 3.00  
##  45-54:16   80-119   :21   20-29   :20   Median : 1.000   Median : 6.00  
##  55-64:16   120+     :21   30+     :20   Mean   : 2.273   Mean   :11.08  
##  65-74:15                                3rd Qu.: 4.000   3rd Qu.:14.00  
##  75+  :11                                Max.   :17.000   Max.   :60.00
model1 <- glm(cbind(ncases, ncontrols) ~ agegp + tobgp * alcgp,
              data = esoph, family = binomial())
anova(model1)
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: cbind(ncases, ncontrols)
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev
## NULL                           87    227.241
## agegp        5   88.128        82    139.112
## tobgp        3   19.085        79    120.028
## alcgp        3   66.054        76     53.973
## tobgp:alcgp  9    6.489        67     47.484

Try a linear effect of alcohol and tobacco

model2 <- glm(cbind(ncases, ncontrols) ~ agegp + unclass(tobgp) + unclass(alcgp),data = esoph, family = binomial())

summary(model2)
## 
## Call:
## glm(formula = cbind(ncases, ncontrols) ~ agegp + unclass(tobgp) + 
##     unclass(alcgp), family = binomial(), data = esoph)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7628  -0.6426  -0.2709   0.3043   2.0421  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.01097    0.31224 -12.846  < 2e-16 ***
## agegp.L         2.96113    0.65092   4.549 5.39e-06 ***
## agegp.Q        -1.33735    0.58918  -2.270  0.02322 *  
## agegp.C         0.15292    0.44792   0.341  0.73281    
## agegp^4         0.06668    0.30776   0.217  0.82848    
## agegp^5        -0.20288    0.19523  -1.039  0.29872    
## unclass(tobgp)  0.26162    0.08198   3.191  0.00142 ** 
## unclass(alcgp)  0.65308    0.08452   7.727 1.10e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 227.241  on 87  degrees of freedom
## Residual deviance:  59.277  on 80  degrees of freedom
## AIC: 222.76
## 
## Number of Fisher Scoring iterations: 6

Re-arrange data for a mosaic plot

ttt <- table(esoph$agegp, esoph$alcgp, esoph$tobgp)
o <- with(esoph, order(tobgp, alcgp, agegp))

ttt[ttt == 1] <- esoph$ncases[o]
tt1 <- table(esoph$agegp, esoph$alcgp, esoph$tobgp)
tt1[tt1 == 1] <- esoph$ncontrols[o]
tt <- array(c(ttt, tt1), c(dim(ttt),2),
            c(dimnames(ttt), list(c("Cancer", "control"))))
mosaicplot(tt, main = "esoph data set", color = TRUE)

References

http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/esoph.html