Assignment 3: Diamonds

Observing the data:

data(diamonds)
head(diamonds)

## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

glimpse(diamonds)

## Observations: 53,940
## Variables: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, ...
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very G...
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, ...
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI...
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, ...
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54...
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339,...
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, ...
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, ...
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, ...

A linear fit model was applied to whole data for price prediction.

model <- lm(price ~ ., diamonds)
model

## 
## Call:
## lm(formula = price ~ ., data = diamonds)
## 
## Coefficients:
## (Intercept)        carat        cut.L        cut.Q        cut.C  
##    5753.762    11256.978      584.457     -301.908      148.035  
##       cut^4      color.L      color.Q      color.C      color^4  
##     -20.794    -1952.160     -672.054     -165.283       38.195  
##     color^5      color^6    clarity.L    clarity.Q    clarity.C  
##     -95.793      -48.466     4097.431    -1925.004      982.205  
##   clarity^4    clarity^5    clarity^6    clarity^7        depth  
##    -364.918      233.563        6.883       90.640      -63.806  
##       table            x            y            z  
##     -26.474    -1008.261        9.609      -50.119

p <- predict(model, diamonds)
error <- p - diamonds$price
sqrt(mean(error^2))

## [1] 1129.843

Then, the data was splitted into train and test data.

set.seed(354)
rows <- sample(nrow(diamonds))
head(rows)

## [1] 22142 41030 27447 29157  8361 50348

diamonds <- diamonds[rows, ]
split <- round(nrow(diamonds) * .80)
split

## [1] 43152

train <- diamonds[1:split,]
test <- diamonds[(split + 1):nrow(diamonds),]

The linear fit model of train data was applied to test data. As a result, there is an improvement in terms of MSE.

model <- lm(price ~ ., train)
model

## 
## Call:
## lm(formula = price ~ ., data = train)
## 
## Coefficients:
## (Intercept)        carat        cut.L        cut.Q        cut.C  
##   5752.1128   11170.0700     577.4729    -302.9939     140.8146  
##       cut^4      color.L      color.Q      color.C      color^4  
##    -29.8035   -1977.1021    -677.4466    -157.9084      40.5503  
##     color^5      color^6    clarity.L    clarity.Q    clarity.C  
##    -88.4332     -47.8604    4118.0989   -1882.0788     991.5657  
##   clarity^4    clarity^5    clarity^6    clarity^7        depth  
##   -350.0420     238.3572       7.4447      87.0422     -62.5827  
##       table            x            y            z  
##    -30.3431    -976.8938      -0.8539     -19.2194

p <- predict(model, test)
error <- p - test$price
sqrt(mean(error^2))

## [1] 1091.328

CART of the train data:

diamond_model <- rpart(price ~ ., data = train)
rpart.plot(diamond_model)

K-fold cross validation was applied to improve the model with k=5.

model <- train(
  price ~ ., diamonds,
  method = "lm",
  trControl = trainControl(
    method = "cv", number = 5,
    verboseIter = TRUE
  )
)

## + Fold1: intercept=TRUE 
## - Fold1: intercept=TRUE 
## + Fold2: intercept=TRUE 
## - Fold2: intercept=TRUE 
## + Fold3: intercept=TRUE 
## - Fold3: intercept=TRUE 
## + Fold4: intercept=TRUE 
## - Fold4: intercept=TRUE 
## + Fold5: intercept=TRUE 
## - Fold5: intercept=TRUE 
## Aggregating results
## Fitting final model on full training set

model

## Linear Regression 
## 
## 53940 samples
##     9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 43152, 43152, 43152, 43152, 43152 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1136.362  0.9189443  741.2053
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

summary(model)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21376.0   -592.4   -183.5    376.4  10694.2 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  5753.762    396.630   14.507  < 2e-16 ***
## carat       11256.978     48.628  231.494  < 2e-16 ***
## cut.L         584.457     22.478   26.001  < 2e-16 ***
## cut.Q        -301.908     17.994  -16.778  < 2e-16 ***
## cut.C         148.035     15.483    9.561  < 2e-16 ***
## `cut^4`       -20.794     12.377   -1.680  0.09294 .  
## color.L     -1952.160     17.342 -112.570  < 2e-16 ***
## color.Q      -672.054     15.777  -42.597  < 2e-16 ***
## color.C      -165.283     14.725  -11.225  < 2e-16 ***
## `color^4`      38.195     13.527    2.824  0.00475 ** 
## `color^5`     -95.793     12.776   -7.498 6.59e-14 ***
## `color^6`     -48.466     11.614   -4.173 3.01e-05 ***
## clarity.L    4097.431     30.259  135.414  < 2e-16 ***
## clarity.Q   -1925.004     28.227  -68.197  < 2e-16 ***
## clarity.C     982.205     24.152   40.668  < 2e-16 ***
## `clarity^4`  -364.918     19.285  -18.922  < 2e-16 ***
## `clarity^5`   233.563     15.752   14.828  < 2e-16 ***
## `clarity^6`     6.883     13.715    0.502  0.61575    
## `clarity^7`    90.640     12.103    7.489 7.06e-14 ***
## depth         -63.806      4.535  -14.071  < 2e-16 ***
## table         -26.474      2.912   -9.092  < 2e-16 ***
## x           -1008.261     32.898  -30.648  < 2e-16 ***
## y               9.609     19.333    0.497  0.61918    
## z             -50.119     33.486   -1.497  0.13448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1130 on 53916 degrees of freedom
## Multiple R-squared:  0.9198, Adjusted R-squared:  0.9198 
## F-statistic: 2.688e+04 on 23 and 53916 DF,  p-value: < 2.2e-16

References

Machine Learning Toolbox : https://rpubs.com/williamsurles/310197

Assignment 3: Diamonds

Ilgaz Yüksel

12 05 2018

References