Observing the data:
data(diamonds)
head(diamonds)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
glimpse(diamonds)
## Observations: 53,940
## Variables: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, ...
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very G...
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, ...
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI...
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, ...
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54...
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339,...
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, ...
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, ...
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, ...
A linear fit model was applied to whole data for price prediction.
model <- lm(price ~ ., diamonds)
model
##
## Call:
## lm(formula = price ~ ., data = diamonds)
##
## Coefficients:
## (Intercept) carat cut.L cut.Q cut.C
## 5753.762 11256.978 584.457 -301.908 148.035
## cut^4 color.L color.Q color.C color^4
## -20.794 -1952.160 -672.054 -165.283 38.195
## color^5 color^6 clarity.L clarity.Q clarity.C
## -95.793 -48.466 4097.431 -1925.004 982.205
## clarity^4 clarity^5 clarity^6 clarity^7 depth
## -364.918 233.563 6.883 90.640 -63.806
## table x y z
## -26.474 -1008.261 9.609 -50.119
p <- predict(model, diamonds)
error <- p - diamonds$price
sqrt(mean(error^2))
## [1] 1129.843
Then, the data was splitted into train and test data.
set.seed(354)
rows <- sample(nrow(diamonds))
head(rows)
## [1] 22142 41030 27447 29157 8361 50348
diamonds <- diamonds[rows, ]
split <- round(nrow(diamonds) * .80)
split
## [1] 43152
train <- diamonds[1:split,]
test <- diamonds[(split + 1):nrow(diamonds),]
The linear fit model of train data was applied to test data. As a result, there is an improvement in terms of MSE.
model <- lm(price ~ ., train)
model
##
## Call:
## lm(formula = price ~ ., data = train)
##
## Coefficients:
## (Intercept) carat cut.L cut.Q cut.C
## 5752.1128 11170.0700 577.4729 -302.9939 140.8146
## cut^4 color.L color.Q color.C color^4
## -29.8035 -1977.1021 -677.4466 -157.9084 40.5503
## color^5 color^6 clarity.L clarity.Q clarity.C
## -88.4332 -47.8604 4118.0989 -1882.0788 991.5657
## clarity^4 clarity^5 clarity^6 clarity^7 depth
## -350.0420 238.3572 7.4447 87.0422 -62.5827
## table x y z
## -30.3431 -976.8938 -0.8539 -19.2194
p <- predict(model, test)
error <- p - test$price
sqrt(mean(error^2))
## [1] 1091.328
CART of the train data:
diamond_model <- rpart(price ~ ., data = train)
rpart.plot(diamond_model)
K-fold cross validation was applied to improve the model with k=5.
model <- train(
price ~ ., diamonds,
method = "lm",
trControl = trainControl(
method = "cv", number = 5,
verboseIter = TRUE
)
)
## + Fold1: intercept=TRUE
## - Fold1: intercept=TRUE
## + Fold2: intercept=TRUE
## - Fold2: intercept=TRUE
## + Fold3: intercept=TRUE
## - Fold3: intercept=TRUE
## + Fold4: intercept=TRUE
## - Fold4: intercept=TRUE
## + Fold5: intercept=TRUE
## - Fold5: intercept=TRUE
## Aggregating results
## Fitting final model on full training set
model
## Linear Regression
##
## 53940 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 43152, 43152, 43152, 43152, 43152
## Resampling results:
##
## RMSE Rsquared MAE
## 1136.362 0.9189443 741.2053
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21376.0 -592.4 -183.5 376.4 10694.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5753.762 396.630 14.507 < 2e-16 ***
## carat 11256.978 48.628 231.494 < 2e-16 ***
## cut.L 584.457 22.478 26.001 < 2e-16 ***
## cut.Q -301.908 17.994 -16.778 < 2e-16 ***
## cut.C 148.035 15.483 9.561 < 2e-16 ***
## `cut^4` -20.794 12.377 -1.680 0.09294 .
## color.L -1952.160 17.342 -112.570 < 2e-16 ***
## color.Q -672.054 15.777 -42.597 < 2e-16 ***
## color.C -165.283 14.725 -11.225 < 2e-16 ***
## `color^4` 38.195 13.527 2.824 0.00475 **
## `color^5` -95.793 12.776 -7.498 6.59e-14 ***
## `color^6` -48.466 11.614 -4.173 3.01e-05 ***
## clarity.L 4097.431 30.259 135.414 < 2e-16 ***
## clarity.Q -1925.004 28.227 -68.197 < 2e-16 ***
## clarity.C 982.205 24.152 40.668 < 2e-16 ***
## `clarity^4` -364.918 19.285 -18.922 < 2e-16 ***
## `clarity^5` 233.563 15.752 14.828 < 2e-16 ***
## `clarity^6` 6.883 13.715 0.502 0.61575
## `clarity^7` 90.640 12.103 7.489 7.06e-14 ***
## depth -63.806 4.535 -14.071 < 2e-16 ***
## table -26.474 2.912 -9.092 < 2e-16 ***
## x -1008.261 32.898 -30.648 < 2e-16 ***
## y 9.609 19.333 0.497 0.61918
## z -50.119 33.486 -1.497 0.13448
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1130 on 53916 degrees of freedom
## Multiple R-squared: 0.9198, Adjusted R-squared: 0.9198
## F-statistic: 2.688e+04 on 23 and 53916 DF, p-value: < 2.2e-16
Machine Learning Toolbox : https://rpubs.com/williamsurles/310197