The aim of this study is to predict diamond prices using an existing dataset. Dataset set consists of different properties of diamonds, some of which are carat, cut, color, clarity.
First we load tidyverse,rpart and our dataset. After that we form our train and test data.
set.seed(503)
library(tidyverse)
library(rpart)
library(rpart.plot)
diamonds_test <- diamonds %>% mutate(diamond_id= row_number()) %>%
group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
diamonds_train <- anti_join(diamonds %>% mutate(diamond_id=row_number()),
diamonds_test, by="diamond_id")
diamonds_train
## # A tibble: 43,143 x 11
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.230 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
## 2 0.210 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
## 3 0.230 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
## 5 0.240 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
## 6 0.240 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
## 7 0.260 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
## 8 0.220 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
## 9 0.230 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39
## 10 0.300 Good J SI1 64.0 55.0 339 4.25 4.28 2.73
## # ... with 43,133 more rows, and 1 more variable: diamond_id <int>
Then we create our model using CART method.
#creating model https://mef-bda503.github.io/pj-nesipoglud/files/Diamonds.html
m1 <- rpart(price ~ ., data=diamonds_train, method ="anova")
We make our prediction using the model.
#prediction
diamonds_in_sample <- predict(m1)
print(head(diamonds_in_sample))
## 1 2 3 4 5 6
## 1056.168 1056.168 1056.168 1056.168 1056.168 1056.168
Lastly we calculate our mean absolute error and estimate prices for test values.
#Mean Absolute Error and Test with diamonds_test
Mean_Absolute_Error <- function(actual, predicted) {mean(abs(actual - predicted))}
Mean_Absolute_Error(diamonds_train$price, diamonds_in_sample)
## [1] 614.0494
modelEstimate <- predict(m1,newdata = diamonds_test)
diamonds_test1 <- diamonds_test %>% select(price)
print(diamonds_test1)
## # A tibble: 10,797 x 1
## price
## <int>
## 1 15964
## 2 3205
## 3 3384
## 4 3634
## 5 2358
## 6 4398
## 7 2167
## 8 3743
## 9 4725
## 10 15627
## # ... with 10,787 more rows
print(head(modelEstimate))
## 1 2 3 4 5 6
## 15901.960 3073.487 3073.487 4887.342 3073.487 4887.342