The aim of this study is to predict diamond prices using an existing dataset. Dataset set consists of different properties of diamonds, some of which are carat, cut, color, clarity.

First we load tidyverse,rpart and our dataset. After that we form our train and test data.

set.seed(503)
library(tidyverse)
library(rpart)
library(rpart.plot)
diamonds_test <- diamonds %>% mutate(diamond_id= row_number()) %>%
  group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()

diamonds_train <- anti_join(diamonds %>% mutate(diamond_id=row_number()),
diamonds_test, by="diamond_id")
diamonds_train
## # A tibble: 43,143 x 11
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1 0.230 Ideal     E     SI2      61.5  55.0   326  3.95  3.98  2.43
##  2 0.210 Premium   E     SI1      59.8  61.0   326  3.89  3.84  2.31
##  3 0.230 Good      E     VS1      56.9  65.0   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4  58.0   334  4.20  4.23  2.63
##  5 0.240 Very Good J     VVS2     62.8  57.0   336  3.94  3.96  2.48
##  6 0.240 Very Good I     VVS1     62.3  57.0   336  3.95  3.98  2.47
##  7 0.260 Very Good H     SI1      61.9  55.0   337  4.07  4.11  2.53
##  8 0.220 Fair      E     VS2      65.1  61.0   337  3.87  3.78  2.49
##  9 0.230 Very Good H     VS1      59.4  61.0   338  4.00  4.05  2.39
## 10 0.300 Good      J     SI1      64.0  55.0   339  4.25  4.28  2.73
## # ... with 43,133 more rows, and 1 more variable: diamond_id <int>

Then we create our model using CART method.

#creating model https://mef-bda503.github.io/pj-nesipoglud/files/Diamonds.html

m1 <- rpart(price ~ ., data=diamonds_train, method ="anova")

We make our prediction using the model.

#prediction

diamonds_in_sample <- predict(m1)
print(head(diamonds_in_sample))
##        1        2        3        4        5        6 
## 1056.168 1056.168 1056.168 1056.168 1056.168 1056.168

Lastly we calculate our mean absolute error and estimate prices for test values.

#Mean Absolute Error and Test with diamonds_test

Mean_Absolute_Error <- function(actual, predicted) {mean(abs(actual - predicted))}
Mean_Absolute_Error(diamonds_train$price, diamonds_in_sample)
## [1] 614.0494
modelEstimate <- predict(m1,newdata = diamonds_test)
diamonds_test1 <- diamonds_test %>% select(price)
print(diamonds_test1)
## # A tibble: 10,797 x 1
##    price
##    <int>
##  1 15964
##  2  3205
##  3  3384
##  4  3634
##  5  2358
##  6  4398
##  7  2167
##  8  3743
##  9  4725
## 10 15627
## # ... with 10,787 more rows
print(head(modelEstimate))
##         1         2         3         4         5         6 
## 15901.960  3073.487  3073.487  4887.342  3073.487  4887.342