Section - 1 Data Preparation

Split data to 2 frames to train and test

diamondsTest <- diamonds %>% mutate(diamond_id = row_number()) %>%
  group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
diamondsTrain <- anti_join(diamonds %>% mutate(diamond_id = row_number()),
                            diamondsTest, by = "diamond_id")

Section - 2 Predicting Price With rpart

Crate model and make prediction, exlude diamon_id and set price as response column while creating model

pricingModel = rpart(price ~., data = (diamondsTrain %>% select(-diamond_id) ))
pricingPrediction <- predict(pricingModel, newdata = diamondsTest %>% select(-diamond_id))

Append predicted prices to test datafarme

diamondsTest$predictedPrice = pricingPrediction

Calculate the price change percentage

diamondsTest$priceChange =abs ((diamondsTest$price - diamondsTest$predictedPrice) / diamondsTest$price)

Lets see mean of the price deviation with predicted price

mean(diamondsTest$priceChange)
## [1] 0.3231341

Section 3 - Make same analyses with Linear Model

Create lienar model with with features, then make the prediction

linearModel = lm(price~carat+ cut + color + clarity +depth+table+x+y+z,data = diamondsTrain)
linearPrediction = predict.lm(linearModel,newdata = diamondsTest)

Append the result to test data farme and calculate the price deviation

diamondsTest$linearPredictedPrice = linearPrediction
diamondsTest$linearPredictedPriceChange = abs ((diamondsTest$price - diamondsTest$linearPredictedPrice) / diamondsTest$price)

See mean of the price deviation with predicted price with Linear Model

mean(diamondsTest$linearPredictedPriceChange)
## [1] 0.3912863

Section 4 - My Comments

I did the same operation with both rpart and Linear Model, in rpart I got %32 devication on prices, however in Linear Model I got %39 for same metric. In this case using rpart is better than using linear model.

Section 5 - Graph Fantasies

These are the graphs that I created before I did the real homework to see corelation between features. This are not required in homework

Plot 1

gg <- ggplot(diamondsTrain, aes(x=diamondsTrain$price, y=diamondsTrain$carat)) + 
  geom_point(aes(col=diamondsTrain$cut, size=diamondsTrain$depth)) + 
  geom_smooth(method="loess", se=F) + 
  labs(subtitle="Price Vs Carat", 
       y="Carat", 
       x="Price", 
       title="Diamond Price Analysis", 
       caption = "Source: Diamonds Train")
plot(gg)

Plot 2

theme_set(theme_bw())  
g <- ggplot(diamondsTrain, aes(diamondsTrain$price, diamondsTrain$table))
g + geom_jitter(width = .3, size=0.5) +
  labs(subtitle="Price Vs Table", 
       y="Table", 
       x="Price", 
       title="Diamond Price Analysis")

Plot 3

theme_set(theme_bw()) 
g <- ggplot(diamondsTrain, aes(diamondsTrain$price, diamondsTrain$depth)) + 
  labs(subtitle="Price Vs Depth",
       title="Diamond Price Analysis")

g + geom_jitter(aes(col=diamondsTrain$color, size=diamondsTrain$x)) + 
  geom_smooth(aes(col=diamondsTrain$color), method="lm", se=F)

Plot 4

theme_set(theme_bw())

ggplot(diamondsTrain, aes(x=diamondsTrain$cut, y=diamondsTrain$carat, label=diamondsTrain$carat)) + 
  geom_point(stat='identity', fill="blue", size=6)  +
  geom_segment(aes(y = 0, 
                   x = diamondsTrain$cut, 
                   yend = diamondsTrain$carat, 
                   xend = diamondsTrain$cut), 
               color = "black") +
  geom_text(color="white", size=2) +
  labs(title="Diamond Price Analysis") + 
  ylim(0, 10) +
  coord_flip()

Plot 5

theme_set(theme_classic())
g <- ggplot(diamondsTrain, aes(diamondsTrain$color))
g + geom_bar(aes(fill=diamondsTrain$cut), width = 0.5) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Diamond Color Vs Cut Analysis")