Split data to 2 frames to train and test
diamondsTest <- diamonds %>% mutate(diamond_id = row_number()) %>%
group_by(cut, color, clarity) %>% sample_frac(0.2) %>% ungroup()
diamondsTrain <- anti_join(diamonds %>% mutate(diamond_id = row_number()),
diamondsTest, by = "diamond_id")
Crate model and make prediction, exlude diamon_id and set price as response column while creating model
pricingModel = rpart(price ~., data = (diamondsTrain %>% select(-diamond_id) ))
pricingPrediction <- predict(pricingModel, newdata = diamondsTest %>% select(-diamond_id))
Append predicted prices to test datafarme
diamondsTest$predictedPrice = pricingPrediction
Calculate the price change percentage
diamondsTest$priceChange =abs ((diamondsTest$price - diamondsTest$predictedPrice) / diamondsTest$price)
Lets see mean of the price deviation with predicted price
mean(diamondsTest$priceChange)
## [1] 0.3231341
Create lienar model with with features, then make the prediction
linearModel = lm(price~carat+ cut + color + clarity +depth+table+x+y+z,data = diamondsTrain)
linearPrediction = predict.lm(linearModel,newdata = diamondsTest)
Append the result to test data farme and calculate the price deviation
diamondsTest$linearPredictedPrice = linearPrediction
diamondsTest$linearPredictedPriceChange = abs ((diamondsTest$price - diamondsTest$linearPredictedPrice) / diamondsTest$price)
See mean of the price deviation with predicted price with Linear Model
mean(diamondsTest$linearPredictedPriceChange)
## [1] 0.3912863
I did the same operation with both rpart and Linear Model, in rpart I got %32 devication on prices, however in Linear Model I got %39 for same metric. In this case using rpart is better than using linear model.
These are the graphs that I created before I did the real homework to see corelation between features. This are not required in homework
gg <- ggplot(diamondsTrain, aes(x=diamondsTrain$price, y=diamondsTrain$carat)) +
geom_point(aes(col=diamondsTrain$cut, size=diamondsTrain$depth)) +
geom_smooth(method="loess", se=F) +
labs(subtitle="Price Vs Carat",
y="Carat",
x="Price",
title="Diamond Price Analysis",
caption = "Source: Diamonds Train")
plot(gg)
theme_set(theme_bw())
g <- ggplot(diamondsTrain, aes(diamondsTrain$price, diamondsTrain$table))
g + geom_jitter(width = .3, size=0.5) +
labs(subtitle="Price Vs Table",
y="Table",
x="Price",
title="Diamond Price Analysis")
theme_set(theme_bw())
g <- ggplot(diamondsTrain, aes(diamondsTrain$price, diamondsTrain$depth)) +
labs(subtitle="Price Vs Depth",
title="Diamond Price Analysis")
g + geom_jitter(aes(col=diamondsTrain$color, size=diamondsTrain$x)) +
geom_smooth(aes(col=diamondsTrain$color), method="lm", se=F)
theme_set(theme_bw())
ggplot(diamondsTrain, aes(x=diamondsTrain$cut, y=diamondsTrain$carat, label=diamondsTrain$carat)) +
geom_point(stat='identity', fill="blue", size=6) +
geom_segment(aes(y = 0,
x = diamondsTrain$cut,
yend = diamondsTrain$carat,
xend = diamondsTrain$cut),
color = "black") +
geom_text(color="white", size=2) +
labs(title="Diamond Price Analysis") +
ylim(0, 10) +
coord_flip()
theme_set(theme_classic())
g <- ggplot(diamondsTrain, aes(diamondsTrain$color))
g + geom_bar(aes(fill=diamondsTrain$cut), width = 0.5) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Diamond Color Vs Cut Analysis")