Preparing the data:

tr_super_league_matches <- read_excel("tr_super_league_matches.xlsx")
tsl_veri <- tr_super_league_matches %>% select(-matchid, -Match_Date, -Match_Result)
tsl_veri <- tsl_veri %>% mutate(difference = Home_Score - Away_Score) %>%
  mutate(ou25 = Home_Score + Away_Score > 2.5) %>%
  mutate(ou35 = Home_Score + Away_Score > 3.5)

The data before 2017 is selected as train data for over/under 2.5 goals model.

tsl_train <- tsl_veri %>% filter(season<2017) %>% select(-season, -ou35)
tsl_model <- rpart(ou25 ~., data = tsl_train)
rpart.plot(tsl_model)

New teams of 2017 season is substracted from test data.

tsl_test <- tsl_veri %>% filter(season==2017) %>% select(-season, -ou35) %>% 
  filter(!grepl("goztepe|yeni malatyaspor",Home)) %>%
  filter(!grepl("goztepe|yeni malatyaspor",Away))
tsl_predict <- predict(tsl_model, newdata = tsl_test)
head(tsl_predict)
## 1 2 3 4 5 6 
## 0 0 0 1 1 0

The model with train data is applied to test data.

tsl_test_prediction <-
  cbind(
    tsl_predict %>% tbl_df %>%
      transmute(tsl_predict = ifelse(tsl_predict >= 2.5,1,0)),
    tsl_test %>% tbl_df %>%
      transmute(tsl_actual = ifelse(ou25 ==1,1,0))
  ) %>%
  mutate(correct_class = (tsl_predict == tsl_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
tsl_test_prediction
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE            86      0.610
## 2 TRUE             55      0.390

The accuracy of train data is tested with in sample prediction.

tsl_in_sample <- predict(tsl_model)
tsl_train_prediction <-
  cbind(
    tsl_in_sample %>% tbl_df %>%
      transmute(tsl_predict = ifelse(tsl_in_sample >= 2.5,1,0)),
    tsl_train %>% tbl_df %>%
      transmute(tsl_actual = ifelse(ou25 == 1,1,0))
  ) %>%
  mutate(correct_class = (tsl_predict == tsl_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
tsl_train_prediction
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE          1108      0.517
## 2 TRUE           1034      0.483

The data before 2017 is selected as train data for over/under 2.5 goals model.

tsl_train <- tsl_veri %>% filter(season<2017) %>% select(-season, -ou25)
tsl_model <- rpart(ou35 ~., data = tsl_train)
rpart.plot(tsl_model)

New teams of 2017 season is substracted from test data.

tsl_test <- tsl_veri %>% filter(season==2017) %>% select(-season, -ou25) %>% 
  filter(!grepl("goztepe|yeni malatyaspor",Home)) %>%
  filter(!grepl("goztepe|yeni malatyaspor",Away))
tsl_predict <- predict(tsl_model, newdata = tsl_test)

The model with train data is applied to test data.

tsl_test_prediction <-
  cbind(
    tsl_predict %>% tbl_df %>%
      transmute(tsl_predict = ifelse(tsl_predict >= 3.5,1,0)),
    tsl_test %>% tbl_df %>%
      transmute(tsl_actual = ifelse(ou35 ==1,1,0))
  ) %>%
  mutate(correct_class = (tsl_predict == tsl_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
tsl_test_prediction
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE            46      0.326
## 2 TRUE             95      0.674
tsl_in_sample <- predict(tsl_model)

The accuracy of train data is tested with in sample prediction.

tsl_train_prediction <-
  cbind(
    tsl_in_sample %>% tbl_df %>%
      transmute(tsl_predict = ifelse(tsl_in_sample >= 3.5,1,0)),
    tsl_train %>% tbl_df %>%
      transmute(tsl_actual = ifelse(ou35 == 1,1,0))
  ) %>%
  mutate(correct_class = (tsl_predict == tsl_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
tsl_train_prediction
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE           634      0.296
## 2 TRUE           1508      0.704

The data before 2017 is selected as train data for over/under 2.5 goals model.

tsl_train <- tsl_veri %>% filter(season<2017) %>% select(-season, -ou25, -ou35)
tsl_model <- rpart(difference ~., data = tsl_train)
rpart.plot(tsl_model)

New teams of 2017 season is substracted from test data.

tsl_test <- tsl_veri %>% filter(season==2017) %>% select(-season, -ou25, -ou35) %>% 
  filter(!grepl("goztepe|yeni malatyaspor",Home)) %>%
  filter(!grepl("goztepe|yeni malatyaspor",Away))
tsl_predict <- predict(tsl_model, newdata = tsl_test)

The model with train data is applied to test data.

tsl_test_prediction <-
  cbind(
    tsl_predict %>% tbl_df %>%
      transmute(tsl_predict = ifelse(tsl_predict >= 0,1,0)),
    tsl_test %>% tbl_df %>%
      transmute(tsl_actual = ifelse(difference ==1,1,0))
  ) %>%
  mutate(correct_class = (tsl_predict == tsl_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
tsl_test_prediction
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE            75      0.532
## 2 TRUE             66      0.468
tsl_in_sample <- predict(tsl_model)

The accuracy of train data is tested with in sample prediction.

tsl_train_prediction <-
  cbind(
    tsl_in_sample %>% tbl_df %>%
      transmute(tsl_predict = ifelse(tsl_in_sample >= 0,1,0)),
    tsl_train %>% tbl_df %>%
      transmute(tsl_actual = ifelse(difference == 1,1,0))
  ) %>%
  mutate(correct_class = (tsl_predict == tsl_actual)) %>%
  group_by(correct_class) %>%
  summarise(count=n(),percentage=n()/nrow(.))
tsl_train_prediction
## # A tibble: 2 x 3
##   correct_class count percentage
##   <lgl>         <int>      <dbl>
## 1 FALSE          1022      0.477
## 2 TRUE           1120      0.523