Task 1 Q1

library(readr)
tr_super_league_matches <- read_csv("C:/Users/Sezgin/Desktop/tr_super_league_matches.csv")
## Parsed with column specification:
## cols(
##   matchid = col_character(),
##   season = col_integer(),
##   Home = col_character(),
##   Away = col_character(),
##   Match_Date = col_date(format = ""),
##   Round = col_integer(),
##   Match_Hour = col_integer(),
##   weekDay = col_integer(),
##   month = col_integer(),
##   AWA_FLAG = col_integer(),
##   Home_Score = col_integer(),
##   Away_Score = col_integer(),
##   Match_Result = col_character()
## )
tr_super_league_matches
## # A tibble: 2,322 x 13
##    matchid  season Home   Away   Match_Date Round Match_Hour weekDay month
##    <chr>     <int> <chr>  <chr>  <date>     <int>      <int>   <int> <int>
##  1 04ouflDc   2010 karde~ manis~ 2010-08-15     1         20       1     8
##  2 0AUXYwsk   2010 eskis~ gencl~ 2010-08-14     1         21       7     8
##  3 6mVTZJRr   2010 gazia~ kasim~ 2010-08-14     1         19       7     8
##  4 6NaYWHB1   2010 sivas~ galat~ 2010-08-14     1         19       7     8
##  5 8UlmhSsA   2010 ankar~ trabz~ 2010-08-15     1         18       1     8
##  6 jgvhincG   2010 bucas~ besik~ 2010-08-14     1         21       7     8
##  7 MDnyeUci   2010 fener~ antal~ 2010-08-15     1         20       1     8
##  8 Uwjqg8S3   2010 basak~ kayse~ 2010-08-16     1         20       2     8
##  9 v1TyYcde   2010 bursa~ konya~ 2010-08-16     1         20       2     8
## 10 2ZASfL38   2010 konya~ eskis~ 2010-08-20     2         20       6     8
## # ... with 2,312 more rows, and 4 more variables: AWA_FLAG <int>,
## #   Home_Score <int>, Away_Score <int>, Match_Result <chr>
attach(tr_super_league_matches)

hist(Home_Score, main= "Home Score Distribution",xlab= "Home Score",ylab= "Number of Games", col="blue")

hist(Away_Score, main= "Away Score Distribution", xlab= "Away Score",ylab= "Number of Games", col="green")

hist(Home_Score - Away_Score, main= "Home Score - Away Score Distribution", xlab= "Home Score - Away Score", ylab= "Number of Games", col="red")

Task 1 Q2

What distribution do you think home and away goals are coming from? Calculate the expected number of games corresponding to each quantile (number of goals) by using sample means as distribution mean and plot these values on the histogram.

Answer: In accordance with the score difference distribution histogram, it can be seen that the distribution comes from standard normal distribution. The histogram also states that the score difference mostly 0, meaning that most games end as a draw, also could be indicating that the mean value is 0 . Games ending as a draw has the highest probability as a result of the number of games. The second quantile with the highest “probability” is +1 goal difference meaning that Home teams score 1 more goal than Away teams.

Task 2

library(readr)
tr_super_league_odd_details <- read_csv("C:/Users/Sezgin/Desktop/tr_super_league_odd_details.csv")
## Parsed with column specification:
## cols(
##   matchid = col_character(),
##   Home = col_character(),
##   Away = col_character(),
##   Bookmaker = col_character(),
##   HomeOdd = col_double(),
##   AwayOdd = col_double(),
##   TieOdd = col_double()
## )
tr_super_league_odd_details
## # A tibble: 10,072 x 7
##    matchid  Home                 Away     Bookmaker HomeOdd AwayOdd TieOdd
##    <chr>    <chr>                <chr>    <chr>       <dbl>   <dbl>  <dbl>
##  1 0029LnfI konyaspor            sivassp~ Betsson      2.82    2.56   3.28
##  2 0029LnfI konyaspor            sivassp~ Pinnacle     2.73    2.76   3.42
##  3 0029LnfI konyaspor            sivassp~ bet365       2.75    2.56   3.35
##  4 0029LnfI konyaspor            sivassp~ bwin         2.72    2.45   3.32
##  5 00ApPQui trabzonspor          galatas~ Betfair      2.52    2.75   3.20
##  6 00ApPQui trabzonspor          galatas~ Betsson      2.17    2.90   3.20
##  7 00ApPQui trabzonspor          galatas~ Pinnacle     2.67    2.88   3.42
##  8 00ApPQui trabzonspor          galatas~ bet365       2.25    3.00   3.25
##  9 00ApPQui trabzonspor          galatas~ bwin         2.33    2.98   3.20
## 10 00IbEw7m akhisar-genclik-spor gaziant~ Betsson      2.04    3.75   3.32
## # ... with 10,062 more rows
attach(tr_super_league_odd_details)
## The following objects are masked from tr_super_league_matches:
## 
##     Away, Home, matchid
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 2.2.1     <U+221A> purrr   0.2.4
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.4
## <U+221A> tidyr   0.8.0     <U+221A> stringr 1.2.0
## <U+221A> ggplot2 2.2.1     <U+221A> forcats 0.2.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
#Calculation of P(home win), P(tie) and P(away win) with formula P(x)=1/odd 
probabilities <- tr_super_league_odd_details %>% group_by(matchid, Bookmaker) %>% summarise(Phomewin= 1/HomeOdd, Pawaywin = 1/AwayOdd, Ptie= 1/TieOdd) %>% ungroup()
probabilities
## # A tibble: 10,072 x 5
##    matchid  Bookmaker Phomewin Pawaywin  Ptie
##    <chr>    <chr>        <dbl>    <dbl> <dbl>
##  1 0029LnfI bet365       0.364    0.390 0.299
##  2 0029LnfI Betsson      0.354    0.391 0.305
##  3 0029LnfI bwin         0.367    0.408 0.301
##  4 0029LnfI Pinnacle     0.366    0.362 0.292
##  5 00ApPQui bet365       0.444    0.333 0.308
##  6 00ApPQui Betfair      0.396    0.364 0.312
##  7 00ApPQui Betsson      0.460    0.345 0.312
##  8 00ApPQui bwin         0.430    0.336 0.312
##  9 00ApPQui Pinnacle     0.375    0.347 0.292
## 10 00IbEw7m bet365       0.460    0.299 0.308
## # ... with 10,062 more rows
#Calculation of the probabilities via normalization approach
probabilities_norm <- tr_super_league_odd_details %>% group_by(matchid, Bookmaker) %>% summarise(Pnorhomewin= (1/HomeOdd)*(1/((1/HomeOdd)+(1/AwayOdd)+(1/TieOdd))), Pnorawaywin= (1/AwayOdd)*(1/((1/HomeOdd)+(1/AwayOdd)+(1/TieOdd))), Pnortie= (1/TieOdd)*(1/((1/HomeOdd)+(1/AwayOdd)+(1/TieOdd)))) %>% ungroup()
probabilities_norm
## # A tibble: 10,072 x 5
##    matchid  Bookmaker Pnorhomewin Pnorawaywin Pnortie
##    <chr>    <chr>           <dbl>       <dbl>   <dbl>
##  1 0029LnfI bet365          0.346       0.371   0.284
##  2 0029LnfI Betsson         0.337       0.372   0.291
##  3 0029LnfI bwin            0.341       0.379   0.280
##  4 0029LnfI Pinnacle        0.359       0.354   0.287
##  5 00ApPQui bet365          0.409       0.307   0.283
##  6 00ApPQui Betfair         0.369       0.339   0.291
##  7 00ApPQui Betsson         0.412       0.309   0.280
##  8 00ApPQui bwin            0.399       0.312   0.290
##  9 00ApPQui Pinnacle        0.370       0.342   0.288
## 10 00IbEw7m bet365          0.431       0.280   0.289
## # ... with 10,062 more rows
#Plotting P(home win) - P(away win) vs. P(tie) utilizing both probability calculation approach
calculation1 <- probabilities %>% mutate(diff1= Phomewin - Pawaywin)
calculation2 <- probabilities_norm %>% mutate(diff2= Pnorhomewin - Pnorawaywin)
ggplot(data= calculation1, aes(x=diff1, y=Ptie)) + geom_point(stat= "identity") + labs(x= "P(home win) - P(away win)", y= "P(tie)", title= "P(home win) - P(away win) vs. P(tie) with First Prabability Calculation Approach") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))

ggplot(data= calculation2, aes(x=diff2, y=Pnortie)) + geom_point(stat= "identity") + labs(x= "P(home win) - P(away win)", y= "P(tie)", title= "P(home win) - P(away win) vs. P(tie) with Second Prabability Calculation Approach") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))

#Plotting for each book marker
bookmarker1 <- probabilities %>% filter(Bookmaker == "Betsson") %>% mutate(diff3 = Phomewin - Pawaywin)
ggplot(data= bookmarker1, aes(x=diff3, y=Ptie)) + geom_point(stat= "identity") + labs(x= "P(home win) - P(away win)", y= "P(tie)", title= "Betsson") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))

bookmarker2 <- probabilities %>% filter(Bookmaker == "Pinnacle") %>% mutate(diff4 = Phomewin - Pawaywin)
ggplot(data= bookmarker2, aes(x=diff4, y=Ptie)) + geom_point(stat= "identity") + labs(x= "P(home win) - P(away win)", y= "P(tie)", title= "Pinnacle") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))

bookmarker3 <- probabilities %>% filter(Bookmaker == "bet365") %>% mutate(diff5 = Phomewin - Pawaywin)
ggplot(data= bookmarker3, aes(x=diff5, y=Ptie)) + geom_point(stat= "identity") + labs(x= "P(home win) - P(away win)", y= "P(tie)", title= "bet365") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))

bookmarker4 <- probabilities %>% filter(Bookmaker == "bwin") %>% mutate(diff6 = Phomewin - Pawaywin)
ggplot(data= bookmarker4, aes(x=diff6, y=Ptie)) + geom_point(stat= "identity") + labs(x= "P(home win) - P(away win)", y= "P(tie)", title= "bwin") + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5, size = 12))