Tidy Analysis of OSYM Data

This is an individual in-class exercise. At the end of the exercise, please upload html outputs to your Progress Journals.

library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --

## <U+221A> ggplot2 2.2.1     <U+221A> purrr   0.2.4
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.4
## <U+221A> tidyr   0.8.0     <U+221A> stringr 1.2.0
## <U+221A> readr   1.1.1     <U+221A> forcats 0.2.0

## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

load("C:/Users/Sezgin/Desktop/osym_data_2017_v2.RData")
head(osym_data_2017)

## # A tibble: 6 x 14
##   program_id university_name  city  faculty_name   program_name  exam_type
##   <chr>      <chr>            <chr> <chr>          <chr>         <chr>    
## 1 100110266  ABANT İZZET BAY~ BOLU  Bolu Sağlık Y~ Hemşirelik    YGS_2    
## 2 100110487  ABANT İZZET BAY~ BOLU  Bolu Turizm İ~ Gastronomi v~ YGS_4    
## 3 100110724  ABANT İZZET BAY~ BOLU  Bolu Turizm İ~ Turizm İşlet~ YGS_6    
## 4 100130252  ABANT İZZET BAY~ BOLU  Bolu Turizm İ~ Turizm İşlet~ YGS_6    
## 5 100110433  ABANT İZZET BAY~ BOLU  Diş Hekimliği~ Diş Hekimliği MF_3     
## 6 100110609  ABANT İZZET BAY~ BOLU  Diş Hekimliği~ Diş Hekimliğ~ MF_3     
## # ... with 8 more variables: general_quota <chr>, general_placement <chr>,
## #   min_score <dbl>, max_score <dbl>, val_quota <dbl>,
## #   val_placement <dbl>, val_min_score <dbl>, val_max_score <dbl>

osym_data_2017 <- osym_data_2017 %>% mutate(general_quota= as.numeric(general_quota), general_placement= as.numeric(general_placement))

Get the highest max_score programs from each exam_type.

osym_data_2017 %>% group_by(exam_type) %>% summarise(Max=max(max_score))

## # A tibble: 19 x 2
##    exam_type   Max
##    <chr>     <dbl>
##  1 DİL_1       543
##  2 DİL_2       485
##  3 DİL_3       490
##  4 MF          408
##  5 MF_1        537
##  6 MF_2        527
##  7 MF_3        564
##  8 MF_4        575
##  9 TM_1        561
## 10 TM_2        458
## 11 TM_3        558
## 12 TS_1        527
## 13 TS_2        536
## 14 YGS_1       516
## 15 YGS_2       473
## 16 YGS_3       315
## 17 YGS_4       466
## 18 YGS_5       419
## 19 YGS_6       519

Plot the top 10 programs of Ä°STANBUL ÃœNÄ°VERSÄ°TESÄ° in terms of total quota in a bar chart.

top_10_iu <- osym_data_2017 %>% filter(university_name == "İSTANBUL ÜNİVERSİTESİ") %>% arrange(desc(general_quota)) %>% slice(1:10)

ggplot(data=top_10_iu)  + geom_bar(aes(x=reorder(program_name, -general_quota),y=general_quota), stat="identity") + theme(axis.text.x=element_text(angle=90))

Calculate the fill rate (sum(general_placement)/sum(general_quota)) per city and return the top 10 cities.

osym_data_2017 %>% group_by(city) %>% summarise(Fill_rate= sum(general_placement)/sum(general_quota)) %>% ungroup() %>% arrange(desc(Fill_rate)) %>% slice(1:10)

## # A tibble: 10 x 2
##    city                  Fill_rate
##    <chr>                     <dbl>
##  1 GEBZE                     1.02 
##  2 TEKİRDAĞ                  1.01 
##  3 MANİSA                    1.01 
##  4 KOCAELİ                   1.00 
##  5 BURSA                     1.00 
##  6 SUMGAYIT - AZERBAYCAN     1.00 
##  7 DİYARBAKIR                0.999
##  8 DENİZLİ                   0.998
##  9 EDİRNE                    0.998
## 10 SAKARYA                   0.997

Find full (general_placement == general_quota) EndÃ¼stri MÃ¼hendisliÄŸi programs (use grepl) and draw a scatterplot of min_score vs max_score. Set transparency parameter (alpha) to 0.7. Set program colors according to whether it is a foundation university or state university. (Tip: State university programs ids start with 1, foundation 2, KKTC 3 and other abroad 4. You can use substr function.).

#grepl returns a logical output for each indices in the original vector/data set
#utilizing grepl with dplyr (filter) enables to filter the data set where "Endüstri Mühendisliği" is involved.
Industrial_eng <- osym_data_2017 %>% filter(grepl("Endüstri Mühendisliği", program_name))
#now that we have filtered the Industrial engineering programs, we need to find data where the program is full.
diff_Industrial_eng <- Industrial_eng %>% mutate(difference= general_placement - general_quota)
full_Industrial_eng <- diff_Industrial_eng %>% filter(difference == 0)
ggplot(data=full_Industrial_eng) + geom_point(aes(x=min_score , y =max_score , color = substr(program_id,1,1), alpha = 0.7)) +xlab("min_score") + ylab ("max_score") + labs (color = "University_type") + guides(alpha = "none") + scale_color_manual( values = c(5,2,6), labels = c("state","foundation","KKTC"))

Find the top 10 faculties with the highest quotas and draw a bar chart. Ignore similar names and typos in faculty names.

top_10_faculty <- osym_data_2017 %>% arrange(desc(general_quota)) %>% slice(1:10)
ggplot(data=top_10_faculty) + geom_bar(aes(x=reorder(program_name, -general_quota),y=general_quota), stat="identity") + theme(axis.text.x=element_text(angle=90,size = 8))

Find all full medicine programs (TÄ±p but not TÄ±p MÃ¼hendisliÄŸi) of foundation universities group by university calculate total quotas per university and maximum max_score and minimum min_score as bounds, ordered and colored by total quota. (Tip: Use geom_crossbar)

Medicine <- osym_data_2017 %>% filter(grepl("Tıp", program_name)) %>% filter(!grepl("Mühendis",program_name)) %>% filter(substr(program_id,1,1) == 1) %>% group_by(university_name) %>% summarize(total_quota = sum(general_quota), maximum_score = max(max_score),minimum_score = min(min_score) ) %>%ungroup() %>% arrange(desc(total_quota))
ggplot(data=Medicine) + geom_crossbar(aes(x=reorder(university_name,-total_quota) , y = ((maximum_score+minimum_score)/2) , ymax= maximum_score , ymin = minimum_score , color = total_quota)) + labs (y = "Max and Min Scores") + theme(axis.text.x=element_text(angle=90,hjust=0.5,vjust=0.5)) + xlab ("University")

Freestyle: Do an analysis that compares the mechanical engineering (Makine MÃ¼hendisliÄŸi) and civil engineering (Ä°nÅŸaat MÃ¼hendisliÄŸi) programs.

Let’s find average scores for both Mechanical Engineering and Civil Engineering. Then let’s compare to check which program has a higher average score depending on top 10 results.

Mech <- osym_data_2017 %>% filter(grepl("Makine Mühendisliği", program_name))
Avg_Mech_top10 <- Mech %>% mutate(avg_score= (max_score + min_score)/2) %>% arrange(desc(avg_score)) %>% slice(1:10)
Civil <- osym_data_2017 %>% filter(grepl("İnşaat Mühendisliği", program_name))
Avg_Civil_top10 <- Civil %>% mutate(avg_score= (max_score + min_score)/2) %>% arrange(desc(avg_score)) %>% slice(1:10)
ggplot(data= Avg_Mech_top10) + geom_bar(aes(x= reorder(university_name, -avg_score), y= avg_score), stat= "identity") + theme(axis.text.x=element_text(angle=90))

ggplot(data= Avg_Civil_top10) + geom_bar(aes(x= reorder(university_name, -avg_score), y= avg_score), stat= "identity") + theme(axis.text.x=element_text(angle=90))

Boğaziçi University has the highest average score for both Mechanical and Civil engineering programs.

Freestyle: Compare KoÃ§ University with Bilkent University.

Let’s compare the average top 10 scores and fill rates of both university’s programmes.

Koc <- osym_data_2017 %>% filter(grepl("KOÇ ÜNİVERSİTESİ", university_name))
Bilkent <- osym_data_2017 %>% filter(grepl("BİLKENT ÜNİVERSİTESİ", university_name))

Koc %>% mutate(avg_score= (max_score + min_score)/2, Fill_rate= general_placement/general_quota) %>% arrange(desc(avg_score)) %>% slice(1:10)

## # A tibble: 10 x 16
##    program_id university_name  city   faculty_name program_name  exam_type
##    <chr>      <chr>            <chr>  <chr>        <chr>         <chr>    
##  1 203910399  KOÇ ÜNİVERSİTESİ İSTAN~ Mühendislik~ Elektrik - E~ MF_4     
##  2 203910699  KOÇ ÜNİVERSİTESİ İSTAN~ Tıp Fakülte~ Tıp (İngiliz~ MF_3     
##  3 203910363  KOÇ ÜNİVERSİTESİ İSTAN~ Mühendislik~ Bilgisayar M~ MF_4     
##  4 203910796  KOÇ ÜNİVERSİTESİ İSTAN~ Hukuk Fakül~ Hukuk (Tam B~ TM_3     
##  5 203910424  KOÇ ÜNİVERSİTESİ İSTAN~ Mühendislik~ Endüstri Müh~ MF_4     
##  6 203910487  KOÇ ÜNİVERSİTESİ İSTAN~ Mühendislik~ Makine Mühen~ MF_4     
##  7 203910072  KOÇ ÜNİVERSİTESİ İSTAN~ İnsani Bili~ İngiliz Dili~ DİL_1    
##  8 203910309  KOÇ ÜNİVERSİTESİ İSTAN~ İktisadi ve~ İşletme (İng~ TM_1     
##  9 203910706  KOÇ ÜNİVERSİTESİ İSTAN~ Tıp Fakülte~ Tıp (İngiliz~ MF_3     
## 10 203910133  KOÇ ÜNİVERSİTESİ İSTAN~ İnsani Bili~ Psikoloji (İ~ TM_3     
## # ... with 10 more variables: general_quota <dbl>,
## #   general_placement <dbl>, min_score <dbl>, max_score <dbl>,
## #   val_quota <dbl>, val_placement <dbl>, val_min_score <dbl>,
## #   val_max_score <dbl>, avg_score <dbl>, Fill_rate <dbl>

Bilkent %>% mutate(avg_score= (max_score + min_score)/2, Fill_rate= general_placement/general_quota) %>% arrange(desc(avg_score)) %>% slice(1:10)

## # A tibble: 10 x 16
##    program_id university_name  city   faculty_name program_name  exam_type
##    <chr>      <chr>            <chr>  <chr>        <chr>         <chr>    
##  1 202110426  İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Elektrik - E~ MF_4     
##  2 202111106  İHSAN DOĞRAMACI~ ANKARA Hukuk Fakül~ Hukuk (Tam B~ TM_3     
##  3 202110408  İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Bilgisayar M~ MF_4     
##  4 202110877  İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Makine Mühen~ MF_4     
##  5 202110992  İHSAN DOĞRAMACI~ ANKARA İnsani Bili~ Mütercim - T~ DİL_1    
##  6 202110744  İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Elektrik - E~ MF_4     
##  7 202110444  İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Endüstri Müh~ MF_4     
##  8 202110735  İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Bilgisayar M~ MF_4     
##  9 202110241  İHSAN DOĞRAMACI~ ANKARA İktisadi, İ~ Psikoloji (İ~ TM_3     
## 10 202111115  İHSAN DOĞRAMACI~ ANKARA Hukuk Fakül~ Hukuk (%50 B~ TM_3     
## # ... with 10 more variables: general_quota <dbl>,
## #   general_placement <dbl>, min_score <dbl>, max_score <dbl>,
## #   val_quota <dbl>, val_placement <dbl>, val_min_score <dbl>,
## #   val_max_score <dbl>, avg_score <dbl>, Fill_rate <dbl>

Both universities have 100% fill rate for their top 10 highest score programmes. Both universities have “Electrical & Electronics Enginnering” as a highest average score programme.

Freestyle: Do your best.

We can calculate the averange difference between maximum and minimum scores for the universities in Istanbul.

chance_to_enroll <- osym_data_2017 %>% filter(city== "İSTANBUL") %>% group_by(university_name, program_name) %>% summarise(Score_diff=round(mean(abs(max_score-min_score)),1)) %>%  arrange(desc(Score_diff)) %>% ungroup() %>% slice(1:10)
ggplot(data=chance_to_enroll) + geom_bar(aes(x= program_name, y= Score_diff), stat="identity") + theme(axis.text.x=element_text(angle=90,size = 8))

Assuming the student wants to study in Istanbul, above calculation shows that German Teaching program in Istanbul University has the highest score difference, could be demonstrating that the student has a higher chance to enroll.

Tidy Analysis of OSYM Data

Sezgin Yıldız

April 20, 2018