This is an individual in-class exercise. At the end of the exercise, please upload html outputs to your Progress Journals.
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 2.2.1 <U+221A> purrr 0.2.4
## <U+221A> tibble 1.4.2 <U+221A> dplyr 0.7.4
## <U+221A> tidyr 0.8.0 <U+221A> stringr 1.2.0
## <U+221A> readr 1.1.1 <U+221A> forcats 0.2.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
load("C:/Users/Sezgin/Desktop/osym_data_2017_v2.RData")
head(osym_data_2017)
## # A tibble: 6 x 14
## program_id university_name city faculty_name program_name exam_type
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 100110266 ABANT İZZET BAY~ BOLU Bolu Sağlık Y~ Hemşirelik YGS_2
## 2 100110487 ABANT İZZET BAY~ BOLU Bolu Turizm İ~ Gastronomi v~ YGS_4
## 3 100110724 ABANT İZZET BAY~ BOLU Bolu Turizm İ~ Turizm İşlet~ YGS_6
## 4 100130252 ABANT İZZET BAY~ BOLU Bolu Turizm İ~ Turizm İşlet~ YGS_6
## 5 100110433 ABANT İZZET BAY~ BOLU Diş Hekimliği~ Diş Hekimliği MF_3
## 6 100110609 ABANT İZZET BAY~ BOLU Diş Hekimliği~ Diş Hekimliğ~ MF_3
## # ... with 8 more variables: general_quota <chr>, general_placement <chr>,
## # min_score <dbl>, max_score <dbl>, val_quota <dbl>,
## # val_placement <dbl>, val_min_score <dbl>, val_max_score <dbl>
osym_data_2017 <- osym_data_2017 %>% mutate(general_quota= as.numeric(general_quota), general_placement= as.numeric(general_placement))
max_score
programs from each exam_type
.osym_data_2017 %>% group_by(exam_type) %>% summarise(Max=max(max_score))
## # A tibble: 19 x 2
## exam_type Max
## <chr> <dbl>
## 1 DİL_1 543
## 2 DİL_2 485
## 3 DİL_3 490
## 4 MF 408
## 5 MF_1 537
## 6 MF_2 527
## 7 MF_3 564
## 8 MF_4 575
## 9 TM_1 561
## 10 TM_2 458
## 11 TM_3 558
## 12 TS_1 527
## 13 TS_2 536
## 14 YGS_1 516
## 15 YGS_2 473
## 16 YGS_3 315
## 17 YGS_4 466
## 18 YGS_5 419
## 19 YGS_6 519
top_10_iu <- osym_data_2017 %>% filter(university_name == "İSTANBUL ÜNİVERSİTESİ") %>% arrange(desc(general_quota)) %>% slice(1:10)
ggplot(data=top_10_iu) + geom_bar(aes(x=reorder(program_name, -general_quota),y=general_quota), stat="identity") + theme(axis.text.x=element_text(angle=90))
general_placement
)/sum(general_quota
)) per city and return the top 10 cities.osym_data_2017 %>% group_by(city) %>% summarise(Fill_rate= sum(general_placement)/sum(general_quota)) %>% ungroup() %>% arrange(desc(Fill_rate)) %>% slice(1:10)
## # A tibble: 10 x 2
## city Fill_rate
## <chr> <dbl>
## 1 GEBZE 1.02
## 2 TEKİRDAĞ 1.01
## 3 MANİSA 1.01
## 4 KOCAELİ 1.00
## 5 BURSA 1.00
## 6 SUMGAYIT - AZERBAYCAN 1.00
## 7 DİYARBAKIR 0.999
## 8 DENİZLİ 0.998
## 9 EDİRNE 0.998
## 10 SAKARYA 0.997
general_placement
== general_quota
) Endüstri Mühendisliği programs (use grepl
) and draw a scatterplot of min_score vs max_score. Set transparency parameter (alpha
) to 0.7. Set program colors according to whether it is a foundation university or state university. (Tip: State university programs ids start with 1, foundation 2, KKTC 3 and other abroad 4. You can use substr
function.).#grepl returns a logical output for each indices in the original vector/data set
#utilizing grepl with dplyr (filter) enables to filter the data set where "Endüstri Mühendisliği" is involved.
Industrial_eng <- osym_data_2017 %>% filter(grepl("Endüstri Mühendisliği", program_name))
#now that we have filtered the Industrial engineering programs, we need to find data where the program is full.
diff_Industrial_eng <- Industrial_eng %>% mutate(difference= general_placement - general_quota)
full_Industrial_eng <- diff_Industrial_eng %>% filter(difference == 0)
ggplot(data=full_Industrial_eng) + geom_point(aes(x=min_score , y =max_score , color = substr(program_id,1,1), alpha = 0.7)) +xlab("min_score") + ylab ("max_score") + labs (color = "University_type") + guides(alpha = "none") + scale_color_manual( values = c(5,2,6), labels = c("state","foundation","KKTC"))
top_10_faculty <- osym_data_2017 %>% arrange(desc(general_quota)) %>% slice(1:10)
ggplot(data=top_10_faculty) + geom_bar(aes(x=reorder(program_name, -general_quota),y=general_quota), stat="identity") + theme(axis.text.x=element_text(angle=90,size = 8))
geom_crossbar
)Medicine <- osym_data_2017 %>% filter(grepl("Tıp", program_name)) %>% filter(!grepl("Mühendis",program_name)) %>% filter(substr(program_id,1,1) == 1) %>% group_by(university_name) %>% summarize(total_quota = sum(general_quota), maximum_score = max(max_score),minimum_score = min(min_score) ) %>%ungroup() %>% arrange(desc(total_quota))
ggplot(data=Medicine) + geom_crossbar(aes(x=reorder(university_name,-total_quota) , y = ((maximum_score+minimum_score)/2) , ymax= maximum_score , ymin = minimum_score , color = total_quota)) + labs (y = "Max and Min Scores") + theme(axis.text.x=element_text(angle=90,hjust=0.5,vjust=0.5)) + xlab ("University")
Let’s find average scores for both Mechanical Engineering and Civil Engineering. Then let’s compare to check which program has a higher average score depending on top 10 results.
Mech <- osym_data_2017 %>% filter(grepl("Makine Mühendisliği", program_name))
Avg_Mech_top10 <- Mech %>% mutate(avg_score= (max_score + min_score)/2) %>% arrange(desc(avg_score)) %>% slice(1:10)
Civil <- osym_data_2017 %>% filter(grepl("İnşaat Mühendisliği", program_name))
Avg_Civil_top10 <- Civil %>% mutate(avg_score= (max_score + min_score)/2) %>% arrange(desc(avg_score)) %>% slice(1:10)
ggplot(data= Avg_Mech_top10) + geom_bar(aes(x= reorder(university_name, -avg_score), y= avg_score), stat= "identity") + theme(axis.text.x=element_text(angle=90))
ggplot(data= Avg_Civil_top10) + geom_bar(aes(x= reorder(university_name, -avg_score), y= avg_score), stat= "identity") + theme(axis.text.x=element_text(angle=90))
Boğaziçi University has the highest average score for both Mechanical and Civil engineering programs.
Let’s compare the average top 10 scores and fill rates of both university’s programmes.
Koc <- osym_data_2017 %>% filter(grepl("KOÇ ÜNİVERSİTESİ", university_name))
Bilkent <- osym_data_2017 %>% filter(grepl("BİLKENT ÜNİVERSİTESİ", university_name))
Koc %>% mutate(avg_score= (max_score + min_score)/2, Fill_rate= general_placement/general_quota) %>% arrange(desc(avg_score)) %>% slice(1:10)
## # A tibble: 10 x 16
## program_id university_name city faculty_name program_name exam_type
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 203910399 KOÇ ÜNİVERSİTESİ İSTAN~ Mühendislik~ Elektrik - E~ MF_4
## 2 203910699 KOÇ ÜNİVERSİTESİ İSTAN~ Tıp Fakülte~ Tıp (İngiliz~ MF_3
## 3 203910363 KOÇ ÜNİVERSİTESİ İSTAN~ Mühendislik~ Bilgisayar M~ MF_4
## 4 203910796 KOÇ ÜNİVERSİTESİ İSTAN~ Hukuk Fakül~ Hukuk (Tam B~ TM_3
## 5 203910424 KOÇ ÜNİVERSİTESİ İSTAN~ Mühendislik~ Endüstri Müh~ MF_4
## 6 203910487 KOÇ ÜNİVERSİTESİ İSTAN~ Mühendislik~ Makine Mühen~ MF_4
## 7 203910072 KOÇ ÜNİVERSİTESİ İSTAN~ İnsani Bili~ İngiliz Dili~ DİL_1
## 8 203910309 KOÇ ÜNİVERSİTESİ İSTAN~ İktisadi ve~ İşletme (İng~ TM_1
## 9 203910706 KOÇ ÜNİVERSİTESİ İSTAN~ Tıp Fakülte~ Tıp (İngiliz~ MF_3
## 10 203910133 KOÇ ÜNİVERSİTESİ İSTAN~ İnsani Bili~ Psikoloji (İ~ TM_3
## # ... with 10 more variables: general_quota <dbl>,
## # general_placement <dbl>, min_score <dbl>, max_score <dbl>,
## # val_quota <dbl>, val_placement <dbl>, val_min_score <dbl>,
## # val_max_score <dbl>, avg_score <dbl>, Fill_rate <dbl>
Bilkent %>% mutate(avg_score= (max_score + min_score)/2, Fill_rate= general_placement/general_quota) %>% arrange(desc(avg_score)) %>% slice(1:10)
## # A tibble: 10 x 16
## program_id university_name city faculty_name program_name exam_type
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 202110426 İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Elektrik - E~ MF_4
## 2 202111106 İHSAN DOĞRAMACI~ ANKARA Hukuk Fakül~ Hukuk (Tam B~ TM_3
## 3 202110408 İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Bilgisayar M~ MF_4
## 4 202110877 İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Makine Mühen~ MF_4
## 5 202110992 İHSAN DOĞRAMACI~ ANKARA İnsani Bili~ Mütercim - T~ DİL_1
## 6 202110744 İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Elektrik - E~ MF_4
## 7 202110444 İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Endüstri Müh~ MF_4
## 8 202110735 İHSAN DOĞRAMACI~ ANKARA Mühendislik~ Bilgisayar M~ MF_4
## 9 202110241 İHSAN DOĞRAMACI~ ANKARA İktisadi, İ~ Psikoloji (İ~ TM_3
## 10 202111115 İHSAN DOĞRAMACI~ ANKARA Hukuk Fakül~ Hukuk (%50 B~ TM_3
## # ... with 10 more variables: general_quota <dbl>,
## # general_placement <dbl>, min_score <dbl>, max_score <dbl>,
## # val_quota <dbl>, val_placement <dbl>, val_min_score <dbl>,
## # val_max_score <dbl>, avg_score <dbl>, Fill_rate <dbl>
Both universities have 100% fill rate for their top 10 highest score programmes. Both universities have “Electrical & Electronics Enginnering” as a highest average score programme.
We can calculate the averange difference between maximum and minimum scores for the universities in Istanbul.
chance_to_enroll <- osym_data_2017 %>% filter(city== "İSTANBUL") %>% group_by(university_name, program_name) %>% summarise(Score_diff=round(mean(abs(max_score-min_score)),1)) %>% arrange(desc(Score_diff)) %>% ungroup() %>% slice(1:10)
ggplot(data=chance_to_enroll) + geom_bar(aes(x= program_name, y= Score_diff), stat="identity") + theme(axis.text.x=element_text(angle=90,size = 8))
Assuming the student wants to study in Istanbul, above calculation shows that German Teaching program in Istanbul University has the highest score difference, could be demonstrating that the student has a higher chance to enroll.