diff --git a/submissions/Junpeng_Jiang_submission.html b/submissions/Junpeng_Jiang_submission.html new file mode 100644 index 0000000..9290285 --- /dev/null +++ b/submissions/Junpeng_Jiang_submission.html @@ -0,0 +1,790 @@ + + + + +
+ + + + + + + + +library(dplyr)
+##
+## Attaching package: 'dplyr'
+## The following objects are masked from 'package:stats':
+##
+## filter, lag
+## The following objects are masked from 'package:base':
+##
+## intersect, setdiff, setequal, union
+library(ggplot2)
+school <- read.csv(here::here("data/nys_schools.csv"))
+acs <- read.csv(here::here("data/nys_acs.csv"))
+str(school)
+## 'data.frame': 35663 obs. of 12 variables:
+## $ school_cd : num 1.01e+10 1.01e+10 1.01e+10 1.01e+10 1.01e+10 ...
+## $ school_name : chr "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" ...
+## $ district_name : chr "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" ...
+## $ county_name : chr "ALBANY" "ALBANY" "ALBANY" "ALBANY" ...
+## $ region : chr "CAPITAL DISTRICT" "CAPITAL DISTRICT" "CAPITAL DISTRICT" "CAPITAL DISTRICT" ...
+## $ year : int 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 ...
+## $ total_enroll : num 316 312 300 326 294 321 335 347 366 359 ...
+## $ per_free_lunch : num 0.33 0.32 0.26 0.22 0.31 0.25 0.35 0.22 0.29 0.26 ...
+## $ per_reduced_lunch: num 0.1 0.08 0.08 0.07 0.06 0.08 0.04 0 0.02 0.01 ...
+## $ per_lep : num 0.01 0.02 0.03 0.03 0.03 0.03 0.06 0.06 0.05 0.07 ...
+## $ mean_ela_score : num 658 673 670 667 670 ...
+## $ mean_math_score : num 670 679 683 681 687 ...
+str(acs)
+## 'data.frame': 496 obs. of 5 variables:
+## $ county_name : chr "ALBANY" "ALBANY" "ALBANY" "ALBANY" ...
+## $ year : int 2009 2010 2011 2012 2013 2014 2015 2016 2009 2010 ...
+## $ county_per_poverty : num 0.118 0.119 0.121 0.124 0.123 ...
+## $ median_household_income: int 55350 56090 57715 59359 59394 59940 59887 60904 40917 41305 ...
+## $ county_per_bach : num 0.19 0.197 0.199 0.198 0.205 ...
+summary(school)
+## school_cd school_name district_name county_name
+## Min. :1.010e+10 Length:35663 Length:35663 Length:35663
+## 1st Qu.:2.802e+11 Class :character Class :character Class :character
+## Median :3.317e+11 Mode :character Mode :character Mode :character
+## Mean :3.568e+11
+## 3rd Qu.:4.725e+11
+## Max. :6.808e+11
+## region year total_enroll per_free_lunch
+## Length:35663 Min. :2008 Min. : -99.0 Min. :-99.0000
+## Class :character 1st Qu.:2010 1st Qu.: 339.0 1st Qu.: 0.1900
+## Mode :character Median :2013 Median : 469.0 Median : 0.4200
+## Mean :2013 Mean : 523.6 Mean : 0.4188
+## 3rd Qu.:2015 3rd Qu.: 648.0 3rd Qu.: 0.7200
+## Max. :2017 Max. :2347.0 Max. :257.0000
+## per_reduced_lunch per_lep mean_ela_score mean_math_score
+## Min. :-99.00000 Min. :-99.00000 Min. :-99.0 Min. :-99.0
+## 1st Qu.: 0.03000 1st Qu.: 0.00000 1st Qu.:296.0 1st Qu.:298.0
+## Median : 0.06000 Median : 0.03000 Median :324.2 Median :330.8
+## Mean : 0.02852 Mean : 0.04124 Mean :447.1 Mean :456.0
+## 3rd Qu.: 0.10000 3rd Qu.: 0.11000 3rd Qu.:666.3 3rd Qu.:683.5
+## Max. : 53.00000 Max. : 1.00000 Max. :720.8 Max. :738.7
+summary(acs)
+## county_name year county_per_poverty median_household_income
+## Length:496 Min. :2009 Min. :0.04689 Min. : 33794
+## Class :character 1st Qu.:2011 1st Qu.:0.10903 1st Qu.: 46347
+## Mode :character Median :2012 Median :0.12884 Median : 50135
+## Mean :2012 Mean :0.13085 Mean : 54116
+## 3rd Qu.:2014 3rd Qu.:0.14929 3rd Qu.: 56448
+## Max. :2016 Max. :0.29935 Max. :102044
+## county_per_bach
+## Min. :0.07574
+## 1st Qu.:0.11018
+## Median :0.13169
+## Mean :0.14410
+## 3rd Qu.:0.17431
+## Max. :0.31795
+num_of_na <- rep(NA,6)
+for (i in 7:12) {
+ num_of_na[i-6] <- sum(school[,i] == -99)
+}
+num_of_na
+## [1] 13 15 15 13 2208 2210
+dim(school)
+## [1] 35663 12
+new_school <- school
+new_school[new_school == -99] <- NA
+new_school <- na.omit(new_school)
+dim(new_school)
+## [1] 33437 12
+county_avg_median <- acs %>%
+ group_by(county_name) %>%
+ summarise(n = mean(median_household_income))
+## `summarise()` ungrouping output (override with `.groups` argument)
+county_avg_median <- as.data.frame(county_avg_median)
+hist(county_avg_median$n)
+summary(county_avg_median$n)
+## Min. 1st Qu. Median Mean 3rd Qu. Max.
+## 34422 46731 50069 54116 56768 97067
+county_avg_median$group <- rep(NA,dim(county_avg_median)[1])
+for (i in 1:dim(county_avg_median)[1]) {
+ if(county_avg_median[i,'n'] < 46731){
+ county_avg_median[i,'group'] <- "low"
+ } else if(county_avg_median[i,'n'] >= 46731 && county_avg_median[i,'n'] <= 56768){
+ county_avg_median[i,'group'] <- "medium"
+ } else if(county_avg_median[i,'n'] > 56768){
+ county_avg_median[i,'group'] <- "high"
+ }
+}
+head(county_avg_median)
+## county_name n group
+## 1 ALBANY 58579.88 high
+## 2 ALLEGANY 42281.12 low
+## 3 BRONX 34421.88 low
+## 4 BROOME 45716.25 low
+## 5 CATTARAUGUS 42811.88 low
+## 6 CAYUGA 50686.00 medium
+acs$group <- rep(NA,dim(acs)[1])
+
+for (i in 1:dim(acs)[1]) {
+ rnum <- which(acs$county_name[i] == county_avg_median$county_name)
+ acs[i,"group"] <- county_avg_median[rnum,"group"]
+}
+head(acs,15)
+## county_name year county_per_poverty median_household_income county_per_bach
+## 1 ALBANY 2009 0.1183511 55350 0.19036819
+## 2 ALBANY 2010 0.1194052 56090 0.19718856
+## 3 ALBANY 2011 0.1207243 57715 0.19898258
+## 4 ALBANY 2012 0.1237525 59359 0.19755045
+## 5 ALBANY 2013 0.1229728 59394 0.20453518
+## 6 ALBANY 2014 0.1286178 59940 0.20119979
+## 7 ALBANY 2015 0.1279796 59887 0.20136847
+## 8 ALBANY 2016 0.1226051 60904 0.20611462
+## 9 ALLEGANY 2009 0.1521532 40917 0.09468291
+## 10 ALLEGANY 2010 0.1491905 41305 0.09019852
+## 11 ALLEGANY 2011 0.1483946 41900 0.08712904
+## 12 ALLEGANY 2012 0.1528964 42095 0.09225430
+## 13 ALLEGANY 2013 0.1474557 42445 0.09361813
+## 14 ALLEGANY 2014 0.1524583 42726 0.09504335
+## 15 ALLEGANY 2015 0.1515498 42776 0.09833813
+## group
+## 1 high
+## 2 high
+## 3 high
+## 4 high
+## 5 high
+## 6 high
+## 7 high
+## 8 high
+## 9 low
+## 10 low
+## 11 low
+## 12 low
+## 13 low
+## 14 low
+## 15 low
+z_dt <- new_school %>% group_by(year) %>% summarise(n = scale(mean_ela_score), m = scale(mean_math_score))
+## `summarise()` regrouping output by 'year' (override with `.groups` argument)
+new_school$z_ela <- z_dt$n
+new_school$z_math <- z_dt$m
+head(new_school)
+## school_cd school_name district_name county_name
+## 1 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT ALBANY
+## 2 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT ALBANY
+## 3 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT ALBANY
+## 4 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT ALBANY
+## 5 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT ALBANY
+## 6 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT ALBANY
+## region year total_enroll per_free_lunch per_reduced_lunch per_lep
+## 1 CAPITAL DISTRICT 2008 316 0.33 0.10 0.01
+## 2 CAPITAL DISTRICT 2009 312 0.32 0.08 0.02
+## 3 CAPITAL DISTRICT 2010 300 0.26 0.08 0.03
+## 4 CAPITAL DISTRICT 2011 326 0.22 0.07 0.03
+## 5 CAPITAL DISTRICT 2012 294 0.31 0.06 0.03
+## 6 CAPITAL DISTRICT 2013 321 0.25 0.08 0.03
+## mean_ela_score mean_math_score z_ela z_math
+## 1 658.0000 669.6667 -0.4645310 -0.5325526
+## 2 672.6667 678.6667 -0.1377583 -0.8628189
+## 3 670.3333 683.0000 -1.3577096 -1.4866552
+## 4 666.6667 681.3333 0.6682810 0.1096319
+## 5 670.3333 687.3333 -0.9329051 -1.5416996
+## 6 309.6667 311.0000 -0.1050810 -0.2389825
+total_dt <- merge(school,acs,by = c("county_name","year"))
+head(total_dt)
+## county_name year school_cd school_name
+## 1 ALBANY 2009 10306060001 CLARKSVILLE ELEMENTARY SCHOOL
+## 2 ALBANY 2009 10306060005 HAMAGRAEL ELEMENTARY SCHOOL
+## 3 ALBANY 2009 10623060006 LATHAM RIDGE SCHOOL
+## 4 ALBANY 2009 10802060009 PINE BUSH ELEMENTARY SCHOOL
+## 5 ALBANY 2009 10100010043 PHILIP J SCHUYLER ACHIEVEMENT ACADEMY
+## 6 ALBANY 2009 10100860899 ALBANY COMMUNITY CHARTER SCHOOL
+## district_name region total_enroll
+## 1 BETHLEHEM CENTRAL SCHOOL DISTRICT CAPITAL DISTRICT 226
+## 2 BETHLEHEM CENTRAL SCHOOL DISTRICT CAPITAL DISTRICT 388
+## 3 NORTH COLONIE CSD CAPITAL DISTRICT 424
+## 4 GUILDERLAND CENTRAL SCHOOL DISTRICT CAPITAL DISTRICT 485
+## 5 ALBANY CITY SCHOOL DISTRICT CAPITAL DISTRICT 310
+## 6 CAPITAL DISTRICT 232
+## per_free_lunch per_reduced_lunch per_lep mean_ela_score mean_math_score
+## 1 0.04 0.03 0.04 681.3333 697.6667
+## 2 0.02 0.03 0.00 689.0000 697.6667
+## 3 0.07 0.04 0.03 675.2500 692.7500
+## 4 0.06 0.01 0.00 689.0000 710.6667
+## 5 0.89 0.07 0.09 652.3333 664.0000
+## 6 0.77 0.15 0.02 669.0000 709.0000
+## county_per_poverty median_household_income county_per_bach group
+## 1 0.1183511 55350 0.1903682 high
+## 2 0.1183511 55350 0.1903682 high
+## 3 0.1183511 55350 0.1903682 high
+## 4 0.1183511 55350 0.1903682 high
+## 5 0.1183511 55350 0.1903682 high
+## 6 0.1183511 55350 0.1903682 high
+tb1 <- total_dt %>%
+ group_by(county_name) %>%
+ summarise(total_erm = sum(total_enroll),
+ q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch),
+ per_poverty = mean(county_per_poverty))
+## `summarise()` ungrouping output (override with `.groups` argument)
+tb1 <- as.data.frame(tb1)
+tb1$q_for_lunch <- tb1$q_for_lunch/tb1$total_erm
+head(tb1)
+## county_name total_erm q_for_lunch per_poverty
+## 1 ALBANY 207584 0.3905571 0.1229838
+## 2 ALLEGANY 44973 0.5195804 0.1508549
+## 3 BRONX 1313511 0.8594981 0.2872294
+## 4 BROOME 154617 0.4711154 0.1584756
+## 5 CATTARAUGUS 74573 0.5091349 0.1642522
+## 6 CAYUGA 46201 0.4272994 0.1148119
+for_names <- tb1 %>% arrange(per_poverty)
+c_names <- for_names$county_name[c(1:5,58:62)]
+c_names# These are the counties has top5 and bottom5 poverty rate.
+## [1] "NASSAU" "PUTNAM" "SUFFOLK" "SARATOGA" "DUTCHESS"
+## [6] "CHAUTAUQUA" "TOMPKINS" "MONTGOMERY" "KINGS" "BRONX"
+tb2 <- total_dt %>%
+ filter(county_name %in% c_names) %>%
+ group_by(county_name) %>%
+ summarise(q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch),
+ per_poverty = mean(county_per_poverty),
+ avg_read = mean(mean_ela_score),
+ avg_math = mean(mean_math_score),
+ total_erm = sum(total_enroll))
+## `summarise()` ungrouping output (override with `.groups` argument)
+tb2 <- as.data.frame(tb2)
+tb2$q_for_lunch <- tb2$q_for_lunch/tb2$total_erm
+tb2$total_erm <- NULL
+head(tb2)
+## county_name q_for_lunch per_poverty avg_read avg_math
+## 1 BRONX 0.8594981 0.28722937 435.9952 443.8711
+## 2 CHAUTAUQUA 0.5429325 0.17495020 457.6790 468.2254
+## 3 DUTCHESS 0.3252967 0.08351837 427.4772 434.7679
+## 4 KINGS 0.7957843 0.22478133 445.8000 454.8801
+## 5 MONTGOMERY 0.5392405 0.17723786 480.3711 490.9603
+## 6 NASSAU 0.1608488 0.05556675 444.1582 455.2362
+length(unique(school$school_name))
+## [1] 5042
+plt1 <- new_school %>%
+ filter(year == 2008) %>%
+ group_by(school_name) %>%
+ summarise(access = per_free_lunch+per_reduced_lunch,
+ total_mean_score = mean_ela_score+mean_math_score)
+## `summarise()` regrouping output by 'school_name' (override with `.groups` argument)
+plt1 %>% ggplot() +
+ geom_point(aes(x=access,y = total_mean_score))
+plt2 <- new_school %>%
+ filter(year == 2009) %>%
+ group_by(school_name) %>%
+ summarise(access = per_free_lunch+per_reduced_lunch,
+ total_mean_score = mean_ela_score+mean_math_score)
+## `summarise()` regrouping output by 'school_name' (override with `.groups` argument)
+plt2 %>% ggplot() +
+ geom_point(aes(x=access,y = total_mean_score))
+plt3 <- new_school %>%
+ filter(year == 2010) %>%
+ group_by(school_name) %>%
+ summarise(access = per_free_lunch+per_reduced_lunch,
+ total_mean_score = mean_ela_score+mean_math_score)
+## `summarise()` regrouping output by 'school_name' (override with `.groups` argument)
+plt3 %>% ggplot() +
+ geom_point(aes(x=access,y = total_mean_score))
+By subsetting data from 2008-2010, I plotted the scatter plot of the total percentage of access to free/reduced price lunch versus the total mean scores of each school.
Through the plot we can see a negative linear trend: for schools from which students have more access to free/reduced price lunch, these students tend to have lower total scores on both ELA and math exams.
plt22 <- total_dt %>%
+ group_by(group,year) %>%
+ summarise(mean_score = (mean(mean_ela_score) + mean(mean_math_score))/2)
+## `summarise()` regrouping output by 'group' (override with `.groups` argument)
+plt22 <- as.data.frame(plt22)
+plt22 %>% ggplot +
+ geom_line(aes(x = year,y = mean_score,group = group,col = group))
+t7 <- total_dt %>%
+ group_by(county_name) %>%
+ summarise(mean_score = mean(mean_ela_score+mean_math_score),
+ total_per = mean(per_free_lunch+per_reduced_lunch),
+ group = group[1])
+## `summarise()` ungrouping output (override with `.groups` argument)
+t7 <- as.data.frame(t7)
+
+t7 %>% ggplot() +
+ geom_point(aes(x = mean_score,y = total_per,group = group,col = group))
+In the plot, each point represents a county, and the colors represent their poverty labels.
From the plot we cannot see clear relationship between each counties poverty level(measured by the total percentage of students allowed for free/reduced price lunch) and their students’ test performances.
However, this plot is using data that is county-level, which means that the test scores and lunch data are averaged across all schools within one county. If we go back to the plot 1 in Task 6, we do see that the schools with lowers average scores often allow more students to access free/less expensive lunch.
Also, this trend has been consistend across the years, as illustrated by the three plots in Task 6.