diff --git a/submissions/Junpeng_Jiang_submission.html b/submissions/Junpeng_Jiang_submission.html new file mode 100644 index 0000000..9290285 --- /dev/null +++ b/submissions/Junpeng_Jiang_submission.html @@ -0,0 +1,790 @@ + + + + + + + + + + + + + +Junpeng Jiang 2020 Bootcamp Submission + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
library(ggplot2)
+
+

Task 1

+
school <- read.csv(here::here("data/nys_schools.csv"))
+acs <- read.csv(here::here("data/nys_acs.csv"))
+
+
+

Task 2

+
str(school)
+
## 'data.frame':    35663 obs. of  12 variables:
+##  $ school_cd        : num  1.01e+10 1.01e+10 1.01e+10 1.01e+10 1.01e+10 ...
+##  $ school_name      : chr  "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" ...
+##  $ district_name    : chr  "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" ...
+##  $ county_name      : chr  "ALBANY" "ALBANY" "ALBANY" "ALBANY" ...
+##  $ region           : chr  "CAPITAL DISTRICT" "CAPITAL DISTRICT" "CAPITAL DISTRICT" "CAPITAL DISTRICT" ...
+##  $ year             : int  2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 ...
+##  $ total_enroll     : num  316 312 300 326 294 321 335 347 366 359 ...
+##  $ per_free_lunch   : num  0.33 0.32 0.26 0.22 0.31 0.25 0.35 0.22 0.29 0.26 ...
+##  $ per_reduced_lunch: num  0.1 0.08 0.08 0.07 0.06 0.08 0.04 0 0.02 0.01 ...
+##  $ per_lep          : num  0.01 0.02 0.03 0.03 0.03 0.03 0.06 0.06 0.05 0.07 ...
+##  $ mean_ela_score   : num  658 673 670 667 670 ...
+##  $ mean_math_score  : num  670 679 683 681 687 ...
+
str(acs)
+
## 'data.frame':    496 obs. of  5 variables:
+##  $ county_name            : chr  "ALBANY" "ALBANY" "ALBANY" "ALBANY" ...
+##  $ year                   : int  2009 2010 2011 2012 2013 2014 2015 2016 2009 2010 ...
+##  $ county_per_poverty     : num  0.118 0.119 0.121 0.124 0.123 ...
+##  $ median_household_income: int  55350 56090 57715 59359 59394 59940 59887 60904 40917 41305 ...
+##  $ county_per_bach        : num  0.19 0.197 0.199 0.198 0.205 ...
+
summary(school)
+
##    school_cd         school_name        district_name      county_name       
+##  Min.   :1.010e+10   Length:35663       Length:35663       Length:35663      
+##  1st Qu.:2.802e+11   Class :character   Class :character   Class :character  
+##  Median :3.317e+11   Mode  :character   Mode  :character   Mode  :character  
+##  Mean   :3.568e+11                                                           
+##  3rd Qu.:4.725e+11                                                           
+##  Max.   :6.808e+11                                                           
+##     region               year       total_enroll    per_free_lunch    
+##  Length:35663       Min.   :2008   Min.   : -99.0   Min.   :-99.0000  
+##  Class :character   1st Qu.:2010   1st Qu.: 339.0   1st Qu.:  0.1900  
+##  Mode  :character   Median :2013   Median : 469.0   Median :  0.4200  
+##                     Mean   :2013   Mean   : 523.6   Mean   :  0.4188  
+##                     3rd Qu.:2015   3rd Qu.: 648.0   3rd Qu.:  0.7200  
+##                     Max.   :2017   Max.   :2347.0   Max.   :257.0000  
+##  per_reduced_lunch      per_lep          mean_ela_score  mean_math_score
+##  Min.   :-99.00000   Min.   :-99.00000   Min.   :-99.0   Min.   :-99.0  
+##  1st Qu.:  0.03000   1st Qu.:  0.00000   1st Qu.:296.0   1st Qu.:298.0  
+##  Median :  0.06000   Median :  0.03000   Median :324.2   Median :330.8  
+##  Mean   :  0.02852   Mean   :  0.04124   Mean   :447.1   Mean   :456.0  
+##  3rd Qu.:  0.10000   3rd Qu.:  0.11000   3rd Qu.:666.3   3rd Qu.:683.5  
+##  Max.   : 53.00000   Max.   :  1.00000   Max.   :720.8   Max.   :738.7
+
summary(acs)
+
##  county_name             year      county_per_poverty median_household_income
+##  Length:496         Min.   :2009   Min.   :0.04689    Min.   : 33794         
+##  Class :character   1st Qu.:2011   1st Qu.:0.10903    1st Qu.: 46347         
+##  Mode  :character   Median :2012   Median :0.12884    Median : 50135         
+##                     Mean   :2012   Mean   :0.13085    Mean   : 54116         
+##                     3rd Qu.:2014   3rd Qu.:0.14929    3rd Qu.: 56448         
+##                     Max.   :2016   Max.   :0.29935    Max.   :102044         
+##  county_per_bach  
+##  Min.   :0.07574  
+##  1st Qu.:0.11018  
+##  Median :0.13169  
+##  Mean   :0.14410  
+##  3rd Qu.:0.17431  
+##  Max.   :0.31795
+ +
+
+

Task 3

+
+

Dealing with Missing values

+
    +
  • Deal with -99, first count how many missing values there are.
  • +
+
num_of_na <- rep(NA,6)
+for (i in 7:12) {
+  num_of_na[i-6] <- sum(school[,i] == -99)
+}
+num_of_na
+
## [1]   13   15   15   13 2208 2210
+
dim(school)
+
## [1] 35663    12
+
    +
  • Remember that there are 35663 rows in the school dataset, so taking out 2200 rows should not affect the entire dataset considerably.
  • +
+
new_school <- school
+new_school[new_school == -99] <- NA
+new_school <- na.omit(new_school)
+dim(new_school)
+
## [1] 33437    12
+
    +
  • This way we take out all rows with missing values, and the remaining dataset new_school has 33437 observations.
  • +
+
+
+

Create poverty level.

+
county_avg_median <- acs %>% 
+  group_by(county_name) %>% 
+  summarise(n = mean(median_household_income))
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
county_avg_median <- as.data.frame(county_avg_median)
+hist(county_avg_median$n)
+

+
    +
  • Compute average median income of each county across the years and plot a hisgram of the averages.
  • +
+
summary(county_avg_median$n)
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   34422   46731   50069   54116   56768   97067
+
    +
  • By looking at the quantiles, I decide to split the groups as follows:
  • +
  • < 46731 : low
  • +
  • between 46731 and 56768: medium
  • +
  • larger than 56768:high
  • +
+
county_avg_median$group <- rep(NA,dim(county_avg_median)[1])
+for (i in 1:dim(county_avg_median)[1]) {
+  if(county_avg_median[i,'n'] < 46731){
+      county_avg_median[i,'group'] <- "low"
+  } else if(county_avg_median[i,'n'] >= 46731 && county_avg_median[i,'n'] <= 56768){
+    county_avg_median[i,'group'] <- "medium"
+  } else if(county_avg_median[i,'n'] > 56768){
+    county_avg_median[i,'group'] <- "high"
+  }
+}
+head(county_avg_median)
+
##   county_name        n  group
+## 1      ALBANY 58579.88   high
+## 2    ALLEGANY 42281.12    low
+## 3       BRONX 34421.88    low
+## 4      BROOME 45716.25    low
+## 5 CATTARAUGUS 42811.88    low
+## 6      CAYUGA 50686.00 medium
+
    +
  • This dataframe indexes each county with a group of low/medium/high.
  • +
  • Then append the group variable back to the county dataset.
  • +
+
acs$group <- rep(NA,dim(acs)[1])
+
+for (i in 1:dim(acs)[1]) {
+  rnum <- which(acs$county_name[i] == county_avg_median$county_name)
+  acs[i,"group"] <- county_avg_median[rnum,"group"]
+}
+head(acs,15)
+
##    county_name year county_per_poverty median_household_income county_per_bach
+## 1       ALBANY 2009          0.1183511                   55350      0.19036819
+## 2       ALBANY 2010          0.1194052                   56090      0.19718856
+## 3       ALBANY 2011          0.1207243                   57715      0.19898258
+## 4       ALBANY 2012          0.1237525                   59359      0.19755045
+## 5       ALBANY 2013          0.1229728                   59394      0.20453518
+## 6       ALBANY 2014          0.1286178                   59940      0.20119979
+## 7       ALBANY 2015          0.1279796                   59887      0.20136847
+## 8       ALBANY 2016          0.1226051                   60904      0.20611462
+## 9     ALLEGANY 2009          0.1521532                   40917      0.09468291
+## 10    ALLEGANY 2010          0.1491905                   41305      0.09019852
+## 11    ALLEGANY 2011          0.1483946                   41900      0.08712904
+## 12    ALLEGANY 2012          0.1528964                   42095      0.09225430
+## 13    ALLEGANY 2013          0.1474557                   42445      0.09361813
+## 14    ALLEGANY 2014          0.1524583                   42726      0.09504335
+## 15    ALLEGANY 2015          0.1515498                   42776      0.09833813
+##    group
+## 1   high
+## 2   high
+## 3   high
+## 4   high
+## 5   high
+## 6   high
+## 7   high
+## 8   high
+## 9    low
+## 10   low
+## 11   low
+## 12   low
+## 13   low
+## 14   low
+## 15   low
+
    +
  • The group variable is created as stated.
  • +
+
+
+

Create Z-Score

+
z_dt <- new_school %>% group_by(year) %>% summarise(n = scale(mean_ela_score), m = scale(mean_math_score))
+
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
+
new_school$z_ela <- z_dt$n
+new_school$z_math <- z_dt$m
+head(new_school)
+
##     school_cd              school_name               district_name county_name
+## 1 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 2 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 3 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 4 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 5 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 6 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+##             region year total_enroll per_free_lunch per_reduced_lunch per_lep
+## 1 CAPITAL DISTRICT 2008          316           0.33              0.10    0.01
+## 2 CAPITAL DISTRICT 2009          312           0.32              0.08    0.02
+## 3 CAPITAL DISTRICT 2010          300           0.26              0.08    0.03
+## 4 CAPITAL DISTRICT 2011          326           0.22              0.07    0.03
+## 5 CAPITAL DISTRICT 2012          294           0.31              0.06    0.03
+## 6 CAPITAL DISTRICT 2013          321           0.25              0.08    0.03
+##   mean_ela_score mean_math_score      z_ela     z_math
+## 1       658.0000        669.6667 -0.4645310 -0.5325526
+## 2       672.6667        678.6667 -0.1377583 -0.8628189
+## 3       670.3333        683.0000 -1.3577096 -1.4866552
+## 4       666.6667        681.3333  0.6682810  0.1096319
+## 5       670.3333        687.3333 -0.9329051 -1.5416996
+## 6       309.6667        311.0000 -0.1050810 -0.2389825
+
+
+
+

Task 4

+
total_dt <- merge(school,acs,by = c("county_name","year"))
+head(total_dt)
+
##   county_name year   school_cd                           school_name
+## 1      ALBANY 2009 10306060001         CLARKSVILLE ELEMENTARY SCHOOL
+## 2      ALBANY 2009 10306060005           HAMAGRAEL ELEMENTARY SCHOOL
+## 3      ALBANY 2009 10623060006                   LATHAM RIDGE SCHOOL
+## 4      ALBANY 2009 10802060009           PINE BUSH ELEMENTARY SCHOOL
+## 5      ALBANY 2009 10100010043 PHILIP J SCHUYLER ACHIEVEMENT ACADEMY
+## 6      ALBANY 2009 10100860899       ALBANY COMMUNITY CHARTER SCHOOL
+##                         district_name           region total_enroll
+## 1   BETHLEHEM CENTRAL SCHOOL DISTRICT CAPITAL DISTRICT          226
+## 2   BETHLEHEM CENTRAL SCHOOL DISTRICT CAPITAL DISTRICT          388
+## 3                   NORTH COLONIE CSD CAPITAL DISTRICT          424
+## 4 GUILDERLAND CENTRAL SCHOOL DISTRICT CAPITAL DISTRICT          485
+## 5         ALBANY CITY SCHOOL DISTRICT CAPITAL DISTRICT          310
+## 6                                     CAPITAL DISTRICT          232
+##   per_free_lunch per_reduced_lunch per_lep mean_ela_score mean_math_score
+## 1           0.04              0.03    0.04       681.3333        697.6667
+## 2           0.02              0.03    0.00       689.0000        697.6667
+## 3           0.07              0.04    0.03       675.2500        692.7500
+## 4           0.06              0.01    0.00       689.0000        710.6667
+## 5           0.89              0.07    0.09       652.3333        664.0000
+## 6           0.77              0.15    0.02       669.0000        709.0000
+##   county_per_poverty median_household_income county_per_bach group
+## 1          0.1183511                   55350       0.1903682  high
+## 2          0.1183511                   55350       0.1903682  high
+## 3          0.1183511                   55350       0.1903682  high
+## 4          0.1183511                   55350       0.1903682  high
+## 5          0.1183511                   55350       0.1903682  high
+## 6          0.1183511                   55350       0.1903682  high
+
+
+

Task 5

+ +
tb1 <- total_dt %>% 
+  group_by(county_name) %>% 
+  summarise(total_erm = sum(total_enroll),
+            q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch),
+            per_poverty = mean(county_per_poverty))
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
tb1 <- as.data.frame(tb1)
+tb1$q_for_lunch <- tb1$q_for_lunch/tb1$total_erm
+head(tb1)
+
##   county_name total_erm q_for_lunch per_poverty
+## 1      ALBANY    207584   0.3905571   0.1229838
+## 2    ALLEGANY     44973   0.5195804   0.1508549
+## 3       BRONX   1313511   0.8594981   0.2872294
+## 4      BROOME    154617   0.4711154   0.1584756
+## 5 CATTARAUGUS     74573   0.5091349   0.1642522
+## 6      CAYUGA     46201   0.4272994   0.1148119
+ +
for_names <- tb1 %>% arrange(per_poverty)
+c_names <- for_names$county_name[c(1:5,58:62)]
+c_names# These are the counties has top5 and bottom5 poverty rate.
+
##  [1] "NASSAU"     "PUTNAM"     "SUFFOLK"    "SARATOGA"   "DUTCHESS"  
+##  [6] "CHAUTAUQUA" "TOMPKINS"   "MONTGOMERY" "KINGS"      "BRONX"
+
tb2 <- total_dt %>% 
+  filter(county_name %in% c_names) %>% 
+  group_by(county_name) %>% 
+  summarise(q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch),
+            per_poverty = mean(county_per_poverty),
+            avg_read = mean(mean_ela_score),
+            avg_math = mean(mean_math_score),
+            total_erm = sum(total_enroll))
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
tb2 <- as.data.frame(tb2)
+tb2$q_for_lunch <- tb2$q_for_lunch/tb2$total_erm
+tb2$total_erm <- NULL
+head(tb2)
+
##   county_name q_for_lunch per_poverty avg_read avg_math
+## 1       BRONX   0.8594981  0.28722937 435.9952 443.8711
+## 2  CHAUTAUQUA   0.5429325  0.17495020 457.6790 468.2254
+## 3    DUTCHESS   0.3252967  0.08351837 427.4772 434.7679
+## 4       KINGS   0.7957843  0.22478133 445.8000 454.8801
+## 5  MONTGOMERY   0.5392405  0.17723786 480.3711 490.9603
+## 6      NASSAU   0.1608488  0.05556675 444.1582 455.2362
+
+
+

Task 6

+
length(unique(school$school_name))
+
## [1] 5042
+
plt1 <- new_school %>% 
+  filter(year == 2008) %>% 
+  group_by(school_name) %>% 
+  summarise(access = per_free_lunch+per_reduced_lunch,
+            total_mean_score = mean_ela_score+mean_math_score)
+
## `summarise()` regrouping output by 'school_name' (override with `.groups` argument)
+
plt1 %>% ggplot() +
+  geom_point(aes(x=access,y = total_mean_score))
+

+
plt2 <- new_school %>% 
+  filter(year == 2009) %>% 
+  group_by(school_name) %>% 
+  summarise(access = per_free_lunch+per_reduced_lunch,
+            total_mean_score = mean_ela_score+mean_math_score)
+
## `summarise()` regrouping output by 'school_name' (override with `.groups` argument)
+
plt2 %>% ggplot() +
+  geom_point(aes(x=access,y = total_mean_score))
+

+
plt3 <- new_school %>% 
+  filter(year == 2010) %>% 
+  group_by(school_name) %>% 
+  summarise(access = per_free_lunch+per_reduced_lunch,
+            total_mean_score = mean_ela_score+mean_math_score)
+
## `summarise()` regrouping output by 'school_name' (override with `.groups` argument)
+
plt3 %>% ggplot() +
+  geom_point(aes(x=access,y = total_mean_score))
+

+ +
plt22 <- total_dt %>% 
+  group_by(group,year) %>%
+  summarise(mean_score = (mean(mean_ela_score) + mean(mean_math_score))/2)
+
## `summarise()` regrouping output by 'group' (override with `.groups` argument)
+
plt22 <- as.data.frame(plt22)
+plt22 %>% ggplot +
+  geom_line(aes(x = year,y = mean_score,group = group,col = group))
+

+ +
+
+

Task 7

+
t7 <- total_dt %>% 
+  group_by(county_name) %>% 
+  summarise(mean_score = mean(mean_ela_score+mean_math_score),
+            total_per = mean(per_free_lunch+per_reduced_lunch),
+            group = group[1])
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
t7 <- as.data.frame(t7)
+
+t7 %>% ggplot() +
+  geom_point(aes(x = mean_score,y = total_per,group = group,col = group))
+

+ +
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/submissions/Junpeng_Jiang_submission.rmd b/submissions/Junpeng_Jiang_submission.rmd new file mode 100644 index 0000000..dc43cab --- /dev/null +++ b/submissions/Junpeng_Jiang_submission.rmd @@ -0,0 +1,228 @@ +--- +title: "Junpeng Jiang 2020 Bootcamp Submission" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r} +library(dplyr) +library(ggplot2) +``` + +# Task 1 +```{r} +school <- read.csv(here::here("data/nys_schools.csv")) +acs <- read.csv(here::here("data/nys_acs.csv")) +``` + +# Task 2 +```{r} +str(school) +str(acs) +summary(school) +summary(acs) +``` + +- Check for data types +- There missing values in school dataset are labeled as -99, all categorical variables (plus school_cd) are complete. + + +# Task 3 +### Dealing with Missing values + +- Deal with -99, first count how many missing values there are. + +```{r} +num_of_na <- rep(NA,6) +for (i in 7:12) { + num_of_na[i-6] <- sum(school[,i] == -99) +} +num_of_na +dim(school) +``` + +- Remember that there are 35663 rows in the school dataset, so taking out 2200 rows should not affect the entire dataset considerably. + +```{r} +new_school <- school +new_school[new_school == -99] <- NA +new_school <- na.omit(new_school) +dim(new_school) +``` + +- This way we take out all rows with missing values, and the remaining dataset new_school has 33437 observations. + +### Create poverty level. + +```{r} +county_avg_median <- acs %>% + group_by(county_name) %>% + summarise(n = mean(median_household_income)) +county_avg_median <- as.data.frame(county_avg_median) +hist(county_avg_median$n) +``` + +- Compute average median income of each county across the years and plot a hisgram of the averages. + +```{r} +summary(county_avg_median$n) +``` + +- By looking at the quantiles, I decide to split the groups as follows: +- < 46731 : low +- between 46731 and 56768: medium +- larger than 56768:high + +```{r} +county_avg_median$group <- rep(NA,dim(county_avg_median)[1]) +for (i in 1:dim(county_avg_median)[1]) { + if(county_avg_median[i,'n'] < 46731){ + county_avg_median[i,'group'] <- "low" + } else if(county_avg_median[i,'n'] >= 46731 && county_avg_median[i,'n'] <= 56768){ + county_avg_median[i,'group'] <- "medium" + } else if(county_avg_median[i,'n'] > 56768){ + county_avg_median[i,'group'] <- "high" + } +} +head(county_avg_median) +``` + +- This dataframe indexes each county with a group of low/medium/high. +- Then append the group variable back to the county dataset. + +```{r} +acs$group <- rep(NA,dim(acs)[1]) + +for (i in 1:dim(acs)[1]) { + rnum <- which(acs$county_name[i] == county_avg_median$county_name) + acs[i,"group"] <- county_avg_median[rnum,"group"] +} +head(acs,15) +``` + +- The group variable is created as stated. + +### Create Z-Score +```{r} +z_dt <- new_school %>% group_by(year) %>% summarise(n = scale(mean_ela_score), m = scale(mean_math_score)) + +new_school$z_ela <- z_dt$n +new_school$z_math <- z_dt$m +head(new_school) +``` + +# Task 4 +```{r} +total_dt <- merge(school,acs,by = c("county_name","year")) +head(total_dt) +``` + +# Task 5 + +- 1 + +```{r} +tb1 <- total_dt %>% + group_by(county_name) %>% + summarise(total_erm = sum(total_enroll), + q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch), + per_poverty = mean(county_per_poverty)) + +tb1 <- as.data.frame(tb1) +tb1$q_for_lunch <- tb1$q_for_lunch/tb1$total_erm +head(tb1) +``` + +- 2 +```{r} + +for_names <- tb1 %>% arrange(per_poverty) +c_names <- for_names$county_name[c(1:5,58:62)] +c_names# These are the counties has top5 and bottom5 poverty rate. + +tb2 <- total_dt %>% + filter(county_name %in% c_names) %>% + group_by(county_name) %>% + summarise(q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch), + per_poverty = mean(county_per_poverty), + avg_read = mean(mean_ela_score), + avg_math = mean(mean_math_score), + total_erm = sum(total_enroll)) +tb2 <- as.data.frame(tb2) +tb2$q_for_lunch <- tb2$q_for_lunch/tb2$total_erm +tb2$total_erm <- NULL +head(tb2) +``` + +# Task 6 +```{r} +length(unique(school$school_name)) +plt1 <- new_school %>% + filter(year == 2008) %>% + group_by(school_name) %>% + summarise(access = per_free_lunch+per_reduced_lunch, + total_mean_score = mean_ela_score+mean_math_score) +plt1 %>% ggplot() + + geom_point(aes(x=access,y = total_mean_score)) + +plt2 <- new_school %>% + filter(year == 2009) %>% + group_by(school_name) %>% + summarise(access = per_free_lunch+per_reduced_lunch, + total_mean_score = mean_ela_score+mean_math_score) +plt2 %>% ggplot() + + geom_point(aes(x=access,y = total_mean_score)) + +plt3 <- new_school %>% + filter(year == 2010) %>% + group_by(school_name) %>% + summarise(access = per_free_lunch+per_reduced_lunch, + total_mean_score = mean_ela_score+mean_math_score) +plt3 %>% ggplot() + + geom_point(aes(x=access,y = total_mean_score)) +``` + +- By subsetting data from 2008-2010, I plotted the scatter plot of the total percentage of access to free/reduced price lunch versus the total mean scores of each school. + +- Through the plot we can see a negative linear trend: for schools from which students have more access to free/reduced price lunch, these students tend to have lower total scores on both ELA and math exams. + +```{r} +plt22 <- total_dt %>% + group_by(group,year) %>% + summarise(mean_score = (mean(mean_ela_score) + mean(mean_math_score))/2) + +plt22 <- as.data.frame(plt22) +plt22 %>% ggplot + + geom_line(aes(x = year,y = mean_score,group = group,col = group)) +``` + +- This plot shows the average scores of each poverty level group across the years. +- If we take averages across all counties with their group labels (high/low/medium), the differences between each group's mean score are not as considerable. + +# Task 7 + +```{r} +t7 <- total_dt %>% + group_by(county_name) %>% + summarise(mean_score = mean(mean_ela_score+mean_math_score), + total_per = mean(per_free_lunch+per_reduced_lunch), + group = group[1]) +t7 <- as.data.frame(t7) + +t7 %>% ggplot() + + geom_point(aes(x = mean_score,y = total_per,group = group,col = group)) + +``` + +- In the plot, each point represents a county, and the colors represent their poverty labels. + +- From the plot we cannot see clear relationship between each counties poverty level(measured by the total percentage of students allowed for free/reduced price lunch) and their students' test performances. + +- However, this plot is using data that is county-level, which means that the test scores and lunch data are averaged across all schools within one county. If we go back to the plot 1 in Task 6, we do see that the schools with lowers average scores often allow more students to access free/less expensive lunch. + +- Also, this trend has been consistend across the years, as illustrated by the three plots in Task 6. + +