From 77ff9a0ff206cb915145bd3afb6cfaa59a949470 Mon Sep 17 00:00:00 2001 From: Junpeng Jiang Date: Thu, 3 Sep 2020 23:50:16 +0800 Subject: [PATCH 1/3] test for 9/3 --- ...Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 submissions/day4_Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd diff --git a/submissions/day4_Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd b/submissions/day4_Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd new file mode 100644 index 0000000..1a6b54d --- /dev/null +++ b/submissions/day4_Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd @@ -0,0 +1,148 @@ +--- +title: "Exercises Day 2" +author: "Richard Paquin Morel, adapted from exercises by Christina Maimone" +date: "`r Sys.Date()`" +output: html_document +params: + answers: FALSE +--- + + +```{r, echo=FALSE, eval=TRUE} +answers<-params$answers +``` + +```{r global_options, echo = FALSE, include = FALSE} +knitr::opts_chunk$set(echo=answers, eval=answers, + warning = FALSE, message = FALSE, + cache = FALSE, tidy = FALSE) +``` + +## Load the data + +Load the `gapminder` dataset. + +```{r} +### Answer +dt <- read.csv(here::here("data/gapminder5.csv")) +``` + +```{r} +gapminder <- read.csv(here::here("data/gapminder5.csv"), stringsAsFactors=FALSE) +``` + + +## If Statement + +Use an if() statement to print a suitable message reporting whether there are any records from 2002 in the gapminder dataset. Now do the same for 2012. + +Hint: use the `any` function. + +```{asis} +### Answer +``` + +```{r} +year<-2002 +if(any(gapminder$year == year)){ + print(paste("Record(s) for the year",year,"found.")) +} else { + print(paste("No records for year",year)) +} +``` + + +## Loop and If Statements + +Write a script that finds the mean life expectancy by country for countries whose population is below the mean for the dataset + +Write a script that loops through the `gapminder` data by continent and prints out whether the mean life expectancy is smaller than 50, between 50 and 70, or greater than 70. + +```{asis} +### Answer +``` + +```{r} +overall_mean <- mean(gapminder$pop) + +for (i in unique(gapminder$country)) { + country_mean <- mean(gapminder$pop[gapminder$country==i]) + + if (country_mean < overall_mean) { + mean_le <- mean(gapminder$lifeExp[gapminder$country==i]) + print(paste("Mean Life Expectancy in", i, "is", mean_le)) + } +} # end for loop +``` + +```{r} +lower_threshold <- 50 +upper_threshold <- 70 + +for (i in unique(gapminder$continent)){ + tmp <- mean(gapminder$lifeExp[gapminder$continent==i]) + + if (tmp < lower_threshold){ + print(paste("Average Life Expectancy in", i, "is less than", lower_threshold)) + } + else if (tmp > lower_threshold & tmp < upper_threshold){ + print(paste("Average Life Expectancy in", i, "is between", lower_threshold, "and", upper_threshold)) + } + else { + print(paste("Average Life Expectancy in", i, "is greater than", upper_threshold)) + } + +} +``` + + +## Exercise: Write Functions + +Create a function that given a data frame will print the name of each column and the class of data it contains. Use the gapminder dataset. Hint: Use `mode()` or `class()` to get the class of the data in each column. Remember that `names()` or `colnames()` returns the name of the columns in a dataset. + +```{asis} +### Answer + +Note: Some of these were taken or modified from https://www.r-bloggers.com/functions-exercises/ +``` + +```{r} +data_frame_info <- function(df) { + cols <- names(df) + for (i in cols) { + print(paste0(i, ": ", mode(df[, i]))) + } +} +data_frame_info(gapminder) +``` + +Create a function that given a vector will print the mean and the standard deviation of a **vector**, it will optionally also print the median. Hint: include an argument that takes a boolean (`TRUE`/`FALSE`) operator and then include an `if` statement. + +```{asis} +### Answer + +``` + +```{r} +vector_info <- function(x, include_median=FALSE) { + print(paste("Mean:", mean(x))) + print(paste("Standard Deviation:", sd(x))) + if (include_median) { + print(paste("Median:", median(x))) + } +} + +le <- gapminder$lifeExp +vector_info(le, include_median = F) +vector_info(le, include_median = T) +``` + +## Analyzing the relationship + +Use what you've learned so far to answer the following questions using the `gapminder` dataset. Be sure to include some visualizations! + +1. What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.) + +2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America. + +- This file submitted on 9/3 is a practice for making a pull request. The excercise are not done yet. \ No newline at end of file From 0c5fd83a67202112d0b063fb06019295a3bdac0f Mon Sep 17 00:00:00 2001 From: Junpeng Jiang Date: Fri, 18 Sep 2020 05:53:38 +0800 Subject: [PATCH 2/3] add my submission --- submissions/Junpeng_Jiang_submission.html | 790 ++++++++++++++++++++++ submissions/Junpeng_Jiang_submission.rmd | 228 +++++++ 2 files changed, 1018 insertions(+) create mode 100644 submissions/Junpeng_Jiang_submission.html create mode 100644 submissions/Junpeng_Jiang_submission.rmd diff --git a/submissions/Junpeng_Jiang_submission.html b/submissions/Junpeng_Jiang_submission.html new file mode 100644 index 0000000..9290285 --- /dev/null +++ b/submissions/Junpeng_Jiang_submission.html @@ -0,0 +1,790 @@ + + + + + + + + + + + + + +Junpeng Jiang 2020 Bootcamp Submission + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
library(ggplot2)
+
+

Task 1

+
school <- read.csv(here::here("data/nys_schools.csv"))
+acs <- read.csv(here::here("data/nys_acs.csv"))
+
+
+

Task 2

+
str(school)
+
## 'data.frame':    35663 obs. of  12 variables:
+##  $ school_cd        : num  1.01e+10 1.01e+10 1.01e+10 1.01e+10 1.01e+10 ...
+##  $ school_name      : chr  "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" ...
+##  $ district_name    : chr  "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" ...
+##  $ county_name      : chr  "ALBANY" "ALBANY" "ALBANY" "ALBANY" ...
+##  $ region           : chr  "CAPITAL DISTRICT" "CAPITAL DISTRICT" "CAPITAL DISTRICT" "CAPITAL DISTRICT" ...
+##  $ year             : int  2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 ...
+##  $ total_enroll     : num  316 312 300 326 294 321 335 347 366 359 ...
+##  $ per_free_lunch   : num  0.33 0.32 0.26 0.22 0.31 0.25 0.35 0.22 0.29 0.26 ...
+##  $ per_reduced_lunch: num  0.1 0.08 0.08 0.07 0.06 0.08 0.04 0 0.02 0.01 ...
+##  $ per_lep          : num  0.01 0.02 0.03 0.03 0.03 0.03 0.06 0.06 0.05 0.07 ...
+##  $ mean_ela_score   : num  658 673 670 667 670 ...
+##  $ mean_math_score  : num  670 679 683 681 687 ...
+
str(acs)
+
## 'data.frame':    496 obs. of  5 variables:
+##  $ county_name            : chr  "ALBANY" "ALBANY" "ALBANY" "ALBANY" ...
+##  $ year                   : int  2009 2010 2011 2012 2013 2014 2015 2016 2009 2010 ...
+##  $ county_per_poverty     : num  0.118 0.119 0.121 0.124 0.123 ...
+##  $ median_household_income: int  55350 56090 57715 59359 59394 59940 59887 60904 40917 41305 ...
+##  $ county_per_bach        : num  0.19 0.197 0.199 0.198 0.205 ...
+
summary(school)
+
##    school_cd         school_name        district_name      county_name       
+##  Min.   :1.010e+10   Length:35663       Length:35663       Length:35663      
+##  1st Qu.:2.802e+11   Class :character   Class :character   Class :character  
+##  Median :3.317e+11   Mode  :character   Mode  :character   Mode  :character  
+##  Mean   :3.568e+11                                                           
+##  3rd Qu.:4.725e+11                                                           
+##  Max.   :6.808e+11                                                           
+##     region               year       total_enroll    per_free_lunch    
+##  Length:35663       Min.   :2008   Min.   : -99.0   Min.   :-99.0000  
+##  Class :character   1st Qu.:2010   1st Qu.: 339.0   1st Qu.:  0.1900  
+##  Mode  :character   Median :2013   Median : 469.0   Median :  0.4200  
+##                     Mean   :2013   Mean   : 523.6   Mean   :  0.4188  
+##                     3rd Qu.:2015   3rd Qu.: 648.0   3rd Qu.:  0.7200  
+##                     Max.   :2017   Max.   :2347.0   Max.   :257.0000  
+##  per_reduced_lunch      per_lep          mean_ela_score  mean_math_score
+##  Min.   :-99.00000   Min.   :-99.00000   Min.   :-99.0   Min.   :-99.0  
+##  1st Qu.:  0.03000   1st Qu.:  0.00000   1st Qu.:296.0   1st Qu.:298.0  
+##  Median :  0.06000   Median :  0.03000   Median :324.2   Median :330.8  
+##  Mean   :  0.02852   Mean   :  0.04124   Mean   :447.1   Mean   :456.0  
+##  3rd Qu.:  0.10000   3rd Qu.:  0.11000   3rd Qu.:666.3   3rd Qu.:683.5  
+##  Max.   : 53.00000   Max.   :  1.00000   Max.   :720.8   Max.   :738.7
+
summary(acs)
+
##  county_name             year      county_per_poverty median_household_income
+##  Length:496         Min.   :2009   Min.   :0.04689    Min.   : 33794         
+##  Class :character   1st Qu.:2011   1st Qu.:0.10903    1st Qu.: 46347         
+##  Mode  :character   Median :2012   Median :0.12884    Median : 50135         
+##                     Mean   :2012   Mean   :0.13085    Mean   : 54116         
+##                     3rd Qu.:2014   3rd Qu.:0.14929    3rd Qu.: 56448         
+##                     Max.   :2016   Max.   :0.29935    Max.   :102044         
+##  county_per_bach  
+##  Min.   :0.07574  
+##  1st Qu.:0.11018  
+##  Median :0.13169  
+##  Mean   :0.14410  
+##  3rd Qu.:0.17431  
+##  Max.   :0.31795
+
    +
  • Check for data types
  • +
  • There missing values in school dataset are labeled as -99, all categorical variables (plus school_cd) are complete.
  • +
+
+
+

Task 3

+
+

Dealing with Missing values

+
    +
  • Deal with -99, first count how many missing values there are.
  • +
+
num_of_na <- rep(NA,6)
+for (i in 7:12) {
+  num_of_na[i-6] <- sum(school[,i] == -99)
+}
+num_of_na
+
## [1]   13   15   15   13 2208 2210
+
dim(school)
+
## [1] 35663    12
+
    +
  • Remember that there are 35663 rows in the school dataset, so taking out 2200 rows should not affect the entire dataset considerably.
  • +
+
new_school <- school
+new_school[new_school == -99] <- NA
+new_school <- na.omit(new_school)
+dim(new_school)
+
## [1] 33437    12
+
    +
  • This way we take out all rows with missing values, and the remaining dataset new_school has 33437 observations.
  • +
+
+
+

Create poverty level.

+
county_avg_median <- acs %>% 
+  group_by(county_name) %>% 
+  summarise(n = mean(median_household_income))
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
county_avg_median <- as.data.frame(county_avg_median)
+hist(county_avg_median$n)
+

+
    +
  • Compute average median income of each county across the years and plot a hisgram of the averages.
  • +
+
summary(county_avg_median$n)
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   34422   46731   50069   54116   56768   97067
+
    +
  • By looking at the quantiles, I decide to split the groups as follows:
  • +
  • < 46731 : low
  • +
  • between 46731 and 56768: medium
  • +
  • larger than 56768:high
  • +
+
county_avg_median$group <- rep(NA,dim(county_avg_median)[1])
+for (i in 1:dim(county_avg_median)[1]) {
+  if(county_avg_median[i,'n'] < 46731){
+      county_avg_median[i,'group'] <- "low"
+  } else if(county_avg_median[i,'n'] >= 46731 && county_avg_median[i,'n'] <= 56768){
+    county_avg_median[i,'group'] <- "medium"
+  } else if(county_avg_median[i,'n'] > 56768){
+    county_avg_median[i,'group'] <- "high"
+  }
+}
+head(county_avg_median)
+
##   county_name        n  group
+## 1      ALBANY 58579.88   high
+## 2    ALLEGANY 42281.12    low
+## 3       BRONX 34421.88    low
+## 4      BROOME 45716.25    low
+## 5 CATTARAUGUS 42811.88    low
+## 6      CAYUGA 50686.00 medium
+
    +
  • This dataframe indexes each county with a group of low/medium/high.
  • +
  • Then append the group variable back to the county dataset.
  • +
+
acs$group <- rep(NA,dim(acs)[1])
+
+for (i in 1:dim(acs)[1]) {
+  rnum <- which(acs$county_name[i] == county_avg_median$county_name)
+  acs[i,"group"] <- county_avg_median[rnum,"group"]
+}
+head(acs,15)
+
##    county_name year county_per_poverty median_household_income county_per_bach
+## 1       ALBANY 2009          0.1183511                   55350      0.19036819
+## 2       ALBANY 2010          0.1194052                   56090      0.19718856
+## 3       ALBANY 2011          0.1207243                   57715      0.19898258
+## 4       ALBANY 2012          0.1237525                   59359      0.19755045
+## 5       ALBANY 2013          0.1229728                   59394      0.20453518
+## 6       ALBANY 2014          0.1286178                   59940      0.20119979
+## 7       ALBANY 2015          0.1279796                   59887      0.20136847
+## 8       ALBANY 2016          0.1226051                   60904      0.20611462
+## 9     ALLEGANY 2009          0.1521532                   40917      0.09468291
+## 10    ALLEGANY 2010          0.1491905                   41305      0.09019852
+## 11    ALLEGANY 2011          0.1483946                   41900      0.08712904
+## 12    ALLEGANY 2012          0.1528964                   42095      0.09225430
+## 13    ALLEGANY 2013          0.1474557                   42445      0.09361813
+## 14    ALLEGANY 2014          0.1524583                   42726      0.09504335
+## 15    ALLEGANY 2015          0.1515498                   42776      0.09833813
+##    group
+## 1   high
+## 2   high
+## 3   high
+## 4   high
+## 5   high
+## 6   high
+## 7   high
+## 8   high
+## 9    low
+## 10   low
+## 11   low
+## 12   low
+## 13   low
+## 14   low
+## 15   low
+
    +
  • The group variable is created as stated.
  • +
+
+
+

Create Z-Score

+
z_dt <- new_school %>% group_by(year) %>% summarise(n = scale(mean_ela_score), m = scale(mean_math_score))
+
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
+
new_school$z_ela <- z_dt$n
+new_school$z_math <- z_dt$m
+head(new_school)
+
##     school_cd              school_name               district_name county_name
+## 1 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 2 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 3 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 4 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 5 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+## 6 10100010014 MONTESSORI MAGNET SCHOOL ALBANY CITY SCHOOL DISTRICT      ALBANY
+##             region year total_enroll per_free_lunch per_reduced_lunch per_lep
+## 1 CAPITAL DISTRICT 2008          316           0.33              0.10    0.01
+## 2 CAPITAL DISTRICT 2009          312           0.32              0.08    0.02
+## 3 CAPITAL DISTRICT 2010          300           0.26              0.08    0.03
+## 4 CAPITAL DISTRICT 2011          326           0.22              0.07    0.03
+## 5 CAPITAL DISTRICT 2012          294           0.31              0.06    0.03
+## 6 CAPITAL DISTRICT 2013          321           0.25              0.08    0.03
+##   mean_ela_score mean_math_score      z_ela     z_math
+## 1       658.0000        669.6667 -0.4645310 -0.5325526
+## 2       672.6667        678.6667 -0.1377583 -0.8628189
+## 3       670.3333        683.0000 -1.3577096 -1.4866552
+## 4       666.6667        681.3333  0.6682810  0.1096319
+## 5       670.3333        687.3333 -0.9329051 -1.5416996
+## 6       309.6667        311.0000 -0.1050810 -0.2389825
+
+
+
+

Task 4

+
total_dt <- merge(school,acs,by = c("county_name","year"))
+head(total_dt)
+
##   county_name year   school_cd                           school_name
+## 1      ALBANY 2009 10306060001         CLARKSVILLE ELEMENTARY SCHOOL
+## 2      ALBANY 2009 10306060005           HAMAGRAEL ELEMENTARY SCHOOL
+## 3      ALBANY 2009 10623060006                   LATHAM RIDGE SCHOOL
+## 4      ALBANY 2009 10802060009           PINE BUSH ELEMENTARY SCHOOL
+## 5      ALBANY 2009 10100010043 PHILIP J SCHUYLER ACHIEVEMENT ACADEMY
+## 6      ALBANY 2009 10100860899       ALBANY COMMUNITY CHARTER SCHOOL
+##                         district_name           region total_enroll
+## 1   BETHLEHEM CENTRAL SCHOOL DISTRICT CAPITAL DISTRICT          226
+## 2   BETHLEHEM CENTRAL SCHOOL DISTRICT CAPITAL DISTRICT          388
+## 3                   NORTH COLONIE CSD CAPITAL DISTRICT          424
+## 4 GUILDERLAND CENTRAL SCHOOL DISTRICT CAPITAL DISTRICT          485
+## 5         ALBANY CITY SCHOOL DISTRICT CAPITAL DISTRICT          310
+## 6                                     CAPITAL DISTRICT          232
+##   per_free_lunch per_reduced_lunch per_lep mean_ela_score mean_math_score
+## 1           0.04              0.03    0.04       681.3333        697.6667
+## 2           0.02              0.03    0.00       689.0000        697.6667
+## 3           0.07              0.04    0.03       675.2500        692.7500
+## 4           0.06              0.01    0.00       689.0000        710.6667
+## 5           0.89              0.07    0.09       652.3333        664.0000
+## 6           0.77              0.15    0.02       669.0000        709.0000
+##   county_per_poverty median_household_income county_per_bach group
+## 1          0.1183511                   55350       0.1903682  high
+## 2          0.1183511                   55350       0.1903682  high
+## 3          0.1183511                   55350       0.1903682  high
+## 4          0.1183511                   55350       0.1903682  high
+## 5          0.1183511                   55350       0.1903682  high
+## 6          0.1183511                   55350       0.1903682  high
+
+
+

Task 5

+
    +
  • 1
  • +
+
tb1 <- total_dt %>% 
+  group_by(county_name) %>% 
+  summarise(total_erm = sum(total_enroll),
+            q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch),
+            per_poverty = mean(county_per_poverty))
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
tb1 <- as.data.frame(tb1)
+tb1$q_for_lunch <- tb1$q_for_lunch/tb1$total_erm
+head(tb1)
+
##   county_name total_erm q_for_lunch per_poverty
+## 1      ALBANY    207584   0.3905571   0.1229838
+## 2    ALLEGANY     44973   0.5195804   0.1508549
+## 3       BRONX   1313511   0.8594981   0.2872294
+## 4      BROOME    154617   0.4711154   0.1584756
+## 5 CATTARAUGUS     74573   0.5091349   0.1642522
+## 6      CAYUGA     46201   0.4272994   0.1148119
+
    +
  • 2
  • +
+
for_names <- tb1 %>% arrange(per_poverty)
+c_names <- for_names$county_name[c(1:5,58:62)]
+c_names# These are the counties has top5 and bottom5 poverty rate.
+
##  [1] "NASSAU"     "PUTNAM"     "SUFFOLK"    "SARATOGA"   "DUTCHESS"  
+##  [6] "CHAUTAUQUA" "TOMPKINS"   "MONTGOMERY" "KINGS"      "BRONX"
+
tb2 <- total_dt %>% 
+  filter(county_name %in% c_names) %>% 
+  group_by(county_name) %>% 
+  summarise(q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch),
+            per_poverty = mean(county_per_poverty),
+            avg_read = mean(mean_ela_score),
+            avg_math = mean(mean_math_score),
+            total_erm = sum(total_enroll))
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
tb2 <- as.data.frame(tb2)
+tb2$q_for_lunch <- tb2$q_for_lunch/tb2$total_erm
+tb2$total_erm <- NULL
+head(tb2)
+
##   county_name q_for_lunch per_poverty avg_read avg_math
+## 1       BRONX   0.8594981  0.28722937 435.9952 443.8711
+## 2  CHAUTAUQUA   0.5429325  0.17495020 457.6790 468.2254
+## 3    DUTCHESS   0.3252967  0.08351837 427.4772 434.7679
+## 4       KINGS   0.7957843  0.22478133 445.8000 454.8801
+## 5  MONTGOMERY   0.5392405  0.17723786 480.3711 490.9603
+## 6      NASSAU   0.1608488  0.05556675 444.1582 455.2362
+
+
+

Task 6

+
length(unique(school$school_name))
+
## [1] 5042
+
plt1 <- new_school %>% 
+  filter(year == 2008) %>% 
+  group_by(school_name) %>% 
+  summarise(access = per_free_lunch+per_reduced_lunch,
+            total_mean_score = mean_ela_score+mean_math_score)
+
## `summarise()` regrouping output by 'school_name' (override with `.groups` argument)
+
plt1 %>% ggplot() +
+  geom_point(aes(x=access,y = total_mean_score))
+

+
plt2 <- new_school %>% 
+  filter(year == 2009) %>% 
+  group_by(school_name) %>% 
+  summarise(access = per_free_lunch+per_reduced_lunch,
+            total_mean_score = mean_ela_score+mean_math_score)
+
## `summarise()` regrouping output by 'school_name' (override with `.groups` argument)
+
plt2 %>% ggplot() +
+  geom_point(aes(x=access,y = total_mean_score))
+

+
plt3 <- new_school %>% 
+  filter(year == 2010) %>% 
+  group_by(school_name) %>% 
+  summarise(access = per_free_lunch+per_reduced_lunch,
+            total_mean_score = mean_ela_score+mean_math_score)
+
## `summarise()` regrouping output by 'school_name' (override with `.groups` argument)
+
plt3 %>% ggplot() +
+  geom_point(aes(x=access,y = total_mean_score))
+

+
    +
  • By subsetting data from 2008-2010, I plotted the scatter plot of the total percentage of access to free/reduced price lunch versus the total mean scores of each school.

  • +
  • Through the plot we can see a negative linear trend: for schools from which students have more access to free/reduced price lunch, these students tend to have lower total scores on both ELA and math exams.

  • +
+
plt22 <- total_dt %>% 
+  group_by(group,year) %>%
+  summarise(mean_score = (mean(mean_ela_score) + mean(mean_math_score))/2)
+
## `summarise()` regrouping output by 'group' (override with `.groups` argument)
+
plt22 <- as.data.frame(plt22)
+plt22 %>% ggplot +
+  geom_line(aes(x = year,y = mean_score,group = group,col = group))
+

+
    +
  • This plot shows the average scores of each poverty level group across the years.
  • +
  • If we take averages across all counties with their group labels (high/low/medium), the differences between each group’s mean score are not as considerable.
  • +
+
+
+

Task 7

+
t7 <- total_dt %>% 
+  group_by(county_name) %>% 
+  summarise(mean_score = mean(mean_ela_score+mean_math_score),
+            total_per = mean(per_free_lunch+per_reduced_lunch),
+            group = group[1])
+
## `summarise()` ungrouping output (override with `.groups` argument)
+
t7 <- as.data.frame(t7)
+
+t7 %>% ggplot() +
+  geom_point(aes(x = mean_score,y = total_per,group = group,col = group))
+

+
    +
  • In the plot, each point represents a county, and the colors represent their poverty labels.

  • +
  • From the plot we cannot see clear relationship between each counties poverty level(measured by the total percentage of students allowed for free/reduced price lunch) and their students’ test performances.

  • +
  • However, this plot is using data that is county-level, which means that the test scores and lunch data are averaged across all schools within one county. If we go back to the plot 1 in Task 6, we do see that the schools with lowers average scores often allow more students to access free/less expensive lunch.

  • +
  • Also, this trend has been consistend across the years, as illustrated by the three plots in Task 6.

  • +
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/submissions/Junpeng_Jiang_submission.rmd b/submissions/Junpeng_Jiang_submission.rmd new file mode 100644 index 0000000..dc43cab --- /dev/null +++ b/submissions/Junpeng_Jiang_submission.rmd @@ -0,0 +1,228 @@ +--- +title: "Junpeng Jiang 2020 Bootcamp Submission" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r} +library(dplyr) +library(ggplot2) +``` + +# Task 1 +```{r} +school <- read.csv(here::here("data/nys_schools.csv")) +acs <- read.csv(here::here("data/nys_acs.csv")) +``` + +# Task 2 +```{r} +str(school) +str(acs) +summary(school) +summary(acs) +``` + +- Check for data types +- There missing values in school dataset are labeled as -99, all categorical variables (plus school_cd) are complete. + + +# Task 3 +### Dealing with Missing values + +- Deal with -99, first count how many missing values there are. + +```{r} +num_of_na <- rep(NA,6) +for (i in 7:12) { + num_of_na[i-6] <- sum(school[,i] == -99) +} +num_of_na +dim(school) +``` + +- Remember that there are 35663 rows in the school dataset, so taking out 2200 rows should not affect the entire dataset considerably. + +```{r} +new_school <- school +new_school[new_school == -99] <- NA +new_school <- na.omit(new_school) +dim(new_school) +``` + +- This way we take out all rows with missing values, and the remaining dataset new_school has 33437 observations. + +### Create poverty level. + +```{r} +county_avg_median <- acs %>% + group_by(county_name) %>% + summarise(n = mean(median_household_income)) +county_avg_median <- as.data.frame(county_avg_median) +hist(county_avg_median$n) +``` + +- Compute average median income of each county across the years and plot a hisgram of the averages. + +```{r} +summary(county_avg_median$n) +``` + +- By looking at the quantiles, I decide to split the groups as follows: +- < 46731 : low +- between 46731 and 56768: medium +- larger than 56768:high + +```{r} +county_avg_median$group <- rep(NA,dim(county_avg_median)[1]) +for (i in 1:dim(county_avg_median)[1]) { + if(county_avg_median[i,'n'] < 46731){ + county_avg_median[i,'group'] <- "low" + } else if(county_avg_median[i,'n'] >= 46731 && county_avg_median[i,'n'] <= 56768){ + county_avg_median[i,'group'] <- "medium" + } else if(county_avg_median[i,'n'] > 56768){ + county_avg_median[i,'group'] <- "high" + } +} +head(county_avg_median) +``` + +- This dataframe indexes each county with a group of low/medium/high. +- Then append the group variable back to the county dataset. + +```{r} +acs$group <- rep(NA,dim(acs)[1]) + +for (i in 1:dim(acs)[1]) { + rnum <- which(acs$county_name[i] == county_avg_median$county_name) + acs[i,"group"] <- county_avg_median[rnum,"group"] +} +head(acs,15) +``` + +- The group variable is created as stated. + +### Create Z-Score +```{r} +z_dt <- new_school %>% group_by(year) %>% summarise(n = scale(mean_ela_score), m = scale(mean_math_score)) + +new_school$z_ela <- z_dt$n +new_school$z_math <- z_dt$m +head(new_school) +``` + +# Task 4 +```{r} +total_dt <- merge(school,acs,by = c("county_name","year")) +head(total_dt) +``` + +# Task 5 + +- 1 + +```{r} +tb1 <- total_dt %>% + group_by(county_name) %>% + summarise(total_erm = sum(total_enroll), + q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch), + per_poverty = mean(county_per_poverty)) + +tb1 <- as.data.frame(tb1) +tb1$q_for_lunch <- tb1$q_for_lunch/tb1$total_erm +head(tb1) +``` + +- 2 +```{r} + +for_names <- tb1 %>% arrange(per_poverty) +c_names <- for_names$county_name[c(1:5,58:62)] +c_names# These are the counties has top5 and bottom5 poverty rate. + +tb2 <- total_dt %>% + filter(county_name %in% c_names) %>% + group_by(county_name) %>% + summarise(q_for_lunch = sum(total_enroll*per_free_lunch)+sum(total_enroll*per_reduced_lunch), + per_poverty = mean(county_per_poverty), + avg_read = mean(mean_ela_score), + avg_math = mean(mean_math_score), + total_erm = sum(total_enroll)) +tb2 <- as.data.frame(tb2) +tb2$q_for_lunch <- tb2$q_for_lunch/tb2$total_erm +tb2$total_erm <- NULL +head(tb2) +``` + +# Task 6 +```{r} +length(unique(school$school_name)) +plt1 <- new_school %>% + filter(year == 2008) %>% + group_by(school_name) %>% + summarise(access = per_free_lunch+per_reduced_lunch, + total_mean_score = mean_ela_score+mean_math_score) +plt1 %>% ggplot() + + geom_point(aes(x=access,y = total_mean_score)) + +plt2 <- new_school %>% + filter(year == 2009) %>% + group_by(school_name) %>% + summarise(access = per_free_lunch+per_reduced_lunch, + total_mean_score = mean_ela_score+mean_math_score) +plt2 %>% ggplot() + + geom_point(aes(x=access,y = total_mean_score)) + +plt3 <- new_school %>% + filter(year == 2010) %>% + group_by(school_name) %>% + summarise(access = per_free_lunch+per_reduced_lunch, + total_mean_score = mean_ela_score+mean_math_score) +plt3 %>% ggplot() + + geom_point(aes(x=access,y = total_mean_score)) +``` + +- By subsetting data from 2008-2010, I plotted the scatter plot of the total percentage of access to free/reduced price lunch versus the total mean scores of each school. + +- Through the plot we can see a negative linear trend: for schools from which students have more access to free/reduced price lunch, these students tend to have lower total scores on both ELA and math exams. + +```{r} +plt22 <- total_dt %>% + group_by(group,year) %>% + summarise(mean_score = (mean(mean_ela_score) + mean(mean_math_score))/2) + +plt22 <- as.data.frame(plt22) +plt22 %>% ggplot + + geom_line(aes(x = year,y = mean_score,group = group,col = group)) +``` + +- This plot shows the average scores of each poverty level group across the years. +- If we take averages across all counties with their group labels (high/low/medium), the differences between each group's mean score are not as considerable. + +# Task 7 + +```{r} +t7 <- total_dt %>% + group_by(county_name) %>% + summarise(mean_score = mean(mean_ela_score+mean_math_score), + total_per = mean(per_free_lunch+per_reduced_lunch), + group = group[1]) +t7 <- as.data.frame(t7) + +t7 %>% ggplot() + + geom_point(aes(x = mean_score,y = total_per,group = group,col = group)) + +``` + +- In the plot, each point represents a county, and the colors represent their poverty labels. + +- From the plot we cannot see clear relationship between each counties poverty level(measured by the total percentage of students allowed for free/reduced price lunch) and their students' test performances. + +- However, this plot is using data that is county-level, which means that the test scores and lunch data are averaged across all schools within one county. If we go back to the plot 1 in Task 6, we do see that the schools with lowers average scores often allow more students to access free/less expensive lunch. + +- Also, this trend has been consistend across the years, as illustrated by the three plots in Task 6. + + From 1a05f56249c0c583b4ab22204cda30642eeda572 Mon Sep 17 00:00:00 2001 From: JunpengJiang <49254991+JunpengJiang@users.noreply.github.com> Date: Fri, 18 Sep 2020 05:54:37 +0800 Subject: [PATCH 3/3] Delete day4_Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd --- ...Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd | 148 ------------------ 1 file changed, 148 deletions(-) delete mode 100644 submissions/day4_Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd diff --git a/submissions/day4_Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd b/submissions/day4_Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd deleted file mode 100644 index 1a6b54d..0000000 --- a/submissions/day4_Rmd-datamanip1_exercises_Junpeng_Jiang.Rmd +++ /dev/null @@ -1,148 +0,0 @@ ---- -title: "Exercises Day 2" -author: "Richard Paquin Morel, adapted from exercises by Christina Maimone" -date: "`r Sys.Date()`" -output: html_document -params: - answers: FALSE ---- - - -```{r, echo=FALSE, eval=TRUE} -answers<-params$answers -``` - -```{r global_options, echo = FALSE, include = FALSE} -knitr::opts_chunk$set(echo=answers, eval=answers, - warning = FALSE, message = FALSE, - cache = FALSE, tidy = FALSE) -``` - -## Load the data - -Load the `gapminder` dataset. - -```{r} -### Answer -dt <- read.csv(here::here("data/gapminder5.csv")) -``` - -```{r} -gapminder <- read.csv(here::here("data/gapminder5.csv"), stringsAsFactors=FALSE) -``` - - -## If Statement - -Use an if() statement to print a suitable message reporting whether there are any records from 2002 in the gapminder dataset. Now do the same for 2012. - -Hint: use the `any` function. - -```{asis} -### Answer -``` - -```{r} -year<-2002 -if(any(gapminder$year == year)){ - print(paste("Record(s) for the year",year,"found.")) -} else { - print(paste("No records for year",year)) -} -``` - - -## Loop and If Statements - -Write a script that finds the mean life expectancy by country for countries whose population is below the mean for the dataset - -Write a script that loops through the `gapminder` data by continent and prints out whether the mean life expectancy is smaller than 50, between 50 and 70, or greater than 70. - -```{asis} -### Answer -``` - -```{r} -overall_mean <- mean(gapminder$pop) - -for (i in unique(gapminder$country)) { - country_mean <- mean(gapminder$pop[gapminder$country==i]) - - if (country_mean < overall_mean) { - mean_le <- mean(gapminder$lifeExp[gapminder$country==i]) - print(paste("Mean Life Expectancy in", i, "is", mean_le)) - } -} # end for loop -``` - -```{r} -lower_threshold <- 50 -upper_threshold <- 70 - -for (i in unique(gapminder$continent)){ - tmp <- mean(gapminder$lifeExp[gapminder$continent==i]) - - if (tmp < lower_threshold){ - print(paste("Average Life Expectancy in", i, "is less than", lower_threshold)) - } - else if (tmp > lower_threshold & tmp < upper_threshold){ - print(paste("Average Life Expectancy in", i, "is between", lower_threshold, "and", upper_threshold)) - } - else { - print(paste("Average Life Expectancy in", i, "is greater than", upper_threshold)) - } - -} -``` - - -## Exercise: Write Functions - -Create a function that given a data frame will print the name of each column and the class of data it contains. Use the gapminder dataset. Hint: Use `mode()` or `class()` to get the class of the data in each column. Remember that `names()` or `colnames()` returns the name of the columns in a dataset. - -```{asis} -### Answer - -Note: Some of these were taken or modified from https://www.r-bloggers.com/functions-exercises/ -``` - -```{r} -data_frame_info <- function(df) { - cols <- names(df) - for (i in cols) { - print(paste0(i, ": ", mode(df[, i]))) - } -} -data_frame_info(gapminder) -``` - -Create a function that given a vector will print the mean and the standard deviation of a **vector**, it will optionally also print the median. Hint: include an argument that takes a boolean (`TRUE`/`FALSE`) operator and then include an `if` statement. - -```{asis} -### Answer - -``` - -```{r} -vector_info <- function(x, include_median=FALSE) { - print(paste("Mean:", mean(x))) - print(paste("Standard Deviation:", sd(x))) - if (include_median) { - print(paste("Median:", median(x))) - } -} - -le <- gapminder$lifeExp -vector_info(le, include_median = F) -vector_info(le, include_median = T) -``` - -## Analyzing the relationship - -Use what you've learned so far to answer the following questions using the `gapminder` dataset. Be sure to include some visualizations! - -1. What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.) - -2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America. - -- This file submitted on 9/3 is a practice for making a pull request. The excercise are not done yet. \ No newline at end of file