diff --git a/exercises/day1_syntax-fileIO_exercises.R b/exercises/day1_syntax-fileIO_exercises.R index 8ffe0cc..d29ab0d 100644 --- a/exercises/day1_syntax-fileIO_exercises.R +++ b/exercises/day1_syntax-fileIO_exercises.R @@ -6,46 +6,46 @@ #### Arithmetic #### # Pick a number; save it as x - +x <- 5 # Multiply x by 3 - +x * 3 # Take the log of the above (Hint, you need the function log() here) - +log(x * 3) # Subtract 4 from the above - +log(x * 3) - 4 # Square the above - +(log(x * 3) - 4)^2 #### Comparisons and Logical Operators #### # Check if 1 is bigger than 2 - +1 > 2 # Check if 1 + 1 is equal to 2 - +1 + 1 == 2 # Check if it is true that the strings "eat" and "drink" are not equal to each other - +"eat" != "drink" # Check if it is true that 1 is equal to 1 *AND* 1 is equal to 2 # (Hint: remember what the operators & and | do) - +(1==1) & (1==2) # Check if it is true that 1 is equal to 1 *OR* 1 is equal to 2 - +(1==1)|(1==2) #### Packages and Functions #### # Load the package tidyverse - +library(tidyverse) # Open the help file for the function recode # (Hint: remember what ? does) - +?recode #### REVIEW: DATA STRUCTURES #### @@ -57,19 +57,20 @@ x1 <- rnorm(5) x2 <- rnorm(20, mean=0.5) # Select the 3rd element in x1 - +x1[3] # Select the elements of x1 that are less than 0 - +x1[x1 < 0] # Select the elements of x2 that are greater than 1 - +x2[x2 > 1] # Create x3 containing the first five elements of x2 - +x3 <- x2[1:5] +x3 # Select all but the third element of x1 - +x1[-3] #### Missing values #### @@ -77,10 +78,10 @@ x2 <- rnorm(20, mean=0.5) vec <- c(1, 8, NA, 7, 3) # Calculate the mean of vec, excluding the NA value - +mean(vec, na.rm = TRUE) # Count the number of missing values in vec - +sum(is.na(vec)) #### Factors #### @@ -98,13 +99,16 @@ vec <- c(1, 8, NA, 7, 3) mat <- matrix(c(1:51, rep(NA,4)), ncol=5) # Select row 4, column 5 - +mat[4,5] # Select column 3 +mat[,3] +# Select column 1 and 5 +mat[,c(1,5)] # Bonus: How many NA values are there in this matrix? - +sum(is.na(mat)) #### Data frames #### @@ -112,20 +116,25 @@ mat <- matrix(c(1:51, rep(NA,4)), ncol=5) data(mtcars) # Identify the number of observations (rows) and number of variables (columns) - +nrow(mtcars) +ncol(mtcars) +dim(mtcars) # Identify the names of the variables - +names(mtcars) # Select the variable 'mpg' - +mtcars$mpg +mtcars["mpg"] # Select the 4th row - +mtcars[4,] # Square the value of the 'cyl' variable and store this as a new variable 'cylsq' - - +mtcars$cyl +cylsq <- (mtcars$cyl)^2 +cylsq + #### READING FILES #### # Check your working directory. It should be the root folder where you downloaded the boot camp materials. If that's not the case, set your working directory accordingly. @@ -135,6 +144,6 @@ data(mtcars) gapminder <- read.csv("data/gapminder5.csv", stringsAsFactors=FALSE) # Load the readr package - +library(readr) # Read gapminder data with read_csv() diff --git a/exercises/day2_dataintro_exercises.R b/exercises/day2_dataintro_exercises.R index 33634f9..29203d7 100644 --- a/exercises/day2_dataintro_exercises.R +++ b/exercises/day2_dataintro_exercises.R @@ -2,37 +2,45 @@ ### by Kumar Ramanathan, based on materials from Christina Maimone ### #### DATA MANIPULATION #### +library(readr) +gapminder <- read_csv("data/gapminder5.csv") #### Exploring data frames #### # Run summary() on the gapminder data - +summary(gapminder) # Find the mean of the variable pop - +mean(gapminder$pop) # Create a frequency table of the variable 'year' # Hint: use table() - +table(gapminder$year) # Create a proportion table of the variable 'continent' # Hint: use prop.table() - +prop.table(table(gapminder$continent)) #### Subsetting and Sorting #### -# Create a new data frame called gapminder07 contaning only those rows in the gapminder data where year is 2007 +# Create a new data frame called gapminder07 containing only those rows in the gapminder data where year is 2007 +gapminder07 <- gapminder[gapminder$year==2007,] +gapminder07 <- subset(gapminder, subset=year==2007) +# More conditions in subset +gapminder07mex <- gapminder[gapminder$year == 2007 & gapminder$country=="Mexico",] +gapminder07mex <- subset(gapminder, year==2007 & country =="Mexico") # Created a sorted frequency table of the variable continent in gapminder07 - +sort(table(gapminder07$continent)) # Print out the population of Mexico in 2007 +gapminder07$pop[gapminder07$country=="Mexico"] +gapminder$pop[gapminder$country=="Mexico"& gapminder$year == 2007] - -# BONUS: Print out the rows represnting the 5 countries with the highest population in 2007 +# BONUS: Print out the rows representing the 5 countries with the highest population in 2007 # Hint: Use order(), which we learned about, and head(), which prints out the first 5 rows of a data frame - +head(gapminder07[order(gapminder07$pop, decreasing = TRUE),]) #### Adding and removing columns #### @@ -42,10 +50,10 @@ #### Recoding variables #### # Round the values of the variable `lifeExp` using `round()` and store this as a new variable `lifeExp_round` - +lifeExp_round <- round(gapminder$lifeExp) # Print out the new variable to see what it looks like - +lifeExp_round # This code creates the new variable 'lifeExp_over70'. Try to understand what it does. gapminder07$lifeExp_over70 <- NA # Initialize a variable containing all "NA" values @@ -56,36 +64,100 @@ table(gapminder07$lifeExp_over70) # Try to create a new variable 'lifeExp_highlow' that has the value # "High" when life expectancy is over the mean and the value "Low" # when it is below the mean. When you are done, print a frequency table. +gapminder07$lifeExp_highlow <- NA +gapminder07$lifeExp_highlow[gapminder07$lifeExp>mean(gapminder07$lifeExp)] <- "Low" +############ Day 2 Exercises ############# +### by Kumar Ramanathan, based on materials from Christina Maimone ### + +#### DATA MANIPULATION #### + +#### Exploring data frames #### + +# Run summary() on the gapminder data +summary(gapminder) + +# Find the mean of the variable pop +mean(gapminder$pop) + +# Create a frequency table of the variable 'year' +# Hint: use table() +table(gapminder$year) + +# Create a proportion table of the variable 'continent' +# Hint: use prop.table() +prop.table(table(gapminder$continent)) + +#### Subsetting and Sorting #### + +# Create a new data frame called gapminder07 containing only those rows in the gapminder data where year is 2007 +gapminder07 <- gapminder[gapminder$year==2007,] +gapminder07 <- subset(gapminder, subset=year==2007) + +# Created a sorted frequency table of the variable continent in gapminder07 +sort(table(gapminder07$continent)) + +# Print out the population of Mexico in 2007 +gapminder07$pop[gapminder07$country=="Mexico"] +gapminder$pop[gapminder$country=="Mexico"|year == 2007] + +# BONUS: Print out the rows representing the 5 countries with the highest population in 2007 +# Hint: Use order(), which we learned about, and head(), which prints out the first 5 rows of a data frame +head(gapminder07[order(gapminder07$pop, decreasing = TRUE),]) +#### Adding and removing columns #### +# See lecture notes for more guidance. We will practice this skill in the next section. +#### Recoding variables #### + +# Round the values of the variable `lifeExp` using `round()` and store this as a new variable `lifeExp_round` +lifeExp_round <- round(gapminder$lifeExp) + +# Print out the new variable to see what it looks like +lifeExp_round + +# This code creates the new variable 'lifeExp_over70'. Try to understand what it does. +gapminder07$lifeExp_over70 <- NA # Initialize a variable containing all "NA" values +gapminder07$lifeExp_over70[gapminder07$lifeExp>70] <- "Yes" +gapminder07$lifeExp_over70[gapminder07$lifeExp<70] <- "No" +table(gapminder07$lifeExp_over70) + +# Try to create a new variable 'lifeExp_highlow' that has the value +# "High" when life expectancy is over the mean and the value "Low" +# when it is below the mean. When you are done, print a frequency table. +gapminder07$lifeExp_highlow <- NA +gapminder07$lifeExp_highlow[gapminder07$lifeExp>mean(gapminder07$lifeExp)] <- "High" +gapminder07$lifeExp_highlow[gapminder07$lifeExpmedian(gapminder$gdpPercap, na.rm = T)){ + print(paste0(i, mean_gdpPercap)) + } else { + print(paste0(i, "Sorry, this country does not meet the median threshold")) + } +} ##################### ##Functions #Create a function called report_min_max to calculate the minimum and maximum log gdp (i.e. not per capita) for Europe. +report_min_max <- + function(df, variable, continent){ + #subset + var <- df[[variable]][df$continent == continent] + min_log_gdp <- min(log(var)) + max_log_gdp <- max(log(var)) + cat("Continent:" , continent, + "\nMin log gdp:", min_log_gdp, + "\nMax log gdp:", max_log_gdp) + } + +report_min_max2 <- + function(df, variable, continent){ + var <- subset(df, df$continent == continent, select = c(variable)) + min_log_gdp <- min(log(var)) + max_log_gdp <- max(log(var)) + cat("Continent:" , continent, + "\nMin log gdp:", min_log_gdp, + "\nMax log gdp:", max_log_gdp) + } + + +report_min_max(gapminder, "gdp", "Europe") ##################### #Putting things all together. @@ -73,6 +159,7 @@ viz_lm <- function(df, dv, iv, year) { + #same as gapminder[gapminder$year == 1977,] dat <- df[df[["year"]] == year, ] y <- log(dat[[dv]]) x <- log(dat[[iv]]) @@ -82,4 +169,5 @@ viz_lm <- lines(x, predict(fit), col = 'blue') } +viz_lm(gapminder, "lifeExp", "gdpPercap", 1977) diff --git a/submissions/Day4Excercise_YuXuan.Rmd b/submissions/Day4Excercise_YuXuan.Rmd new file mode 100644 index 0000000..5db9d96 --- /dev/null +++ b/submissions/Day4Excercise_YuXuan.Rmd @@ -0,0 +1,151 @@ +--- +title: "Exercises Day 2" +author: "Richard Paquin Morel, adapted from exercises by Christina Maimone" +date: "`r Sys.Date()`" +output: html_document +params: + answers: FALSE +--- + + +```{r, echo=FALSE, eval=TRUE} +answers<-params$answers +``` + +```{r global_options, echo = FALSE, include = FALSE} +# name of the chuck: global_option +knitr::opts_chunk$set(echo=answers, eval=answers, + warning = FALSE, message = FALSE, + cache = FALSE, tidy = FALSE) +``` + +## Load the data + +Load the `gapminder` dataset. + +```{asis} +### Answer +``` + +```{r} +library("here") +here() + +gapminder <- read.csv(here::here("MSiA/Academic/Bootcamp/bootcamp-2020/data","gapminder5.csv"), stringsAsFactors=FALSE) + +#gapminder <- read.csv(here::here("data/gapminder5.csv"), stringsAsFactors=FALSE) +``` + + +## If Statement + +Use an if() statement to print a suitable message reporting whether there are any records from 2002 in the gapminder dataset. Now do the same for 2012. + +Hint: use the `any` function. + +```{asis} +### Answer +``` + +```{r} +year<-2002 +if(any(gapminder$year == year)){ + print(paste("Record(s) for the year",year,"found.")) +} else { + print(paste("No records for year",year)) +} +``` + + +## Loop and If Statements + +Write a script that finds the mean life expectancy by country for countries whose population is below the mean for the dataset + +Write a script that loops through the `gapminder` data by continent and prints out whether the mean life expectancy is smaller than 50, between 50 and 70, or greater than 70. + +```{asis} +### Answer +``` + +```{r} +overall_mean <- mean(gapminder$pop) + +for (i in unique(gapminder$country)) { + country_mean <- mean(gapminder$pop[gapminder$country==i]) + + if (country_mean < overall_mean) { + mean_le <- mean(gapminder$lifeExp[gapminder$country==i]) + print(paste("Mean Life Expectancy in", i, "is", mean_le)) + } +} # end for loop +``` + +```{r} +lower_threshold <- 50 +upper_threshold <- 70 + +for (i in unique(gapminder$continent)){ + tmp <- mean(gapminder$lifeExp[gapminder$continent==i]) + + if (tmp < lower_threshold){ + print(paste("Average Life Expectancy in", i, "is less than", lower_threshold)) + } + else if (tmp > lower_threshold & tmp < upper_threshold){ + print(paste("Average Life Expectancy in", i, "is between", lower_threshold, "and", upper_threshold)) + } + else { + print(paste("Average Life Expectancy in", i, "is greater than", upper_threshold)) + } + +} +``` + + +## Exercise: Write Functions + +Create a function that given a data frame will print the name of each column and the class of data it contains. Use the gapminder dataset. Hint: Use `mode()` or `class()` to get the class of the data in each column. Remember that `names()` or `colnames()` returns the name of the columns in a dataset. + +```{asis} +### Answer + +Note: Some of these were taken or modified from https://www.r-bloggers.com/functions-exercises/ +``` + +```{r} +data_frame_info <- function(df) { + cols <- names(df) + for (i in cols) { + print(paste0(i, ": ", mode(df[, i]))) + } +} +data_frame_info(gapminder) +``` + +Create a function that given a vector will print the mean and the standard deviation of a **vector**, it will optionally also print the median. Hint: include an argument that takes a boolean (`TRUE`/`FALSE`) operator and then include an `if` statement. + +```{asis} +### Answer + +``` + +```{r} +vector_info <- function(x, include_median=FALSE) { + print(paste("Mean:", mean(x))) + print(paste("Standard Deviation:", sd(x))) + if (include_median) { + print(paste("Median:", median(x))) + } +} + +le <- gapminder$lifeExp +vector_info(le, include_median = F) +vector_info(le, include_median = T) +``` + +## Analyzing the relationship + +Use what you've learned so far to answer the following questions using the `gapminder` dataset. Be sure to include some visualizations! + +1. What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.) + +2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America. \ No newline at end of file diff --git a/submissions/Day4Excercise_YuXuan.html b/submissions/Day4Excercise_YuXuan.html new file mode 100644 index 0000000..f0c177a --- /dev/null +++ b/submissions/Day4Excercise_YuXuan.html @@ -0,0 +1,447 @@ + + + + + + + + + + + + + + + +Exercises Day 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Load the data

+

Load the gapminder dataset.

+
+
+

If Statement

+

Use an if() statement to print a suitable message reporting whether there are any records from 2002 in the gapminder dataset. Now do the same for 2012.

+

Hint: use the any function.

+
+
+

Loop and If Statements

+

Write a script that finds the mean life expectancy by country for countries whose population is below the mean for the dataset

+

Write a script that loops through the gapminder data by continent and prints out whether the mean life expectancy is smaller than 50, between 50 and 70, or greater than 70.

+
+
+

Exercise: Write Functions

+

Create a function that given a data frame will print the name of each column and the class of data it contains. Use the gapminder dataset. Hint: Use mode() or class() to get the class of the data in each column. Remember that names() or colnames() returns the name of the columns in a dataset.

+

Create a function that given a vector will print the mean and the standard deviation of a vector, it will optionally also print the median. Hint: include an argument that takes a boolean (TRUE/FALSE) operator and then include an if statement.

+
+
+

Analyzing the relationship

+

Use what you’ve learned so far to answer the following questions using the gapminder dataset. Be sure to include some visualizations!

+
    +
  1. What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.)

  2. +
  3. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America.

  4. +
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/submissions/Day4_yuxuan.Rmd b/submissions/Day4_yuxuan.Rmd new file mode 100644 index 0000000..5db9d96 --- /dev/null +++ b/submissions/Day4_yuxuan.Rmd @@ -0,0 +1,151 @@ +--- +title: "Exercises Day 2" +author: "Richard Paquin Morel, adapted from exercises by Christina Maimone" +date: "`r Sys.Date()`" +output: html_document +params: + answers: FALSE +--- + + +```{r, echo=FALSE, eval=TRUE} +answers<-params$answers +``` + +```{r global_options, echo = FALSE, include = FALSE} +# name of the chuck: global_option +knitr::opts_chunk$set(echo=answers, eval=answers, + warning = FALSE, message = FALSE, + cache = FALSE, tidy = FALSE) +``` + +## Load the data + +Load the `gapminder` dataset. + +```{asis} +### Answer +``` + +```{r} +library("here") +here() + +gapminder <- read.csv(here::here("MSiA/Academic/Bootcamp/bootcamp-2020/data","gapminder5.csv"), stringsAsFactors=FALSE) + +#gapminder <- read.csv(here::here("data/gapminder5.csv"), stringsAsFactors=FALSE) +``` + + +## If Statement + +Use an if() statement to print a suitable message reporting whether there are any records from 2002 in the gapminder dataset. Now do the same for 2012. + +Hint: use the `any` function. + +```{asis} +### Answer +``` + +```{r} +year<-2002 +if(any(gapminder$year == year)){ + print(paste("Record(s) for the year",year,"found.")) +} else { + print(paste("No records for year",year)) +} +``` + + +## Loop and If Statements + +Write a script that finds the mean life expectancy by country for countries whose population is below the mean for the dataset + +Write a script that loops through the `gapminder` data by continent and prints out whether the mean life expectancy is smaller than 50, between 50 and 70, or greater than 70. + +```{asis} +### Answer +``` + +```{r} +overall_mean <- mean(gapminder$pop) + +for (i in unique(gapminder$country)) { + country_mean <- mean(gapminder$pop[gapminder$country==i]) + + if (country_mean < overall_mean) { + mean_le <- mean(gapminder$lifeExp[gapminder$country==i]) + print(paste("Mean Life Expectancy in", i, "is", mean_le)) + } +} # end for loop +``` + +```{r} +lower_threshold <- 50 +upper_threshold <- 70 + +for (i in unique(gapminder$continent)){ + tmp <- mean(gapminder$lifeExp[gapminder$continent==i]) + + if (tmp < lower_threshold){ + print(paste("Average Life Expectancy in", i, "is less than", lower_threshold)) + } + else if (tmp > lower_threshold & tmp < upper_threshold){ + print(paste("Average Life Expectancy in", i, "is between", lower_threshold, "and", upper_threshold)) + } + else { + print(paste("Average Life Expectancy in", i, "is greater than", upper_threshold)) + } + +} +``` + + +## Exercise: Write Functions + +Create a function that given a data frame will print the name of each column and the class of data it contains. Use the gapminder dataset. Hint: Use `mode()` or `class()` to get the class of the data in each column. Remember that `names()` or `colnames()` returns the name of the columns in a dataset. + +```{asis} +### Answer + +Note: Some of these were taken or modified from https://www.r-bloggers.com/functions-exercises/ +``` + +```{r} +data_frame_info <- function(df) { + cols <- names(df) + for (i in cols) { + print(paste0(i, ": ", mode(df[, i]))) + } +} +data_frame_info(gapminder) +``` + +Create a function that given a vector will print the mean and the standard deviation of a **vector**, it will optionally also print the median. Hint: include an argument that takes a boolean (`TRUE`/`FALSE`) operator and then include an `if` statement. + +```{asis} +### Answer + +``` + +```{r} +vector_info <- function(x, include_median=FALSE) { + print(paste("Mean:", mean(x))) + print(paste("Standard Deviation:", sd(x))) + if (include_median) { + print(paste("Median:", median(x))) + } +} + +le <- gapminder$lifeExp +vector_info(le, include_median = F) +vector_info(le, include_median = T) +``` + +## Analyzing the relationship + +Use what you've learned so far to answer the following questions using the `gapminder` dataset. Be sure to include some visualizations! + +1. What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.) + +2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America. \ No newline at end of file diff --git a/submissions/Day8_Final_XuanYu.Rmd b/submissions/Day8_Final_XuanYu.Rmd new file mode 100644 index 0000000..b49b96b --- /dev/null +++ b/submissions/Day8_Final_XuanYu.Rmd @@ -0,0 +1,235 @@ +--- +title: "Day 8 final excercise" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +### Task 1: Import your data +```{r, message=FALSE} +library(here) +library(tidyverse) +here() + + +schools <- read.csv(here::here("data/nys_schools.csv"), + stringsAsFactors = F) +survey <- read.csv(here::here("data/nys_acs.csv"), + stringsAsFactors = F) +``` + + +### Task 2: Explore your data +```{r} +str(schools) +str(survey) +``` + +#### Check missing values (NA) in both data frames +```{r} +sum(is.na(schools$region)) +sum(is.na(survey)) +``` + +There is no missing value as NA. + +#### Summary statistics +```{r} +summary(schools) # we have -99 for many variables +summary(survey) +``` + +#### School data +```{r} +# number of schools in each region +table(schools$region) # "-99" has 19 values indicating missing info. + +#regional lunch +schools %>% + filter(region!="-99")%>% + group_by(region) %>% + summarise(ave_free_lunch = mean(per_free_lunch), ave_reduced_lunch = mean(per_reduced_lunch)) +``` + +#### Survey data +```{r} +# Poverty rate in each year +survey %>% + group_by(year) %>% + summarise(max_poverty=max(county_per_poverty), min_poverty=min(county_per_poverty), ave_poverty=mean(county_per_poverty)) +``` + + +### Task 3: Recoding and variable manipulation +#### 1. Deal with missing values +```{r} +# Where are missing values? +library(tidyverse) + +tmp1 <- filter(schools, per_free_lunch==-99 | per_reduced_lunch ==-99 | per_lep==-99| mean_ela_score==-99| mean_math_score==-99) +tmp2 <- filter(schools, region=="-99"|per_free_lunch==-99 | per_reduced_lunch ==-99 | per_lep==-99) +nrow(tmp1) +nrow(tmp2) +rm(tmp1,tmp2) +``` + + +It seems that most missing value comes from scores. I decide to only remove other columns with missing values and change -99 in scores to NA (try to keep more data points). + + +```{r} +# Change missing score data to NA +schools$mean_ela_score[schools$mean_ela_score == -99] <- NA +schools$mean_math_score[schools$mean_math_score == -99] <- NA + +# Drop rows in other columns containing missing value +schools <- schools[schools$region !="-99" & schools$per_free_lunch!=-99 & schools$per_reduced_lunch !=-99 & schools$per_lep !=-99,] + +# Check summary statistics again +summary(schools) +``` + +#### 2. Create a categorical variable that groups counties into "high", "medium", and "low" poverty groups. +```{r} +hist(survey$county_per_poverty) +``` + +```{r} +summary(survey$county_per_poverty) +sd(survey$county_per_poverty) +``` + + +The histogram is bell-shape and right-skewed. Median poverty has the largest number of people, which is also true in reality. I decided to take the 1 sd. away from the median as lower and upper thresholds. + + +```{r} +lower_threshold <- 0.12884-0.031741 +upper_threshold <- 0.12884+0.031741 + +for (i in 1:nrow(survey)){ + if (survey$county_per_poverty[i]<=lower_threshold){ + survey[i, "poverty_group"] <- "Low" + } + else if (survey$county_per_poverty[i]>=upper_threshold){ + survey[i, "poverty_group"] <- "High" + } + else{ + survey[i, "poverty_group"] <- "Medium" + } +} + +table(survey$poverty_group) # follow the bell-shape + +boxplot(county_per_poverty~poverty_group,survey) +``` + +#### 3. Standardize score for math and english +```{r} +schools_new <- schools %>% + group_by(year) %>% + mutate(mean_ela_score_std = scale(mean_ela_score), + mean_math_score_std = scale(mean_math_score)) +``` + + +### Task 4: Merge datasets +```{r} +school_county <- merge(schools_new, survey, by = c("county_name","year"), all = F) +dim(school_county) +``` + + +### Task 5: Create summary tables +#### 1. For each county: total enrollment, percent of students qualifying for free or reduced price lunch, and percent of population in poverty. +```{r} +school_county %>% + mutate(free_reduced = per_free_lunch + per_reduced_lunch) %>% + group_by(county_name) %>% + summarize(mean_lunch = mean(free_reduced), poverty=mean(county_per_poverty)) + +``` + +#### 2. For the counties with the top 5 and bottom 5 poverty rate: percent of population in poverty, percent of students qualifying for free or reduced price lunch, mean reading scores and mean math score. +```{r} +school_county %>% + mutate(free_reduced = per_free_lunch + per_reduced_lunch) %>% # addition to get total + group_by(county_name) %>% + summarize(mean_lunch = mean(free_reduced), poverty=mean(county_per_poverty), + ela_score=mean(mean_ela_score, na.rm = T), + math_score=mean(mean_math_score, na.rm=T)) %>% + arrange(desc(poverty)) %>% + filter(row_number()<=5|row_number()>=max(row_number())-5) + +``` + +### Task 6: Data visualization +#### 1. The relationship between access to free/reduced price lunch and test performance, at the *school* level. +```{r} +school_county %>% + mutate(free_reduced = per_free_lunch + per_reduced_lunch) %>% + ggplot() + + geom_point(aes(x=free_reduced, y=mean_ela_score_std),na.rm = T, size=0.5) + + xlim(0,1)+ + ylim(-5,5)+ + labs(title="Free/reduced price lunch and ela test performance", + x="Free/reduced lunch", y="Standardized ela score") + +school_county %>% + mutate(free_reduced = per_free_lunch + per_reduced_lunch) %>% + ggplot() + + geom_point(aes(x=free_reduced, y=mean_math_score_std),na.rm = T, size=0.5) + + xlim(0,1)+ + ylim(-5,5)+ + labs(title="Free/reduced price lunch and math test performance", + x="Free/reduced lunch", y="Standardized math score") +``` + + +In both graph, we can clearly notice there is a negative relationship between test performance and percentage of free/reduced price lunch. I think larger percentage of free/reduced lunch indicates the school might be located at a region with higher poverty rate thus relatively poor academic performance. + + +#### 2. Average test performance across *counties* with high, low, and medium poverty. +```{r} +positions <- c("High", "Medium", "Low") + +school_county %>% + group_by(poverty_group)%>% + summarize(mean_math_score = mean(mean_math_score_std, na.rm=T), mean_ela_score = mean(mean_ela_score_std, na.rm=T))%>% + gather(key = subject, value = scaled_score, -poverty_group) %>% + ggplot() + + geom_col(aes(x = poverty_group, y = scaled_score, group = subject, fill = subject), position = "dodge") + + scale_x_discrete(limits = positions) + + labs(title ="Standardized test score of different poverty group", x = "Poverty group", y = "Standardized score") +``` + + +### Task 7: Answering questions +#### What can the data tell us about the relationship between poverty and test performance in New York public schools? Has this relationship changed over time? Is this relationship at all moderated by access to free/reduced price lunch? +```{r} +# Relationship between poverty and ela score +year <- unique(school_county$year) + +for (i in year){ + result <- lm(mean_ela_score_std ~ county_per_poverty,school_county[school_county$year == i,]) + print(paste0(i,": ")) + print(result$coefficients) +} +``` + +```{r} +# Relationship between poverty and math score +year <- unique(school_county$year) + +for (i in year){ + result <- lm(mean_math_score_std ~ county_per_poverty,school_county[school_county$year == i,]) + print(paste0(i,": ")) + print(result$coefficients) +} +``` + +From the linear regression, we can see there is a negative correlation between poverty and score. Also, the effect of poverty on both score generally decreases over time. +(I'm not sure how to answer the last question..)