From 56f7dd0fe3fc5478b3c2b4eef71e8e56210ca63d Mon Sep 17 00:00:00 2001
From: zixiaohuang9874 <zixiaohuang2021@u.northwestern.edu>
Date: Thu, 3 Sep 2020 23:36:01 +0800
Subject: [PATCH 1/9] Update on submission folder

---
 submissions/Day4Exercise_HuangZixiao.Rmd | 156 +++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 submissions/Day4Exercise_HuangZixiao.Rmd

diff --git a/submissions/Day4Exercise_HuangZixiao.Rmd b/submissions/Day4Exercise_HuangZixiao.Rmd
new file mode 100644
index 0000000..00757ae
--- /dev/null
+++ b/submissions/Day4Exercise_HuangZixiao.Rmd
@@ -0,0 +1,156 @@
+---
+title: "Exercises Day 2"
+author: "Richard Paquin Morel, adapted from exercises by Christina Maimone"
+date: "`r Sys.Date()`"
+output:
+  pdf_document: default
+  html_document: default
+params:
+  answers: yes
+---
+
+
+```{r, echo=FALSE, eval=TRUE}
+answers<-params$answers
+```
+
+```{r global_options, echo = FALSE, include = FALSE}
+knitr::opts_chunk$set(echo=answers, eval=answers,
+                      warning = FALSE, message = FALSE,
+                      cache = FALSE, tidy = FALSE)
+```
+
+## Load the data
+
+Load the `gapminder` dataset.
+
+```{asis}
+### Answer
+```
+
+```{r}
+gapminder <- read.csv(here::here("/Desktop/Northwestern/Bootcamp/bootcamp-2020/data/gapminder5.csv"), stringsAsFactors=FALSE)
+```
+
+## Class Example
+generation <- read_csv("Desktop/Northwestern/Bootcamp/bootcamp-2020/data/ca_energy_generation.csv")
+imports <- read_csv("Desktop/Northwestern/Bootcamp/bootcamp-2020/data/ca_energy_imports.csv")
+merged_energy <- merge(generation, imports, by = "datetime")
+dim(merged_energy)
+head(merged_energy)
+long_merged_energy <- gather(merged_energy, key = source, value = usage, -datetime)
+head(long_merged_energy)
+dim(long_merged_energy)
+
+## If Statement
+
+Use an if() statement to print a suitable message reporting whether there are any records from 2002 in the gapminder dataset. Now do the same for 2012.
+
+Hint: use the `any` function.
+
+```{asis}
+### Answer
+```
+
+```{r}
+year<-2002
+if(any(gapminder$year == year)){
+   print(paste("Record(s) for the year",year,"found."))
+} else {
+  print(paste("No records for year",year))
+}
+```
+
+
+## Loop and If Statements
+
+Write a script that finds the mean life expectancy by country for countries whose population is below the mean for the dataset
+
+Write a script that loops through the `gapminder` data by continent and prints out whether the mean life expectancy is smaller than 50, between 50 and 70, or greater than 70.
+
+```{asis}
+### Answer
+```
+
+```{r}
+overall_mean <- mean(gapminder$pop)
+
+for (i in unique(gapminder$country)) {
+   country_mean <- mean(gapminder$pop[gapminder$country==i])
+   
+   if (country_mean < overall_mean) {
+     mean_le <- mean(gapminder$lifeExp[gapminder$country==i])
+     print(paste("Mean Life Expectancy in", i, "is", mean_le))
+   } 
+} # end for loop
+```
+
+```{r}
+lower_threshold <- 50
+upper_threshold <- 70
+ 
+for (i in unique(gapminder$continent)){
+    tmp <- mean(gapminder$lifeExp[gapminder$continent==i])
+    
+    if (tmp < lower_threshold){
+        print(paste("Average Life Expectancy in", i, "is less than", lower_threshold))
+    }
+    else if (tmp > lower_threshold & tmp < upper_threshold){
+        print(paste("Average Life Expectancy in", i, "is between", lower_threshold, "and", upper_threshold))
+    }
+    else {
+        print(paste("Average Life Expectancy in", i, "is greater than", upper_threshold))
+    }
+   
+}
+```
+
+
+## Writing Functions
+
+Create a function that given a data frame will print the name of each column and the class of data it contains. Use the gapminder dataset.  Hint: Use `mode()` or `class()` to get the class of the data in each column. Remember that `names()` or `colnames()` returns the name of the columns in a dataset.
+
+```{asis}
+### Answer
+
+Note: Some of these were taken or modified from https://www.r-bloggers.com/functions-exercises/
+```
+
+```{r}
+data_frame_info <- function(df) {
+  cols <- names(df)
+  for (i in cols) {
+    print(paste0(i, ": ", mode(df[, i])))
+  }
+}
+data_frame_info(gapminder)
+```
+
+Create a function that given a vector will print the mean and the standard deviation of a **vector**, it will optionally also print the median. Hint: include an argument that takes a boolean (`TRUE`/`FALSE`) operator and then include an `if` statement.
+
+```{asis}
+### Answer
+
+```
+
+```{r}
+vector_info <- function(x, include_median=FALSE) {
+  print(paste("Mean:", mean(x)))
+  print(paste("Standard Deviation:", sd(x)))
+  if (include_median) {
+    print(paste("Median:", median(x)))
+  }
+}
+
+le <- gapminder$lifeExp
+vector_info(le, include_median = F)
+vector_info(le, include_median = T)
+```
+
+## Analyzing the relationship between GDP per capita and life expectancy
+
+Use what you've learned so far to answer the following questions using the `gapminder` dataset. Be sure to include some visualizations!
+
+1. What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.)
+
+2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America.
\ No newline at end of file

From 8a440c8ac33e9f0b6c33d542a9aeaaa414aa354a Mon Sep 17 00:00:00 2001
From: zixiaohuang9874 <zixiaohuang2021@u.northwestern.edu>
Date: Tue, 15 Sep 2020 22:23:26 +0800
Subject: [PATCH 2/9] Add submission file

---
 submissions/FinalRExercise_HuangZixiao.Rmd | 46 ++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 submissions/FinalRExercise_HuangZixiao.Rmd

diff --git a/submissions/FinalRExercise_HuangZixiao.Rmd b/submissions/FinalRExercise_HuangZixiao.Rmd
new file mode 100644
index 0000000..956c9f7
--- /dev/null
+++ b/submissions/FinalRExercise_HuangZixiao.Rmd
@@ -0,0 +1,46 @@
+---
+title: "FinalRExercise_HuangZixiao.Rmd"
+author: "Zixiao Huang"
+date: "9/15/2020"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## R Markdown
+
+This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
+
+When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
+
+```{r}
+library(tidyverse) 
+library(data.table)
+```
+
+# Task 1: Import your data (nys_schools.csv and nys_acs.csv)
+```{r}
+# Read the data of schools
+schools <- read.csv(here::here("Desktop/Northwestern/Bootcamp/bootcamp-2020/data/nys_schools.csv"))
+
+# Read the data of counties
+counties <- read.csv(here::here("Desktop/Northwestern/Bootcamp/bootcamp-2020/data/nys_acs.csv"))
+```
+
+# Task 2: Explore your data
+
+# Task 3: Recoding and Variable Manipulation
+1. Deal with missing values, which are currently encoded as -99
+```{r}
+
+
+
+
+```
+
+
+
+
+

From 5a7b22bdd4ea24e60b580aeebddcad55d11cf918 Mon Sep 17 00:00:00 2001
From: zixiaohuang9874 <zixiaohuang2021@u.northwestern.edu>
Date: Tue, 15 Sep 2020 23:52:31 +0800
Subject: [PATCH 3/9] Add submission file

---
 submissions/FinalRExercise_HuangZixiao.Rmd | 38 +++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/submissions/FinalRExercise_HuangZixiao.Rmd b/submissions/FinalRExercise_HuangZixiao.Rmd
index 956c9f7..ad63139 100644
--- a/submissions/FinalRExercise_HuangZixiao.Rmd
+++ b/submissions/FinalRExercise_HuangZixiao.Rmd
@@ -30,17 +30,53 @@ counties <- read.csv(here::here("Desktop/Northwestern/Bootcamp/bootcamp-2020/dat
 ```
 
 # Task 2: Explore your data
+```{r}
+summary(schools)
+summary(counties)
+```
 
 # Task 3: Recoding and Variable Manipulation
 1. Deal with missing values, which are currently encoded as -99
 ```{r}
+# Set all missing values to NA since later in our calculations and analysis, 
+# we can just ignore them.
+schools <- replace(schools, schools == -99, NA)
+counties <- replace(counties, counties == -99, NA)
+```
 
+2. Create a categorical variable that groups counties into "high", "medium", and "low" poverty groups.
+```{r}
+# Group counties into three different poverty groups by using the median_household_income column
+# Set the counties with lowest 25% median household income (first quartile) as "high" poverty group (income <= 46347)
+# Set the counties with highest 25% median household income (fourth quartile) as "low" poverty group (income > 56448)
+# Set the middle 50% as "medium" poverty group (46347 < income <= 56448)
+
+# Start by creating a new variable with all missing values
+counties$poverty_level <- NA
+# Replace lowest 25% value with "high"
+counties$poverty_level[counties$median_household_income <= 46347] <- "high"
+# Replace middle 50% value with "medium"
+counties$poverty_level[counties$median_household_income <= 56448 & counties$median_household_income > 46347] <- "medium"
+# Replace highest 25% value with "low"
+counties$poverty_level[counties$median_household_income > 56448] <- "low" 
+```
 
+3. Create a new variable that is the standardized z-score for math and English Language Arts (ELA)
+for each year.
+```{r}
+# First group by year, then use the scale() function
+schools <- schools %>%
+            group_by(year) %>%
+            mutate(z_score_math = scale(mean_math_score),
+            z_score_ela = scale(mean_ela_score))
+```
 
+# Task 4: Merge datasets
+Create a county-level dataset that merges variables from the schools dataset and the ACS dataset.
+```{r}
 
 ```
 
 
 
 
-

From c2d3c18bf69a0af1a89a45daf150a1d2a462b4e0 Mon Sep 17 00:00:00 2001
From: zixiaohuang9874 <zixiaohuang2021@u.northwestern.edu>
Date: Wed, 16 Sep 2020 00:45:01 +0800
Subject: [PATCH 4/9] Finishing Several Tasks

---
 submissions/FinalRExercise_HuangZixiao.Rmd | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/submissions/FinalRExercise_HuangZixiao.Rmd b/submissions/FinalRExercise_HuangZixiao.Rmd
index ad63139..497d0a4 100644
--- a/submissions/FinalRExercise_HuangZixiao.Rmd
+++ b/submissions/FinalRExercise_HuangZixiao.Rmd
@@ -74,9 +74,27 @@ schools <- schools %>%
 # Task 4: Merge datasets
 Create a county-level dataset that merges variables from the schools dataset and the ACS dataset.
 ```{r}
+county_school <- merge(schools, counties, by = c("county_name", "year"))
+```
+
+# Task 5: Generate summary tables
+1. For each county: total enrollment, percent of students qualifying for free or reduced price lunch, and percent of 
+population in poverty. 
+```{r}
+tmp <- county_school %>%
+       group_by(county_name) %>%
+       select(total_enroll, per_free_lunch, per_reduced_lunch, county_per_poverty) 
+
 
 ```
 
+2. For the counties with the top 5 and bottom 5 poverty rate: percent of population in poverty, percent of students 
+qualifying for free or reduced price lunch, mean reading score, and mean math score.
 
+# Task 6: Data Visualization
+1. The relationship between access to free/reduced price lunch and test performance, at the school level.
 
+2. Average test performance across counties with high, low, and medium poverty.
 
+# Task 7: Answering questions
+What can the data tell us about the relationship between poverty and test performance in New York public schools? Has this relationship changed over time? Is this relationship at all moderated by access to free/reduced price lunch?

From 84d8c8d8874c371384ff7fadaa2a8f6626cad1e5 Mon Sep 17 00:00:00 2001
From: zixiaohuang9874 <zixiaohuang2021@u.northwestern.edu>
Date: Wed, 16 Sep 2020 15:38:58 +0800
Subject: [PATCH 5/9] Changes in Task 5

---
 submissions/FinalRExercise_HuangZixiao.Rmd | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/submissions/FinalRExercise_HuangZixiao.Rmd b/submissions/FinalRExercise_HuangZixiao.Rmd
index 497d0a4..0d81379 100644
--- a/submissions/FinalRExercise_HuangZixiao.Rmd
+++ b/submissions/FinalRExercise_HuangZixiao.Rmd
@@ -82,14 +82,23 @@ county_school <- merge(schools, counties, by = c("county_name", "year"))
 population in poverty. 
 ```{r}
 tmp <- county_school %>%
-       group_by(county_name) %>%
-       select(total_enroll, per_free_lunch, per_reduced_lunch, county_per_poverty) 
-
-
+        mutate(free_lunch = total_enroll * per_free_lunch, reduced_lunch = total_enroll * per_reduced_lunch) %>%
+        group_by(county_name) %>%
+        summarise(sum_enroll = sum(total_enroll, na.rm = T),
+                 per_free_lunch = sum(total_enroll, na.rm = T) / sum(free_lunch, na.rm = T),
+                 per_reduced_lunch = sum(total_enroll, na.rm = T) / sum(reduced_lunch, na.rm = T),
+                 per_poverty = mean(county_per_poverty)) 
+
+tmp
 ```
 
 2. For the counties with the top 5 and bottom 5 poverty rate: percent of population in poverty, percent of students 
 qualifying for free or reduced price lunch, mean reading score, and mean math score.
+```{r}
+# Use the table generated in task5.1 to find the counties with the top 5 and bottom 5 poverty rate.
+# Then use dplyr to select the desired rows and columns 
+
+```
 
 # Task 6: Data Visualization
 1. The relationship between access to free/reduced price lunch and test performance, at the school level.

From 162ee58f0aa7de984f0e5a93ab0a0138a643a2a2 Mon Sep 17 00:00:00 2001
From: zixiaohuang9874 <zixiaohuang2021@u.northwestern.edu>
Date: Wed, 16 Sep 2020 16:37:47 +0800
Subject: [PATCH 6/9] Finishing Task 5

---
 submissions/FinalRExercise_HuangZixiao.Rmd | 42 ++++++++++++++++------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/submissions/FinalRExercise_HuangZixiao.Rmd b/submissions/FinalRExercise_HuangZixiao.Rmd
index 0d81379..f0b447c 100644
--- a/submissions/FinalRExercise_HuangZixiao.Rmd
+++ b/submissions/FinalRExercise_HuangZixiao.Rmd
@@ -81,27 +81,47 @@ county_school <- merge(schools, counties, by = c("county_name", "year"))
 1. For each county: total enrollment, percent of students qualifying for free or reduced price lunch, and percent of 
 population in poverty. 
 ```{r}
-tmp <- county_school %>%
-        mutate(free_lunch = total_enroll * per_free_lunch, reduced_lunch = total_enroll * per_reduced_lunch) %>%
-        group_by(county_name) %>%
-        summarise(sum_enroll = sum(total_enroll, na.rm = T),
-                 per_free_lunch = sum(total_enroll, na.rm = T) / sum(free_lunch, na.rm = T),
-                 per_reduced_lunch = sum(total_enroll, na.rm = T) / sum(reduced_lunch, na.rm = T),
-                 per_poverty = mean(county_per_poverty)) 
-
-tmp
+summary1 <- county_school %>%
+              mutate(free_lunch = total_enroll * per_free_lunch, reduced_lunch = total_enroll * per_reduced_lunch) %>%
+              group_by(county_name) %>%
+              summarise(sum_enroll = sum(total_enroll, na.rm = T),
+                  per_free_lunch = sum(total_enroll, na.rm = T) / sum(free_lunch, na.rm = T),
+                  per_reduced_lunch = sum(total_enroll, na.rm = T) / sum(reduced_lunch, na.rm = T),
+                  per_poverty = mean(county_per_poverty)) 
+
+summary1
 ```
 
 2. For the counties with the top 5 and bottom 5 poverty rate: percent of population in poverty, percent of students 
 qualifying for free or reduced price lunch, mean reading score, and mean math score.
 ```{r}
-# Use the table generated in task5.1 to find the counties with the top 5 and bottom 5 poverty rate.
-# Then use dplyr to select the desired rows and columns 
+# Create a temporary table with the mean reading score and mean math score for each county
+tmp <- county_school %>%
+        group_by(county_name) %>%
+        summarise(mean_ela = mean(mean_ela_score, na.rm = T),
+                  mean_math = mean(mean_math_score, na.rm = T))
 
+# Merge the temporary table with the summary1 table in the previous task
+summary2 <- merge(summary1, tmp, by = c("county_name"))
+
+# Select the counties with the top5 and bottom5 poverty rate by removing 
+summary2 <- summary2[order(-summary2$per_poverty),]
+tmp <- summary2[1:5,]
+tmp2 <- summary2[-1:-57,]
+
+# Combine tmp and tmp2 together
+summary2 <- rbind(tmp, tmp2)
+
+# Select the required columns of summary2
+summary2 <- summary2 %>% select(-sum_enroll)
+summary2
 ```
 
 # Task 6: Data Visualization
 1. The relationship between access to free/reduced price lunch and test performance, at the school level.
+```{r}
+# Use the schools dataframe and 
+```
 
 2. Average test performance across counties with high, low, and medium poverty.
 

From 5a08fbf6da44bbc9b0220893f6c578c3d106a87f Mon Sep 17 00:00:00 2001
From: zixiaohuang9874 <zixiaohuang2021@u.northwestern.edu>
Date: Wed, 16 Sep 2020 16:56:09 +0800
Subject: [PATCH 7/9] Finishing Task 6.1

---
 submissions/FinalRExercise_HuangZixiao.Rmd | 39 +++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/submissions/FinalRExercise_HuangZixiao.Rmd b/submissions/FinalRExercise_HuangZixiao.Rmd
index f0b447c..f48b142 100644
--- a/submissions/FinalRExercise_HuangZixiao.Rmd
+++ b/submissions/FinalRExercise_HuangZixiao.Rmd
@@ -120,7 +120,44 @@ summary2
 # Task 6: Data Visualization
 1. The relationship between access to free/reduced price lunch and test performance, at the school level.
 ```{r}
-# Use the schools dataframe and 
+# Use the schools dataframe
+# Relationship between free price lunch and ela score
+ggplot(data = schools) +
+  geom_point(aes(x = per_free_lunch, y = z_score_ela)) +
+  labs(title = "Relationship between percentage of free lunch and ela score", x = "Percentage of free lunch", 
+       y = "z-score of ELA") +
+  scale_x_continuous(limits = c(0,1)) +
+  scale_y_continuous(limits = c(-5,5))
+```
+
+```{r}
+# Relationship between free price lunch and math score
+ggplot(data = schools) +
+  geom_point(aes(x = per_free_lunch, y = z_score_math)) +
+  labs(title = "Relationship between percentage of free lunch and math score", x = "Percentage of free lunch", 
+       y = "z-score of math") +
+  scale_x_continuous(limits = c(0,1)) +
+  scale_y_continuous(limits = c(-5,5))
+```
+
+```{r}
+# Relationship between reduced price lunch and ela score
+ggplot(data = schools) +
+  geom_point(aes(x = per_reduced_lunch, y = z_score_ela)) +
+  labs(title = "Relationship between percentage of reduced lunch and ela score", x = "Percentage of reduced lunch", 
+       y = "z-score of ELA") +
+  scale_x_continuous(limits = c(0,1)) +
+  scale_y_continuous(limits = c(-5,5))
+```
+
+```{r}
+# Relationship between reduced price lunch and math score
+ggplot(data = schools) +
+  geom_point(aes(x = per_reduced_lunch, y = z_score_math)) +
+  labs(title = "Relationship between percentage of reduced lunch and math score", x = "Percentage of reduced lunch", 
+       y = "z-score of ELA") +
+  scale_x_continuous(limits = c(0,1)) +
+  scale_y_continuous(limits = c(-5,5))
 ```
 
 2. Average test performance across counties with high, low, and medium poverty.

From df406abe1c53f9c8c6517bc9324dfdc5daf69c57 Mon Sep 17 00:00:00 2001
From: zixiaohuang9874 <zixiaohuang2021@u.northwestern.edu>
Date: Wed, 16 Sep 2020 17:24:05 +0800
Subject: [PATCH 8/9] Finishing all tasks

---
 submissions/FinalRExercise_HuangZixiao.Rmd | 27 ++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/submissions/FinalRExercise_HuangZixiao.Rmd b/submissions/FinalRExercise_HuangZixiao.Rmd
index f48b142..b61a478 100644
--- a/submissions/FinalRExercise_HuangZixiao.Rmd
+++ b/submissions/FinalRExercise_HuangZixiao.Rmd
@@ -161,6 +161,33 @@ ggplot(data = schools) +
 ```
 
 2. Average test performance across counties with high, low, and medium poverty.
+```{r}
+# ELA test performance across counties with high, low, and medium poverty
+county_school %>%
+  group_by(year, poverty_level) %>%
+  summarise(mean_z_score_ela = mean(z_score_ela, na.rm = T)) %>%
+  ggplot() +
+  geom_line(aes(x = year, y = mean_z_score_ela, group = poverty_level, col = poverty_level)) +
+  labs(title = "Relationship between poverty level and ela score across years", x = "year", y = "ELA z-score") +
+  theme_classic() +
+  theme(plot.title = element_text(hjust = 0.5, face="bold"), panel.border = element_blank())
+```
+```{r}
+# Math test performance across counties with high, low, and medium poverty
+county_school %>%
+  group_by(year, poverty_level) %>%
+  summarise(mean_z_score_math = mean(z_score_math, na.rm = T)) %>%
+  ggplot() +
+  geom_line(aes(x = year, y = mean_z_score_math, group = poverty_level, col = poverty_level)) +
+  labs(title = "Relationship between poverty level and math score across years", x = "year", y = "Math z-score") +
+  theme_classic() +
+  theme(plot.title = element_text(hjust = 0.5, face="bold"), panel.border = element_blank())
+```
 
 # Task 7: Answering questions
 What can the data tell us about the relationship between poverty and test performance in New York public schools? Has this relationship changed over time? Is this relationship at all moderated by access to free/reduced price lunch?
+
+Answer:
+The data tells us that the lower the poverty level, the better the test performance in New York public schools. This
+relationship hasn't changed over time. This relationship is not seemed to be moderated by access to free/reduced price
+lunch because the gap in performances between different poverty levels grew during the past few years.
\ No newline at end of file

From d42488ad1ce46721a2e740cc6f08a7c5c5f8a011 Mon Sep 17 00:00:00 2001
From: zixiaohuang9874 <zixiaohuang2021@u.northwestern.edu>
Date: Thu, 17 Sep 2020 15:19:50 +0800
Subject: [PATCH 9/9] Finishing all tasks

---
 submissions/FinalRExercise_HuangZixiao.Rmd | 26 ++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/submissions/FinalRExercise_HuangZixiao.Rmd b/submissions/FinalRExercise_HuangZixiao.Rmd
index b61a478..13f7588 100644
--- a/submissions/FinalRExercise_HuangZixiao.Rmd
+++ b/submissions/FinalRExercise_HuangZixiao.Rmd
@@ -35,6 +35,14 @@ summary(schools)
 summary(counties)
 ```
 
+Answer:
+There are many different types of variable in this dataset, such as character and numerical
+variable (integer/double). There are missing data because some of the variables have a minimum
+value of -99 and missing values are currently encoded as -99. The time ranges of these two datasets
+are different. The schools data frame has a time range from 2008 to 2017, while that of counties
+is 2009 to 2016. Therefore, if we want to merge these two data frames together, some entries in
+the schools data frame might be dropped to facilitate our analysis.
+
 # Task 3: Recoding and Variable Manipulation
 1. Deal with missing values, which are currently encoded as -99
 ```{r}
@@ -82,11 +90,13 @@ county_school <- merge(schools, counties, by = c("county_name", "year"))
 population in poverty. 
 ```{r}
 summary1 <- county_school %>%
+              # Calculate the total number of students with free/reduced lunch over the period
               mutate(free_lunch = total_enroll * per_free_lunch, reduced_lunch = total_enroll * per_reduced_lunch) %>%
               group_by(county_name) %>%
               summarise(sum_enroll = sum(total_enroll, na.rm = T),
                   per_free_lunch = sum(total_enroll, na.rm = T) / sum(free_lunch, na.rm = T),
                   per_reduced_lunch = sum(total_enroll, na.rm = T) / sum(reduced_lunch, na.rm = T),
+                  # Calculate the poverty rate by calculating the average over years
                   per_poverty = mean(county_per_poverty)) 
 
 summary1
@@ -127,7 +137,9 @@ ggplot(data = schools) +
   labs(title = "Relationship between percentage of free lunch and ela score", x = "Percentage of free lunch", 
        y = "z-score of ELA") +
   scale_x_continuous(limits = c(0,1)) +
-  scale_y_continuous(limits = c(-5,5))
+  scale_y_continuous(limits = c(-5,5)) +
+  theme_classic() +
+  theme(plot.title = element_text(hjust = 0.5, face="bold"), panel.border = element_blank())
 ```
 
 ```{r}
@@ -137,7 +149,9 @@ ggplot(data = schools) +
   labs(title = "Relationship between percentage of free lunch and math score", x = "Percentage of free lunch", 
        y = "z-score of math") +
   scale_x_continuous(limits = c(0,1)) +
-  scale_y_continuous(limits = c(-5,5))
+  scale_y_continuous(limits = c(-5,5)) +
+  theme_classic() +
+  theme(plot.title = element_text(hjust = 0.5, face="bold"), panel.border = element_blank())
 ```
 
 ```{r}
@@ -147,7 +161,9 @@ ggplot(data = schools) +
   labs(title = "Relationship between percentage of reduced lunch and ela score", x = "Percentage of reduced lunch", 
        y = "z-score of ELA") +
   scale_x_continuous(limits = c(0,1)) +
-  scale_y_continuous(limits = c(-5,5))
+  scale_y_continuous(limits = c(-5,5)) +
+  theme_classic() +
+  theme(plot.title = element_text(hjust = 0.5, face="bold"), panel.border = element_blank())
 ```
 
 ```{r}
@@ -157,7 +173,9 @@ ggplot(data = schools) +
   labs(title = "Relationship between percentage of reduced lunch and math score", x = "Percentage of reduced lunch", 
        y = "z-score of ELA") +
   scale_x_continuous(limits = c(0,1)) +
-  scale_y_continuous(limits = c(-5,5))
+  scale_y_continuous(limits = c(-5,5)) +
+  theme_classic() +
+  theme(plot.title = element_text(hjust = 0.5, face="bold"), panel.border = element_blank())
 ```
 
 2. Average test performance across counties with high, low, and medium poverty.