diff --git a/.gitignore b/.gitignore index 5b6a065..7b732e7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData .Ruserdata +.DS_Store diff --git a/submissions/FinalRExercise_KeChengHao.Rmd b/submissions/FinalRExercise_KeChengHao.Rmd new file mode 100644 index 0000000..6a9d28b --- /dev/null +++ b/submissions/FinalRExercise_KeChengHao.Rmd @@ -0,0 +1,497 @@ +--- +title: "MSIA Boot Camp - Final R exercise" +author: "Cheng Hao Ke" +output: + html_document: + includes: + in_header: header.html +--- + +

+```{r setup, message = FALSE, warning=FALSE} +library(car) +library(htmltools) +library(rmarkdown) +library(knitr) +library(kableExtra) +library(pander) +library(glue) +library(gtable) +library(grid) +library(gridExtra) +library(nnet) +library(sjPlot) +library(data.table) +library(tidyverse) + +knitr::opts_chunk$set(echo = TRUE) +options(scipen = 999) +``` + +```{r htmlTemplate, echo=FALSE} +img <- htmltools::img(src = "msia.png", + alt = 'logo', + style = 'position:absolute; top:10px; right:1%; padding:10px;z-index:200;', + width="300px") +htmlhead <- paste0(' + +') +readr::write_lines(htmlhead, path = "header.html") +``` +

+ +#### Task 1: Import your data + +Read the data files `nys_schools.csv` and `nys_acs.csv` into R. These data come from two different sources: one is data on *schools* in New York state from the [New York State Department of Education](http://data.nysed.gov/downloads.php), and the other is data on *counties* from the American Communities Survey from the US Census Bureau. Review the codebook file so that you know what each variable name means in each dataset. + +```{r read1, message = FALSE, warning=FALSE} +# read in data +school0 <- read_csv('../data/nys_schools.csv') +county0 <- read_csv('../data/nys_acs.csv') +``` +

+ +#### Task 2: Explore your data + +Getting to know your data is a critical part of data analysis. Take the time to explore the structure of the two dataframes you have imported. What types of variables are there? Is there any missing data? How can you tell? What else do you notice about the data? +

+```{r eda1, message = FALSE, warning=FALSE} +# function to obtain data type of each column +cltype <- function(df) { + colt0 <- as.data.table(lapply(df, class)) + colt0$No <- 1 + colmelt <- names(colt0[, !c('No'), with = FALSE]) + colt1 <- melt(colt0, id.vars = c("No"), measure.vars = colmelt, + variable.name = 'Columns', value.name = 'DataType') + colt1$No <- rownames(colt1) + return(colt1) + } + +schcol <- cltype(school0) +ctycol <- cltype(county0) + +# combine both data type dfs for kable +addcol <- dim(schcol)[1] - dim(ctycol)[1] +ctycol <- rbind(ctycol, as.data.table(matrix('--', ncol = dim(ctycol)[2], nrow = addcol)), + use.names=FALSE) +allcol <- cbind(schcol, ctycol) +``` +

+**Table 1. Data Types** +```{r table1, message = FALSE, results='asis'} +kable(allcol, caption = "Data types for school and county files", + booktabs = T, longtable = T) %>% + kable_styling(bootstrap_options = c("striped", "condensed", "responsive"), + full_width = T, position = "left") %>% + add_header_above(c("Schools" = 3, "Countries" = 3)) +``` + +```{r eda2, message = FALSE, warning=FALSE} +missingf <- function(df) { + # count number of missing values of each type + mis1 <- list() + for (i in c(1:length(df))) { + j <- names(df)[i] + mis0 <- df %>% + filter(.data[[j]] %in% c('-99', -99) | is.na(.data[[j]])) %>% + count(.data[[j]]) + mis1[[i]] <- mis0 + } + mis2 <- as.data.table(mis1) + # get columns for melt + col0 <- names(mis2) + + # first melt + nam0 <- grep('[.]', col0, value = TRUE, invert = TRUE) + nam0 <- nam0[!nam0 == 'n'] + mis3a <- melt(mis2[, ..nam0], measure.vars = nam0, + variable.name = 'Columns', value.name = 'Missing') + # second melt + nam1 <- col0[!(col0 %in% nam0)] + mis3b <- melt(mis2[, ..nam1], measure.vars = nam1, + variable.name = 'Columns', value.name = 'Amount') + # combine + mis4 <- cbind(mis3a, mis3b) + # drop col + mis4 <- mis4[, c(1, 2, 4)] + # remove duplicate + mis5 <- unique(mis4) + # recode values for better looking table + mis5$Missing[is.na(mis5$Missing)] <- 'NA' + mis5$Amount[is.na(mis5$Amount)] <- '' + + return(mis5) +} + +mschool <- missingf(school0) +mcounty <- missingf(county0) + +miall <- rbind(mschool, mcounty) +``` + +

+As shown in the table below there are quite a number of missing values. Missing values are especially prevalent for math and reading scores. It also seems that there were various data insertion errors present for percentage of students with free and reduced price lunch. These entries were all excluded in subsequent analyses. + +**Table 2. Missing Values** +```{r table2, message = FALSE, results='asis'} +kable(miall, caption = "Missing values by column", + booktabs = T, longtable = T) %>% + kable_styling(bootstrap_options = c("striped", "condensed", "responsive"), + full_width = T, position = "left") %>% + footnote(general = "Both NA and -99 are considered to be missing values.") +``` +

+ +#### Task 3: Recoding and variable manipulation + +1. Deal with missing values, which are currently coded as `-99`. +2. Create a categorical variable that groups counties into "high", "medium", and "low" poverty groups. Decide how you want to split up the groups and briefly explain your decision. +3. The tests that the NYS Department of Education administers changes from time to time, so scale scores are not directly comparable year-to-year. Create a new variable that is the standardized z-score for math and English Language Arts (ELA) for each year (hint: group by year and use the `scale()` function) + +```{r recode1, message=FALSE, warning=FALSE} +# replace -99 with NA +school1 <- school0 +school1[school1 == -99 |school1 == '-99'] <- NA +# remove rows that have wrong lunch data (percentage should be lower than 1) +school1 <- school1[(school1$per_free_lunch < 1) & (school1$per_reduced_lunch < 1),] + +county1 <- county0 +county1[county1 == -99 |county1 == '-99'] <- NA +# test +# miss0 <- school1 %>% filter_all(any_vars(. %in% c('-99', -99) | is.na(.))) + +# group by county +county2 <- county1 %>% group_by(county_name) %>% + summarize(county_per_poverty = mean(county_per_poverty, na.rm = TRUE), + median_household_income = mean(median_household_income, na.rm = TRUE), + county_per_bach = mean(county_per_bach, na.rm = TRUE)) + +# quantiles 3 +q3 <- quantile(county2$county_per_poverty, probs = c(0, 1/3, 2/3)) +q3 <- append(q3, 1) +print(q3) +county2$povlvl <- cut(county2$county_per_poverty, breaks = q3, labels = c('low', 'medium', 'high'), + include.lowest = TRUE, right = FALSE) + +# scaling: z-score +school2 <- school1 %>% group_by(year) %>% + mutate(zmath = scale(mean_math_score), zeng = scale(mean_ela_score)) %>% + ungroup(year) +``` + +Poverty levels were recoded according to quantiles calculated using the r `quantile` function. +

+ +#### Task 4: Merge datasets + +Create a county-level dataset that merges variables from the schools dataset and the ACS dataset. Remember that you have learned multiple approaches on how to do this, and that you will have to decide how to summarize data when moving from the school to the county level. + +

+ +```{r merge0, message=FALSE, warning=FALSE} +# group school by county name +school3 <- school2 %>% group_by(county_name) %>% + summarize(total_enroll = mean(total_enroll, na.rm = TRUE), + per_free_lunch = mean(per_free_lunch, na.rm = TRUE), + per_reduced_lunch = mean(per_reduced_lunch, na.rm = TRUE), + mean_math_score = mean(mean_math_score, na.rm = TRUE), + mean_ela_score = mean(mean_ela_score, na.rm = TRUE), + zmath = mean(zmath, na.rm = TRUE), + zeng = mean(zeng, na.rm = TRUE)) + +# inner join both data +ctysch0 <- merge(county2, school3, by = "county_name", + all = FALSE, all.x = FALSE, all.y = FALSE) +ctysch0 <- as.data.table(ctysch0) +``` + +

+ +#### Task 5: Create summary tables + +Generate tables showing the following: + +1. For each county: total enrollment, percent of students qualifying for free or reduced price lunch, and percent of population in poverty. +2. For the counties with the top 5 and bottom 5 poverty rate: percent of population in poverty, percent of students qualifying for free or reduced price lunch, mean reading score, and mean math score. + +```{r table3, message=FALSE, warning=FALSE} +ctysch0[, c("county_name", 'total_enroll', 'per_free_lunch', 'per_reduced_lunch', + 'county_per_poverty')] %>% + # change percentage format + mutate(per_free_lunch = scales::percent(per_free_lunch), + per_reduced_lunch = scales::percent(per_reduced_lunch), + county_per_poverty = scales::percent(county_per_poverty) + ) %>% + mutate_if(is.numeric, function(x) {format(x, digits = 1, big.mark = ",", + decimal.mark = ".")}) %>% + kable(caption = "County performance on student enrollment, lunch price reduction, and percent of poverty", + booktabs = T, longtable = T) %>% + kable_styling(bootstrap_options = c("striped", "condensed", "responsive"), + full_width = T, position = "left") +``` + +

+ +```{r table4, message=FALSE, warning=FALSE} +top5 <- ctysch0[order(county_per_poverty, decreasing = TRUE),] +top5 <- top5[1:5, c('county_name', 'county_per_poverty', 'per_free_lunch', + 'per_reduced_lunch', 'mean_math_score', 'mean_ela_score')] + +top5 %>% + mutate(county_per_poverty = scales::percent(county_per_poverty), + per_free_lunch = scales::percent(per_free_lunch), + per_reduced_lunch = scales::percent(per_reduced_lunch) + ) %>% + kable(caption = "Top 5 Counties with the highest poverty percentage", + booktabs = T, longtable = T) %>% + kable_styling(bootstrap_options = c("striped", "condensed", "responsive"), + full_width = T, position = "left") %>% + footnote(general = "Both NA and -99 are considered to be missing values.") +# tables side by side: +# list(top51, matrix(numeric(), nrow=0, ncol=1), bot51) %>% +``` + +```{r table5, message=FALSE, warning=FALSE} +bot5 <- ctysch0[order(county_per_poverty)] +bot5 <- bot5[1:5, c('county_name', 'county_per_poverty', 'per_free_lunch', + 'per_reduced_lunch', 'mean_math_score', 'mean_ela_score')] +bot5 %>% + mutate(county_per_poverty = scales::percent(county_per_poverty), + per_free_lunch = scales::percent(per_free_lunch), + per_reduced_lunch = scales::percent(per_reduced_lunch) + ) %>% + kable(caption = "Top 5 Counties with the lowest poverty percentage", + booktabs = T, longtable = T) %>% + kable_styling(bootstrap_options = c("striped", "condensed", "responsive"), + full_width = T, position = "left") %>% + footnote(general = "Both NA and -99 are considered to be missing values.") +``` + +

+ +#### Task 6: Data visualization + +Using `ggplot2`, visualize the following: + +1. The relationship between access to free/reduced price lunch and test performance, at the *school* level. +2. Average test performance across *counties* with high, low, and medium poverty. + +```{r plot1, message=FALSE, warning=FALSE, fig.align="center", fig.width=12, fig.height=6} +# group by school +bysch0 <- school2 %>% group_by(school_name) %>% + summarize(per_free_lunch = mean(per_free_lunch, na.rm = TRUE), + per_reduced_lunch = mean(per_reduced_lunch, na.rm = TRUE), + mean_math_score = mean(mean_math_score, na.rm = TRUE), + mean_ela_score = mean(mean_ela_score, na.rm = TRUE), + zmath = mean(zmath, na.rm = TRUE), + zeng = mean(zeng, na.rm = TRUE)) +# remove schools with NA +bysch1 <- na.omit(bysch0) + +sp1 <- ggplot(bysch1, aes(x=per_free_lunch, y=zmath)) + geom_point() + + geom_smooth(method=lm) + theme_classic() + + labs(title = 'Free lunch percentage and math score', + x = 'Free lunch percentage', y = 'Standardized math score') +sp2 <- ggplot(bysch1, aes(x=per_free_lunch, y=zeng)) + geom_point() + + geom_smooth(method=lm) + theme_classic() + + labs(title = 'Free lunch percentage and reading score', + x = 'Free lunch percentage', y = 'Standardized reading score') +sp3 <- ggplot(bysch1, aes(x=per_reduced_lunch, y=zmath)) + geom_point() + + geom_smooth(method=lm) + theme_classic() + + labs(title = 'Reduced lunch fee percentage and math score', + x = 'Reduced lunch fee percentage', y = 'Standardized math score') +sp4 <- ggplot(bysch1, aes(x=per_reduced_lunch, y=zeng)) + geom_point() + + geom_smooth(method=lm) + theme_classic() + + labs(title = 'Reduced lunch fee percentage and math score', + x = 'Reduced lunch fee percentage', y = 'Standardized reading score') + +grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2) +``` +The above plots seemed to suggest that percentage of students with free lunch is negatively associated with standardized scores, while percentage of students with reduced lunch price is positively associated with standardized scores. +

+```{r plot2, message=FALSE, warning=FALSE, fig.align = "center", fig.width=12, fig.height=6} +cp1 <- ggplot(ctysch0, aes(x=zmath, y=zeng, color=povlvl, shape=povlvl)) + + geom_point(size = 5) + theme_classic() + + # geom_text(label=ctysch0$county_name) + + labs(title = 'County poverty level and student standardized scores', + x = 'Standardized math score', y = 'Standardized reading score') +cp2 <- ggplot(ctysch0, aes(x=zmath, y=zeng, color=povlvl, shape=povlvl)) + + geom_smooth(method=lm) + theme_classic() + + labs(title = 'County poverty level associations with student standardized scores', + x = 'Standardized math score', y = 'Standardized reading score') + +grid.arrange(cp1, cp2, ncol = 2, nrow = 1) +``` +The above plots indicates that low poverty levels are associated with higher math and reading standardized scores. While high poverty levels are associated with the reverse. +

+ +#### Task 7: Answering questions + +Using the skills you have learned in the past three days, tackle the following question: + +> What can the data tell us about the relationship between poverty and test performance in New York public schools? Has this relationship changed over time? Is this relationship at all moderated by access to free/reduced price lunch? + +You may use summary tables, statistical models, and/or data visualization in pursuing an answer to this question. Feel free to build on the tables and plots you generated above in Tasks 5 and 6. + +

+```{r plot3, message=FALSE, warning=FALSE, fig.align = "center", fig.width=12, fig.height=6} +syear1 <- school2 %>% group_by(school_name, county_name, year) %>% + summarize(per_free_lunch = mean(per_free_lunch, na.rm = TRUE), + per_reduced_lunch = mean(per_reduced_lunch, na.rm = TRUE), + mean_math_score = mean(mean_math_score, na.rm = TRUE), + mean_ela_score = mean(mean_ela_score, na.rm = TRUE), + zmath = mean(zmath, na.rm = TRUE), + zeng = mean(zeng, na.rm = TRUE)) + +cyear1 <- county1 %>% group_by(county_name, year) %>% + summarize(county_per_poverty = mean(county_per_poverty, na.rm = TRUE)) +cyear2 <- merge(cyear1, county2[, c('county_name', 'povlvl')], by = "county_name", + all = FALSE, all.x = FALSE, all.y = FALSE) + +# inner join both data +csyr1 <- merge(cyear2, syear1, by = c("county_name", 'year'), + all = FALSE, all.x = FALSE, all.y = FALSE) +csyr2 <- as.data.table(csyr1) +csyr2 <- na.omit(csyr2) +csyr2$years <- factor(csyr2$year) + +cs1 <- ggplot(csyr2, aes(x=zmath, y=zeng, color=years, shape=povlvl)) + + geom_point(size = 5) + theme_classic() + + # geom_text(label=ctysch0$county_name) + + labs(title = 'County poverty level and student standardized scores by year', + x = 'Standardized math score', y = 'Standardized reading score') +cs2 <- ggplot(csyr2, aes(x=zmath, y=zeng, color=years, shape=povlvl)) + + geom_smooth(method=lm) + theme_classic() + + labs(title = 'County poverty level associations with student standardized scores by year', + x = 'Standardized math score', y = 'Standardized reading score') + +grid.arrange(cs1, cs2, ncol = 2, nrow = 1) +``` +The plots above revealed that poverty and standardized score associations did not have drastic changes from year to year. Thus, it would be appropriate to aggregate and analyze the above data without accounting for errors with time series structures. +

+ +```{r stats0, message=FALSE, warning=FALSE} +# group by school +school2$lunch <- school2$per_free_lunch + school2$per_reduced_lunch + +bysch2 <- school2 %>% group_by(school_name, county_name) %>% + summarize(per_free_lunch = mean(per_free_lunch, na.rm = TRUE), + per_reduced_lunch = mean(per_reduced_lunch, na.rm = TRUE), + lunch = mean(lunch, na.rm = TRUE), + mean_math_score = mean(mean_math_score, na.rm = TRUE), + mean_ela_score = mean(mean_ela_score, na.rm = TRUE), + zmath = mean(zmath, na.rm = TRUE), + zeng = mean(zeng, na.rm = TRUE)) +# remove schools with NA +bysch3 <- as.data.table(na.omit(bysch2)) + +county3 <- as.data.table(county2[, c('county_name', 'county_per_poverty', 'povlvl')]) + +ctysch1 <- merge(bysch3, county3, by = "county_name", + all = FALSE, all.x = FALSE, all.y = FALSE) +ctysch1 <- na.omit(ctysch1) + +# bivariate tests +print(cor.test(ctysch1$county_per_poverty, ctysch1$zmath)) +print(cor.test(ctysch1$county_per_poverty, ctysch1$zeng)) +print(cor.test(ctysch1$per_free_lunch, ctysch1$zmath)) +print(cor.test(ctysch1$per_free_lunch, ctysch1$zeng)) +print(cor.test(ctysch1$per_reduced_lunch, ctysch1$zmath)) +print(cor.test(ctysch1$per_reduced_lunch, ctysch1$zeng)) +``` + +Pearson correlation tests showed that there exists significant negative associations between county poverty levels and student standardized math and reading scores. In other words, the higher a county's percent of poverty, the lower its students score in math and reading. It's also interesting to note that the percentage of students with free lunch has a significant negative association with both standardized scores. Finally, the percentage of student with reduced lunch price is positively associated with both standardized scores. Although, the association between reading scores and reduced lunch price is low. + +```{r stats1, message=FALSE, warning=FALSE} +# multivariate linear regression +lr1 <- lm(zmath ~ county_per_poverty + per_free_lunch + per_reduced_lunch, data=ctysch1) +summary(lr1) +print(durbinWatsonTest(lr1)) +``` + +The multivariate linear regression model showed that in the presence of the effects of the percentage of students with free or reduced lunch price, the association between county poverty and standardized math score is reversed. Associations between free lunch and reduced lunch price with math scores were not changed in the presence of county poverty. The Durbin Watson Test statistic indicated a slight positive autocorrelation within the residuals, this means that time series regression models might provide more accurate results. +

+```{r stats2, message=FALSE, warning=FALSE} +# multivariate linear regression +lr2 <- lm(zeng ~ county_per_poverty + per_free_lunch + per_reduced_lunch, data=ctysch1) +summary(lr2) +print(durbinWatsonTest(lr2)) +``` + +Standardized reading scores also have the similar associations with county poverty, free lunch and reduced lunch price. The Durbin Watson Test statistic of this reading score regression model also indicated a slight positive autocorrelation within the residuals. + +```{r inter0, message=FALSE, warning=FALSE} +# multivariate linear regression +lr3 <- lm(zmath ~ county_per_poverty + per_free_lunch + county_per_poverty*per_free_lunch, data=ctysch1) +summary(lr3) +lr4 <- lm(zmath ~ county_per_poverty + per_reduced_lunch + county_per_poverty*per_reduced_lunch, data=ctysch1) +summary(lr4) + +lr5 <- lm(zeng ~ county_per_poverty + per_free_lunch + county_per_poverty*per_free_lunch, data=ctysch1) +summary(lr5) +lr6 <- lm(zeng ~ county_per_poverty + per_reduced_lunch + county_per_poverty*per_reduced_lunch, data=ctysch1) +summary(lr6) +``` + +Regression models showed that the interaction between percent of students with free lunch and county poverty percentage is not significant. However, the interaction between percent of students with reduced lunch price and county poverty percentage is significant for both types of standardized scores. The coefficients for county poverty in all models were negative, this signifies that the association between poverty and standardized scores is negative when there is no lunch price reduction. However, since the coefficient of interaction is positive, this means that as the percentage of students with reduced lunch price increases, the negative association between poverty and scores also becomes less negative. In other words, as a school provides more students with reduced lunch the better the school's students score despite the school being located in a county with higher percentage of poverty. +

+ +In order to illustrate the above models, percentage of students with reduced lunch price was recoded to a categorical variable with 4 levels based on quartiles. The resulting plots clearly showed that when percentage of students with reduced lunch price increases, the negative association between poverty and scores lessen. This lessening effect gradually reverses to a positive association when the percentage of students with reduced lunch price rises over 10%. + +```{r inter1, message=FALSE, warning=FALSE, fig.align = "center", fig.width=12, fig.height=6} +# change reduced lunch to quartile +q4 <- quantile(ctysch1$per_reduced_lunch) +q4[5] <- 1 +print(q4) + +ctysch1$rlunchlvl <- cut(ctysch1$per_reduced_lunch, breaks = q4, labels = c('q1', 'q2', 'q3', 'q4'), + include.lowest = TRUE, right = FALSE) + +in0 <- lm(zmath ~ county_per_poverty + rlunchlvl + county_per_poverty*rlunchlvl, data=ctysch1) +in1 <- lm(zeng ~ county_per_poverty + rlunchlvl + county_per_poverty*rlunchlvl, data=ctysch1) + +ipl0 <- plot_model(in0, type = "pred", terms = c("county_per_poverty", "rlunchlvl")) +ipl1 <- plot_model(in1, type = "pred", terms = c("county_per_poverty", "rlunchlvl")) + +grid.arrange(ipl0, ipl1, ncol = 2, nrow = 1, clip = FALSE) +``` + +

+```{r stats3, message=FALSE, warning=FALSE} +# Compute anova +ano1 <- aov(zmath ~ povlvl, data = ctysch1) +summary(ano1) +# post hoc tests +TukeyHSD(ano1) + +ano2 <- aov(zeng ~ povlvl, data = ctysch1) +summary(ano2) +TukeyHSD(ano2) +``` + +ANOVA tests and post hoc Tukey HSD tests between county poverty levels and standardized scores showed significant associations. These associations also align with previous test results. + +```{r stats4, message=FALSE, warning=FALSE} +# multinomial logistic regression +# Setting the basline +ctysch1$poverty <- relevel(ctysch1$povlvl, ref = "low") +# Training the multinomial model +mlog1 <- multinom(poverty ~ zmath + zeng + per_free_lunch + per_reduced_lunch, + data = ctysch1) + +# Checking the model +print(summary(mlog1)) +print(exp(coef(mlog1))) + +# p-value +z <- summary(mlog1)$coefficients/summary(mlog1)$standard.errors +p <- (1 - pnorm(abs(z), 0, 1)) * 2 +print(p) +# https://towardsdatascience.com/a-deep-dive-on-vector-autoregression-in-r-58767ebb3f06 +``` + +The multinomial logistic regression model showed similar results to the multivariate linear regression model. Of note is the lack of significance between the association of standardized reading scores and different levels of poverty. In other words, there were no significant differences between schools located within counties classified into different poverty levels and mean student standardized reading scores. There were also no difference between high and low poverty levels for reduced lunch price. This corroborates the results obtained above, when the effect of lunch subsidies are accounted for, scores have a weak positive association with poverty levels. An explanation of the positive association between lunch and poverty levels is that poorer counties were more likely to subsidies student lunches. + +

diff --git a/submissions/FinalRExercise_KeChengHao.html b/submissions/FinalRExercise_KeChengHao.html new file mode 100644 index 0000000..e5df0fb --- /dev/null +++ b/submissions/FinalRExercise_KeChengHao.html @@ -0,0 +1,2910 @@ + + + + + + + + + + + + + + +MSIA Boot Camp - Final R exercise + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +



+
library(htmltools)
+library(rmarkdown)
+library(knitr)
+library(kableExtra)
+library(pander)
+library(glue)
+library(gtable)
+library(grid)
+library(gridExtra)
+library(nnet)
+library(sjPlot)
+library(data.table)
+library(tidyverse)
+
+knitr::opts_chunk$set(echo = TRUE)
+options(scipen = 999)
+



+
+

Task 1: Import your data

+

Read the data files nys_schools.csv and nys_acs.csv into R. These data come from two different sources: one is data on schools in New York state from the New York State Department of Education, and the other is data on counties from the American Communities Survey from the US Census Bureau. Review the codebook file so that you know what each variable name means in each dataset.

+
# read in data
+school0 <- read_csv('../data/nys_schools.csv')
+county0 <- read_csv('../data/nys_acs.csv')
+



+
+
+

Task 2: Explore your data

+

Getting to know your data is a critical part of data analysis. Take the time to explore the structure of the two dataframes you have imported. What types of variables are there? Is there any missing data? How can you tell? What else do you notice about the data?

+
# function to obtain data type of each column
+cltype <- function(df) {
+    colt0 <- as.data.table(lapply(df, class))
+    colt0$No <- 1
+    colmelt <- names(colt0[, !c('No'), with = FALSE])
+    colt1 <- melt(colt0, id.vars = c("No"), measure.vars = colmelt, 
+                  variable.name = 'Columns', value.name = 'DataType')
+    colt1$No <- rownames(colt1)
+    return(colt1)
+  }
+
+schcol <- cltype(school0)
+ctycol <- cltype(county0)
+
+# combine both data type dfs for kable
+addcol <- dim(schcol)[1] - dim(ctycol)[1]
+ctycol <- rbind(ctycol, as.data.table(matrix('--', ncol = dim(ctycol)[2], nrow = addcol)), 
+                use.names=FALSE)
+allcol <- cbind(schcol, ctycol)
+



Table 1. Data Types

+
kable(allcol, caption = "Data types for school and county files",
+      booktabs = T, longtable = T) %>%
+  kable_styling(bootstrap_options = c("striped", "condensed", "responsive"),
+                full_width = T, position = "left") %>%
+  add_header_above(c("Schools" = 3, "Countries" = 3))
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Data types for school and county files +
+
+Schools +
+
+
+Countries +
+
+No + +Columns + +DataType + +No + +Columns + +DataType +
+1 + +school_cd + +character + +1 + +county_name + +character +
+2 + +school_name + +character + +2 + +year + +numeric +
+3 + +district_name + +character + +3 + +county_per_poverty + +numeric +
+4 + +county_name + +character + +4 + +median_household_income + +numeric +
+5 + +region + +character + +5 + +county_per_bach + +numeric +
+6 + +year + +numeric + +– + +– + +– +
+7 + +total_enroll + +numeric + +– + +– + +– +
+8 + +per_free_lunch + +numeric + +– + +– + +– +
+9 + +per_reduced_lunch + +numeric + +– + +– + +– +
+10 + +per_lep + +numeric + +– + +– + +– +
+11 + +mean_ela_score + +numeric + +– + +– + +– +
+12 + +mean_math_score + +numeric + +– + +– + +– +
+
missingf <- function(df) {
+  # count number of missing values of each type
+  mis1 <- list()
+  for (i in c(1:length(df))) {
+    j <- names(df)[i]
+    mis0 <- df %>% 
+      filter(.data[[j]] %in% c('-99', -99) | is.na(.data[[j]])) %>%
+      count(.data[[j]])
+    mis1[[i]] <- mis0
+  }
+  mis2 <- as.data.table(mis1)
+  # get columns for melt
+  col0 <- names(mis2)
+  
+  # first melt
+  nam0 <- grep('[.]', col0, value = TRUE, invert = TRUE)
+  nam0 <- nam0[!nam0 == 'n']
+  mis3a <- melt(mis2[, ..nam0], measure.vars = nam0, 
+                variable.name = 'Columns', value.name = 'Missing')
+  # second melt
+  nam1 <- col0[!(col0 %in% nam0)]
+  mis3b <- melt(mis2[, ..nam1], measure.vars = nam1, 
+                variable.name = 'Columns', value.name = 'Amount')
+  # combine
+  mis4 <- cbind(mis3a, mis3b)
+  # drop col
+  mis4 <- mis4[, c(1, 2, 4)]
+  # remove duplicate
+  mis5 <- unique(mis4)
+  # recode values for better looking table
+  mis5$Missing[is.na(mis5$Missing)] <- 'NA'
+  mis5$Amount[is.na(mis5$Amount)] <- '' 
+  
+  return(mis5)
+}
+
+mschool <- missingf(school0)
+mcounty <- missingf(county0)
+
+miall <- rbind(mschool, mcounty)
+



As shown in the table below there are quite a number of missing values. Missing values are especially prevalent for math and reading scores. It also seems that there were various data insertion errors present for percentage of students with free and reduced price lunch. These entries were all excluded in subsequent analyses.

+

Table 2. Missing Values

+
kable(miall, caption = "Missing values by column",
+      booktabs = T, longtable = T) %>%
+  kable_styling(bootstrap_options = c("striped", "condensed", "responsive"),
+                full_width = T, position = "left") %>%
+  footnote(general = "Both NA and -99 are considered to be missing values.")
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Missing values by column +
+Columns + +Missing + +Amount +
+school_cd + +NA + +
+school_name + +NA + +
+district_name + +-99 + +19 +
+district_name + +NA + +1706 +
+county_name + +-99 + +19 +
+region + +-99 + +19 +
+year + +NA + +
+total_enroll + +-99 + +13 +
+per_free_lunch + +-99 + +15 +
+per_reduced_lunch + +-99 + +15 +
+per_lep + +-99 + +13 +
+mean_ela_score + +-99 + +2208 +
+mean_math_score + +-99 + +2210 +
+Note: +
+ Both NA and -99 are considered to be missing values. +
+



+
+
+

Task 3: Recoding and variable manipulation

+
    +
  1. Deal with missing values, which are currently coded as -99.
  2. +
  3. Create a categorical variable that groups counties into “high”, “medium”, and “low” poverty groups. Decide how you want to split up the groups and briefly explain your decision.
  4. +
  5. The tests that the NYS Department of Education administers changes from time to time, so scale scores are not directly comparable year-to-year. Create a new variable that is the standardized z-score for math and English Language Arts (ELA) for each year (hint: group by year and use the scale() function)
  6. +
+
# replace -99 with NA
+school1 <- school0
+school1[school1 == -99 |school1 == '-99'] <- NA
+# remove rows that have wrong lunch data (percentage should be lower than 1)
+school1 <- school1[(school1$per_free_lunch < 1) & (school1$per_reduced_lunch < 1),]
+
+county1 <- county0
+county1[county1 == -99 |county1 == '-99'] <- NA
+# test
+# miss0 <- school1 %>% filter_all(any_vars(. %in% c('-99', -99) | is.na(.)))
+
+# group by county
+county2 <- county1 %>% group_by(county_name) %>% 
+  summarize(county_per_poverty = mean(county_per_poverty, na.rm = TRUE),
+            median_household_income = mean(median_household_income, na.rm = TRUE),
+            county_per_bach = mean(county_per_bach, na.rm = TRUE))
+
+# quantiles 3
+q3 <- quantile(county2$county_per_poverty, probs = c(0, 1/3, 2/3))
+q3 <- append(q3, 1)
+print(q3)
+
##         0%  33.33333%  66.66667%            
+## 0.05557705 0.11667442 0.14051301 1.00000000
+
county2$povlvl <- cut(county2$county_per_poverty, breaks = q3, labels = c('low', 'medium', 'high'),
+                      include.lowest = TRUE, right = FALSE)
+
+# scaling: z-score
+school2 <- school1 %>% group_by(year) %>% 
+  mutate(zmath = scale(mean_math_score), zeng = scale(mean_ela_score)) %>%
+  ungroup(year)
+

Poverty levels were recoded according to quantiles calculated using the r quantile function.

+
+
+

Task 4: Merge datasets

+

Create a county-level dataset that merges variables from the schools dataset and the ACS dataset. Remember that you have learned multiple approaches on how to do this, and that you will have to decide how to summarize data when moving from the school to the county level.

+



+
# group school by county name
+school3 <- school2 %>% group_by(county_name) %>%
+  summarize(total_enroll = mean(total_enroll, na.rm = TRUE),
+            per_free_lunch = mean(per_free_lunch, na.rm = TRUE),
+            per_reduced_lunch = mean(per_reduced_lunch, na.rm = TRUE),
+            mean_math_score = mean(mean_math_score, na.rm = TRUE),
+            mean_ela_score = mean(mean_ela_score, na.rm = TRUE),
+            zmath = mean(zmath, na.rm = TRUE),
+            zeng = mean(zeng, na.rm = TRUE))
+
+# inner join both data
+ctysch0 <- merge(county2, school3, by = "county_name", 
+                 all = FALSE, all.x = FALSE, all.y = FALSE)
+ctysch0 <- as.data.table(ctysch0)
+



+
+
+

Task 5: Create summary tables

+

Generate tables showing the following:

+
    +
  1. For each county: total enrollment, percent of students qualifying for free or reduced price lunch, and percent of population in poverty.
  2. +
  3. For the counties with the top 5 and bottom 5 poverty rate: percent of population in poverty, percent of students qualifying for free or reduced price lunch, mean reading score, and mean math score.
  4. +
+
ctysch0[, c("county_name", 'total_enroll', 'per_free_lunch', 'per_reduced_lunch', 
+            'county_per_poverty')] %>%
+  # change percentage format
+  mutate(per_free_lunch = scales::percent(per_free_lunch),
+       per_reduced_lunch = scales::percent(per_reduced_lunch),
+       county_per_poverty = scales::percent(county_per_poverty)
+       ) %>%
+  mutate_if(is.numeric, function(x) {format(x, digits = 1, big.mark = ",",
+                                          decimal.mark = ".")}) %>%
+  kable(caption = "County performance on student enrollment, lunch price reduction, and percent of poverty",
+        booktabs = T, longtable = T) %>%
+  kable_styling(bootstrap_options = c("striped", "condensed", "responsive"),
+                full_width = T, position = "left")
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+County performance on student enrollment, lunch price reduction, and percent of poverty +
+county_name + +total_enroll + +per_free_lunch + +per_reduced_lunch + +county_per_poverty +
+ALBANY + +433 + +36.403% + +5.7665% + +12.30511% +
+ALLEGANY + +383 + +40.616% + +11.1712% + +15.08504% +
+BRONX + +561 + +80.233% + +5.6204% + +28.67147% +
+BROOME + +461 + +39.683% + +7.7214% + +15.86677% +
+CATTARAUGUS + +398 + +40.677% + +10.2681% + +16.51814% +
+CAYUGA + +387 + +34.338% + +7.1060% + +11.48767% +
+CHAUTAUQUA + +365 + +45.616% + +8.5630% + +17.52520% +
+CHEMUNG + +480 + +42.497% + +6.5683% + +14.85521% +
+CHENANGO + +345 + +43.026% + +11.0321% + +14.47734% +
+CLINTON + +366 + +35.172% + +8.5419% + +13.21178% +
+COLUMBIA + +426 + +30.947% + +8.3540% + +9.89453% +
+CORTLAND + +299 + +39.484% + +9.8710% + +13.37571% +
+DELAWARE + +320 + +39.586% + +12.6621% + +13.88953% +
+DUTCHESS + +448 + +26.308% + +6.8615% + +8.36447% +
+ERIE + +547 + +40.442% + +6.7577% + +13.99995% +
+ESSEX + +250 + +32.675% + +11.4917% + +10.98214% +
+FRANKLIN + +354 + +44.108% + +10.5324% + +15.64112% +
+FULTON + +369 + +42.487% + +9.5133% + +16.12855% +
+GENESEE + +417 + +32.883% + +9.0583% + +12.06421% +
+GREENE + +381 + +32.871% + +7.9483% + +12.66660% +
+HAMILTON + +97 + +19.250% + +10.3269% + +10.38686% +
+HERKIMER + +390 + +38.636% + +10.6848% + +13.89260% +
+JEFFERSON + +438 + +34.081% + +13.0129% + +14.23137% +
+KINGS + +570 + +72.274% + +6.4320% + +22.44771% +
+LEWIS + +324 + +37.295% + +13.8632% + +13.64802% +
+LIVINGSTON + +363 + +30.236% + +7.7826% + +11.90124% +
+MADISON + +331 + +33.809% + +8.5931% + +10.04190% +
+MONROE + +530 + +40.283% + +5.9072% + +14.04920% +
+MONTGOMERY + +460 + +43.294% + +7.7745% + +17.75885% +
+NASSAU + +532 + +18.170% + +4.2915% + +5.55770% +
+NEW YORK + +455 + +63.208% + +5.7118% + +17.15754% +
+NIAGARA + +471 + +38.340% + +8.5508% + +12.95836% +
+ONEIDA + +416 + +40.936% + +8.3503% + +14.89985% +
+ONONDAGA + +503 + +37.599% + +6.0243% + +13.92367% +
+ONTARIO + +523 + +28.775% + +7.4742% + +9.27393% +
+ORANGE + +632 + +30.120% + +7.5153% + +11.66177% +
+ORLEANS + +537 + +40.480% + +9.5333% + +12.80679% +
+OSWEGO + +447 + +41.467% + +9.3399% + +16.16781% +
+OTSEGO + +337 + +33.194% + +11.0971% + +14.70162% +
+PUTNAM + +632 + +11.331% + +3.2938% + +5.76395% +
+QUEENS + +757 + +61.588% + +10.0887% + +13.99263% +
+RENSSELAER + +425 + +33.121% + +6.3313% + +11.78365% +
+RICHMOND + +726 + +49.686% + +8.6370% + +11.37478% +
+ROCKLAND + +502 + +26.788% + +4.8303% + +12.71141% +
+SAINT LAWRENCE + +395 + +42.083% + +10.2310% + +16.35377% +
+SARATOGA + +550 + +17.077% + +4.5569% + +6.39764% +
+SCHENECTADY + +424 + +39.133% + +6.1333% + +11.67878% +
+SCHOHARIE + +374 + +34.443% + +11.4205% + +11.33890% +
+SCHUYLER + +312 + +37.378% + +8.4865% + +10.98782% +
+SENECA + +320 + +32.322% + +9.3444% + +11.54927% +
+STEUBEN + +391 + +38.434% + +11.6742% + +14.79747% +
+SUFFOLK + +583 + +22.286% + +5.1200% + +6.19403% +
+SULLIVAN + +507 + +41.764% + +8.3150% + +16.40239% +
+TIOGA + +369 + +39.458% + +10.0486% + +9.57096% +
+TOMPKINS + +343 + +33.313% + +7.4000% + +17.50210% +
+ULSTER + +420 + +31.981% + +7.5989% + +11.79799% +
+WARREN + +431 + +31.273% + +7.5400% + +10.78112% +
+WASHINGTON + +383 + +34.919% + +9.1284% + +11.86004% +
+WAYNE + +377 + +34.736% + +10.0377% + +11.33986% +
+WESTCHESTER + +535 + +29.052% + +5.1459% + +8.87114% +
+WYOMING + +308 + +29.387% + +9.8933% + +9.68185% +
+YATES + +490 + +45.333% + +9.5000% + +14.05235% +
+



+
top5 <- ctysch0[order(county_per_poverty, decreasing = TRUE),]
+top5 <- top5[1:5, c('county_name', 'county_per_poverty', 'per_free_lunch', 
+                    'per_reduced_lunch', 'mean_math_score', 'mean_ela_score')]
+
+top5 %>%
+  mutate(county_per_poverty = scales::percent(county_per_poverty),
+       per_free_lunch = scales::percent(per_free_lunch),
+       per_reduced_lunch = scales::percent(per_reduced_lunch)
+       ) %>%
+  kable(caption = "Top 5 Counties with the highest poverty percentage",
+        booktabs = T, longtable = T) %>%
+  kable_styling(bootstrap_options = c("striped", "condensed", "responsive"),
+                full_width = T, position = "left") %>%
+  footnote(general = "Both NA and -99 are considered to be missing values.")
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Top 5 Counties with the highest poverty percentage +
+county_name + +county_per_poverty + +per_free_lunch + +per_reduced_lunch + +mean_math_score + +mean_ela_score +
+BRONX + +28.671% + +80.2% + +5.62% + +469.5195 + +462.1041 +
+KINGS + +22.448% + +72.3% + +6.43% + +476.6196 + +467.9365 +
+MONTGOMERY + +17.759% + +43.3% + +7.77% + +501.2010 + +491.5319 +
+CHAUTAUQUA + +17.525% + +45.6% + +8.56% + +497.9509 + +487.2825 +
+TOMPKINS + +17.502% + +33.3% + +7.40% + +490.7822 + +481.5070 +
+Note: +
+ Both NA and -99 are considered to be missing values. +
+
# tables side by side:
+# list(top51, matrix(numeric(), nrow=0, ncol=1), bot51) %>%
+
bot5 <- ctysch0[order(county_per_poverty)]
+bot5 <- bot5[1:5, c('county_name', 'county_per_poverty', 'per_free_lunch', 
+                    'per_reduced_lunch', 'mean_math_score', 'mean_ela_score')]
+bot5 %>%
+  mutate(county_per_poverty = scales::percent(county_per_poverty),
+       per_free_lunch = scales::percent(per_free_lunch),
+       per_reduced_lunch = scales::percent(per_reduced_lunch)
+       ) %>%
+  kable(caption = "Top 5 Counties with the lowest poverty percentage",
+        booktabs = T, longtable = T) %>%
+  kable_styling(bootstrap_options = c("striped", "condensed", "responsive"),
+                full_width = T, position = "left") %>%
+  footnote(general = "Both NA and -99 are considered to be missing values.")
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Top 5 Counties with the lowest poverty percentage +
+county_name + +county_per_poverty + +per_free_lunch + +per_reduced_lunch + +mean_math_score + +mean_ela_score +
+NASSAU + +5.56% + +18.2% + +4.29% + +509.5901 + +498.1457 +
+PUTNAM + +5.76% + +11.3% + +3.29% + +516.3372 + +504.9122 +
+SUFFOLK + +6.19% + +22.3% + +5.12% + +501.2089 + +491.2172 +
+SARATOGA + +6.40% + +17.1% + +4.56% + +503.4612 + +493.3317 +
+DUTCHESS + +8.36% + +26.3% + +6.86% + +501.6445 + +493.9521 +
+Note: +
+ Both NA and -99 are considered to be missing values. +
+



+
+
+

Task 6: Data visualization

+

Using ggplot2, visualize the following:

+
    +
  1. The relationship between access to free/reduced price lunch and test performance, at the school level.
  2. +
  3. Average test performance across counties with high, low, and medium poverty.
  4. +
+
# group by school
+bysch0 <- school2 %>% group_by(school_name) %>% 
+  summarize(per_free_lunch = mean(per_free_lunch, na.rm = TRUE),
+            per_reduced_lunch = mean(per_reduced_lunch, na.rm = TRUE),
+            mean_math_score = mean(mean_math_score, na.rm = TRUE),
+            mean_ela_score = mean(mean_ela_score, na.rm = TRUE),
+            zmath = mean(zmath, na.rm = TRUE),
+            zeng = mean(zeng, na.rm = TRUE))
+# remove schools with NA 
+bysch1 <- na.omit(bysch0)
+
+sp1 <- ggplot(bysch1, aes(x=per_free_lunch, y=zmath)) + geom_point() +
+  geom_smooth(method=lm) + theme_classic() +
+  labs(title = 'Free lunch percentage and math score',
+       x = 'Free lunch percentage', y = 'Standardized math score')
+sp2 <- ggplot(bysch1, aes(x=per_free_lunch, y=zeng)) + geom_point() +
+  geom_smooth(method=lm) + theme_classic() +
+  labs(title = 'Free lunch percentage and reading score',
+       x = 'Free lunch percentage', y = 'Standardized reading score')
+sp3 <- ggplot(bysch1, aes(x=per_reduced_lunch, y=zmath)) + geom_point() +
+  geom_smooth(method=lm) + theme_classic() +
+  labs(title = 'Reduced lunch fee percentage and math score',
+       x = 'Reduced lunch fee percentage', y = 'Standardized math score')
+sp4 <- ggplot(bysch1, aes(x=per_reduced_lunch, y=zeng)) + geom_point() +
+  geom_smooth(method=lm) + theme_classic() +
+  labs(title = 'Reduced lunch fee percentage and math score',
+       x = 'Reduced lunch fee percentage', y = 'Standardized reading score')
+
+grid.arrange(sp1, sp2, sp3, sp4, ncol = 2, nrow = 2)
+

The above plots seemed to suggest that percentage of students with free lunch is negatively associated with standardized scores, while percentage of students with reduced lunch price is positively associated with standardized scores.

+
cp1 <- ggplot(ctysch0, aes(x=zmath, y=zeng, color=povlvl, shape=povlvl)) + 
+  geom_point(size = 5) +  theme_classic() +
+  # geom_text(label=ctysch0$county_name) + 
+  labs(title = 'County poverty level and student standardized scores', 
+       x = 'Standardized math score', y = 'Standardized reading score')
+cp2 <- ggplot(ctysch0, aes(x=zmath, y=zeng, color=povlvl, shape=povlvl)) +
+  geom_smooth(method=lm) +  theme_classic() +
+  labs(title = 'County poverty level associations with student standardized scores', 
+       x = 'Standardized math score', y = 'Standardized reading score')
+  
+grid.arrange(cp1, cp2, ncol = 2, nrow = 1)
+

The above plots indicates that low poverty levels are associated with higher math and reading standardized scores. While high poverty levels are associated with the reverse.

+
+
+

Task 7: Answering questions

+

Using the skills you have learned in the past three days, tackle the following question:

+
+

What can the data tell us about the relationship between poverty and test performance in New York public schools? Has this relationship changed over time? Is this relationship at all moderated by access to free/reduced price lunch?

+
+

You may use summary tables, statistical models, and/or data visualization in pursuing an answer to this question. Feel free to build on the tables and plots you generated above in Tasks 5 and 6.

+



+
syear1 <- school2 %>% group_by(school_name, county_name, year) %>%
+  summarize(per_free_lunch = mean(per_free_lunch, na.rm = TRUE),
+            per_reduced_lunch = mean(per_reduced_lunch, na.rm = TRUE),
+            mean_math_score = mean(mean_math_score, na.rm = TRUE),
+            mean_ela_score = mean(mean_ela_score, na.rm = TRUE),
+            zmath = mean(zmath, na.rm = TRUE),
+            zeng = mean(zeng, na.rm = TRUE))
+
+cyear1 <- county1 %>% group_by(county_name, year) %>% 
+  summarize(county_per_poverty = mean(county_per_poverty, na.rm = TRUE))
+cyear2 <- merge(cyear1, county2[, c('county_name', 'povlvl')], by = "county_name", 
+                 all = FALSE, all.x = FALSE, all.y = FALSE)
+
+# inner join both data
+csyr1 <- merge(cyear2, syear1, by = c("county_name", 'year'), 
+               all = FALSE, all.x = FALSE, all.y = FALSE)
+csyr2 <- as.data.table(csyr1)
+csyr2 <- na.omit(csyr2)
+csyr2$years <- factor(csyr2$year)
+
+cs1 <- ggplot(csyr2, aes(x=zmath, y=zeng, color=years, shape=povlvl)) + 
+  geom_point(size = 5) +  theme_classic() +
+  # geom_text(label=ctysch0$county_name) + 
+  labs(title = 'County poverty level and student standardized scores by year', 
+       x = 'Standardized math score', y = 'Standardized reading score')
+cs2 <- ggplot(csyr2, aes(x=zmath, y=zeng, color=years, shape=povlvl)) +
+  geom_smooth(method=lm) +  theme_classic() +
+  labs(title = 'County poverty level associations with student standardized scores by year', 
+       x = 'Standardized math score', y = 'Standardized reading score')
+  
+grid.arrange(cs1, cs2, ncol = 2, nrow = 1)
+

The plots above revealed that poverty and standardized score associations did not have drastic changes from year to year. Thus, it would be appropriate to aggregate and analyze the above data without accounting for errors with time series structures.

+
# group by school
+school2$lunch <- school2$per_free_lunch + school2$per_reduced_lunch
+
+bysch2 <- school2 %>% group_by(school_name, county_name) %>% 
+  summarize(per_free_lunch = mean(per_free_lunch, na.rm = TRUE),
+            per_reduced_lunch = mean(per_reduced_lunch, na.rm = TRUE),
+            lunch = mean(lunch, na.rm = TRUE),
+            mean_math_score = mean(mean_math_score, na.rm = TRUE),
+            mean_ela_score = mean(mean_ela_score, na.rm = TRUE),
+            zmath = mean(zmath, na.rm = TRUE),
+            zeng = mean(zeng, na.rm = TRUE))
+# remove schools with NA 
+bysch3 <- as.data.table(na.omit(bysch2))
+
+county3 <- as.data.table(county2[, c('county_name', 'county_per_poverty', 'povlvl')])
+
+ctysch1 <- merge(bysch3, county3, by = "county_name", 
+                 all = FALSE, all.x = FALSE, all.y = FALSE)
+ctysch1 <- na.omit(ctysch1)
+
+# bivariate tests
+print(cor.test(ctysch1$county_per_poverty, ctysch1$zmath))
+
## 
+##  Pearson's product-moment correlation
+## 
+## data:  ctysch1$county_per_poverty and ctysch1$zmath
+## t = -19.989, df = 4797, p-value < 0.00000000000000022
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+##  -0.3032034 -0.2509634
+## sample estimates:
+##        cor 
+## -0.2772884
+
print(cor.test(ctysch1$county_per_poverty, ctysch1$zeng))
+
## 
+##  Pearson's product-moment correlation
+## 
+## data:  ctysch1$county_per_poverty and ctysch1$zeng
+## t = -25.733, df = 4797, p-value < 0.00000000000000022
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+##  -0.3728930 -0.3231642
+## sample estimates:
+##        cor 
+## -0.3482736
+
print(cor.test(ctysch1$per_free_lunch, ctysch1$zmath))
+
## 
+##  Pearson's product-moment correlation
+## 
+## data:  ctysch1$per_free_lunch and ctysch1$zmath
+## t = -50.329, df = 4797, p-value < 0.00000000000000022
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+##  -0.6060613 -0.5690181
+## sample estimates:
+##        cor 
+## -0.5878477
+
print(cor.test(ctysch1$per_free_lunch, ctysch1$zeng))
+
## 
+##  Pearson's product-moment correlation
+## 
+## data:  ctysch1$per_free_lunch and ctysch1$zeng
+## t = -65.262, df = 4797, p-value < 0.00000000000000022
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+##  -0.7004872 -0.6705015
+## sample estimates:
+##        cor 
+## -0.6857853
+
print(cor.test(ctysch1$per_reduced_lunch, ctysch1$zmath))
+
## 
+##  Pearson's product-moment correlation
+## 
+## data:  ctysch1$per_reduced_lunch and ctysch1$zmath
+## t = 5.3922, df = 4797, p-value = 0.00000007295
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+##  0.04943336 0.10568048
+## sample estimates:
+##        cor 
+## 0.07761868
+
print(cor.test(ctysch1$per_reduced_lunch, ctysch1$zeng))
+
## 
+##  Pearson's product-moment correlation
+## 
+## data:  ctysch1$per_reduced_lunch and ctysch1$zeng
+## t = 2.0438, df = 4797, p-value = 0.04103
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+##  0.00120356 0.05774213
+## sample estimates:
+##        cor 
+## 0.02949644
+

Pearson correlation tests showed that there exists significant negative associations between county poverty levels and student standardized math and reading scores. In other words, the higher a county’s percent of poverty, the lower its students score in math and reading. It’s also interesting to note that the percentage of students with free lunch has a significant negative association with both standardized scores. Finally, the percentage of student with reduced lunch price is positively associated with both standardized scores. Although, the association between reading scores and reduced lunch price is low.

+
# multivariate linear regression
+lr1 <- lm(zmath ~ county_per_poverty + per_free_lunch + per_reduced_lunch, data=ctysch1)
+summary(lr1)
+
## 
+## Call:
+## lm(formula = zmath ~ county_per_poverty + per_free_lunch + per_reduced_lunch, 
+##     data = ctysch1)
+## 
+## Residuals:
+##     Min      1Q  Median      3Q     Max 
+## -7.4409 -0.4874 -0.0431  0.4518  3.4801 
+## 
+## Coefficients:
+##                    Estimate Std. Error t value            Pr(>|t|)    
+## (Intercept)         0.61474    0.03670  16.751 <0.0000000000000002 ***
+## county_per_poverty  2.46881    0.23071  10.701 <0.0000000000000002 ***
+## per_free_lunch     -2.45703    0.05245 -46.846 <0.0000000000000002 ***
+## per_reduced_lunch   2.48427    0.27100   9.167 <0.0000000000000002 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 0.8139 on 4795 degrees of freedom
+## Multiple R-squared:  0.3705, Adjusted R-squared:  0.3702 
+## F-statistic: 940.9 on 3 and 4795 DF,  p-value: < 0.00000000000000022
+

The multivariate linear regression model showed that in the presence of the effects of the percentage of students with free or reduced lunch price, the association between county poverty and standardized math score is reversed. Associations between free lunch and reduced lunch price with math scores were not changed in the presence of county poverty.

+
# multivariate linear regression
+lr2 <- lm(zeng ~ county_per_poverty + per_free_lunch + per_reduced_lunch, data=ctysch1)
+summary(lr2)
+
## 
+## Call:
+## lm(formula = zeng ~ county_per_poverty + per_free_lunch + per_reduced_lunch, 
+##     data = ctysch1)
+## 
+## Residuals:
+##     Min      1Q  Median      3Q     Max 
+## -6.0435 -0.4379 -0.0532  0.4003  3.0517 
+## 
+## Coefficients:
+##                    Estimate Std. Error t value             Pr(>|t|)    
+## (Intercept)         0.82679    0.03254  25.408 < 0.0000000000000002 ***
+## county_per_poverty  2.14118    0.20457  10.467 < 0.0000000000000002 ***
+## per_free_lunch     -2.70462    0.04651 -58.155 < 0.0000000000000002 ***
+## per_reduced_lunch   1.39071    0.24030   5.787         0.0000000076 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 0.7217 on 4795 degrees of freedom
+## Multiple R-squared:  0.4851, Adjusted R-squared:  0.4848 
+## F-statistic:  1506 on 3 and 4795 DF,  p-value: < 0.00000000000000022
+

Standardized reading scores also have the similar associations with county poverty, free lunch and reduced lunch price.

+
# multivariate linear regression
+lr3 <- lm(zmath ~ county_per_poverty + per_free_lunch + county_per_poverty*per_free_lunch, data=ctysch1)
+summary(lr3)
+
## 
+## Call:
+## lm(formula = zmath ~ county_per_poverty + per_free_lunch + county_per_poverty * 
+##     per_free_lunch, data = ctysch1)
+## 
+## Residuals:
+##     Min      1Q  Median      3Q     Max 
+## -7.6201 -0.4794 -0.0633  0.4349  3.6363 
+## 
+## Coefficients:
+##                                   Estimate Std. Error t value
+## (Intercept)                        0.81724    0.06103  13.391
+## county_per_poverty                 2.20678    0.48005   4.597
+## per_free_lunch                    -2.46940    0.11179 -22.090
+## county_per_poverty:per_free_lunch  0.27956    0.69432   0.403
+##                                               Pr(>|t|)    
+## (Intercept)                       < 0.0000000000000002 ***
+## county_per_poverty                           0.0000044 ***
+## per_free_lunch                    < 0.0000000000000002 ***
+## county_per_poverty:per_free_lunch                0.687    
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 0.821 on 4795 degrees of freedom
+## Multiple R-squared:  0.3595, Adjusted R-squared:  0.3591 
+## F-statistic: 897.3 on 3 and 4795 DF,  p-value: < 0.00000000000000022
+
lr4 <- lm(zmath ~ county_per_poverty + per_reduced_lunch + county_per_poverty*per_reduced_lunch, data=ctysch1)
+summary(lr4)
+
## 
+## Call:
+## lm(formula = zmath ~ county_per_poverty + per_reduced_lunch + 
+##     county_per_poverty * per_reduced_lunch, data = ctysch1)
+## 
+## Residuals:
+##     Min      1Q  Median      3Q     Max 
+## -7.0665 -0.5450  0.0243  0.5461  3.7439 
+## 
+## Coefficients:
+##                                       Estimate Std. Error t value
+## (Intercept)                            1.29928    0.06457   20.12
+## county_per_poverty                    -9.46732    0.37358  -25.34
+## per_reduced_lunch                    -11.64104    0.86532  -13.45
+## county_per_poverty:per_reduced_lunch  85.88016    5.15697   16.65
+##                                                 Pr(>|t|)    
+## (Intercept)                          <0.0000000000000002 ***
+## county_per_poverty                   <0.0000000000000002 ***
+## per_reduced_lunch                    <0.0000000000000002 ***
+## county_per_poverty:per_reduced_lunch <0.0000000000000002 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 0.9554 on 4795 degrees of freedom
+## Multiple R-squared:  0.1326, Adjusted R-squared:  0.1321 
+## F-statistic: 244.4 on 3 and 4795 DF,  p-value: < 0.00000000000000022
+
lr5 <- lm(zeng ~ county_per_poverty + per_free_lunch + county_per_poverty*per_free_lunch, data=ctysch1)
+summary(lr5)
+
## 
+## Call:
+## lm(formula = zeng ~ county_per_poverty + per_free_lunch + county_per_poverty * 
+##     per_free_lunch, data = ctysch1)
+## 
+## Residuals:
+##     Min      1Q  Median      3Q     Max 
+## -6.1453 -0.4295 -0.0557  0.3930  3.0158 
+## 
+## Coefficients:
+##                                   Estimate Std. Error t value
+## (Intercept)                        0.94513    0.05383  17.557
+## county_per_poverty                 1.95460    0.42345   4.616
+## per_free_lunch                    -2.72091    0.09861 -27.594
+## county_per_poverty:per_free_lunch  0.22247    0.61245   0.363
+##                                               Pr(>|t|)    
+## (Intercept)                       < 0.0000000000000002 ***
+## county_per_poverty                          0.00000402 ***
+## per_free_lunch                    < 0.0000000000000002 ***
+## county_per_poverty:per_free_lunch                0.716    
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 0.7242 on 4795 degrees of freedom
+## Multiple R-squared:  0.4815, Adjusted R-squared:  0.4812 
+## F-statistic:  1484 on 3 and 4795 DF,  p-value: < 0.00000000000000022
+
lr6 <- lm(zeng ~ county_per_poverty + per_reduced_lunch + county_per_poverty*per_reduced_lunch, data=ctysch1)
+summary(lr6)
+
## 
+## Call:
+## lm(formula = zeng ~ county_per_poverty + per_reduced_lunch + 
+##     county_per_poverty * per_reduced_lunch, data = ctysch1)
+## 
+## Residuals:
+##     Min      1Q  Median      3Q     Max 
+## -7.1183 -0.5050  0.0037  0.5098  3.8358 
+## 
+## Coefficients:
+##                                       Estimate Std. Error t value
+## (Intercept)                            1.51589    0.06169   24.57
+## county_per_poverty                   -10.58523    0.35691  -29.66
+## per_reduced_lunch                    -13.08200    0.82673  -15.82
+## county_per_poverty:per_reduced_lunch  87.64059    4.92695   17.79
+##                                                 Pr(>|t|)    
+## (Intercept)                          <0.0000000000000002 ***
+## county_per_poverty                   <0.0000000000000002 ***
+## per_reduced_lunch                    <0.0000000000000002 ***
+## county_per_poverty:per_reduced_lunch <0.0000000000000002 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## Residual standard error: 0.9128 on 4795 degrees of freedom
+## Multiple R-squared:  0.1763, Adjusted R-squared:  0.1758 
+## F-statistic: 342.1 on 3 and 4795 DF,  p-value: < 0.00000000000000022
+

Regression models showed that the interaction between percent of students with free lunch and county poverty percentage is not significant. However, the interaction between percent of students with reduced lunch price and county poverty percentage is significant for both types of standardized scores. The coefficients for county poverty in all models were negative, this signifies that the association between poverty and standardized scores is negative when there is no lunch price reduction. However, since the coefficient of interaction is positive, this means that as the percentage of students with reduced lunch price increases, the negative association between poverty and scores also becomes less negative. In other words, as a school provides more students with reduced lunch the better the school’s students score despite the school being located in a county with higher percentage of poverty.

+

In order to illustrate the above models, percentage of students with reduced lunch price was recoded to a categorical variable with 4 levels based on quartiles. The resulting plots clearly showed that when percentage of students with reduced lunch price increases, the negative association between poverty and scores lessen. This lessening effect gradually reverses to a positive association when the percentage of students with reduced lunch price rises over 10%.

+
# change reduced lunch to quartile
+q4 <- quantile(ctysch1$per_reduced_lunch)
+q4[5] <- 1
+print(q4)
+
##         0%        25%        50%        75%       100% 
+## 0.00000000 0.03845238 0.07000000 0.10000000 1.00000000
+
ctysch1$rlunchlvl <- cut(ctysch1$per_reduced_lunch, breaks = q4, labels = c('q1', 'q2', 'q3', 'q4'),
+                      include.lowest = TRUE, right = FALSE)
+
+in0 <- lm(zmath ~ county_per_poverty + rlunchlvl + county_per_poverty*rlunchlvl, data=ctysch1)
+in1 <- lm(zeng ~ county_per_poverty + rlunchlvl + county_per_poverty*rlunchlvl, data=ctysch1)
+
+ipl0 <- plot_model(in0, type = "pred", terms = c("county_per_poverty", "rlunchlvl"))
+ipl1 <- plot_model(in1, type = "pred", terms = c("county_per_poverty", "rlunchlvl"))
+
+grid.arrange(ipl0, ipl1, ncol = 2, nrow = 1, clip = FALSE)
+

+



+
# Compute anova
+ano1 <- aov(zmath ~ povlvl, data = ctysch1)
+summary(ano1)
+
##               Df Sum Sq Mean Sq F value              Pr(>F)    
+## povlvl         2    278  139.14     140 <0.0000000000000002 ***
+## Residuals   4796   4768    0.99                                
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
# post hoc tests
+TukeyHSD(ano1)
+
##   Tukey multiple comparisons of means
+##     95% family-wise confidence level
+## 
+## Fit: aov(formula = zmath ~ povlvl, data = ctysch1)
+## 
+## $povlvl
+##                   diff        lwr        upr p adj
+## medium-low  -0.3090262 -0.4009862 -0.2170661     0
+## high-low    -0.5943582 -0.6785277 -0.5101887     0
+## high-medium -0.2853321 -0.3650224 -0.2056417     0
+
ano2 <- aov(zeng ~ povlvl, data = ctysch1)
+summary(ano2)
+
##               Df Sum Sq Mean Sq F value              Pr(>F)    
+## povlvl         2    433  216.39   234.9 <0.0000000000000002 ***
+## Residuals   4796   4417    0.92                                
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
TukeyHSD(ano2)
+
##   Tukey multiple comparisons of means
+##     95% family-wise confidence level
+## 
+## Fit: aov(formula = zeng ~ povlvl, data = ctysch1)
+## 
+## $povlvl
+##                   diff        lwr        upr p adj
+## medium-low  -0.3989839 -0.4875007 -0.3104670     0
+## high-low    -0.7431194 -0.8241374 -0.6621014     0
+## high-medium -0.3441355 -0.4208421 -0.2674289     0
+

ANOVA tests and post hoc Tukey HSD tests between county poverty levels and standardized scores showed significant associations. These associations also align with previous test results.

+
# multinomial logistic regression
+# Setting the basline 
+ctysch1$poverty <- relevel(ctysch1$povlvl, ref = "low")
+# Training the multinomial model
+mlog1 <- multinom(poverty ~ zmath + zeng + per_free_lunch + per_reduced_lunch, 
+                         data = ctysch1)
+
## # weights:  18 (10 variable)
+## initial  value 5272.240373 
+## iter  10 value 4308.494087
+## final  value 4256.521083 
+## converged
+
# Checking the model
+print(summary(mlog1))
+
## Call:
+## multinom(formula = poverty ~ zmath + zeng + per_free_lunch + 
+##     per_reduced_lunch, data = ctysch1)
+## 
+## Coefficients:
+##        (Intercept)     zmath          zeng per_free_lunch per_reduced_lunch
+## medium   -1.607750 0.3754930 -0.1151414263       3.499373         6.0528040
+## high     -2.412004 0.5125692 -0.0009621729       6.748489        -0.8832986
+## 
+## Std. Errors:
+##        (Intercept)     zmath      zeng per_free_lunch per_reduced_lunch
+## medium   0.1211976 0.1447577 0.1576059      0.2549074          1.026847
+## high     0.1328879 0.1451377 0.1589606      0.2591488          1.070781
+## 
+## Residual Deviance: 8513.042 
+## AIC: 8533.042
+
print(exp(coef(mlog1)))
+
##        (Intercept)    zmath      zeng per_free_lunch per_reduced_lunch
+## medium  0.20033777 1.455709 0.8912401       33.09469        425.303911
+## high    0.08963549 1.669575 0.9990383      852.76897          0.413417
+
# p-value
+z <- summary(mlog1)$coefficients/summary(mlog1)$standard.errors
+p <- (1 - pnorm(abs(z), 0, 1)) * 2
+print(p)
+
##        (Intercept)        zmath      zeng per_free_lunch per_reduced_lunch
+## medium           0 0.0094882764 0.4650446              0 0.000000003756944
+## high             0 0.0004130431 0.9951705              0 0.409422158037793
+
# https://towardsdatascience.com/a-deep-dive-on-vector-autoregression-in-r-58767ebb3f06
+

The multinomial logistic regression model showed similar results to the multivariate linear regression model. Of note is the lack of significance between the association of standardized reading scores and different levels of poverty. In other words, there were no significant differences between schools located within counties classified into different poverty levels and mean student standardized reading scores. There were also no difference between high and low poverty levels for reduced lunch price. This corroborates the results obtained above, when the effect of lunch subsidies are accounted for, scores have a weak positive association with poverty levels. An explanation of the positive association between lunch and poverty levels is that poorer counties were more likely to subsidies student lunches.

+



+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/submissions/header.html b/submissions/header.html new file mode 100644 index 0000000..46fef1f --- /dev/null +++ b/submissions/header.html @@ -0,0 +1,5 @@ + + + diff --git a/submissions/msia.png b/submissions/msia.png new file mode 100644 index 0000000..b3cd1cc Binary files /dev/null and b/submissions/msia.png differ