diff --git a/submissions/Final R Exercise Markdown.Rmd b/submissions/Final R Exercise Markdown.Rmd new file mode 100644 index 0000000..d0b3897 --- /dev/null +++ b/submissions/Final R Exercise Markdown.Rmd @@ -0,0 +1,177 @@ +--- +title: "Final R Exercise" +author: "ZZ" +date: "2024-09-25" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## R final exercise + +```{r} +library(dplyr) +library(ggplot2) +``` + + +```{r} +str(nys_acs) +``` + +```{r} +str(nys_schools) +``` +```{r} +summary(nys_acs) +``` + +```{r} +summary(nys_schools) +``` +## Recoding and variable manipulation +### Deal with missing values, which are currently coded as -99. +#### dplyr +```{r} +nys_schools %>% + filter(!if_any(everything(), ~ .x == -99)) +``` + + +```{r} + num_rows_with_neg99 <- nys_schools %>% + filter(if_any(everything(), ~ .x == -99)) %>% + nrow() + +num_rows_with_neg99 # not too much, drop all + +``` +rm(nys_schools_clean) +```{r} +nys_schools %>% + filter(!if_any(everything(), ~ .x == -99)) %>% + nrow() + +nys_schools %>% + filter(if_any(everything(), ~ .x == -99)) %>% + sample_n(size = 10) + + +total_rows <- nrow(nys_schools) +rows_with_neg99 <- nys_schools %>% + filter(if_any(everything(), ~ .x == -99)) %>% + nrow() +rows_without_neg99 <- nys_schools %>% + filter(!if_any(everything(), ~ .x == -99)) %>% + nrow() +``` + + +```{r} +# Count duplicates +duplicate_count <- nys_schools %>% + distinct() %>% + nrow() %>% + {total_rows - .} +print(paste("Duplicate rows:", duplicate_count)) +``` + + +```{r} +# Verify counts directly linked to the condition +print(nys_schools %>% filter(if_any(everything(), ~ .x == -99)) %>% nrow()) +print(nys_schools %>% filter(!if_any(everything(), ~ .x == -99)) %>% nrow()) + +``` + + + +#### Base R + +```{r} +nys_schools_clean1 <- nys_schools[!apply(nys_schools == -99, 1, any),] +nrow(nys_schools_clean1) + + +nrow(nys_schools_clean1) +head(nys_schools_clean1) +``` +## Create a categorical variable that groups counties into "high", "medium", and "low" poverty groups. + +```{r} +summary(nys_acs) +``` +#### Base R +```{r} +breaks <- quantile(nys_acs$median_household_income, probs = c(0, 1/3, 2/3, 1), na.rm = T) + +nys_acs$poverty_level <- cut(nys_acs$median_household_income, + breaks = breaks, + labels = c("High", "Medium", "Low"), + include.lowest = TRUE) +``` + +#### dplyr +```{r} +nys_acs <- nys_acs %>% + mutate( poverty_level1 = case_when( + median_household_income < 47680 ~ "High", + median_household_income >= 47680 & median_household_income< 53345 ~ "Medium", + median_household_income >= 53345 ~ 'Low' + )) + +table(nys_acs$poverty_level1) +``` + + + +## The tests that the NYS Department of Education administers changes from time to time, so scale scores are not directly comparable year-to-year. Create a new variable that is the standardized z-score for math and English Language Arts (ELA) for each year (hint: group by year and use the scale() function) + +#### dplyr +```{r} +nys_schools_clean1 <- nys_schools_clean1 %>% + group_by(year) %>% + mutate( + ELA_normalized = scale(mean_ela_score, center = TRUE, scale = TRUE), + Math_normalized = scale(mean_math_score, center = TRUE, scale = TRUE) + ) %>% + ungroup() +``` + + +## Merge datasets +####Create a dataset that merges variables from the schools dataset and the ACS dataset. + +```{r} +data <- left_join(nys_acs, nys_schools_clean1, by = c("county_name", "year")) +``` + + +## Create Summary Tables + +data_summary <- data[c("county_name", "school_name", "year", "poverty_level", "per_free_lunch", "ELA_normalized", "Math_normalized")] + +data_summary + +```{r} +library(reshape2) + +data_bar_chart <- melt(data %>% + group_by(poverty_level) %>% + summarise( + ela_avg = mean(ELA_normalized, na.rm = TRUE), + math_avg = mean(Math_normalized, na.rm = TRUE) + ), + id.vars = "poverty_level", + variable.name = "test", + value.name = "scores") +data_bar_chart %>% + ggplot()+ + geom_col(aes(x = poverty_level, y = scores, group = test, fill = test), position = "dodge") + + labs(title = "Test Scores By Poverty Level", x = "Poverty Level", y = "Above or Below Average") +``` + + + diff --git a/submissions/r last day something upload.R b/submissions/r last day something upload.R new file mode 100644 index 0000000..8720b81 --- /dev/null +++ b/submissions/r last day something upload.R @@ -0,0 +1,28 @@ +# Some EDA + +names(nys_acs) +names(nys_schools) + +str(nys_acs) +str(nys_schools) + +summary(nys_acs) +summary(nys_schools) +# Recoding and variable manipulation + +## Deal with missing values, which are currently coded as -99. +library(dplyr) +nys_schools<- nys_schools %>% filter(nys_schools,mean_ela_score != -99 & mean_math_score != -99 ) + +summary(nys_schools) + +?filter + +nys_schools<- nys_schools %>% filter(mean_ela_score != -99 & mean_math_score != -99.0 ) + +sum(apply(nys_schools, 2, function(row) any(row == -99))) + + +table(nys_acs == -99) + +?apply \ No newline at end of file