diff --git a/submissions/FinalRExercise_LionGlenysCharity.Rmd b/submissions/FinalRExercise_LionGlenysCharity.Rmd new file mode 100644 index 0000000..5e83580 --- /dev/null +++ b/submissions/FinalRExercise_LionGlenysCharity.Rmd @@ -0,0 +1,144 @@ +--- +title: "Bootcamp Final Exercise" +output: html_notebook +--- + + +```{r} +library(tidyverse) +library(here) +library(reshape2) + +nys_school <- read.csv("data/nys_schools.csv") +nys_acs <- read.csv("data/nys_acs.csv") +``` + +```{r} +head(nys_school) +``` + +```{r} +head(nys_acs) +``` + +```{r} +str(nys_school) +``` + +```{r} +str(nys_acs) +``` + + +```{r} +summary(nys_school) +``` +total_enroll, mean_ela_score, mean_math_score -> positive value, but the minimum value -> -99 (Missing value) +per_free_lunch, per_reduced_lunch, per_lep -> should be ratio, but the minimum value -> -99 (Missing values) + +```{r} +summary(nys_acs) +``` +county_per_poverty, country_per_batch -> ratio +median_household_income -> integer + +```{r} +colSums(is.na(nys_school)) +``` + +```{r} +colSums(is.na(nys_acs)) +``` + +No NA values, just -99 as the missing values + +Variables with missing values -> total_enroll, mean_ela_score, mean_math_score, per_free_lunch, per_reduced_lunch, per_lep + +Dealing with missing values -> set it to NA for the -99 +```{r} +nys_school[nys_school == -99] <- NA +summary(nys_school) +``` + +```{r} +head(nys_acs) +``` +```{r} +boxplot(county_per_poverty ~ year, data = nys_acs, + main = "Income Distribution by year", + xlab = "Year", + ylab = "Income", + las = 2) # Adjust label size if there are many counties +``` +Based on the distribution -> it looks like within a year, most of the distribution lie on a simillar value. Set Low -> Less than 0.07, Medium [0.07, 0.17], High (>0,17) + +```{r} +nys_acs_with_county_level <- nys_acs %>% + mutate(county_level = case_when( + county_per_poverty < 0.07 ~ "Low", + county_per_poverty >= 0.07 & county_per_poverty <= 0.17 ~ "Medium", + county_per_poverty > 0.17 ~ "High" + ) + ) +``` + +Task 3.3 +```{r} +scores_std <- nys_school %>% + select(year, contains("score")) %>% + group_by(year) %>% + summarize(all_mean_ela = mean(mean_ela_score, na.rm=TRUE), + all_mean_math = mean(mean_math_score, na.rm=TRUE), + all_sd_ela = sd(mean_ela_score, na.rm=TRUE), + all_sd_math = sd(mean_math_score, na.rm=TRUE)) +``` + +```{r} +with_all_value <- inner_join(nys_school, scores_std, by = 'year') +std_school <- mutate(with_all_value, + z_score_ela = (mean_ela_score-all_mean_ela)/all_sd_ela, + z_score_math = (mean_math_score-all_mean_math)/all_sd_math) +``` + +```{r} +head(nys_acs_with_county_level) +``` + +```{r} +join_data <- inner_join(std_school, nys_acs_with_county_level, by = c('county_name', 'year')) +head(join_data) +``` + +```{r} +summary_data <- join_data %>% + group_by(year, county_level) %>% + summarize(mean_ela = mean(z_score_ela, na.rm = TRUE), + mean_math = mean(z_score_math, na.rm = TRUE)) + +ggplot(summary_data, aes(x = year, y = mean_ela, color = county_level, group = county_level)) + + geom_line() + + geom_point() + + labs( + title = "Trend of Average ELA Scores by County Poverty Level", + x = "Year", + y = "Average ELA Score", + color = "County Poverty Level" + ) + + scale_color_brewer(palette = "Dark2") + + theme_minimal() +``` + +```{r} +ggplot(summary_data, aes(x = year, y = mean_math, color = county_level, group = county_level)) + + geom_line() + + geom_point() + + labs( + title = "Trend of Average Math Scores by County Poverty Level", + x = "Year", + y = "Average Math Score", + color = "County Poverty Level" + ) + + scale_color_brewer(palette = "Dark2") + + theme_minimal() +``` + diff --git a/submissions/FinalRExercise_LionGlenysCharity.nb.html b/submissions/FinalRExercise_LionGlenysCharity.nb.html new file mode 100644 index 0000000..ae0006f --- /dev/null +++ b/submissions/FinalRExercise_LionGlenysCharity.nb.html @@ -0,0 +1,2142 @@ + + + + +
+ + + + + + + + +library(tidyverse)
+library(here)
+library(reshape2)
+
+nys_school <- read.csv("data/nys_schools.csv")
+nys_acs <- read.csv("data/nys_acs.csv")
+
+
+
+
+
+
+head(nys_school)
+
+
+str(nys_school)
+
+
+
+'data.frame': 35663 obs. of 12 variables:
+ $ school_cd : num 1.01e+10 1.01e+10 1.01e+10 1.01e+10 1.01e+10 ...
+ $ school_name : chr "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" "MONTESSORI MAGNET SCHOOL" ...
+ $ district_name : chr "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" "ALBANY CITY SCHOOL DISTRICT" ...
+ $ county_name : chr "ALBANY" "ALBANY" "ALBANY" "ALBANY" ...
+ $ region : chr "CAPITAL DISTRICT" "CAPITAL DISTRICT" "CAPITAL DISTRICT" "CAPITAL DISTRICT" ...
+ $ year : int 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 ...
+ $ total_enroll : num 316 312 300 326 294 321 335 347 366 359 ...
+ $ per_free_lunch : num 0.33 0.32 0.26 0.22 0.31 0.25 0.35 0.22 0.29 0.26 ...
+ $ per_reduced_lunch: num 0.1 0.08 0.08 0.07 0.06 0.08 0.04 0 0.02 0.01 ...
+ $ per_lep : num 0.01 0.02 0.03 0.03 0.03 0.03 0.06 0.06 0.05 0.07 ...
+ $ mean_ela_score : num 658 673 670 667 670 ...
+ $ mean_math_score : num 670 679 683 681 687 ...
+
+
+
+
+
+
+str(nys_acs)
+
+
+
+'data.frame': 496 obs. of 5 variables:
+ $ county_name : chr "ALBANY" "ALBANY" "ALBANY" "ALBANY" ...
+ $ year : int 2009 2010 2011 2012 2013 2014 2015 2016 2009 2010 ...
+ $ county_per_poverty : num 0.118 0.119 0.121 0.124 0.123 ...
+ $ median_household_income: int 55350 56090 57715 59359 59394 59940 59887 60904 40917 41305 ...
+ $ county_per_bach : num 0.19 0.197 0.199 0.198 0.205 ...
+
+
+
+
+
+
+summary(nys_school)
+
+
+
+ school_cd school_name district_name county_name region year
+ Min. :1.010e+10 Length:35663 Length:35663 Length:35663 Length:35663 Min. :2008
+ 1st Qu.:2.802e+11 Class :character Class :character Class :character Class :character 1st Qu.:2010
+ Median :3.317e+11 Mode :character Mode :character Mode :character Mode :character Median :2013
+ Mean :3.568e+11 Mean :2013
+ 3rd Qu.:4.725e+11 3rd Qu.:2015
+ Max. :6.808e+11 Max. :2017
+ total_enroll per_free_lunch per_reduced_lunch per_lep mean_ela_score mean_math_score
+ Min. : -99.0 Min. :-99.0000 Min. :-99.00000 Min. :-99.00000 Min. :-99.0 Min. :-99.0
+ 1st Qu.: 339.0 1st Qu.: 0.1900 1st Qu.: 0.03000 1st Qu.: 0.00000 1st Qu.:296.0 1st Qu.:298.0
+ Median : 469.0 Median : 0.4200 Median : 0.06000 Median : 0.03000 Median :324.2 Median :330.8
+ Mean : 523.6 Mean : 0.4188 Mean : 0.02852 Mean : 0.04124 Mean :447.1 Mean :456.0
+ 3rd Qu.: 648.0 3rd Qu.: 0.7200 3rd Qu.: 0.10000 3rd Qu.: 0.11000 3rd Qu.:666.3 3rd Qu.:683.5
+ Max. :2347.0 Max. :257.0000 Max. : 53.00000 Max. : 1.00000 Max. :720.8 Max. :738.7
+
+
+
+total_enroll, mean_ela_score, mean_math_score -> positive value, +but the minimum value -> -99 (Missing value) per_free_lunch, +per_reduced_lunch, per_lep -> should be ratio, but the minimum value +-> -99 (Missing values)
+ + + +summary(nys_acs)
+
+
+
+ county_name year county_per_poverty median_household_income county_per_bach
+ Length:496 Min. :2009 Min. :0.04689 Min. : 33794 Min. :0.07574
+ Class :character 1st Qu.:2011 1st Qu.:0.10903 1st Qu.: 46347 1st Qu.:0.11018
+ Mode :character Median :2012 Median :0.12884 Median : 50134 Median :0.13169
+ Mean :2012 Mean :0.13085 Mean : 54116 Mean :0.14410
+ 3rd Qu.:2014 3rd Qu.:0.14929 3rd Qu.: 56448 3rd Qu.:0.17431
+ Max. :2016 Max. :0.29935 Max. :102044 Max. :0.31795
+
+
+
+county_per_poverty, country_per_batch -> ratio +median_household_income -> integer
+ + + +colSums(is.na(nys_school))
+
+
+
+ school_cd school_name district_name county_name region year
+ 0 0 0 0 0 0
+ total_enroll per_free_lunch per_reduced_lunch per_lep mean_ela_score mean_math_score
+ 0 0 0 0 0 0
+
+
+
+
+
+
+colSums(is.na(nys_acs))
+
+
+
+ county_name year county_per_poverty median_household_income county_per_bach
+ 0 0 0 0 0
+
+
+
+No NA values, just -99 as the missing values
+Variables with missing values -> total_enroll, mean_ela_score, +mean_math_score, per_free_lunch, per_reduced_lunch, per_lep
+Dealing with missing values -> set it to NA for the -99
+ + + +summary(nys_school)
+
+
+
+ school_cd school_name district_name county_name region year
+ Min. :1.010e+10 Length:35663 Length:35663 Length:35663 Length:35663 Min. :2008
+ 1st Qu.:2.802e+11 Class :character Class :character Class :character Class :character 1st Qu.:2010
+ Median :3.317e+11 Mode :character Mode :character Mode :character Mode :character Median :2013
+ Mean :3.568e+11 Mean :2013
+ 3rd Qu.:4.725e+11 3rd Qu.:2015
+ Max. :6.808e+11 Max. :2017
+
+ total_enroll per_free_lunch per_reduced_lunch per_lep mean_ela_score mean_math_score
+ Min. : 3.0 Min. : 0.0000 Min. : 0.00000 Min. :0.00000 Min. :191.0 Min. :213.0
+ 1st Qu.: 339.0 1st Qu.: 0.1900 1st Qu.: 0.03000 1st Qu.:0.00000 1st Qu.:300.0 1st Qu.:303.0
+ Median : 469.0 Median : 0.4200 Median : 0.06000 Median :0.03000 Median :347.3 Median :361.0
+ Mean : 523.8 Mean : 0.4606 Mean : 0.07019 Mean :0.07736 Mean :483.2 Mean :492.7
+ 3rd Qu.: 648.0 3rd Qu.: 0.7225 3rd Qu.: 0.10000 3rd Qu.:0.11000 3rd Qu.:667.3 3rd Qu.:684.7
+ Max. :2347.0 Max. :257.0000 Max. :53.00000 Max. :1.00000 Max. :720.8 Max. :738.7
+ NA's :13 NA's :15 NA's :15 NA's :13 NA's :2208 NA's :2210
+
+
+
+
+
+
+boxplot(county_per_poverty ~ year, data = nys_acs,
+ main = "Income Distribution by year",
+ xlab = "Year",
+ ylab = "Income",
+ las = 2) # Adjust label size if there are many counties
+
+
+Based on the distribution -> it looks like within a year, most of +the distribution lie on a simillar value. Set Low -> Less than 0.07, +Medium [0.07, 0.17], High (>0,17)
+ + + +nys_acs_with_county_level <- nys_acs %>%
+ mutate(county_level = case_when(
+ county_per_poverty < 0.07 ~ "Low",
+ county_per_poverty >= 0.07 & county_per_poverty <= 0.17 ~ "Medium",
+ county_per_poverty > 0.17 ~ "High"
+ )
+ )
+
+
+
+Task 3.3
+ + + +scores_std <- nys_school %>%
+ select(year, contains("score")) %>%
+ group_by(year) %>%
+ summarize(all_mean_ela = mean(mean_ela_score, na.rm=TRUE),
+ all_mean_math = mean(mean_math_score, na.rm=TRUE),
+ all_sd_ela = sd(mean_ela_score, na.rm=TRUE),
+ all_sd_math = sd(mean_math_score, na.rm=TRUE))
+
+
+
+
+
+
+with_all_value <- inner_join(nys_school, scores_std, by = 'year')
+std_school <- mutate(with_all_value,
+ z_score_ela = (mean_ela_score-all_mean_ela)/all_sd_ela,
+ z_score_math = (mean_math_score-all_mean_math)/all_sd_math)
+
+
+
+
+
+
+head(nys_acs_with_county_level)
+
+
+join_data <- inner_join(std_school, nys_acs_with_county_level, by = c('county_name', 'year'))
+head(join_data)
+
+
+summary_data <- join_data %>%
+ group_by(year, county_level) %>%
+ summarize(mean_ela = mean(z_score_ela, na.rm = TRUE),
+ mean_math = mean(z_score_math, na.rm = TRUE))
+
+
+`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
+
+
+ggplot(summary_data, aes(x = year, y = mean_ela, color = county_level, group = county_level)) +
+ geom_line() +
+ geom_point() +
+ labs(
+ title = "Trend of Average ELA Scores by County Poverty Level",
+ x = "Year",
+ y = "Average ELA Score",
+ color = "County Poverty Level"
+ ) +
+ scale_color_brewer(palette = "Dark2") +
+ theme_minimal()
+
+
+
+ggplot(summary_data, aes(x = year, y = mean_math, color = county_level, group = county_level)) +
+ geom_line() +
+ geom_point() +
+ labs(
+ title = "Trend of Average Math Scores by County Poverty Level",
+ x = "Year",
+ y = "Average Math Score",
+ color = "County Poverty Level"
+ ) +
+ scale_color_brewer(palette = "Dark2") +
+ theme_minimal()
+
+
+