diff --git a/submissions/FinalRExercise_SeokhyunKim.Rmd b/submissions/FinalRExercise_SeokhyunKim.Rmd new file mode 100644 index 0000000..5738ebf --- /dev/null +++ b/submissions/FinalRExercise_SeokhyunKim.Rmd @@ -0,0 +1,125 @@ +--- +title: "FinalRExercise_SeokhyunKim" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Task 1: Import your data + +```{r} + +# use the here command to read the data files. +library(here) +here() +nys_schools <- read.csv(here::here("data", "nys_schools.csv")) +nys_acs <- read.csv(here::here("data", "nys_acs.csv")) + +``` + +## Task 2: Explore your data + +```{r} + +head(nys_schools) # to see the head rows of dataframe +summary(nys_schools) # to see statistics of dataframe +# something is wrong with the data in nys_schools. Variables like total_enroll or per_free_lunch should be over 0 but the minimum value of them which I can see in summary(nys_schools) is -99. I'm not still sure is it missing value or sth. + +str(nys_schools) # to see structure of dataframe (types of variables, how many obs are there... etc) +sum(is.na(nys_schools$district_name)) # to check is there any NA data -> result of the code is 0 (which means there is no NA data) + +# apply equally to nys_acs +head(nys_acs) +summary(nys_acs) +str(nys_acs) +sum(is.na(nys_acs)) + +``` + +## Task 3: Recoding and variable manipulation + +```{r} + +library(tidyverse) +library(data.table) +# check if the number of missing values is significant or not (I guess there's better way to find it out other than using nrow like below..) +nrow(nys_schools[nys_schools$total_enroll == -99,]) +nrow(nys_schools[nys_schools$per_free_lunch == -99,]) +nrow(nys_schools[nys_schools$mean_ela_score == -99,]) +nrow(nys_schools[nys_schools$mean_math_score == -99,]) + +# I decided to remove the missing values because it can cause significant error in the future result and also, it doesn't take up significant portion in nys_schools. +tmp <- filter(nys_schools, total_enroll != -99, per_free_lunch != -99, per_reduced_lunch != -99, per_lep != -99, mean_ela_score != -99, mean_math_score != -99) +nrow(tmp) +nys_schools <- tmp + +# check if all the missing values are removed +summary(nys_schools) + +# to split up the groups, I found out the mean value of county_per_poverty by each county and use the 25, 75% percentile value as boundary for each group. +pov_grouping <- nys_acs %>% + group_by(county_name) %>% + summarize(mean_poverty = mean(county_per_poverty, na.rm = T)) +pov_grouping$pov_group <- ifelse(pov_grouping$mean_poverty >= quantile(pov_grouping$mean_poverty, 0.75), "high", ifelse(pov_grouping$mean_poverty >= quantile(pov_grouping$mean_poverty, 0.25), "medium", "low")) +nys_acs <- merge(nys_acs, pov_grouping, by = "county_name") + +# Create Z-Score (group by year) +nys_schools <- as.data.table(nys_schools) +nys_acs <- as.data.table(nys_acs) + +nys_schools[, z_ela := scale(mean_ela_score), by = year] +nys_schools[, z_math := scale(mean_math_score), by = year] + +``` + +#### Task 4: Merge datasets + +```{r} + +# to make nys_schools and nys_acs into same level, I decided to summarize schools data by simply calculating mean of each columns by year, county. +summ_schools <- nys_schools[, .(total_enroll = mean(total_enroll), per_free_lunch = mean(per_free_lunch), per_reduced_lunch = mean(per_reduced_lunch), per_lep = mean(per_lep), mean_ela_score = mean(mean_ela_score), mean_math_score = mean(mean_math_score)), by = .(county_name, year)] + +# merge into one table +merged <- merge(summ_schools, nys_acs, by = c("county_name", "year")) + +``` + +#### Task 5: Create summary tables + +```{r} + +# I think I already made the first task in merged table +merged[, .(county_name, total_enroll, per_free_lunch, per_reduced_lunch, county_per_poverty)] + +# sort it and get top5, bottom5 +merged[order(county_per_poverty), .(county_per_poverty, per_free_lunch, per_reduced_lunch, mean_ela_score, mean_math_score)][1:5] +merged[order(-county_per_poverty), .(county_per_poverty, per_free_lunch, per_reduced_lunch, mean_ela_score, mean_math_score)][1:5] + +``` + +#### Task 6: Data visualization + +```{r} + +# extract the data needed from school table +rel_schools <- nys_schools[, .(price_effect = (mean(per_free_lunch) + mean(per_reduced_lunch))/2, test_per = (mean(mean_ela_score) + mean(mean_math_score))/2), by = .(school_name)] + +rel_schools %>% + ggplot() + geom_point(aes(x = price_effect, y = test_per)) + +# make the table to visualize task2 +pov_grouping +nys_schools +test_county <- nys_schools[, .(test = (mean(mean_ela_score) + mean(mean_math_score))/2), by = county_name] +test_county <- merge(test_county, pov_grouping, by = "county_name") + +test_county %>% + ggplot() + geom_point(aes(x = county_name, y = test, col = pov_group)) + +``` + +#### Task 7: Answering questions + +It seems test performance is low in "high" poverty group and there is decreasing shape in free/reduced price lunch graph according to task6. \ No newline at end of file diff --git a/submissions/day4_seokhyunkim.Rmd b/submissions/day4_seokhyunkim.Rmd new file mode 100644 index 0000000..76e042b --- /dev/null +++ b/submissions/day4_seokhyunkim.Rmd @@ -0,0 +1,145 @@ +--- +title: "Exercises Day 2" +author: "Richard Paquin Morel, adapted from exercises by Christina Maimone" +date: "`r Sys.Date()`" +output: html_document +params: + answers: FALSE +--- + + +```{r, echo=FALSE, eval=TRUE} +answers<-params$answers +``` + +```{r global_options, echo = FALSE, include = FALSE} +knitr::opts_chunk$set(echo=answers, eval=answers, + warning = FALSE, message = FALSE, + cache = FALSE, tidy = FALSE) +``` + +## Load the data + +Load the `gapminder` dataset. + +```{asis} +### Answer +``` + +```{r} +gapminder <- read.csv(here::here("data/gapminder5.csv"), stringsAsFactors=FALSE) +``` + + +## If Statement + +Use an if() statement to print a suitable message reporting whether there are any records from 2002 in the gapminder dataset. Now do the same for 2012. + +Hint: use the `any` function. + +```{asis} +### Answer +``` + +```{r} +year<-2002 +if(any(gapminder$year == year)){ + print(paste("Record(s) for the year",year,"found.")) +} else { + print(paste("No records for year",year)) +} +``` + + +## Loop and If Statements + +Write a script that finds the mean life expectancy by country for countries whose population is below the mean for the dataset + +Write a script that loops through the `gapminder` data by continent and prints out whether the mean life expectancy is smaller than 50, between 50 and 70, or greater than 70. + +```{asis} +### Answer +``` + +```{r} +overall_mean <- mean(gapminder$pop) + +for (i in unique(gapminder$country)) { + country_mean <- mean(gapminder$pop[gapminder$country==i]) + + if (country_mean < overall_mean) { + mean_le <- mean(gapminder$lifeExp[gapminder$country==i]) + print(paste("Mean Life Expectancy in", i, "is", mean_le)) + } +} # end for loop +``` + +```{r} +lower_threshold <- 50 +upper_threshold <- 70 + +for (i in unique(gapminder$continent)){ + tmp <- mean(gapminder$lifeExp[gapminder$continent==i]) + + if (tmp < lower_threshold){ + print(paste("Average Life Expectancy in", i, "is less than", lower_threshold)) + } + else if (tmp > lower_threshold & tmp < upper_threshold){ + print(paste("Average Life Expectancy in", i, "is between", lower_threshold, "and", upper_threshold)) + } + else { + print(paste("Average Life Expectancy in", i, "is greater than", upper_threshold)) + } + +} +``` + + +## Exercise: Write Functions + +Create a function that given a data frame will print the name of each column and the class of data it contains. Use the gapminder dataset. Hint: Use `mode()` or `class()` to get the class of the data in each column. Remember that `names()` or `colnames()` returns the name of the columns in a dataset. + +```{asis} +### Answer + +Note: Some of these were taken or modified from https://www.r-bloggers.com/functions-exercises/ +``` + +```{r} +data_frame_info <- function(df) { + cols <- names(df) + for (i in cols) { + print(paste0(i, ": ", mode(df[, i]))) + } +} +data_frame_info(gapminder) +``` + +Create a function that given a vector will print the mean and the standard deviation of a **vector**, it will optionally also print the median. Hint: include an argument that takes a boolean (`TRUE`/`FALSE`) operator and then include an `if` statement. + +```{asis} +### Answer + +``` + +```{r} +vector_info <- function(x, include_median=FALSE) { + print(paste("Mean:", mean(x))) + print(paste("Standard Deviation:", sd(x))) + if (include_median) { + print(paste("Median:", median(x))) + } +} + +le <- gapminder$lifeExp +vector_info(le, include_median = F) +vector_info(le, include_median = T) +``` + +## Analyzing the relationship + +Use what you've learned so far to answer the following questions using the `gapminder` dataset. Be sure to include some visualizations! + +1. What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.) + +2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America. \ No newline at end of file