diff --git a/exercises/day1part1_R-intro_exercises.R b/exercises/day1part1_R-intro_exercises.R index d1c697c..665bb8d 100644 --- a/exercises/day1part1_R-intro_exercises.R +++ b/exercises/day1part1_R-intro_exercises.R @@ -6,46 +6,46 @@ #### Arithmetic #### # Pick a number; save it as x - +x <- 6 # Multiply x by 3 - +x*3 # Take the log of the above (Hint, you need the function log() here) - +log(x) # Subtract 4 from the above - +log(x)-4 # Square the above - +(log(x)-4)^2 #### Comparisons and Logical Operators #### # Check if 1 is bigger than 2 - +1>2 # Check if 1 + 1 is equal to 2 - +(1+1) == 2 # Check if it is true that the strings "eat" and "drink" are not equal to each other - +"eat" != "drink" # Check if it is true that 1 is equal to 1 *AND* 1 is equal to 2 # (Hint: remember what the operators & and | do) - +(1==1) $ (1==2) # Check if it is true that 1 is equal to 1 *OR* 1 is equal to 2 - +(1==1) | (1==2) #### Packages and Functions #### # Load the package tidyverse - +install.packages("tidyverse") #load it every time you use: library(tidyverse) # Open the help file for the function recode # (Hint: remember what ? does) - +?recode #### REVIEW: DATA STRUCTURES #### @@ -57,19 +57,19 @@ x1 <- rnorm(5) x2 <- rnorm(20, mean=0.5) # Select the 3rd element in x1 - +x1[3] # Select the elements of x1 that are less than 0 - +x1[x1 < 0] # Select the elements of x2 that are greater than 1 - +x2[x2 > 1] # Create x3 containing the first five elements of x2 - +x3 <- x2[1:5] # Select all but the third element of x1 - +x1[-3] #### Missing values #### @@ -77,20 +77,23 @@ x2 <- rnorm(20, mean=0.5) vec <- c(1, 8, NA, 7, 3) # Calculate the mean of vec, excluding the NA value - +mean(vec) +mean(vec, na.rm=TRUE) # Count the number of missing values in vec - +is.na(vec) +sum(is.na(vec)) +mean(is.na(vec)) #### Factors #### # See lecture notes and DataCamp for guidance and practice - +#Create factors with factor(), which includes an argument for levels = #### Lists #### # See lecture notes and DataCamp for guidance and practice - +#index an element in a list using double brackets: [[1]] #### Matricies #### @@ -98,93 +101,102 @@ vec <- c(1, 8, NA, 7, 3) mat <- matrix(c(1:51, rep(NA,4)), ncol=5) # Select row 4, column 5 - +mat(4,5) # Select column 3 - +mat(,3) # Bonus: How many NA values are there in this matrix? #### Data frames #### +#data.frame(), or by combining vectors with cbind() or rbind() +#df$var or df["var"] # Load one of R's example data frames, mtcars data(mtcars) # Identify the number of observations (rows) and number of variables (columns) - +dim(mtcars) +ncol(mtcars) +nrow(mtcars) # Identify the names of the variables - +names(mtcars) # mtcars$var # Select the variable 'mpg' - +mtcars["mpg"] # Select the 4th row - +mtcars[4,] # Square the value of the 'cyl' variable and store this as a new variable 'cylsq' - +mtcars$cylsq <- (mtcars$cyl)^2 #### READING FILES #### # Check your working directory. It should be the root folder where you downloaded the boot camp materials. If that's not the case, set your working directory accordingly. - +getwd() # Read gapminder data with read.csv() gapminder <- read.csv("data/gapminder5.csv", stringsAsFactors=FALSE) # Load the readr package - +library(readr) # Read gapminder data with read_csv() - +gapminder <- read.csv("data/gapminder5.csv") # read_csv("data/gapminder5.csv") #### DATA MANIPULATION #### #### Exploring data frames #### # Run summary() on the gapminder data - +summary(gapminder) # Find the mean of the variable pop - +mean(gapminder$pop) # Create a frequency table of the variable 'year' # Hint: use table() - +table(gapminder$year) # Create a proportion table of the variable 'continent' # Hint: use prop.table() - +prop.table(table(gapminder$continent)) #### Subsetting and Sorting #### # Create a new data frame called gapminder07 contaning only those rows in the gapminder data where year is 2007 - +gapminder07 <- gapminder[gapminder$year==2007,] +ggapminder07 <- subset(gapminder, subset = year==2007) # Created a sorted frequency table of the variable continent in gapminder07 - +sort(table(gapminder07$continent)) # Print out the population of Mexico in 2007 - +gapminder07$pop[gapminder07$country=="Mexico"] # BONUS: Print out the rows represnting the 5 countries with the highest population in 2007 # Hint: Use order(), which we learned about, and head(), which prints out the first 5 rows of a data frame - +head(gapminder07[order(gapminder07$pop, decreasing=TRUE),]) #### Adding and removing columns #### # See lecture notes for more guidance. We will practice this skill in the next section. +gapminder$newvar <- newvar +#or gapminder <- cbind(gapminder, newvar) +gapminder$newvar <- NULL +#or gapminder <- gapminder[-"newvar"] #### Recoding variables #### # Round the values of the variable `lifeExp` using `round()` and store this as a new variable `lifeExp_round` - +gapminder07$lifeExp_round <- round(gapminder07$lifeExp) # Print out the new variable to see what it looks like - +head(gapminder07$lifeExp_round) # This code creates the new variable 'lifeExp_over70'. Try to understand what it does. gapminder07$lifeExp_over70 <- NA # Initialize a variable containing all "NA" values @@ -195,36 +207,39 @@ table(gapminder07$lifeExp_over70) # Try to create a new variable 'lifeExp_highlow' that has the value # "High" when life expectancy is over the mean and the value "Low" # when it is below the mean. When you are done, print a frequency table. - - - +gapminder07$lifeExp_highlow <- NA # Initialize a variable containing all "NA" values +gapminder07$lifeExp_highlow[gapminder07$lifeExp>mean(gapminder07$lifeExp)] <- "High" +gapminder07$lifeExp_highlow[gapminder07$lifeExp lower_bound & mean_le < higher_bound){ + print(paste0(i, "'s mean life expectancy is between 50 and 70 which is ", mean_le )) + }else{ + print(paste0(i, "'s mean life expectancy is greater an 70 which is ", mean_le )) + } +} +``` + +##Exercise: Write Functions +Create a function that given a data frame will print the name of each column and the class of data it contains. Use the gapminder dataset. + +Hint: Use mode() or class() to get the class of the data in each column. Remember that names() or colnames() returns the name of the columns in a dataset. + +```{asis} +### Answer +``` + +```{r 5} +report_name_class <- + function(df) { + for(i in names(df)){ + print(paste("Column names:", i, "; Class:", class(df[,i]))) + } + } + +report_name_class(gapminder) + +``` + +##Exercise: Write Functions +Create a function that given a vector will print the mean and the standard deviation of a vector, it will optionally also print the median. + +Hint: include an argument that takes a boolean (TRUE/FALSE) operator and then include an if statement. + +```{asis} +### Answer +``` + +```{r 6} +vector_mean_sd_med <- + function(v,include_median) { + print(paste("Mean:", mean(v), "; STD:", sd(v))) + if (include_median == TRUE){ + print(paste("Median:", median(v))) + } + } + +x <- gapminder$lifeExp +vector_mean_sd_med(x, TRUE) + +``` + +## Analyzing the relationship + +What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.) + +```{asis} +### Answer +``` + +```{r 7} +years <- unique(gapminder$year) + +for(i in years){ + plot(log(gapminder$lifeExp[gapminder$year==i]) ~ log(gapminder$gdpPercap[gapminder$year==i]), + main=paste0("Relationship between life expectancy and GDP per capita in ",i), + ylab="Life expectancy", xlab="GDP per capita") + abline(h = mean(gapminder$lifeExp[gapminder$year == i])) +} + +``` +##Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America. + +```{asis} +### Answer +``` + +```{r 7} +``` + diff --git a/submissions/Day2_R_Exercise_ZhuZekai.Rmd b/submissions/Day2_R_Exercise_ZhuZekai.Rmd new file mode 100644 index 0000000..efad4f8 --- /dev/null +++ b/submissions/Day2_R_Exercise_ZhuZekai.Rmd @@ -0,0 +1,223 @@ +--- +title: "Day 2 Exercise *Zach*" +date: "`r Sys.Date()`" +output: html_document +params: + answers: TRUE +--- + +```{r, echo=FALSE, eval=TRUE} +answers<-params$answers +``` + +```{r global_options, echo = FALSE, include = FALSE} +knitr::opts_chunk$set(echo=answers, eval=answers, + warning = FALSE, message = FALSE, + cache = FALSE, tidy = FALSE) +``` + +## Load the data + +Read both California energy datasets. Make sure the datetime variable is in an appropriate data type (i.e. not character). + +```{asis} +### Answer +``` + +```{asis} +**`dplyr`** +``` + +```{r 1} +library(dplyr) +library(lubridate) + +generation_ca_dp <- read.csv(here::here("data/ca_energy_generation.csv"), + stringsAsFactors = F) +imports_ca_dp <- read.csv(here::here("data/ca_energy_imports.csv"), + stringsAsFactors = F) + +generation_ca_dp$datetime <- as_datetime(generation_ca_dp$datetime) +imports_ca_dp$datetime <- as_datetime(imports_ca_dp$datetime) +``` + +```{asis} +**`data.table`** +``` + +```{r 2} +library(data.table) +library(lubridate) +generation_ca_dt <- fread(here::here("data/ca_energy_generation.csv")) +imports_ca_dt <- fread(here::here("data/ca_energy_imports.csv")) + +generation_ca_dt[,datetime := as_datetime(datetime)] +imports_ca_dt[,datetime := as_datetime(datetime)] +``` + +## Merge and reshape the data +Merge the two datasets and then melt the resulting dataframe/datatable to make it tidy. + + +```{asis} +### Answer +``` + +```{asis} +**`dplyr`** +``` + +```{r 3} +library(reshape2) +merged_ca_energy_dp <- merge(generation_ca_dp, imports_ca_dp, by = "datetime") +long_ca_energy_dp <- melt(merged_ca_energy_dp, id.vars = "datetime", + variable.name = "source", + value.name = "output") + +``` + +```{asis} +**`data.table`** +``` + +```{r 4} +library(reshape2) +merged_ca_energy_dt <- generation_ca_dt[imports_ca_dt, on = "datetime"] +long_ca_energy_dt <- melt(merged_ca_energy_dt, id.vars = "datetime", + variable.name = "source", + value.name = "output") +``` + +## Creating new variables +Create a series of new variables: + +1. day, which is the year-month-day, without the hour. The lubridate function as_date will do this. +2. log_output, which is the natural log of the output. +3. Challenge: per_output, which is the percent of daily output represented by each observation. You will need to use group_by and to create a new variable with the total output for the day. (Make sure to use ungroup() after this!) +Bonus: If you are using dplyr, try to do this all in one pipe! + +```{asis} +### Answer +``` + +```{asis} +**`dplyr`** +``` + +```{r 5} +long_ca_energy_dp <- long_ca_energy_dp%>% + mutate(day = as_date(datetime),log_output = log(output)) %>% + group_by(day) %>% + mutate(total_daily_output = sum(output, na.rm = T)) %>% + ungroup(day) %>% + mutate(per_output = output/total_daily_output) + +# Check results +long_ca_energy_dp %>% select(day, log_output, per_output) %>% head() +``` + +```{asis} +**`data.table`** +``` + +```{r 6} +long_ca_energy_dt[,day := as_date(datetime)] +long_ca_energy_dt[,log_output := log(output)] +long_ca_energy_dt[,per_output := output/sum(output, na.rm = TRUE), by = day] + +# Check +long_ca_energy_dt +``` + +## Summarizing and analyzing data + +1. Which source has the greatest mean output by hour? (Hint: Use the dplyr verb arrange(desc(variable)) to order the data frame so that the largest value of variable is first. Don???t use desc and it arranges in ascending order. The data.table function is setorder.) Which has the least? + +```{asis} +### Answer +``` + +```{asis} +**`dplyr`** +``` + +```{r 7} +long_ca_energy_dp %>% + group_by(source) %>% + summarize(mean_hourly = mean(output, na.rm = T)) %>% + arrange(desc(mean_hourly)) +``` + +```{asis} +**`data.table`** +``` + +```{r 8} +#long_ca_energy_dt[, mean_hourly:= mean(output), by = source] +#setorder(long_ca_energy_dt, -mean_hourly) +#long_ca_energy_dt + +mean_hourly <- long_ca_energy_dt[,.(mean_hourly = mean(output)), by = source] +setorder(mean_hourly, -mean_hourly) +mean_hourly +``` + +## Summarizing and analyzing data + +Which source has the greatest mean output by day? Which has the least? (Do not include zero values.) + +```{asis} +### Answer +``` + +```{asis} +**`dplyr`** +``` + +```{r 9} +long_ca_energy_dp %>% + filter(output>0) %>% + group_by(day, source) %>% + summarize(mean_daily = mean(output, na.rm = T)) %>% + arrange(desc(mean_daily)) +``` + +```{asis} +**`data.table`** +``` + +```{r 10} +mean_by_day <- long_ca_energy_dt[output > 0,.(mean_daily = mean(output, na.rm = T)), by = .(source, day)] +setorder(mean_by_day, -mean_daily) +mean_by_day +``` + +## Summarizing and analyzing data + +Which sources has the greatest variance in usage over the course of a dataset? Which has the least? (Do not include zero values.) + +```{asis} +### Answer +``` + +```{asis} +**`dplyr`** +``` + +```{r 11} +long_ca_energy_dp %>% + filter(output>0) %>% + group_by(source) %>% + summarize(var_usage = var(output, na.rm = T)) %>% + arrange(desc(var_usage)) +``` + +```{asis} +**`data.table`** +``` + +```{r 12} +var_in_usage <- long_ca_energy_dt[output > 0,.(var_usage = var(output, na.rm = T)), by = source] +setorder(var_in_usage, -var_usage) +var_in_usage +``` diff --git a/submissions/Day2_R_Exercise_ZhuZekai.html b/submissions/Day2_R_Exercise_ZhuZekai.html new file mode 100644 index 0000000..19df0cf --- /dev/null +++ b/submissions/Day2_R_Exercise_ZhuZekai.html @@ -0,0 +1,544 @@ + + + + + + + + + + + + + + + +Day 2 Exercise Zach + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Load the data

+

Read both California energy datasets. Make sure the datetime variable is in an appropriate data type (i.e. not character).

+
+

Answer

+

dplyr

+
library(dplyr)
+library(lubridate)
+
+generation_ca_dp <- read.csv(here::here("data/ca_energy_generation.csv"), 
+                         stringsAsFactors = F)
+imports_ca_dp <- read.csv(here::here("data/ca_energy_imports.csv"), 
+                          stringsAsFactors = F)
+
+generation_ca_dp$datetime <- as_datetime(generation_ca_dp$datetime)
+imports_ca_dp$datetime <- as_datetime(imports_ca_dp$datetime)
+

data.table

+
library(data.table)
+library(lubridate)
+generation_ca_dt <- fread(here::here("data/ca_energy_generation.csv"))
+imports_ca_dt <- fread(here::here("data/ca_energy_imports.csv"))
+
+generation_ca_dt[,datetime := as_datetime(datetime)]
+imports_ca_dt[,datetime := as_datetime(datetime)]
+
+
+
+

Merge and reshape the data

+

Merge the two datasets and then melt the resulting dataframe/datatable to make it tidy.

+
+

Answer

+

dplyr

+
library(reshape2)
+merged_ca_energy_dp <- merge(generation_ca_dp, imports_ca_dp, by = "datetime")
+long_ca_energy_dp <- melt(merged_ca_energy_dp, id.vars = "datetime",
+                 variable.name = "source",
+                 value.name = "output")
+

data.table

+
library(reshape2)
+merged_ca_energy_dt <- generation_ca_dt[imports_ca_dt, on = "datetime"]
+long_ca_energy_dt <- melt(merged_ca_energy_dt, id.vars = "datetime",
+                 variable.name = "source",
+                 value.name = "output")
+
+
+
+

Creating new variables

+

Create a series of new variables:

+
    +
  1. day, which is the year-month-day, without the hour. The lubridate function as_date will do this.
  2. +
  3. log_output, which is the natural log of the output.
  4. +
  5. Challenge: per_output, which is the percent of daily output represented by each observation. You will need to use group_by and to create a new variable with the total output for the day. (Make sure to use ungroup() after this!) Bonus: If you are using dplyr, try to do this all in one pipe!
  6. +
+
+

Answer

+

dplyr

+
long_ca_energy_dp <- long_ca_energy_dp%>% 
+    mutate(day = as_date(datetime),log_output = log(output)) %>%
+    group_by(day) %>% 
+    mutate(total_daily_output = sum(output, na.rm = T)) %>%
+    ungroup(day) %>% 
+    mutate(per_output = output/total_daily_output)
+
+# Check results
+long_ca_energy_dp %>% select(day, log_output, per_output) %>% head()
+
## # A tibble: 6 x 3
+##   day        log_output per_output
+##   <date>          <dbl>      <dbl>
+## 1 2019-09-03       5.48   0.000293
+## 2 2019-09-03       5.48   0.000293
+## 3 2019-09-03       5.48   0.000293
+## 4 2019-09-03       5.48   0.000293
+## 5 2019-09-03       5.47   0.000292
+## 6 2019-09-03       5.47   0.000291
+

data.table

+
long_ca_energy_dt[,day := as_date(datetime)]
+long_ca_energy_dt[,log_output := log(output)]
+long_ca_energy_dt[,per_output := output/sum(output, na.rm = TRUE), by = day]
+
+# Check
+long_ca_energy_dt
+
##                  datetime  source     output        day log_output
+##    1: 2019-09-03 00:00:00  biogas   238.9167 2019-09-03   5.476115
+##    2: 2019-09-03 01:00:00  biogas   239.0000 2019-09-03   5.476464
+##    3: 2019-09-03 02:00:00  biogas   239.0000 2019-09-03   5.476464
+##    4: 2019-09-03 03:00:00  biogas   238.9167 2019-09-03   5.476115
+##    5: 2019-09-03 04:00:00  biogas   237.9167 2019-09-03   5.471920
+##   ---                                                             
+## 2012: 2019-09-09 19:00:00 imports 10002.6667 2019-09-09   9.210607
+## 2013: 2019-09-09 20:00:00 imports  9994.0833 2019-09-09   9.209749
+## 2014: 2019-09-09 21:00:00 imports  9690.1667 2019-09-09   9.178867
+## 2015: 2019-09-09 22:00:00 imports  7998.0000 2019-09-09   8.986947
+## 2016: 2019-09-09 23:00:00 imports  7570.7500 2019-09-09   8.932047
+##         per_output
+##    1: 0.0002933610
+##    2: 0.0002934633
+##    3: 0.0002934633
+##    4: 0.0002933610
+##    5: 0.0002921331
+##   ---             
+## 2012: 0.0149567162
+## 2013: 0.0149438818
+## 2014: 0.0144894434
+## 2015: 0.0119591925
+## 2016: 0.0113203372
+
+
+
+

Summarizing and analyzing data

+
    +
  1. Which source has the greatest mean output by hour? (Hint: Use the dplyr verb arrange(desc(variable)) to order the data frame so that the largest value of variable is first. Don???t use desc and it arranges in ascending order. The data.table function is setorder.) Which has the least?
  2. +
+
+

Answer

+

dplyr

+
long_ca_energy_dp %>%
+  group_by(source) %>%
+  summarize(mean_hourly = mean(output, na.rm = T)) %>%
+  arrange(desc(mean_hourly))
+
## # A tibble: 12 x 2
+##    source      mean_hourly
+##    <fct>             <dbl>
+##  1 natural_gas    10634.  
+##  2 imports         6683.  
+##  3 solar           3930.  
+##  4 large_hydro     2968.  
+##  5 wind            2620.  
+##  6 nuclear         2254.  
+##  7 geothermal       965.  
+##  8 small_hydro      419.  
+##  9 biomass          374.  
+## 10 biogas           235.  
+## 11 coal               9.09
+## 12 other              0
+

data.table

+
#long_ca_energy_dt[, mean_hourly:= mean(output), by = source]
+#setorder(long_ca_energy_dt, -mean_hourly)
+#long_ca_energy_dt 
+
+mean_hourly <- long_ca_energy_dt[,.(mean_hourly = mean(output)), by = source]
+setorder(mean_hourly, -mean_hourly)
+mean_hourly
+
##          source  mean_hourly
+##  1: natural_gas 10634.254960
+##  2:     imports  6682.663194
+##  3:       solar  3930.446429
+##  4: large_hydro  2968.263889
+##  5:        wind  2619.791171
+##  6:     nuclear  2253.677083
+##  7:  geothermal   964.964782
+##  8: small_hydro   419.438492
+##  9:     biomass   374.343254
+## 10:      biogas   234.685516
+## 11:        coal     9.092758
+## 12:       other     0.000000
+
+
+
+

Summarizing and analyzing data

+

Which source has the greatest mean output by day? Which has the least? (Do not include zero values.)

+
+

Answer

+

dplyr

+
long_ca_energy_dp %>%
+  filter(output>0) %>%
+  group_by(day, source) %>%
+  summarize(mean_daily = mean(output, na.rm = T)) %>%
+  arrange(desc(mean_daily))
+
## # A tibble: 77 x 3
+## # Groups:   day [7]
+##    day        source      mean_daily
+##    <date>     <fct>            <dbl>
+##  1 2019-09-05 natural_gas     15403.
+##  2 2019-09-04 natural_gas     14928.
+##  3 2019-09-03 natural_gas     13051.
+##  4 2019-09-06 natural_gas     12786.
+##  5 2019-09-07 natural_gas      7508.
+##  6 2019-09-08 solar            7278.
+##  7 2019-09-07 solar            7186.
+##  8 2019-09-09 solar            7133.
+##  9 2019-09-03 imports          7076.
+## 10 2019-09-09 imports          7058.
+## # ... with 67 more rows
+

data.table

+
mean_by_day <- long_ca_energy_dt[output > 0,.(mean_daily = mean(output, na.rm = T)), by = .(source, day)]
+setorder(mean_by_day, -mean_daily)
+mean_by_day
+
##          source        day   mean_daily
+##  1: natural_gas 2019-09-05 15403.371528
+##  2: natural_gas 2019-09-04 14927.760417
+##  3: natural_gas 2019-09-03 13050.864583
+##  4: natural_gas 2019-09-06 12786.263889
+##  5: natural_gas 2019-09-07  7508.375000
+##  6:       solar 2019-09-08  7278.422619
+##  7:       solar 2019-09-07  7185.732143
+##  8:       solar 2019-09-09  7132.797619
+##  9:     imports 2019-09-03  7076.062500
+## 10:     imports 2019-09-09  7058.482639
+## 11:       solar 2019-09-06  6841.357143
+## 12:     imports 2019-09-07  6746.111111
+## 13:     imports 2019-09-08  6720.090278
+## 14:     imports 2019-09-04  6683.656250
+## 15:     imports 2019-09-06  6426.944444
+## 16:       solar 2019-09-04  6381.148810
+## 17:       solar 2019-09-03  6298.616667
+## 18: natural_gas 2019-09-09  6245.145833
+## 19:     imports 2019-09-05  6067.295139
+## 20:       solar 2019-09-05  5597.380952
+## 21: natural_gas 2019-09-08  4518.003472
+## 22:        wind 2019-09-07  3907.954861
+## 23:        wind 2019-09-09  3718.628472
+## 24:        wind 2019-09-08  3696.673611
+## 25: large_hydro 2019-09-04  3580.166667
+## 26: large_hydro 2019-09-03  3477.944444
+## 27: large_hydro 2019-09-05  3432.618056
+## 28: large_hydro 2019-09-06  3352.111111
+## 29: large_hydro 2019-09-07  2421.420139
+## 30: large_hydro 2019-09-09  2417.555556
+## 31:     nuclear 2019-09-09  2264.385417
+## 32:     nuclear 2019-09-08  2257.489583
+## 33:     nuclear 2019-09-07  2254.881944
+## 34:     nuclear 2019-09-05  2253.395833
+## 35:     nuclear 2019-09-06  2252.579861
+## 36:     nuclear 2019-09-03  2246.920139
+## 37:     nuclear 2019-09-04  2246.086806
+## 38:        wind 2019-09-06  2225.656250
+## 39: large_hydro 2019-09-08  2096.031250
+## 40:        wind 2019-09-03  2095.381944
+## 41:        wind 2019-09-05  1504.118056
+## 42:        wind 2019-09-04  1190.125000
+## 43:  geothermal 2019-09-09   974.159722
+## 44:  geothermal 2019-09-08   967.381944
+## 45:  geothermal 2019-09-05   964.569444
+## 46:  geothermal 2019-09-06   964.326389
+## 47:  geothermal 2019-09-07   963.840278
+## 48:  geothermal 2019-09-03   960.274306
+## 49:  geothermal 2019-09-04   960.201389
+## 50: small_hydro 2019-09-03   435.368056
+## 51: small_hydro 2019-09-04   430.854167
+## 52: small_hydro 2019-09-06   429.125000
+## 53: small_hydro 2019-09-05   423.746528
+## 54: small_hydro 2019-09-09   413.465278
+## 55:     biomass 2019-09-03   409.645833
+## 56: small_hydro 2019-09-08   406.833333
+## 57: small_hydro 2019-09-07   396.677083
+## 58:     biomass 2019-09-04   385.854167
+## 59:     biomass 2019-09-07   380.704861
+## 60:     biomass 2019-09-09   371.135417
+## 61:     biomass 2019-09-08   362.388889
+## 62:     biomass 2019-09-06   358.510417
+## 63:     biomass 2019-09-05   352.163194
+## 64:      biogas 2019-09-07   238.090278
+## 65:      biogas 2019-09-06   236.118056
+## 66:      biogas 2019-09-08   236.017361
+## 67:      biogas 2019-09-03   235.746528
+## 68:      biogas 2019-09-05   235.270833
+## 69:      biogas 2019-09-04   233.055556
+## 70:      biogas 2019-09-09   228.500000
+## 71:        coal 2019-09-09    13.336806
+## 72:        coal 2019-09-05     9.500000
+## 73:        coal 2019-09-04     9.128472
+## 74:        coal 2019-09-03     8.986111
+## 75:        coal 2019-09-06     8.892361
+## 76:        coal 2019-09-08     7.236111
+## 77:        coal 2019-09-07     6.569444
+##          source        day   mean_daily
+
+
+
+

Summarizing and analyzing data

+

Which sources has the greatest variance in usage over the course of a dataset? Which has the least? (Do not include zero values.)

+
+

Answer

+

dplyr

+
long_ca_energy_dp %>%
+  filter(output>0) %>%
+  group_by(source) %>%
+  summarize(var_usage = var(output, na.rm = T)) %>%
+  arrange(desc(var_usage))
+
## # A tibble: 11 x 2
+##    source        var_usage
+##    <fct>             <dbl>
+##  1 natural_gas 27827066.  
+##  2 solar       15534019.  
+##  3 imports      3100104.  
+##  4 wind         1667027.  
+##  5 large_hydro  1110193.  
+##  6 small_hydro      940.  
+##  7 biomass          529.  
+##  8 geothermal        73.0 
+##  9 nuclear           44.0 
+## 10 biogas            17.8 
+## 11 coal               7.42
+

data.table

+
var_in_usage <- long_ca_energy_dt[output > 0,.(var_usage = var(output, na.rm = T)), by = source]
+setorder(var_in_usage, -var_usage)
+var_in_usage
+
##          source    var_usage
+##  1: natural_gas 2.782707e+07
+##  2:       solar 1.553402e+07
+##  3:     imports 3.100104e+06
+##  4:        wind 1.667027e+06
+##  5: large_hydro 1.110193e+06
+##  6: small_hydro 9.397124e+02
+##  7:     biomass 5.294015e+02
+##  8:  geothermal 7.296303e+01
+##  9:     nuclear 4.399819e+01
+## 10:      biogas 1.783838e+01
+## 11:        coal 7.415288e+00
+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/submissions/Day2_notes.R b/submissions/Day2_notes.R new file mode 100644 index 0000000..194222b --- /dev/null +++ b/submissions/Day2_notes.R @@ -0,0 +1,226 @@ +#### DAY 2 #### + +# Reading in the data +generation <- read.csv(here::here("data/ca_energy_generation.csv"), stringsAsFactors = F) +imports <- read.csv(here::here("data/ca_energy_imports.csv"), stringsAsFactors = F) +str(generation) +class(generation$datetime) +class(imports$datetime) + +# The best way to deal with date-time data is to use the lubridate package +# and the as_datetime function +library(lubridate) +generation$datetime <- as_datetime(generation$datetime) +class(generation$datetime) +head(generation$datetime) + +#### Reshaping data #### +head(generation) + +# Using reshape2 +# melt ???> make data long +# dcast ???> make data wide +# recast???> melt then cast data + +# melt +library(reshape2) +long_gen <- melt(generation, id.vars = "datetime", + variable.name = "source", + value.name = "usage") +head(long_gen) +long_gen[order(long_gen$datetime)[1:20], ] + +# Merging data +merged_energy <- merge(generation, imports, by = "datetime") +dim(merged_energy) +head(merged_energy) + +long_merged_energy <- melt(merged_energy, id.vars = "datetime", + variable.name = "source", + value.name = "usage") +head(long_merged_energy) + +#### dplyr #### + +# select ???> subset variables +# filter ???> subset observations based on conditions +# mutate ???> add new variables +# summarize ???> reduce multiple observations to a single value (e.g., find the mean) + +# select + +# select by name: select(gapminder, continent, pop) +# select by position: select(df, c(1, 3, 10)) +# select by range: select(df, country:pop) or select(df, 1:3) +# drop variables with -: select(df, -gdpPercap) + +library(tidyverse) +library(dplyr) +tmp <- select(merged_energy, biogas, biomass, geothermal, solar) +names(tmp) + +# filter +tmp <- filter(merged_energy, imports > 7000) +nrow(tmp) +head(tmp) + +tmp <- filter(merged_energy, imports > 7000, natural_gas < 7000) +nrow(tmp) +head(tmp) + +# mutate-----create new variables +tmp <- mutate(long_merged_energy, log_usage = log(usage)) +head(tmp) + +# summarize +summarize(long_merged_energy, total = sum(usage, na.rm = T)) +summarize(long_merged_energy, mean_cons = mean(usage, na.rm = T)) + +# %>% operator lets you chain together functions +# While piping, the piped dataframe is not changed! +long_merged_energy %>% + filter(source == "geothermal") %>% + select(-datetime) %>% + mutate(log_usage = log(usage)) %>% + summarize(mean_log_usage = mean(log_usage, na.rm = T)) + +merged_energy %>% + select(contains("hydro")) %>% # filter(source == "hydro") %>% can't work her + mutate(total_hydro = rowSums(., na.rm = T)) %>% + summarize(mean_hydro = mean(total_hydro, na.rm = T)) + +# group by +long_merged_energy %>% + group_by(source) %>% + summarize(sum_usage = sum(usage, na.rm = T)) + +# comparing to loop +gapminder <- read.csv(here::here("data/gapminder5.csv")) +gapminder %>% + group_by(year) %>% + summarize(mean_le = mean(lifeExp, na.rm = T), + sd_lf = sd(lifeExp, na.rm = T)) + +# Use your knowledge of dplyr to find the mean usage for small hydro, +# large hydro, biogas, and biomass +long_merged_energy %>% + filter(source %in% c("small_hydro","large_hydro", "biogas", "biomass")) %>% + group_by(source) %>% + summarize(mean_usage = mean(usage, na.rm = T)) + +# or +merged_energy %>% + select(datetime, contains("hydro"), contains("bio")) %>% + melt(id.vars = "datetime", + variable.name = "source", + value.name = "usage") %>% + group_by(source) %>% + summarize(mean_usage = mean(usage, na.rm = T)) + +# join v.s. merge +left_join = merge(all.x = T) +right_join = merge(all.y = T) +full_join = merge(all = T) +inner_join = merge(all = F) + +#### Data Table #### +# dt(i, j, by) i:row, j: col, by: group by + +library(data.table) +data_file <- here::here("data", "ca_energy_generation.csv") +generation_df <- read.csv(data_file, stringsAsFactors = F) +generation_dt <- fread(data_file) # fread similar to read.table but faster and more convenient + +class(generation_df) +class(generation_dt) +View(generation_df) +View(generation_dt) +generation_df +generation_dt +str(generation_df) +str(generation_dt) + +# Row filtering +generation_dt[wind > 4400] +generation_dt[wind > 4400 & mday(datetime) == 7] + +# Telect rows for which natural gas generation is less than +# or equal to 5,000 MW and large hydro generation is greater than 2,000 MW +generation_dt[natural_gas <= 5000 & large_hydro > 2000] + +# Select rows for which coal generation is greater than 10 MW and +#solar generation is greater than the median value of solar generation +generation_dt[coal > 10 & solar > median(soalr)] + +# Column operations +generation_dt[,wind + solar] + +generation_dt[,3*wind + solar*biogas/2] + +# New columns +generation_dt[,newcol := 3*wind + solar*biogas/2] + +generation_dt[,.(newcol = 3*wind + solar*biogas/2)] + +generation_dt[,newcol := NULL] # delete + + +# Add a column called ???total_hydro??? that is the sum of the +# small_hydro and large_hydro columns +generation_dt[,.(total_hydro = small_hydro+large_hydro)] # not directly modify the table and print the results +generation_dt[,total_hydro := small_hydro + large_hydro] # directly modify the table but not print the results + +# Find the mean of the nuclear and biogas columns +generation_dt[,.(V1 = mean(nuclear), V2 =mean(biogas))] +# Create a new table: for the hours when solar generation is zero, +# get the datetime and total_thermal (sum of natural gas and coal generation) +generation_dt[solar == 0, .(datetime, total_thermal = natural_gas + coal)] + + +# Group by +generation_dt[,.(mean_nuc = mean(nuclear), mean_wind = mean(wind)), by = mday(datetime)] + +# Find the median solar generation by hour. +generation_dt[, median(solar), by = hour(datetime)] + +# For hours when the solar generation is greater than zero, +# find the maximum natural gas generation by day +generation_dt[solar > 0, max(natural_gas), by = mday(datetime)] + +# Convert this dplyr syntax into data.table syntax +long_ca_energy <- long_ca_energy %>% + mutate(day = as_date(datetime), + log_output = log(output)) %>% + group_by(day) %>% + mutate(total_daily_output = sum(output, na.rm = T)) %>% + ungroup() %>% + mutate(per_output = output/total_daily_output) + +# Answer +all_generation_long[,day := as_date(datetime)] +all_generation_long[,log_output := log(value)] +all_generation_long[,per_output := value/sum(value), by = day] + +# set column names +setnames(dt, "old", "new") + +# set row order +setorder(dt, col1, -col2, ...) + +# set anything +set(dt, i, j) + +# set colum +dt[,col1 := 2*col2] + +# .N: number of rows in the current group +# .I: a vector, 1:nrow(dt), usually used for more advanced operations +generation_dt[,.N] +generation_dt[,.I] + +# keys: one or more columns, pre-sorted index of the table. +key(generation_dt) +setkey(generation_dt, datetime) +key(generation_dt) + + diff --git a/submissions/FinalRExercise_ZhuZekai.Rmd b/submissions/FinalRExercise_ZhuZekai.Rmd new file mode 100644 index 0000000..f1cbaa8 --- /dev/null +++ b/submissions/FinalRExercise_ZhuZekai.Rmd @@ -0,0 +1,202 @@ +--- +title: "FinalRExercise_ZhuZekai.Rmd" +output: html_document +--- + + +```{r, echo=FALSE, eval=TRUE} +answers<-params$answers +``` + +```{r global_options, echo = FALSE, include = FALSE} +knitr::opts_chunk$set(echo=answers, eval=answers, + warning = FALSE, message = FALSE, + cache = FALSE, tidy = FALSE) +``` + +## Task 1: Load the data + +Read the data files `nys_schools.csv` and `nys_acs.csv` into R. + + +```{asis} +### Answer +``` + +```{asis} +**`dplyr`** +``` + +```{r 1} +library(dplyr) + +nys_schools <- read.csv(here::here("data/nys_schools.csv"), + stringsAsFactors = F) +nys_acs <- read.csv(here::here("data/nys_acs.csv"), + stringsAsFactors = F) +``` + + + +## Task 2: Explore your data + +Explore the structure of the two dataframes. Detect missing data + +```{asis} +### Answer +``` + +```{asis} +**`dplyr`** +``` + +```{r 2} +# Overall view of the imported datasets + +summary(nys_schools) # Some missing values in nys_schools are coded as '-99' +summary(nys_acs) +str(nys_schools) +str(nys_acs) +# dim(nys_schools) +# dim(nys_acs) + +# Missing values in nys_schools are coded as '-99' or "" +sum (is.na(nys_schools)) +sum(nys_schools == "") +sum(nys_schools == -99) + +# No missing values in nys_acs +sum (is.na(nys_acs)) +sum(nys_acs == "") +sum(nys_acs == -99) +``` + + +## Task 3: Recoding and variable manipulation + +1. Deal with missing values, which are currently coded as `-99`. +2. Create a categorical variable that groups counties into "high", "medium", and "low" poverty groups. Decide how you want to split up the groups and briefly explain your decision. +3. The tests that the NYS Department of Education administers changes from time to time, so scale scores are not directly comparable year-to-year. Create a new variable that is the standardized z-score for math and English Language Arts (ELA) for each year (hint: group by year and use the `scale()` function) + + +```{asis} +### Answer for 1 +```` + +```{r 3} +nys_schools[nys_schools == -99] <- NA + +# Check no missing values which are currently coded as `-99` +summary(nys_schools) + +``` + +```{asis} +### Answer for 2 +```` + +```{r 4} + +summary(nys_acs) +nys_acs$poverty_level <- NA +tmp1 <- 46347 # 1st Quartile +tmp2 <- 56448 # 3rd Quartile +i <- 1 + +while (i <= nrow(nys_acs)) { + tmp <- nys_acs[i,4] + if (tmp < tmp1){ + nys_acs[i,6] <- "high" + } else if (tmp >= tmp2){ + nys_acs[i,6] <- "low" + } else{ + nys_acs[i,6] <- "median" + } + i <- i + 1 + } + + +``` + +```{asis} +### Answer for 3 +```` + +```{r 5} + +``` + +## Task 4: Merge datasets + +Create a county-level dataset that merges variables from the schools dataset and the ACS dataset. Remember that you have learned multiple approaches on how to do this, and that you will have to decide how to summarize data when moving from the school to the county level. + +```{asis} +### Answer +``` + +```{r 6} + +``` + +## Task 5: Create summary tables + +Generate tables showing the following: + +1. For each county: total enrollment, percent of students qualifying for free or reduced price lunch, and percent of population in poverty. +2. For the counties with the top 5 and bottom 5 poverty rate: percent of population in poverty, percent of students qualifying for free or reduced price lunch, mean reading score, and mean math score. + +```{asis} +### Answer for 1 +``` + +```{r 7} + +``` + +```{asis} +### Answer for 2 +``` + +```{r 8} + +``` + +## Task 6: Data visualization + +Using `ggplot2`, visualize the following: + +1. The relationship between access to free/reduced price lunch and test performance, at the *school* level. +2. Average test performance across *counties* with high, low, and medium poverty. + +```{asis} +### Answer for 1 +``` + +```{r 9} + +``` + +```{asis} +### Answer for 2 +``` + +```{r 10} + +``` + +## Task 7: Answering questions + +Using the skills you have learned in the past three days, tackle the following question: + +> What can the data tell us about the relationship between poverty and test performance in New York public schools? Has this relationship changed over time? Is this relationship at all moderated by access to free/reduced price lunch? + +You may use summary tables, statistical models, and/or data visualization in pursuing an answer to this question. Feel free to build on the tables and plots you generated above in Tasks 5 and 6. + +```{asis} +### Answer +``` + +```{r 11} + +``` + diff --git a/submissions/day3_notes.R b/submissions/day3_notes.R new file mode 100644 index 0000000..cef2308 --- /dev/null +++ b/submissions/day3_notes.R @@ -0,0 +1,218 @@ +#### ggplot #### + +# Tidtverse package include ggplot2 +library(tidyverse) +library(ggplot2) + +# Data + Coordinate Plane + Geomereical Object + Mapping of Data on to Plane + (X? Y?) + +# Load the data +source(here::here("data/day3_objects.R")) + +# ggplot(data = ) + (mapping = aes()) +# connected with the + operator + +p1 <-ggplot(data = gapminder07) + + geom_point(mapping = aes(x = log(pop), y = log(gdpPercap))) + + labs(title = "Relationship between GDP per capita and population in 2007", + x = "Natural log of GDP per capita", y = "Natural log of population") +p1 + theme_bw() + +# dplyr and ggplot2 are designed to work well together +# EX1: Plot a column chart of total energy generated over time +p2 <- long_gen %>% + group_by(datetime) %>% + summarise(total_output=sum(output)) %>% + ggplot() + + geom_col(aes(x=datetime, y=total_output)) + + labs(title="Total energy generated, by hour", x="Hour", y="Output (MW)") + +p2 + theme(plot.background = element_rect(fill = "purple")) + +#EX2: Plot a column chart hydroelectric power generated over time +p3 <- long_gen %>% + filter(source == "small_hydro" | source == "large_hydro" ) %>% + # filter(source %in% c(large_hydro", "small_hydr")) + # filter(str_detest(source, "hydro")) + group_by(datetime) %>% + summarise(total_output=sum(output)) %>% + ggplot() + + geom_col(aes(x=datetime, y=total_output)) + + labs(title="Total hydro power generated, by hour", x="Hour", y="Output (MW)") + +# line +imports %>% +ggplot() + + geom_line(aes(x=datetime, y=imports), size = 0.8, col = "purple") + + labs(title="Energy imports over time", x="Hour", y="Amount imported (MW)") + +# area +generation %>% + ggplot() + + geom_area(aes(x=datetime, y=wind), fill="pink") + + labs(title="Hourly wind power generation, Sept 3-9", x="Hour", y="Output (MW)") + + +# boxplt +long_gen %>% + ggplot() + + geom_boxplot(aes(x=source, y=output)) + + labs(title="Amount of energy generated by each source, Sept 3-9", + x="Source type", y="Output (MW)") + +# Plot a line of large hydro generation over time, +# and a smoothed line of the same relationship on top of it. +P4 <-long_gen %>% + filter(source == "large_hydro" ) %>% + ggplot() + + geom_line(aes(x=datetime, y=output), size = 0.8, col = "turquoise3") + + geom_smooth(aes(x=datetime, y=output),size = 1.0, col = "purple")+ + labs(title="Hydroelectric (large) generation per hour, Sept 3-9", x="Hour", y="Output (MW)") +P4 + +# EX3 +# Create a column chart that shows the total output per source. +P5 <-long_merged_energy %>% + group_by(source) %>% + summarise(total_output=sum(output)) %>% + ggplot() + + geom_col(aes(x = source, y = total_output), fill = "pink") + + geom_hline(aes(yintercept = mean(total_output))) + + labs(title="Total energy generation per hour, Sept 3-9", x="Hour", y="Output (MW)") +P5 + +# Labels +imports %>% + ggplot() + + geom_line(aes(x=datetime, y=imports), col="red") + + labs(title="Energy imports over time in California", subtitle="Hourly data from September 3-9, 2018", + caption="Source: California Energy Commission", x="Hour", y="Amount imported (MW)") + +# Scales +imports %>% + ggplot() + geom_line(aes(x=datetime, y=imports), col="red") + + scale_x_datetime(date_labels="%H:%M", date_breaks="12 hours") + + labs(title="Energy imports over time in California", subtitle="Hourly data from September 3-9, 2018", + x="Hour", y="Amount imported (MW)") + theme(axis.text.x=element_text(angle=45, hjust=1, size=12)) + +# Cord +long_gen %>% + mutate(date=lubridate::date(datetime)) %>% + group_by(date) %>% summarise(output=sum(output)) %>% + ggplot() + geom_col(aes(x=date, y=output)) + + labs(title="Total energy generated, by day", x="Day", y="Output (MW)") + + coord_flip() + +# Group +long_merged_energy %>% + ggplot() + + geom_line(aes(x=datetime, y=output, group=source, col=source)) + # group=source can be omitted in this case + labs(title="Output by energy source over time", subtitle="Hourly data from September 3-9, 2018", + x="Hour", y="Output (MW)") + +# EX4 +p6 <-long_merged_energy %>% + filter(source %in% c("wind", "solar", "geothermal")) %>% + ggplot() + + geom_line(aes(x=datetime, y=output, group=source, col=source), size =1.5) + + # scale_color_brewer(palette="Accent", name="Energy source") + + # scale_color_brewer(palette = "Set1") + labs(title="Output by energy source over time", subtitle="Hourly data from September 3-9, 2018", + x="Hour", y="Output (MW)") +p6 + +# Energy use by day +long_merged_energy %>% + mutate(date=lubridate::date(datetime)) %>% + group_by(date, source) %>% + summarize(output=sum(output)) %>% + ggplot() + + geom_col(aes(x=date, y=output, group=source, fill=source)) + + labs(title="Energy use by day", x="Day", y="Output (MW)") + +# dodge +long_merged_energy %>% + mutate(date=lubridate::date(datetime)) %>% + group_by(date, source) %>% + summarize(output=sum(output)) %>% + ggplot() + + geom_col(aes(x=date, y=output, group=source, fill=source), position="dodge") + + labs(title="Energy use by day", x="Day", y="Output (MW)") + +# group by day +# Prepare data +long_merged_energy_regroup <- long_merged_energy %>% + rename(type = source) %>% + merge(regroup, by = "type") %>% + mutate(date=lubridate::date(datetime)) %>% + group_by(date, group) %>% + summarise(output=sum(output)) + +# Take a look at our prepared data +head(long_merged_energy_regroup) + +# 1 +long_merged_energy_regroup %>% + ggplot() + + geom_line(aes(x=date, y=output, group=group, col=group), size=0.8) + + geom_point(aes(x=date, y=output, group=group, shape=group)) + + labs(title="Output by source group over time", subtitle="Data collected during September 3-9, 2018", + x="Date", y="Output (MW)") + +# 2 +long_merged_energy_regroup %>% + ggplot() + + geom_line(aes(x=date, y=output, group=group, linetype=group), size=1) + + labs(title="Output by source group over time", subtitle="Data collected during September 3-9, 2018", + x="Date", y="Output (MW)") + +# Sizes and alpha +gapminder07 %>% + ggplot() + + geom_point(aes(x=log(gdpPercap), y=lifeExp, size=pop, col=continent)) + + scale_size_continuous(name="Population") + scale_color_discrete(name="Continent") + + labs(title="Life expectancy as a function of GDP per capita in 2007", + x="Logged GDP per capita", y="Life expectancy") + +# EX5 +# Visualize the average output for each hour of the day, grouped by source +ex5 <- long_merged_energy %>% + mutate(hour=lubridate::hour(datetime)) %>% + group_by(hour, source) %>% + summarize(output=sum(output)) %>% + ggplot() + + geom_area(aes(x=hour, y=output, fill=factor(source))) + + scale_fill_brewer(palette="Set3", name="Source") + + labs(title="Average hourly output by source", + subtitle="Data collected during September 3-9", + x="Hour of the day", y="Output (MW)") + + theme_bw() + +# Comparing generation patterns +long_gen %>% + ggplot() + + geom_line(aes(x = datetime, y = output)) + + facet_wrap(~source) + + labs(title="Generation over time, by energy source", subtitle="Hourly data from September 3-9, 2018", + x="Hour", y="Output (MW)") + +long_gen %>% + ggplot() + + geom_line(aes(x = datetime, y = output)) + + facet_wrap(~source, scales="free") + + labs(title="Generation over time, by energy source", subtitle="Hourly data from September 3-9, 2018", + x="Hour", y="Output (MW)") + +# Facets +long_gen_regroup <- long_gen %>% + rename(type = source) %>% + merge(regroup, by="type") + +head(long_gen_regroup) + +long_gen_regroup %>% ggplot() + + geom_line(aes(x=datetime, y=output, group=group, col=group), size=1) + + scale_color_brewer(palette="Set1", name="Type of energy source") + + facet_wrap(~type, scales="free") + + labs(title="Generation over time, by energy source", subtitle="Hourly data from September 3-9, 2018", x="Hour", y="Output (MW)") + + theme(legend.position = "bottom")