NUMLDS · XiaolingZhang97 · Sep 17, 2019 · Sep 17, 2019 · Sep 18, 2019 · Sep 19, 2019
diff --git a/exercises/day1part1_R-intro_exercises.R b/exercises/day1part1_R-intro_exercises.R
@@ -29,7 +29,7 @@
 
 
 # Check if it is true that the strings "eat" and "drink" are not equal to each other
-
+"eat" == "drink"
 
 # Check if it is true that 1 is equal to 1 *AND* 1 is equal to 2 
 # (Hint: remember what the operators & and | do)
@@ -41,11 +41,11 @@
 #### Packages and Functions ####
 
 # Load the package tidyverse
-
+library(tidyverse)
 
 # Open the help file for the function recode 
 # (Hint: remember what ? does)
-
+?tidyverse
 
 #### REVIEW: DATA STRUCTURES ####
 
@@ -57,30 +57,31 @@ x1 <- rnorm(5)
 x2 <- rnorm(20, mean=0.5)
 
 # Select the 3rd element in x1
-
+x1[3]
 
 # Select the elements of x1 that are less than 0
+x1[x1<0]
 
 
 # Select the elements of x2 that are greater than 1
-
+x2[x2>x1]
 
 # Create x3 containing the first five elements of x2
-
+x3 = x2[1:5]
 
 # Select all but the third element of x1
-
+x1[-3]
 
 #### Missing values ####
 
 # Generate a vector
 vec <- c(1, 8, NA, 7, 3)
 
 # Calculate the mean of vec, excluding the NA value
-
+mean(vec, na.rm = T)
 
 # Count the number of missing values in vec
-
+sum(is.na(vec))
 
 #### Factors ####
 
@@ -98,33 +99,34 @@ vec <- c(1, 8, NA, 7, 3)
 mat <- matrix(c(1:51, rep(NA,4)), ncol=5)
 
 # Select row 4, column 5
-
+mat[4,5]
 
 # Select column 3
-
+mat[,3]
 
 # Bonus: How many NA values are there in this matrix?
-
+sum(is.na(mat))
 
 #### Data frames ####
 
 # Load one of R's example data frames, mtcars
 data(mtcars)
 
 # Identify the number of observations (rows) and number of variables (columns)
-
+nrow(mtcars)
+ncol(mtcars)
 
 # Identify the names of the variables
-
+colnames(mtcars)
 
 # Select the variable 'mpg'
-
+mtcars$mpg
 
 # Select the 4th row
-
+mtcars[4,]
 
 # Square the value of the 'cyl' variable and store this as a new variable 'cylsq'
-
+cylsq = mtcars$cyl^2
 
 #### READING FILES ####
 
@@ -135,43 +137,44 @@ data(mtcars)
 gapminder <- read.csv("data/gapminder5.csv", stringsAsFactors=FALSE)
 
 # Load the readr package
-
+library(readr)
 
 # Read gapminder data with read_csv()
-
+read_csv("data/gapminder5.csv")
 
 #### DATA MANIPULATION ####
 
 #### Exploring data frames ####
 
 # Run summary() on the gapminder data
-
+summary(gapminder)
 
 # Find the mean of the variable pop
-
+mean(gapminder$pop)
 
 # Create a frequency table of the variable 'year'
 # Hint: use table()
-
+table(gapminder$year)
 
 # Create a proportion table of the variable 'continent'
 # Hint: use prop.table()
+prop.table(table(gapminder$continent))
 
 
 #### Subsetting and Sorting ####
 
 # Create a new data frame called gapminder07 contaning only those rows in the gapminder data where year is 2007
-
+gapminder07 = gapminder[gapminder$year == 2007,]
 
 # Created a sorted frequency table of the variable continent in gapminder07
-
+sort(table(gapminder$continent))
 
 # Print out the population of Mexico in 2007
-
+gapminder07[gapminder07$country == 'Mexico',]$pop
 
 # BONUS: Print out the rows represnting the 5 countries with the highest population in 2007
 # Hint: Use order(), which we learned about, and head(), which prints out the first 5 rows of a data frame
-
+head(gapminder07[order(gapminder07$pop, decreasing = T),])
 
 #### Adding and removing columns ####
 
@@ -181,10 +184,10 @@ gapminder <- read.csv("data/gapminder5.csv", stringsAsFactors=FALSE)
 #### Recoding variables ####
 
 # Round the values of the variable `lifeExp` using `round()` and store this as a new variable `lifeExp_round`
-
+lifeExp_round = round(gapminder07$lifeExp)
 
 # Print out the new variable to see what it looks like
-
+lifeExp_round
 
 # This code creates the new variable 'lifeExp_over70'. Try to understand what it does.
 gapminder07$lifeExp_over70 <- NA  # Initialize a variable containing all "NA" values
@@ -195,6 +198,10 @@ table(gapminder07$lifeExp_over70)
 # Try to create a new variable 'lifeExp_highlow' that has the value 
 # "High" when life expectancy is over the mean and the value "Low" 
 # when it is below the mean. When you are done, print a frequency table.
+gapminder07$lifeExp_highlow <- NA  # Initialize a variable containing all "NA" values
+gapminder07$lifeExp_highlow[gapminder07$lifeExp>mean(gapminder07$lifeExp)] <- "High"
+gapminder07$lifeExp_highlow[gapminder07$lifeExp<mean(gapminder07$lifeExp)] <- "Low"
+table(gapminder07$lifeExp_highlow)
 
 
 
@@ -204,35 +211,37 @@ table(gapminder07$lifeExp_over70)
 
 # Find the mean of life expectancy in 2007 for each continent
 # Hint: use the aggregate() function
-
+aggregate(gapminder07, list(Continent = gapminder07$continent), mean)
 
 #### Statistics, part 1 ####
 
 # Calculate the correlation between 'lifeExp' and 'gdpPercap'.
+cor(gapminder$lifeExp, gapminder$gdpPercap)
 
 
 # Use a t-test to evaluate the difference between 'gdpPercap' in "high" and "low" life expectancy countries. Store the results as t1, and then print out t1.
-
+t.test(gapminder07[gapminder07$lifeExp_highlow == 'High',]$gdpPercap, 
+       gapminder07[gapminder07$lifeExp_highlow == 'Low',]$gdpPercap)
 
 
 #### Statistics, part 2 ####
 
 # Conduct a linear regression predicting 'lifeExp' as a function of 'gdpPercap' and 'pop', and store the results as reg1.
-
+reg1 = lm(gapminder07$lifeExp ~ gapminder07$gdpPercap + gapminder07$pop)
 
 # Print out reg1.
-
+reg1
 
 # Run summary() on reg1.
-
+summary(reg1)
 
 #### WRITING FILES ####
 
 #### Writing a data file ####
 
 # Save the gapminder07 data frame as a CSV file using write.csv() in the "data" subfolder within the working directory
 # Set the argument `row.names = FALSE`.
-
+write.csv(gapminder07, "data/gapminder07.csv", row.names = FALSE)
 
 #### Save R objects ####
 
@@ -244,22 +253,24 @@ table(gapminder07$lifeExp_over70)
 #### Histograms ####
 
 # Create a histogram of the variable 'lifeExp' in gapminder07
-
+hist(gapminder07$lifeExp)
 
 # Re-create the histogram with a title and axis labels
-
+hist(gapminder07$lifeExp, main = 'Hist', xlab = 'lifeExp', ylab = 'Freq')
 
 # Bonus: Change the `breaks = ` argument from its default setting and see what happens.
-
+hist(gapminder07$lifeExp, main = 'Hist', xlab = 'lifeExp', ylab = 'Freq', breaks = 20) # number of break point you want
 
 #### Scatterplots ####
 
 # Create a scatterplot with `lifeExp` on the y-axis and `gdpPercap` on the x-axis.
-
+plot(gapminder07$gdpPercap,gapminder07$lifeExp)
 
 # Add a title and axis labels.
-
+plot(gapminder07$gdpPercap,gapminder07$lifeExp, main = 'sp', xlab = 'gdpPercap', ylab = 'lifeExp')
 
 # Bonus: Add a horizontal line indicating the mean of `lifeExp` onto the plot using `abline()`.
+plot(gapminder07$gdpPercap,gapminder07$lifeExp, main = 'sp', xlab = 'gdpPercap', ylab = 'lifeExp')
+abline(v = mean(gapminder07$lifeExp))
 
 
diff --git a/exercises/day1part2_exercises_no_answers.Rmd b/exercises/day1part2_exercises_no_answers.Rmd
@@ -141,5 +141,22 @@ vector_info(le, include_median = T)
 Use what you've learned so far to answer the following questions using the `gapminder` dataset. Be sure to include some visualizations!
 
 1. What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.)
+```{r}
+cor(log(gapminder$gdpPercap), log(gapminder$lifeExp))
+cor_list = c()
+for(i in unique(gapminder$year)){
+    gapminder_temp = gapminder[gapminder$year == i,]
+    print(paste("corr for", i ,"is", cor(log(gapminder_temp$gdpPercap), log(gapminder_temp$lifeExp))))
+    cor_list = c(cor_list, cor(log(gapminder_temp$gdpPercap), log(gapminder_temp$lifeExp)))
+}
+l = lm(cor_list~unique(gapminder$year))
+plot(unique(gapminder$year), cor_list)
+abline(l)
+```
+
+
+2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America.
+```{r}
+gapminder
+```
 
-2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America.
diff --git a/exercises/dplyr_exercise.Rmd b/exercises/dplyr_exercise.Rmd
@@ -0,0 +1,103 @@
+---
+title: "day2_exercise"
+author: "Xiaoling Zhang"
+date: "9/18/2019"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+```{r}
+library(dplyr)
+library(lubridate)
+generation <- read.csv(here::here("data/ca_energy_generation.csv"), 
+                       stringsAsFactors = F)
+imports <- read.csv(here::here("data/ca_energy_imports.csv"), 
+                    stringsAsFactors = F)
+
+generation <- mutate(generation, datetime = as_datetime(datetime))
+imports <- mutate(imports, datetime = as_datetime(datetime))
+```
+
+```{r}
+library(reshape2)
+long_ca_energy <- generation %>%
+  inner_join(imports, by = "datetime") %>% 
+  melt(id.vars = "datetime",
+       variable.name = "source",
+       value.name = "output")
+long_ca_energy
+```
+```{r}
+long_ca_energy <- long_ca_energy %>%
+  mutate(day = as_date(datetime),
+         log_output = log(output)) %>%
+  group_by(day) %>%
+  mutate(total_daily_output = sum(output, na.rm = T)) %>% 
+  ungroup() %>% 
+  mutate(per_output = output/total_daily_output)
+```
+```{r}
+long_ca_energy
+```
+
+
+```{r}
+long_ca_energy %>% 
+  group_by(source) %>% 
+  summarize(mean_hourly = mean(output, na.rm = T)) %>% 
+  arrange(desc(mean_hourly))
+```
+
+
+```{r}
+long_ca_energy %>% 
+  group_by(source) %>% 
+  summarize(mean_hourly = mean(output, na.rm = T)) %>% 
+  arrange(mean_hourly)
+```
+
+
+
+```{r}
+long_ca_energy %>% 
+  group_by(day, source) %>% 
+  summarize(mean_daily = mean(output, na.rm = T)) %>% 
+  arrange(desc(mean_daily))
+
+long_ca_energy %>% 
+  group_by(day, source) %>% 
+  summarize(mean_daily = mean(output, na.rm = T)) %>% 
+  arrange(mean_daily)
+```
+
+
+
+```{r}
+long_ca_energy %>% 
+  group_by(source) %>% 
+  summarize(variance = var(output, na.rm = T)) %>% 
+  arrange(desc(variance))
+
+long_ca_energy %>% 
+  group_by(source) %>% 
+  summarize(variance = var(output, na.rm = T)) %>% 
+  arrange(variance)
+```
+
+
+```{r}
+regroup <- read.csv(here::here("data/ca_energy_regroup.csv"), 
+                    stringsAsFactors = F)
+regroup_energy <- merge(long_ca_energy, regroup, by.x = 'source', by.y = 'type')
+```
+
+
+```{r}
+regroup_energy %>% group_by(group) %>% summarize(mean_hourly = mean(output, na.rm = T)) %>% 
+  arrange(desc(mean_hourly))
+```
+
+