Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 50 additions & 39 deletions exercises/day1part1_R-intro_exercises.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@


# Check if it is true that the strings "eat" and "drink" are not equal to each other

"eat" == "drink"

# Check if it is true that 1 is equal to 1 *AND* 1 is equal to 2
# (Hint: remember what the operators & and | do)
Expand All @@ -41,11 +41,11 @@
#### Packages and Functions ####

# Load the package tidyverse

library(tidyverse)

# Open the help file for the function recode
# (Hint: remember what ? does)

?tidyverse

#### REVIEW: DATA STRUCTURES ####

Expand All @@ -57,30 +57,31 @@ x1 <- rnorm(5)
x2 <- rnorm(20, mean=0.5)

# Select the 3rd element in x1

x1[3]

# Select the elements of x1 that are less than 0
x1[x1<0]


# Select the elements of x2 that are greater than 1

x2[x2>x1]

# Create x3 containing the first five elements of x2

x3 = x2[1:5]

# Select all but the third element of x1

x1[-3]

#### Missing values ####

# Generate a vector
vec <- c(1, 8, NA, 7, 3)

# Calculate the mean of vec, excluding the NA value

mean(vec, na.rm = T)

# Count the number of missing values in vec

sum(is.na(vec))

#### Factors ####

Expand All @@ -98,33 +99,34 @@ vec <- c(1, 8, NA, 7, 3)
mat <- matrix(c(1:51, rep(NA,4)), ncol=5)

# Select row 4, column 5

mat[4,5]

# Select column 3

mat[,3]

# Bonus: How many NA values are there in this matrix?

sum(is.na(mat))

#### Data frames ####

# Load one of R's example data frames, mtcars
data(mtcars)

# Identify the number of observations (rows) and number of variables (columns)

nrow(mtcars)
ncol(mtcars)

# Identify the names of the variables

colnames(mtcars)

# Select the variable 'mpg'

mtcars$mpg

# Select the 4th row

mtcars[4,]

# Square the value of the 'cyl' variable and store this as a new variable 'cylsq'

cylsq = mtcars$cyl^2

#### READING FILES ####

Expand All @@ -135,43 +137,44 @@ data(mtcars)
gapminder <- read.csv("data/gapminder5.csv", stringsAsFactors=FALSE)

# Load the readr package

library(readr)

# Read gapminder data with read_csv()

read_csv("data/gapminder5.csv")

#### DATA MANIPULATION ####

#### Exploring data frames ####

# Run summary() on the gapminder data

summary(gapminder)

# Find the mean of the variable pop

mean(gapminder$pop)

# Create a frequency table of the variable 'year'
# Hint: use table()

table(gapminder$year)

# Create a proportion table of the variable 'continent'
# Hint: use prop.table()
prop.table(table(gapminder$continent))


#### Subsetting and Sorting ####

# Create a new data frame called gapminder07 contaning only those rows in the gapminder data where year is 2007

gapminder07 = gapminder[gapminder$year == 2007,]

# Created a sorted frequency table of the variable continent in gapminder07

sort(table(gapminder$continent))

# Print out the population of Mexico in 2007

gapminder07[gapminder07$country == 'Mexico',]$pop

# BONUS: Print out the rows represnting the 5 countries with the highest population in 2007
# Hint: Use order(), which we learned about, and head(), which prints out the first 5 rows of a data frame

head(gapminder07[order(gapminder07$pop, decreasing = T),])

#### Adding and removing columns ####

Expand All @@ -181,10 +184,10 @@ gapminder <- read.csv("data/gapminder5.csv", stringsAsFactors=FALSE)
#### Recoding variables ####

# Round the values of the variable `lifeExp` using `round()` and store this as a new variable `lifeExp_round`

lifeExp_round = round(gapminder07$lifeExp)

# Print out the new variable to see what it looks like

lifeExp_round

# This code creates the new variable 'lifeExp_over70'. Try to understand what it does.
gapminder07$lifeExp_over70 <- NA # Initialize a variable containing all "NA" values
Expand All @@ -195,6 +198,10 @@ table(gapminder07$lifeExp_over70)
# Try to create a new variable 'lifeExp_highlow' that has the value
# "High" when life expectancy is over the mean and the value "Low"
# when it is below the mean. When you are done, print a frequency table.
gapminder07$lifeExp_highlow <- NA # Initialize a variable containing all "NA" values
gapminder07$lifeExp_highlow[gapminder07$lifeExp>mean(gapminder07$lifeExp)] <- "High"
gapminder07$lifeExp_highlow[gapminder07$lifeExp<mean(gapminder07$lifeExp)] <- "Low"
table(gapminder07$lifeExp_highlow)



Expand All @@ -204,35 +211,37 @@ table(gapminder07$lifeExp_over70)

# Find the mean of life expectancy in 2007 for each continent
# Hint: use the aggregate() function

aggregate(gapminder07, list(Continent = gapminder07$continent), mean)

#### Statistics, part 1 ####

# Calculate the correlation between 'lifeExp' and 'gdpPercap'.
cor(gapminder$lifeExp, gapminder$gdpPercap)


# Use a t-test to evaluate the difference between 'gdpPercap' in "high" and "low" life expectancy countries. Store the results as t1, and then print out t1.

t.test(gapminder07[gapminder07$lifeExp_highlow == 'High',]$gdpPercap,
gapminder07[gapminder07$lifeExp_highlow == 'Low',]$gdpPercap)


#### Statistics, part 2 ####

# Conduct a linear regression predicting 'lifeExp' as a function of 'gdpPercap' and 'pop', and store the results as reg1.

reg1 = lm(gapminder07$lifeExp ~ gapminder07$gdpPercap + gapminder07$pop)

# Print out reg1.

reg1

# Run summary() on reg1.

summary(reg1)

#### WRITING FILES ####

#### Writing a data file ####

# Save the gapminder07 data frame as a CSV file using write.csv() in the "data" subfolder within the working directory
# Set the argument `row.names = FALSE`.

write.csv(gapminder07, "data/gapminder07.csv", row.names = FALSE)

#### Save R objects ####

Expand All @@ -244,22 +253,24 @@ table(gapminder07$lifeExp_over70)
#### Histograms ####

# Create a histogram of the variable 'lifeExp' in gapminder07

hist(gapminder07$lifeExp)

# Re-create the histogram with a title and axis labels

hist(gapminder07$lifeExp, main = 'Hist', xlab = 'lifeExp', ylab = 'Freq')

# Bonus: Change the `breaks = ` argument from its default setting and see what happens.

hist(gapminder07$lifeExp, main = 'Hist', xlab = 'lifeExp', ylab = 'Freq', breaks = 20) # number of break point you want

#### Scatterplots ####

# Create a scatterplot with `lifeExp` on the y-axis and `gdpPercap` on the x-axis.

plot(gapminder07$gdpPercap,gapminder07$lifeExp)

# Add a title and axis labels.

plot(gapminder07$gdpPercap,gapminder07$lifeExp, main = 'sp', xlab = 'gdpPercap', ylab = 'lifeExp')

# Bonus: Add a horizontal line indicating the mean of `lifeExp` onto the plot using `abline()`.
plot(gapminder07$gdpPercap,gapminder07$lifeExp, main = 'sp', xlab = 'gdpPercap', ylab = 'lifeExp')
abline(v = mean(gapminder07$lifeExp))


19 changes: 18 additions & 1 deletion exercises/day1part2_exercises_no_answers.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,22 @@ vector_info(le, include_median = T)
Use what you've learned so far to answer the following questions using the `gapminder` dataset. Be sure to include some visualizations!

1. What is the relationship between GDP per capita and life expectancy? Does this relationship change over time? (Hint: Use the natural log of both variables.)
```{r}
cor(log(gapminder$gdpPercap), log(gapminder$lifeExp))
cor_list = c()
for(i in unique(gapminder$year)){
gapminder_temp = gapminder[gapminder$year == i,]
print(paste("corr for", i ,"is", cor(log(gapminder_temp$gdpPercap), log(gapminder_temp$lifeExp))))
cor_list = c(cor_list, cor(log(gapminder_temp$gdpPercap), log(gapminder_temp$lifeExp)))
}
l = lm(cor_list~unique(gapminder$year))
plot(unique(gapminder$year), cor_list)
abline(l)
```


2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America.
```{r}
gapminder
```

2. Does the relationship between GDP per capita and life expectacy vary by continent? Make sure you divide the Americas into North and South America.
103 changes: 103 additions & 0 deletions exercises/dplyr_exercise.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
---
title: "day2_exercise"
author: "Xiaoling Zhang"
date: "9/18/2019"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library(dplyr)
library(lubridate)
generation <- read.csv(here::here("data/ca_energy_generation.csv"),
stringsAsFactors = F)
imports <- read.csv(here::here("data/ca_energy_imports.csv"),
stringsAsFactors = F)

generation <- mutate(generation, datetime = as_datetime(datetime))
imports <- mutate(imports, datetime = as_datetime(datetime))
```

```{r}
library(reshape2)
long_ca_energy <- generation %>%
inner_join(imports, by = "datetime") %>%
melt(id.vars = "datetime",
variable.name = "source",
value.name = "output")
long_ca_energy
```
```{r}
long_ca_energy <- long_ca_energy %>%
mutate(day = as_date(datetime),
log_output = log(output)) %>%
group_by(day) %>%
mutate(total_daily_output = sum(output, na.rm = T)) %>%
ungroup() %>%
mutate(per_output = output/total_daily_output)
```
```{r}
long_ca_energy
```


```{r}
long_ca_energy %>%
group_by(source) %>%
summarize(mean_hourly = mean(output, na.rm = T)) %>%
arrange(desc(mean_hourly))
```


```{r}
long_ca_energy %>%
group_by(source) %>%
summarize(mean_hourly = mean(output, na.rm = T)) %>%
arrange(mean_hourly)
```



```{r}
long_ca_energy %>%
group_by(day, source) %>%
summarize(mean_daily = mean(output, na.rm = T)) %>%
arrange(desc(mean_daily))

long_ca_energy %>%
group_by(day, source) %>%
summarize(mean_daily = mean(output, na.rm = T)) %>%
arrange(mean_daily)
```



```{r}
long_ca_energy %>%
group_by(source) %>%
summarize(variance = var(output, na.rm = T)) %>%
arrange(desc(variance))

long_ca_energy %>%
group_by(source) %>%
summarize(variance = var(output, na.rm = T)) %>%
arrange(variance)
```


```{r}
regroup <- read.csv(here::here("data/ca_energy_regroup.csv"),
stringsAsFactors = F)
regroup_energy <- merge(long_ca_energy, regroup, by.x = 'source', by.y = 'type')
```


```{r}
regroup_energy %>% group_by(group) %>% summarize(mean_hourly = mean(output, na.rm = T)) %>%
arrange(desc(mean_hourly))
```


Loading