ElementR_3_Seance1_offline.Rmd

---
title: "Extraire des données depuis Internet"
date: "`r paste('Vague 3', 'Séance 1', Sys.Date(), sep=' - ')`"
author: "ElementR - Robin Cura"
output:
  #html_document:
  
  rmdformats::readthedown:
    highlight: kate
    thumbnails: true
    lightbox: true
    gallery: true
    code_folding: hide
    keep_md: TRUE
---


```{r knitr_init, echo=FALSE, cache=FALSE}
library(knitr)
library(rmdformats)

## Global options
options(max.print="75")
opts_chunk$set(echo=TRUE,
	             cache=TRUE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE)
opts_knit$set(width=75)
```

# Utiliser des *API* à travers des *packages* dédiés

## Géocoder une liste de lieux

### Requête de récupération

```{r geocodage}
library(dplyr)
library(photon) # devtools::install_github(repo = 'rCarto/photon')


geogeoplaces <- data_frame(Nom = c("Institut de Géographie",
                                   "Géographie-cités",
                                   "PRODIG",
                                   "Centre PMF",
                                   "Centre Montreal",
                                   "Olympe de Gouges"),
                           Adresse = c("Institut de Géographie, 75005 Paris, France",
                                      "Rue du Four, 75006 Paris, France",
                                      "Rue Valette, 75005 Paris, France",
                                      "90 rue de Tolbiac, 75013 Paris, France",
                                      "105 rue de Tolbiac, 75013 Paris, France",
                                      "Rue Albert Einstein, 75013 Paris, France"))

geoCodingResults <- tryCatch(
  geocode(geogeoplaces$Adresse, limit = 1, lang = "fr"),
  error = function(e) readRDS("data/geoCodingResults.Rds")
)

geogeogeoplaces <- geogeoplaces %>%
  bind_cols(geoCodingResults %>% select(lat, lon))

```

### Conversion en format R

```{r conversion spdf}
library(sp)
geogeogeoplaces <- as.data.frame(geogeogeoplaces, stringsAsFactors = FALSE)
coordinates(geogeogeoplaces) <- ~lon + lat
proj4string(geogeogeoplaces) <- CRS("+init=epsg:4326")
```

### Cartographie

```{r cartographie rapide}
library(mapview)

plot(geogeogeoplaces)
tryCatch(
  mapView(geogeogeoplaces)
)
```

## Trouver le plus court chemin entre ces lieux

### Requête de récupération

```{r requete osrm}
library(osrm) # devtools::install_github("rCarto/osrm")

plusCourtItineraire <- tryCatch(
  osrmTripGeom(geogeogeoplaces, sp = TRUE),
  error = function(e) readRDS("data/plusCourtItineraire.Rds")
)
```

### Conversion en format R

```{r recup osrm}
geoitineraire <- plusCourtItineraire[[1]]
str(geoitineraire)

geoitineraire$summary
```

### Cartographie

```{r carto osrm}
plot(geoitineraire$trip)
tryCatch(
  mapView(geoitineraire$trip),
  error = function(e) plot(geoitineraire$trip)
)
```

## Créer un profil du trajet

### Segmenter le trajet

```{r segmentation lines}
library(magrittr)
coordsLists <- lapply(geoitineraire$trip@lines, function(x){x@Lines[[1]]@coords})
coordsDF <- lapply(coordsLists, function(x){as.data.frame(x)}) %>%
  rbind_all() %>%
  set_colnames(c("lon", "lat"))
```

### Récuperer l'altitude

```{r altitude geonames}
library(geonames)
coordsDF$index <- as.numeric(row.names(coordsDF))
altitudeDF <- coordsDF[seq(from = 1, to = nrow(coordsDF), by = 10),]

tryCatch(
  options(geonamesUsername = "parisgeo")
)

altitudePoints<- tryCatch(
  apply(altitudeDF, MARGIN = 1, FUN = function(x){
    res <- GNsrtm3(lat=x["lat"], lng = x["lon"]);
    res$srtm3}),
   error = function(e) readRDS("data/altitudePoints.Rds")
)

altitudeDF$alt <- altitudePoints

```

### Affichage du profil du trajet

```{r plot profil}
library(ggplot2)
ggplot(altitudeDF, aes(x = index, y = alt)) + geom_line(group=1) + coord_equal(ratio=1)
```


# Récuperer du contenu HTML formaté

## Récupération d'une page

```{r recup page wikipedia}
library(xml2)

rawWikipediaPage <- tryCatch(
  read_html(x = "https://fr.wikipedia.org/wiki/Liste_des_communes_de_France_les_plus_peupl%C3%A9es"),
  error = function(e) readRDS("data/rawWikipediaPage.Rds")
)
```

## Conversion en format R

```{r extraction df}
library(rvest)
library(magrittr)

rawWikiTable <- rawWikipediaPage %>%
  html_nodes(".wikitable") %>%
  extract2(1) %>%
  html_table(trim = TRUE, dec = ",")
```

## Nettoyage

```{r nettoyage colonnes}
library(stringr)

communesFr <- rawWikiTable

communesFr <- communesFr %>%
  set_colnames(
    gsub(x = colnames(.), pattern = "\\[[^\\]]*\\]", replacement = "", perl=TRUE) %>% # Suppression des [*]
    str_replace_all( string = ., pattern = "\\s", replacement = "") %>% # Suppression des espaces
    str_replace_all(string = ., pattern = fixed(","), replacement = "") %>% # Suppression des ","
    str_replace_all(string = ., pattern = fixed("é"), replacement = "e")
  ) %>%
  rename(Rank = `#o`)
  
```

```{r nettoyage contenu}
library(dplyr)

communesFr <- communesFr %>%
  mutate_each(funs(gsub(x = ., pattern = "\\[[^\\]]*\\]", replacement = "", perl=TRUE))) %>% # Suppression des [*]
  mutate_each(funs(str_replace_all(string = ., pattern = fixed(","), replacement = ""))) %>% # Suppression des ","
  mutate_each(funs(gsub(x = ., pattern = "\\([^\\]]*\\)", replacement = "", perl=TRUE))) %>% # Suppression des (*)
  mutate_each(funs(str_replace_all(string = ., pattern = fixed("+"), replacement = ""))) %>% # Suppression des "+"
  mutate_each(vars = -c(1:4), funs(str_replace_all( string = ., pattern = "\\s", replacement = ""))) %>% # Suppression des espaces
  mutate_each(vars = -c(1:4), funs(as.numeric)) # Conversion en numeric
```

## Utilisation

```{r analyse communes peuplées}
library(tidyr)
library(ggplot2)

communesParRegion<- communesFr %>%
  group_by(Region) %>%
  summarise_each(funs(sum), -c(1:4)) %>%
  gather(key = Annee, value = Pop, -Region) %>%
  group_by(Annee) %>%
  mutate(label_y = cumsum(Pop) - .5*Pop) %>%
  mutate(label_name = str_wrap(Region, width=12))

ggplot(communesParRegion,
       aes(factor(Annee), Pop, group=Region, fill = Region), col = "black") +
  geom_bar(stat = "identity") +
   geom_text(aes(y=label_y, label=label_name), col='black', size = 2.5, check_overlap = TRUE) +
  scale_fill_manual(values = rainbow(20)) +
  theme_bw() +
  theme(legend.position="bottom")

```

# Extraire et stucturer du contenu Web brut

## Récuperer l'ensemble des liens de billets d'un blog

### On récupère les liens vers le classement chronologique des billets

```{r elementr date_links}

home_page <- tryCatch(
  read_html("http://elementr.hypotheses.org/"),
  error = function(e) readRDS("data/home_page.Rds")
)


home_links <-  home_page %>%
  html_nodes("a") %>%
  html_attr("href")

reg_query1 <- "/date/"
dates_links <- home_links[grepl(home_links, pattern=reg_query1)]
```

### On parcours ces pages pour en extraire les articles

```{r elementr billets_links}
posts_links <- character()

for (thisLink in dates_links) {
  this_page_links <- read_html(thisLink) %>%
    html_nodes("a") %>%
    html_attr("href")

  reg_query2 <- "http://elementr.hypotheses.org/\\d"
    
  this_page_posts_links <- this_page_links[grepl(this_page_links, pattern=reg_query2)]
  posts_links <- c(posts_links, this_page_posts_links)
}

if (length(posts_links) == 0){
  posts_links <- readRDS("data/posts_links.Rds")
}
```

## Récuperer leur contenu

### Contenu brut

```{r elementr billets_content}
posts_content <- character()

for (this_post in posts_links){
  this_content <- read_html(this_post) %>%
    html_node("article .entry-content") %>%
    html_text()
  posts_content <- c(posts_content, this_content)
}

if (length(posts_content) == 1){
  posts_content <- readRDS("data/posts_links.Rds")
}
```

### Nettoyage

```{r}
library(tm.plugin.webmining)

clean_posts_content <- unlist(lapply(posts_content, extractHTMLStrip))
```


## L'analyser

```{r elementr wordcloud}
library(tm)
library(wordcloud)
library(RColorBrewer)

textCorpus <- Corpus(VectorSource(clean_posts_content)) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(PlainTextDocument) %>%
  tm_map(removeWords, stopwords('french')) %>%
  wordcloud(max.words = 100,
            random.order = FALSE,
            random.color = FALSE,
            colors=brewer.pal(6, "Dark2")
            )
```