forked from info201a-s18/mini-demos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscraper-demo.R
121 lines (68 loc) · 2.93 KB
/
webscraper-demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
## Let's make a webscraper!
## Sources:
## https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands-on-knowledge/
## https://www.rdocumentation.org/packages/rvest/versions/0.3.2/topics/html_nodes
## https://www.rdocumentation.org/packages/rvest/versions/0.3.2/topics/html_text
## Uncomment this to install packages
#install.packages('rvest')
# Load in 'rvest' package
library('rvest')
'Specify the URL endpoint we are using'
url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
webpage <- read_html(url)
#html_nodes: More easily extract pieces out of HTML documents using XPath and css selectors
#html_text: Extract attributes, text and tag name from html.
rank_data_html <- html_nodes(webpage,'.text-primary')
rank_data <- html_text(rank_data_html)
head(rank_data)
rank_data<-as.numeric(rank_data)
head(rank_data)
title_data_html <- html_nodes(webpage, ".lister-item-header a")
#html to text
title_data <- html_text(title_data_html)
#look at data
head(title_data)
#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')
#Converting the description data to text
description_data <- html_text(description_data_html)
#look at data
head(description_data)
#Data-Preprocessing: removing '\n'
description_data<-gsub("\n","",description_data)
#Using CSS selectors to scrap the Movie runtime section
#Converting the movie runtime data to text
#Let's have a look at the movie runtime
#Data-Preprocessing: removing mins and converting it to numerical
#Let's have another look at the runtime data
#Converting the genre data to text
#Let's have a look at the genre
#Data-Preprocessing: removing \n
#Data-Preprocessing: removing excess spaces
#taking only the first genre of each movie
#Convering each genre from text to factor
#Let's have another look at the genre data
#Using CSS selectors to scrap the IMDB rating section
#Converting the ratings data to text
#Let's have a look at the ratings
#Data-Preprocessing: converting ratings to numerical
#Let's have another look at the ratings data
#Using CSS selectors to scrap the directors section
#Converting the directors data to text
#Let's have a look at the directors data
#Data-Preprocessing: converting directors data into factors
#Using CSS selectors to scrap the actors section
#Converting the gross actors data to text
#Let's have a look at the actors data
#Data-Preprocessing: converting actors data into factors
#Using CSS selectors to scrap the gross revenue section
#Converting the gross revenue data to text
#Let's have a look at the votes data
#Data-Preprocessing: removing '$' and 'M' signs
#Let's check the length of gross data
length(gross_data)
#Filling missing entries with NA
#Data-Preprocessing: converting gross to numerical
#Let's have another look at the length of gross data
#library('ggplot2')
# let's draw some plots!