-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathsimplePageScrape.R
More file actions
84 lines (64 loc) · 2.55 KB
/
simplePageScrape.R
File metadata and controls
84 lines (64 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#
# scraping a simple html page
# Page has 4 people with contact information. We want to get them into a dataframe
#
library(rvest)
# Read the page
pg = read_html("simplePage.html")
# Get list of names
# attribute id can be extracted with prefixing #
name = pg %>% html_nodes("#name") %>% html_text()
name
# extracting names with xpath
names2 = pg %>% html_nodes(xpath = "//li//div[@id = 'name']") %>% html_text()
names2
# get links from href attribute
link = pg %>% html_nodes("a") %>% html_attr("href")
link
# Get list of addresses and cities
# attribute class can be extracted with prefixing period (.)
addr = pg %>% html_nodes(".add") %>% html_text()
addr
city = pg %>% html_nodes("li .city span") %>% html_text()
city
# Get state and zip code info
state = pg %>% html_nodes("p[geo = 'state']") %>% html_text()
state
zipcode = pg %>% html_nodes("p[geocode = 'zip']") %>% html_text()
zipcode
# What if geo and geocode attributes are not there in html
state2 = pg %>% html_nodes("div p") %>% html_text() # won't work since it gets both state and zip code
state2
# directly using xpath handy since it is able to just extract the first element for state and second element for zip
state2 = pg %>% html_nodes(xpath = "//div[@class = 'city']/p[1]") %>% html_text()
state2
zip2 = pg %>% html_nodes(xpath = "//div[@class = 'city']/p[2]") %>% html_text()
zip2
# Get phone numbers
phone = pg %>% html_nodes(".comm p[type = 'phone']") %>% html_text()
phone # problem since only 2 addresses have phone numbers and it will be hard to align with other fields
# workaround is to force using all nodes
addrnodes = pg %>% html_nodes("li")
addrnodes
phone = sapply(addrnodes, function(x) x %>% html_nodes(".comm p[type = 'phone']") %>% html_text())
phone # has length 0 fields when phone doesn't exist
phone = sapply(phone, function(x) if(length(x) == 0) NA else x) # set NA to missing phone numbers
phone
# Get email addresses
email = sapply(addrnodes, function(x) x %>% html_nodes(".comm p[type = 'email']") %>% html_text())
email = sapply(email, function(x) if(length(x) == 0) NA else x)
email
# create a dataframe with address data
addrdf = data.frame(name = name, addr = addr, city = city, state = state, zipcode = zipcode,
phone = phone, email = email, link = link)
addrdf
#
# Scraping a html table
# scraping list of packages from CRAN site
#
url = "http://cran.us.r-project.org/web/packages/available_packages_by_date.html"
cranpg = read_html(url)
cranlist = cranpg %>% html_nodes("table") %>% html_table()
head(cranlist[[1]])
# Used RStudio 0.99.489
sessionInfo()