Skip to Navigation Skip to Content Skip to Search Skip to Site Map
Search

Getting Data from the Web discussion March 7 2017

An 23 minute recording of a discussion of getting data from the web from a March 7th meeting: CSVs, HTML tables via rvest, fixed width embedded in tables, JSON.

You can watch the recording (23 minutes 11 seconds)

or download (right click and choose the save/download link option) the recording to your computer via the link:

march7.m4v (.m4v format, 102.2 MB)

and read the parts of the script below (code available when viewing the full article)

###
# package of the week
# rworldmaps http://egallic.fr/maps-with-r/

################################
## They just give you a csv
### https://earthquake.usgs.gov/earthquakes/search/
library(lubridate)
library(dplyr)
go <- Sys.time()
begin <- as.POSIXct("2011-09-01 00:00:00", tz="UTC")
end <- as.POSIXct("2016-09-15 00:00:00", tz="UTC")
first <- seq(from=begin, to=end, by="week")
first <- first[1:(length(first)-1)]
second <- first[2:length(first)]
second <- second - seconds(1)
mid_url <- paste("starttime=", format(first, "%Y-%m-%d"), "%20", format(first, "%H:%M:%S"),
 "&endtime=", format(second, "%Y-%m-%d"), "%20", format(second, "%H:%M:%S"), sep="")
full_url <- paste("http://earthquake.usgs.gov/fdsnws/event/1/query.csv?", mid_url,
 "&maxlatitude=50&minlatitude=24.6&maxlongitude=-65&minlongitude=-125&minmagnitude=0&eventtype=earthquake&orderby=time")
usgs <- lapply(full_url[1:(length(full_url)-2)],read.csv, stringsAsFactors=FALSE)
equsa <- bind_rows(usgs)

##########################
## HTML tables
# Example wikipedia
library(dplyr)
#library(stringr)
library(rvest)
url <- "http://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City"
#Scrape the first two columns, restaurant name and borough
htmltables <- url %>% read_html() %>% html_nodes("table") %>% html_table(fill = TRUE)
length(htmltables)
#check which table has the data
head(htmltables[[1]])
head(htmltables[[2]])
# etc through the number of tables
data_I_want <- htmltables[[1]]
names(data_I_want) <- make.names(names(data_I_want)) # convert to R safe names

#####################
## embeded fixed width file
# http://aa.usno.navy.mil/data/index.php
library(readr)
url <- "http://aa.usno.navy.mil/cgi-bin/aa_rstablew.pl?ID=AA&year=2012&task=1&place=Wellington&lon_sign=1&lon_deg=174&lon_min=46&lat_sign=-1&lat_deg=41&lat_min=17&tz=0&tz_sign=0"
yr <- read_fwf(url,fwf_widths(c(4,rep(c(5,6),12))), skip=31, n_max=31)

######################
## JSON
### http://www.xeno-canto.org
library("jsonlite")
nzurl="http://www.xeno-canto.org/api/2/recordings?species_nr=&query=+cnt%3A%22New+Zealand%22"
download.file(nzurl, destfile="p1.JSON")
nzurl="http://www.xeno-canto.org/api/2/recordings?species_nr=&query=+cnt%3A%22New+Zealand%22&page=2"
download.file(nzurl, destfile="p2.JSON")
p1 <- fromJSON("p1.JSON")
p2 <- fromJSON("p2.JSON")

## Alternative- there is a package
### WBI
### spend the time working out the codes for the datset you want

## Alternative curl

## Alterntive selenium

Leave a comment