library(RCurl)
library("XML")
library("rjson")
myurl <- paste0("http://www.appannie.com/apps/google-play/top/south-korea/overall/")
cafile <- system.file("CurlSSL", "cacert.pem", package = "RCurl")
curl <- getCurlHandle()
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"),
ssl.verifypeer = FALSE, verbose = TRUE, cookiejar = 'my_cookies.txt',
cookiefile = 'my_cookies.txt', followlocation = TRUE,
useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3'))
#useragent : http://www.whatsmyuseragent.com/
curlSetOpt(.opts = list(proxy = 'proxyserver:port'), curl = curl)
mydata <- getURI(myurl, .opts=options, curl=curl, encoding="UTF-8")
doc = htmlParse(mydata, useInternalNodes = T, encoding="UTF-8")
yyyymmdd <- gsub("-", "", substr(as.character(Sys.time()), 1, 10))
div <- xpathSApply(doc,"//table[2]//thead//tr//th", xmlValue)
ranking <- xpathSApply(doc,"//table[1]//tbody//tr//td", xmlValue)
game_nm1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[1]//a", xmlValue)
company1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[2]//a", xmlValue)
id1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[3]", xmlValue)
game_nm2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[1]//a", xmlValue)
company2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[2]//a", xmlValue)
id2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[3]", xmlValue)
game_nm3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[1]//a", xmlValue)
company3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[2]//a", xmlValue)
id3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[3]", xmlValue)
game_nm4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[1]//a", xmlValue)
company4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[2]//a", xmlValue)
id4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[3]", xmlValue)
game_nm5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[1]//a", xmlValue)
company5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[2]//a", xmlValue)
id5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[3]", xmlValue)
}
html parsing
library("XML")
u = "C:\\data\\test.htm"
doc = htmlParse(u, useInternalNodes = T)
#getNodeSet(doc, "//table[2]//tbody//tr//td[1]//div[@class='main-info']")[1]
ranking <- xpathSApply(doc,"//table[1]//tbody//tr//td", xmlValue)
game_nm <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[1]//a", xmlValue)
company1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[2]//a", xmlValue)
id1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[3]", xmlValue)
game_nm2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[1]//a", xmlValue)
company2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[2]//a", xmlValue)
id2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[3]", xmlValue)
game_nm3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[1]//a", xmlValue)
company3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[2]//a", xmlValue)
id3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[3]", xmlValue)
game_nm4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[1]//a", xmlValue)
company4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[2]//a", xmlValue)
id4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[3]", xmlValue)
game_nm5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[1]//a", xmlValue)
company5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[2]//a", xmlValue)
id5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[3]", xmlValue)
rs <- data.frame(ranking,game_nm=game_nm1,company=company1,id=id1, div=rep("覓企", length(game_nm1)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)
rs <- data.frame(ranking,game_nm=game_nm2,company=company2,id=id2, div=rep("襭", length(game_nm2)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)
rs <- data.frame(ranking,game_nm=game_nm3,company=company3,id=id3, div=rep("", length(game_nm3)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)
rs <- data.frame(ranking,game_nm=game_nm4,company=company4,id=id4, div=rep(" 覓企", length(game_nm4)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)
rs <- data.frame(ranking=ranking[1:length(game_nm5)],game_nm=game_nm5,company=company5,id=id5, div=rep(" 襭", length(game_nm5)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)
shell("dir")