Crawling #

library(RCurl)
library("XML")
library("rjson")

myurl <- paste0("http://www.appannie.com/apps/google-play/top/south-korea/overall/")
cafile <- system.file("CurlSSL", "cacert.pem", package = "RCurl")
curl <- getCurlHandle()

options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"),
        ssl.verifypeer = FALSE, verbose = TRUE, cookiejar = 'my_cookies.txt', 
        cookiefile = 'my_cookies.txt',   followlocation = TRUE,                                                                        
        useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3'))
#useragent : http://www.whatsmyuseragent.com/

curlSetOpt(.opts = list(proxy = 'proxyserver:port'), curl = curl)

mydata <- getURI(myurl, .opts=options, curl=curl, encoding="UTF-8")
doc = htmlParse(mydata, useInternalNodes = T, encoding="UTF-8")


yyyymmdd <- gsub("-", "", substr(as.character(Sys.time()), 1, 10))

div <- xpathSApply(doc,"//table[2]//thead//tr//th", xmlValue)

ranking <- xpathSApply(doc,"//table[1]//tbody//tr//td", xmlValue)
game_nm1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[1]//a", xmlValue)
company1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[2]//a", xmlValue)
id1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[3]", xmlValue)

game_nm2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[1]//a", xmlValue)
company2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[2]//a", xmlValue)
id2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[3]", xmlValue)

game_nm3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[1]//a", xmlValue)
company3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[2]//a", xmlValue)
id3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[3]", xmlValue)

game_nm4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[1]//a", xmlValue)
company4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[2]//a", xmlValue)
id4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[3]", xmlValue)

game_nm5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[1]//a", xmlValue)
company5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[2]//a", xmlValue)
id5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[3]", xmlValue)
}



ttp://stackoverflow.com/questions/22668144/how-correctly-use-request-header-with-api-data-requests
library("XML")
u = "http://en.wikipedia.org/wiki/List_of_countries_by_population"
 
tables = readHTMLTable(u)
names(tables)
tables[[1]]

煙 --> http://stackoverflow.com/questions/15168970/log-into-a-website-to-grab-the-data-using-rcurl

html parsing
library("XML")
u = "C:\\data\\test.htm"
doc = htmlParse(u, useInternalNodes = T)
#getNodeSet(doc, "//table[2]//tbody//tr//td[1]//div[@class='main-info']")[1]

ranking <- xpathSApply(doc,"//table[1]//tbody//tr//td", xmlValue)
game_nm <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[1]//a", xmlValue)
company1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[2]//a", xmlValue)
id1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[3]", xmlValue)

game_nm2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[1]//a", xmlValue)
company2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[2]//a", xmlValue)
id2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[3]", xmlValue)

game_nm3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[1]//a", xmlValue)
company3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[2]//a", xmlValue)
id3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[3]", xmlValue)

game_nm4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[1]//a", xmlValue)
company4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[2]//a", xmlValue)
id4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[3]", xmlValue)

game_nm5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[1]//a", xmlValue)
company5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[2]//a", xmlValue)
id5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[3]", xmlValue)

rs <- data.frame(ranking,game_nm=game_nm1,company=company1,id=id1, div=rep("覓企", length(game_nm1)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)

rs <- data.frame(ranking,game_nm=game_nm2,company=company2,id=id2, div=rep("襭", length(game_nm2)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)

rs <- data.frame(ranking,game_nm=game_nm3,company=company3,id=id3, div=rep("", length(game_nm3)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)

rs <- data.frame(ranking,game_nm=game_nm4,company=company4,id=id4, div=rep(" 覓企", length(game_nm4)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)

rs <- data.frame(ranking=ranking[1:length(game_nm5)],game_nm=game_nm5,company=company5,id=id5, div=rep(" 襭", length(game_nm5)))
sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F)



shell("dir")

xpathSApply(doc,"//*[local-name() = 'span'][@itemprop='genre']", xmlValue)