#title 데이터 수집 ==== Crawling ==== {{{ library(RCurl) library("XML") library("rjson") myurl <- paste0("http://www.appannie.com/apps/google-play/top/south-korea/overall/") cafile <- system.file("CurlSSL", "cacert.pem", package = "RCurl") curl <- getCurlHandle() options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE, verbose = TRUE, cookiejar = 'my_cookies.txt', cookiefile = 'my_cookies.txt', followlocation = TRUE, useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3')) #useragent 확인: http://www.whatsmyuseragent.com/ curlSetOpt(.opts = list(proxy = 'proxyserver:port'), curl = curl) mydata <- getURI(myurl, .opts=options, curl=curl, encoding="UTF-8") doc = htmlParse(mydata, useInternalNodes = T, encoding="UTF-8") yyyymmdd <- gsub("-", "", substr(as.character(Sys.time()), 1, 10)) div <- xpathSApply(doc,"//table[2]//thead//tr//th", xmlValue) ranking <- xpathSApply(doc,"//table[1]//tbody//tr//td", xmlValue) game_nm1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[1]//a", xmlValue) company1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[2]//a", xmlValue) id1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[3]", xmlValue) game_nm2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[1]//a", xmlValue) company2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[2]//a", xmlValue) id2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[3]", xmlValue) game_nm3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[1]//a", xmlValue) company3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[2]//a", xmlValue) id3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[3]", xmlValue) game_nm4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[1]//a", xmlValue) company4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[2]//a", xmlValue) id4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[3]", xmlValue) game_nm5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[1]//a", xmlValue) company5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[2]//a", xmlValue) id5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[3]", xmlValue) }}}} ttp://stackoverflow.com/questions/22668144/how-correctly-use-request-header-with-api-data-requests {{{ library("XML") u = "http://en.wikipedia.org/wiki/List_of_countries_by_population" tables = readHTMLTable(u) names(tables) tables[[1]] }}} 앱애니 --> http://stackoverflow.com/questions/15168970/log-into-a-website-to-grab-the-data-using-rcurl html parsing {{{ library("XML") u = "C:\\data\\test.htm" doc = htmlParse(u, useInternalNodes = T) #getNodeSet(doc, "//table[2]//tbody//tr//td[1]//div[@class='main-info']")[1] ranking <- xpathSApply(doc,"//table[1]//tbody//tr//td", xmlValue) game_nm <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[1]//a", xmlValue) company1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[2]//a", xmlValue) id1 <- xpathSApply(doc,"//table[2]//tbody//tr//td[1]//div[@class='main-info']//span[3]", xmlValue) game_nm2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[1]//a", xmlValue) company2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[2]//a", xmlValue) id2 <- xpathSApply(doc,"//table[2]//tbody//tr//td[2]//div[@class='main-info']//span[3]", xmlValue) game_nm3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[1]//a", xmlValue) company3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[2]//a", xmlValue) id3 <- xpathSApply(doc,"//table[2]//tbody//tr//td[3]//div[@class='main-info']//span[3]", xmlValue) game_nm4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[1]//a", xmlValue) company4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[2]//a", xmlValue) id4 <- xpathSApply(doc,"//table[2]//tbody//tr//td[4]//div[@class='main-info']//span[3]", xmlValue) game_nm5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[1]//a", xmlValue) company5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[2]//a", xmlValue) id5 <- xpathSApply(doc,"//table[2]//tbody//tr//td[5]//div[@class='main-info']//span[3]", xmlValue) rs <- data.frame(ranking,game_nm=game_nm1,company=company1,id=id1, div=rep("무료", length(game_nm1))) sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F) rs <- data.frame(ranking,game_nm=game_nm2,company=company2,id=id2, div=rep("유료", length(game_nm2))) sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F) rs <- data.frame(ranking,game_nm=game_nm3,company=company3,id=id3, div=rep("수익", length(game_nm3))) sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F) rs <- data.frame(ranking,game_nm=game_nm4,company=company4,id=id4, div=rep("새 무료", length(game_nm4))) sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F) rs <- data.frame(ranking=ranking[1:length(game_nm5)],game_nm=game_nm5,company=company5,id=id5, div=rep("새 유료", length(game_nm5))) sqlSave(conn,rs,tablename="dbo.google_play_ranking", append=T, fast=T, rownames=F) shell("dir") }}} {{{ xpathSApply(doc,"//*[local-name() = 'span'][@itemprop='genre']", xmlValue) }}}