library("RODBC")
conn <- odbcConnect("sql_server",uid="id", pwd="pw")
x <- sqlQuery(conn, "
select
play_rate1h t01
, play_rate2h t02
, play_rate3h t03
, play_rate4h t04
, play_rate5h t05
, play_rate6h t06
, play_rate7h t07
, play_rate8h t08
, play_rate9h t09
, play_rate10h t10
, play_rate11h t11
, play_rate12h t12
, play_rate13h t13
, play_rate14h t14
, play_rate15h t15
, play_rate16h t16
, play_rate17h t17
, play_rate18h t18
, play_rate19h t19
, play_rate20h t20
, play_rate21h t21
, play_rate22h t22
, play_rate23h t23
, play_rate0h t24
from plays tablesample(10 percent)
")
#kmeans clustering
(cl <- kmeans(x, 8))
summary(cl)
tmp <- data.frame(cl$centers, cluster=rownames(cl$centers))
#企ろ磯 pdf襯 蠏碁る慨.
library(reshape)
library(ggplot2)
tmp <- melt(tmp, id=c("cluster"))
tmp$variable <- as.numeric(gsub("t","", tmp$variable))
head(tmp)
p <- ggplot(tmp, aes(x=variable, y=value, colour=factor(cluster)))
p + geom_line() + geom_text(data=tmp, aes(x=variable, y=value, label=factor(cluster)))
#pdf螳 豎 100企?
library(sqldf)
sqldf("
select
cluster
, sum(value) pdf
, count(*) cnt
from tmp
group by
cluster
order by 1
")
clusters <- data.frame(cl$cluster)
colnames(clusters) <- c("cl")
head(clusters)
cnt <- sqldf("
select
cl
, count(*) cnt
from clusters
group by
cl
order by 1
")
data.frame(cnt=cnt$cnt, prop=cnt$cnt / sum(cnt$cnt))
#test, predic
x1 <- sqlQuery(conn, "
select top 10
play_rate1h t01
, play_rate2h t02
, play_rate3h t03
, play_rate4h t04
, play_rate5h t05
, play_rate6h t06
, play_rate7h t07
, play_rate8h t08
, play_rate9h t09
, play_rate10h t10
, play_rate11h t11
, play_rate12h t12
, play_rate13h t13
, play_rate14h t14
, play_rate15h t15
, play_rate16h t16
, play_rate17h t17
, play_rate18h t18
, play_rate19h t19
, play_rate20h t20
, play_rate21h t21
, play_rate22h t22
, play_rate23h t23
, play_rate0h t24
from plays tablesample(1 percent)
")
#install.packages("DeducerExtras")
library("DeducerExtras")
predict(cl, x1)