#title K-Means Clustering
[[TableOfContents]]

==== k값 결정 ====
{{{
n = 100
g = 6 
set.seed(g)
d <- data.frame(x = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2))), 
                y = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2))))
plot(d)

library(fpc)
pamk.best <- pamk(d)
pamk.best$nc
}}}

다른 방법:
{{{
NbClust(d, min.nc=2, max.nc=15, method="kmeans")
}}}
==== 예제 ====
{{{
library("RODBC")
conn <- odbcConnect("sql_server",uid="id", pwd="pw")

x <- sqlQuery(conn, "
select
     play_rate1h	t01
,	play_rate2h	t02
,	play_rate3h	t03
,	play_rate4h	t04
,	play_rate5h	t05
,	play_rate6h	t06
,	play_rate7h	t07
,	play_rate8h	t08
,	play_rate9h	t09
,	play_rate10h	t10
,	play_rate11h	t11
,	play_rate12h	t12
,	play_rate13h	t13
,	play_rate14h	t14
,	play_rate15h	t15
,	play_rate16h	t16
,	play_rate17h	t17
,	play_rate18h	t18
,	play_rate19h	t19
,	play_rate20h	t20
,	play_rate21h	t21
,	play_rate22h	t22
,	play_rate23h	t23
,	play_rate0h	t24
from plays tablesample(10 percent)
")

#kmeans clustering
(cl <- kmeans(x, 8))
summary(cl)
tmp <- data.frame(cl$centers, cluster=rownames(cl$centers))


#클러스터별 pdf를 그려보자.
library(reshape)
library(ggplot2)

tmp <- melt(tmp, id=c("cluster"))
tmp$variable <- as.numeric(gsub("t","", tmp$variable))
head(tmp)

p <- ggplot(tmp, aes(x=variable, y=value, colour=factor(cluster))) 
p + geom_line() + geom_text(data=tmp, aes(x=variable, y=value, label=factor(cluster)))


#pdf가 합쳐서 100이냐?
library(sqldf)
sqldf("
    select
        cluster
    ,   sum(value) pdf
    ,   count(*) cnt
    from tmp
    group by
        cluster
    order by 1
")


clusters <- data.frame(cl$cluster)
colnames(clusters) <- c("cl")
head(clusters)

cnt <- sqldf("
    select
        cl
    ,   count(*) cnt
    from clusters
    group by
        cl
    order by 1
")
data.frame(cnt=cnt$cnt, prop=cnt$cnt / sum(cnt$cnt))

#test, predic
x1 <- sqlQuery(conn, "
select top 10
    play_rate1h    t01
,	play_rate2h	t02
,	play_rate3h	t03
,	play_rate4h	t04
,	play_rate5h	t05
,	play_rate6h	t06
,	play_rate7h	t07
,	play_rate8h	t08
,	play_rate9h	t09
,	play_rate10h	t10
,	play_rate11h	t11
,	play_rate12h	t12
,	play_rate13h	t13
,	play_rate14h	t14
,	play_rate15h	t15
,	play_rate16h	t16
,	play_rate17h	t17
,	play_rate18h	t18
,	play_rate19h	t19
,	play_rate20h	t20
,	play_rate21h	t21
,	play_rate22h	t22
,	play_rate23h	t23
,	play_rate0h	t24
from plays tablesample(1 percent)
")

#install.packages("DeducerExtras")
library("DeducerExtras")
predict(cl, x1)
}}}

==== 애니메이션(r code) ====
{{{
library("animation")
kmeans.ani(x = cbind(X1 = runif(50), X2 = runif(50)), 
           centers = 3, hints = c("Move centers!", "Find cluster?"), pch = 1:3, col = 1:3)
}}}

==== 애니메이션(gif) ====
출처: http://shabal.in/visuals/kmeans/1.html

Starting with 4 left-most points
attachment:K-MeansClustering/left.gif

Starting with 4 right-most points
attachment:K-MeansClustering/right.gif

Starting with 4 top points
attachment:K-MeansClustering/top.gif