#title K-Means Clustering [[TableOfContents]] ==== k값 결정 ==== {{{ n = 100 g = 6 set.seed(g) d <- data.frame(x = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2))), y = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2)))) plot(d) library(fpc) pamk.best <- pamk(d) pamk.best$nc }}} 다른 방법: {{{ NbClust(d, min.nc=2, max.nc=15, method="kmeans") }}} ==== 예제 ==== {{{ library("RODBC") conn <- odbcConnect("sql_server",uid="id", pwd="pw") x <- sqlQuery(conn, " select play_rate1h t01 , play_rate2h t02 , play_rate3h t03 , play_rate4h t04 , play_rate5h t05 , play_rate6h t06 , play_rate7h t07 , play_rate8h t08 , play_rate9h t09 , play_rate10h t10 , play_rate11h t11 , play_rate12h t12 , play_rate13h t13 , play_rate14h t14 , play_rate15h t15 , play_rate16h t16 , play_rate17h t17 , play_rate18h t18 , play_rate19h t19 , play_rate20h t20 , play_rate21h t21 , play_rate22h t22 , play_rate23h t23 , play_rate0h t24 from plays tablesample(10 percent) ") #kmeans clustering (cl <- kmeans(x, 8)) summary(cl) tmp <- data.frame(cl$centers, cluster=rownames(cl$centers)) #클러스터별 pdf를 그려보자. library(reshape) library(ggplot2) tmp <- melt(tmp, id=c("cluster")) tmp$variable <- as.numeric(gsub("t","", tmp$variable)) head(tmp) p <- ggplot(tmp, aes(x=variable, y=value, colour=factor(cluster))) p + geom_line() + geom_text(data=tmp, aes(x=variable, y=value, label=factor(cluster))) #pdf가 합쳐서 100이냐? library(sqldf) sqldf(" select cluster , sum(value) pdf , count(*) cnt from tmp group by cluster order by 1 ") clusters <- data.frame(cl$cluster) colnames(clusters) <- c("cl") head(clusters) cnt <- sqldf(" select cl , count(*) cnt from clusters group by cl order by 1 ") data.frame(cnt=cnt$cnt, prop=cnt$cnt / sum(cnt$cnt)) #test, predic x1 <- sqlQuery(conn, " select top 10 play_rate1h t01 , play_rate2h t02 , play_rate3h t03 , play_rate4h t04 , play_rate5h t05 , play_rate6h t06 , play_rate7h t07 , play_rate8h t08 , play_rate9h t09 , play_rate10h t10 , play_rate11h t11 , play_rate12h t12 , play_rate13h t13 , play_rate14h t14 , play_rate15h t15 , play_rate16h t16 , play_rate17h t17 , play_rate18h t18 , play_rate19h t19 , play_rate20h t20 , play_rate21h t21 , play_rate22h t22 , play_rate23h t23 , play_rate0h t24 from plays tablesample(1 percent) ") #install.packages("DeducerExtras") library("DeducerExtras") predict(cl, x1) }}} ==== 애니메이션(r code) ==== {{{ library("animation") kmeans.ani(x = cbind(X1 = runif(50), X2 = runif(50)), centers = 3, hints = c("Move centers!", "Find cluster?"), pch = 1:3, col = 1:3) }}} ==== 애니메이션(gif) ==== 출처: http://shabal.in/visuals/kmeans/1.html Starting with 4 left-most points attachment:K-MeansClustering/left.gif Starting with 4 right-most points attachment:K-MeansClustering/right.gif Starting with 4 top points attachment:K-MeansClustering/top.gif