#title Naive Bayes [[TableOfContents]] ==== 나이브? ==== 나이브로 불리는 이유는 결합하는 확률이 서로 독립(independence)이라는 가정이 있기때문이다. 사실은 미약하게나마 개별 사건에 대한 확률이 다른 확률에 영향을 미칠수 있지만, 무시하고 그냥 단순하게 두 사건이 서로 독립이라고 가정하고(즉, 아무런 영향을 주지 않는다고 가정하고) 두 사건의 결합 확률을 구했을때 그냥 두 확률값의 곱으로만 표현해서 결합 확률(joint probability)를 구하는 것이다. --http://abipictures.tistory.com/716 ==== 예제 ==== 다음과 같은 데이터가 있다. {{{ Admit Gender Dept Freq 1 Admitted Male A 512 2 Rejected Male A 313 3 Admitted Female A 89 4 Rejected Female A 19 }}} 조건부 확률을 계산해 보면.. * P(Admitted | Male, A) = 512 / (512 + 313) = 0.6206061 * P(Admitted | Female, A) = 89 / (89 + 19) = 0.8240741 * ... {{{ library(e1071) classifier <- naiveBayes(Admit ~ Gender + Dept, data=UCBAdmissions) classifier }}} ==== 예제 ==== {{{ install.packages("klaR") library("klaR") data(iris) model<- NaiveBayes(Species ~ ., data = iris) plot(model) }}} {{{ install.packages("e1071") library("e1071") install.packages("mlbench") #HouseVotes84, 참고자료를 참고하라. library("mlbench") data(HouseVotes84) head(HouseVotes84) model <- naiveBayes(Class ~ ., data = HouseVotes84) predict(model, HouseVotes84[1:10,-1]) predict(model, HouseVotes84[1:10,-1], type = "raw") pred <- predict(model, HouseVotes84[,-1]) table(pred, HouseVotes84$Class) ## Example of using a contingency table: data(Titanic) m <- naiveBayes(Survived ~ ., data = Titanic) m predict(m, as.data.frame(Titanic)[,1:3]) ## Example with metric predictors: data(iris) m <- naiveBayes(Species ~ ., data = iris) ## alternatively: m <- naiveBayes(iris[,-5], iris[,5]) m table(predict(m, iris[,-5]), iris[,5]) }}} ==== code ==== https://github.com/baotong/code-recipes/blob/master/R/naiveBayes.R {{{ naiveBayes <- function(x, ...) UseMethod("naiveBayes") naiveBayes.default <- function(x, y, laplace = 0, ...) { call <- match.call() Yname <- deparse(substitute(y)) x <- as.data.frame(x) ## estimation-function est <- function(var) if (is.numeric(var)) { cbind(tapply(var, y, mean, na.rm = TRUE), tapply(var, y, sd, na.rm = TRUE)) } else { tab <- table(y, var) (tab + laplace) / (rowSums(tab) + laplace * nlevels(var)) } ## create tables apriori <- table(y) tables <- lapply(x, est) ## fix dimname names for (i in 1:length(tables)) names(dimnames(tables[[i]])) <- c(Yname, colnames(x)[i]) names(dimnames(apriori)) <- Yname structure(list(apriori = apriori, tables = tables, levels = levels(y), call = call ), class = "naiveBayes" ) } naiveBayes.formula <- function(formula, data, laplace = 0, ..., subset, na.action = na.pass) { call <- match.call() Yname <- as.character(formula[[2]]) if (is.data.frame(data)) { ## handle formula m <- match.call(expand.dots = FALSE) m$... <- NULL m$laplace = NULL m$na.action <- na.action m[[1]] <- as.name("model.frame") m <- eval(m, parent.frame()) Terms <- attr(m, "terms") if (any(attr(Terms, "order") > 1)) stop("naiveBayes cannot handle interaction terms") Y <- model.extract(m, "response") X <- m[,-attr(Terms, "response"), drop = FALSE] return(naiveBayes(X, Y, laplace = laplace, ...)) } else if (is.array(data)) { nam <- names(dimnames(data)) ## Find Class dimension Yind <- which(nam == Yname) ## Create Variable index deps <- strsplit(as.character(formula)[3], ".[+].")[[1]] if (length(deps) == 1 && deps == ".") deps <- nam[-Yind] Vind <- which(nam %in% deps) ## create tables apriori <- margin.table(data, Yind) tables <- lapply(Vind, function(i) (margin.table(data, c(Yind, i)) + laplace) / (as.numeric(apriori) + laplace * dim(data)[i])) names(tables) <- nam[Vind] structure(list(apriori = apriori, tables = tables, levels = names(apriori), call = call ), class = "naiveBayes" ) } else stop("naiveBayes formula interface handles data frames or arrays only") } print.naiveBayes <- function(x, ...) { cat("\nNaive Bayes Classifier for Discrete Predictors\n\n") cat("Call:\n") print(x$call) cat("\nA-priori probabilities:\n") print(x$apriori / sum(x$apriori)) cat("\nConditional probabilities:\n") for (i in x$tables) {print(i); cat("\n")} } predict.naiveBayes <- function(object, newdata, type = c("class", "raw"), threshold = 0.001, ...) { type <- match.arg(type) newdata <- as.data.frame(newdata) attribs <- which(names(object$tables) %in% names(newdata)) isnumeric <- sapply(newdata, is.numeric) newdata <- data.matrix(newdata) L <- sapply(1:nrow(newdata), function(i) { ndata <- newdata[i,] L <- log(object$apriori) + apply(log(sapply(attribs, function(v) { nd <- ndata[v] if(is.na(nd)) rep(1, length(object$apriori)) else { prob <- if (isnumeric[v]) { msd <- object$tables[[v]] msd[,2][msd[,2]==0] <- threshold dnorm(nd, msd[,1], msd[,2]) } else object$tables[[v]][,nd] prob[prob == 0] <- threshold prob } })), 1, sum) if (type == "class") L else { L <- exp(L) L / sum(L) } }) if (type == "class") factor(object$levels[apply(L, 2, which.max)], levels = object$levels) else t(L) } }}} ==== 참고자료 ==== * http://abipictures.tistory.com/716 * http://rgm3.lab.nig.ac.jp/RGM/R_rdfile?f=mlbench/man/HouseVotes84.Rd&d=R_CC --> HouseVotes84 설명