#title Naive Bayes
[[TableOfContents]]

==== 나이브? ====
나이브로 불리는 이유는 결합하는 확률이 서로 독립(independence)이라는 가정이 있기때문이다. 사실은 미약하게나마 개별 사건에 대한 확률이 다른 확률에 영향을 미칠수 있지만, 무시하고 그냥 단순하게 두 사건이 서로 독립이라고 가정하고(즉, 아무런 영향을 주지 않는다고 가정하고) 두 사건의 결합 확률을 구했을때 그냥 두 확률값의 곱으로만 표현해서 결합 확률(joint probability)를 구하는 것이다. --http://abipictures.tistory.com/716

==== 예제 ====
다음과 같은 데이터가 있다.
{{{
Admit Gender Dept Freq
1  Admitted   Male    A  512
2  Rejected   Male    A  313
3  Admitted Female    A   89
4  Rejected Female    A   19
}}}

조건부 확률을 계산해 보면..
 * P(Admitted | Male, A) = 512 / (512 + 313) = 0.6206061
 * P(Admitted | Female, A) = 89 / (89 + 19) = 0.8240741
 * ...

{{{
library(e1071)
classifier <- naiveBayes(Admit ~ Gender + Dept, data=UCBAdmissions)
classifier
}}}
==== 예제 ====
{{{
install.packages("klaR")
library("klaR")
data(iris)
model<- NaiveBayes(Species ~ ., data = iris)
plot(model)
}}}

{{{
install.packages("e1071")
library("e1071")

install.packages("mlbench") #HouseVotes84, 참고자료를 참고하라.
library("mlbench")

data(HouseVotes84)
head(HouseVotes84)

model <- naiveBayes(Class ~ ., data = HouseVotes84)
predict(model, HouseVotes84[1:10,-1])
predict(model, HouseVotes84[1:10,-1], type = "raw")

pred <- predict(model, HouseVotes84[,-1])
table(pred, HouseVotes84$Class)

## Example of using a contingency table:
data(Titanic)
m <- naiveBayes(Survived ~ ., data = Titanic)
m
predict(m, as.data.frame(Titanic)[,1:3])

## Example with metric predictors:
data(iris)
m <- naiveBayes(Species ~ ., data = iris)
## alternatively:
m <- naiveBayes(iris[,-5], iris[,5])
m
table(predict(m, iris[,-5]), iris[,5])
}}}

==== code ====
https://github.com/baotong/code-recipes/blob/master/R/naiveBayes.R
{{{
naiveBayes <- function(x, ...)
    UseMethod("naiveBayes")

naiveBayes.default <- function(x, y, laplace = 0, ...) {
    call <- match.call()
    Yname <- deparse(substitute(y))
    x <- as.data.frame(x)

    ## estimation-function
    est <- function(var)
        if (is.numeric(var)) {
            cbind(tapply(var, y, mean, na.rm = TRUE),
                  tapply(var, y, sd, na.rm = TRUE))
        } else {
            tab <- table(y, var)
            (tab + laplace) / (rowSums(tab) + laplace * nlevels(var))
        }

    ## create tables
    apriori <- table(y)
    tables <- lapply(x, est)

    ## fix dimname names
    for (i in 1:length(tables))
        names(dimnames(tables[[i]])) <- c(Yname, colnames(x)[i])
    names(dimnames(apriori)) <- Yname

    structure(list(apriori = apriori,
                   tables = tables,
                   levels = levels(y),
                   call   = call
                   ),

              class = "naiveBayes"
              )
}

naiveBayes.formula <- function(formula, data, laplace = 0, ...,
                               subset, na.action = na.pass) {
    call <- match.call()
    Yname <- as.character(formula[[2]])

    if (is.data.frame(data)) {
        ## handle formula
        m <- match.call(expand.dots = FALSE)
        m$... <- NULL
        m$laplace = NULL
        m$na.action <- na.action
        m[[1]] <- as.name("model.frame")
        m <- eval(m, parent.frame())
        Terms <- attr(m, "terms")
        if (any(attr(Terms, "order") > 1))
            stop("naiveBayes cannot handle interaction terms")
        Y <- model.extract(m, "response")
        X <- m[,-attr(Terms, "response"), drop = FALSE]

        return(naiveBayes(X, Y, laplace = laplace, ...))
    } else if (is.array(data)) {
        nam <- names(dimnames(data))
        ## Find Class dimension
        Yind <- which(nam == Yname)

        ## Create Variable index
        deps <- strsplit(as.character(formula)[3], ".[+].")[[1]]
        if (length(deps) == 1 && deps == ".")
            deps <- nam[-Yind]
        Vind <- which(nam %in% deps)

        ## create tables
        apriori <- margin.table(data, Yind)
        tables <- lapply(Vind,
                         function(i) (margin.table(data, c(Yind, i)) + laplace) /
                         (as.numeric(apriori) + laplace * dim(data)[i]))
        names(tables) <- nam[Vind]

        structure(list(apriori = apriori,
                       tables = tables,
                       levels = names(apriori),
                       call   = call
                       ),

                  class = "naiveBayes"
                  )
    } else stop("naiveBayes formula interface handles data frames or arrays only")

}


print.naiveBayes <- function(x, ...) {
    cat("\nNaive Bayes Classifier for Discrete Predictors\n\n")
    cat("Call:\n")
    print(x$call)
    cat("\nA-priori probabilities:\n")
    print(x$apriori / sum(x$apriori))

    cat("\nConditional probabilities:\n")
    for (i in x$tables) {print(i); cat("\n")}

}

predict.naiveBayes <- function(object,
                               newdata,
                               type = c("class", "raw"),
                               threshold = 0.001,
                               ...) {
    type <- match.arg(type)
    newdata <- as.data.frame(newdata)
    attribs <- which(names(object$tables) %in% names(newdata))
    isnumeric <- sapply(newdata, is.numeric)
    newdata <- data.matrix(newdata)
    L <- sapply(1:nrow(newdata), function(i) {
        ndata <- newdata[i,]
        L <- log(object$apriori) +
            apply(log(sapply(attribs, function(v) {
                nd <- ndata[v]
                if(is.na(nd))
                    rep(1, length(object$apriori))
                else {
                    prob <- if (isnumeric[v]) {
                        msd <- object$tables[[v]]
                        msd[,2][msd[,2]==0] <- threshold
                        dnorm(nd, msd[,1], msd[,2])
                    } else
                        object$tables[[v]][,nd]
                    prob[prob == 0] <- threshold
                    prob
                }
            })), 1, sum)
        if (type == "class")
            L
        else {
            L <- exp(L)
            L / sum(L)
        }
    })
    if (type == "class")
        factor(object$levels[apply(L, 2, which.max)], levels = object$levels)
    else
        t(L)
}
}}}
==== 참고자료 ====
 * http://abipictures.tistory.com/716
 * http://rgm3.lab.nig.ac.jp/RGM/R_rdfile?f=mlbench/man/HouseVotes84.Rd&d=R_CC --> HouseVotes84 설명