#title Python-Logistic Classification [[TableOfContents]] ==== Binary Classification ==== {{{ #iris 데이터세트 만들기 import numpy as np import pandas as pd from sklearn.datasets import load_iris iris = load_iris() iris.data iris.feature_names iris.target iris.target_names iris_df = pd.DataFrame(iris.data, columns=iris.feature_names) iris_df["target"] = iris.target iris_df["target_names"] = iris.target_names[iris.target] #binary classification이므로 setosa이면 1 아니면 0으로 분류하자. from pandasql import sqldf pysqldf = lambda q: sqldf(q, globals()) iris_df["is_setosa"] = pysqldf(""" select *, case when target_names = 'setosa' then 1 else 0 end is_setosa from iris_df """)["is_setosa"] iris_df[:5] #훈련세트, 테스트세트 나누기 from sklearn.model_selection import train_test_split train_set, test_set = train_test_split(iris_df, test_size = 0.5) train_set.shape test_set.shape #scatter plot import seaborn as sns sns.pairplot(x_vars=["sepal length (cm)"], y_vars=["petal length (cm)"], data=train_set, hue="target_names", size=5) #Logistic Classification from sklearn.linear_model import LogisticRegression model = LogisticRegression(C=10) #C값을 조정하여 over fitting을 막자. C값을 높이면 높일수록 over fitting 될 것임 #훈련 model.fit(X=train_set[["sepal length (cm)", "petal length (cm)"]], y=train_set[["is_setosa"]]) #테스트 pred = model.predict(X=test_set[["sepal length (cm)", "petal length (cm)"]]) # consusion matrix는 다음을 참고 # https://uberpython.wordpress.com/2012/01/01/precision-recall-sensitivity-and-specificity/ # https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal from pandas_ml import ConfusionMatrix cm = ConfusionMatrix(test_set.is_setosa.values, pred) cm.print_stats() #점수 print(model.score(X=train_set[["sepal length (cm)", "petal length (cm)"]], y=train_set[["is_setosa"]])) print(model.score(X=test_set[["sepal length (cm)", "petal length (cm)"]], y=test_set[["is_setosa"]])) #plot from matplotlib import pyplot as plt fig = plt.figure() plt.scatter(iris_df[iris_df.is_setosa == 0]["sepal length (cm)"], iris_df[iris_df.is_setosa == 0]["petal length (cm)"], marker='+') plt.scatter(iris_df[iris_df.is_setosa == 1]["sepal length (cm)"], iris_df[iris_df.is_setosa == 1]["petal length (cm)"], c= 'green', marker='o') coef = model.coef_ intercept = model.intercept_ ex1 = np.linspace(4, 8.5) ex2 = -(coef[:, 0] * ex1 + intercept) / coef[:,1] plt.plot(ex1, ex2, color='r', label='decision boundary'); plt.legend(); }}} 결과 attachment:Python-LogisticClassification/logistic1.png ==== 참고자료 ==== * https://www.kunxi.org/notes/machine_learning/logistic_regression/