import numpy as np
import pandas as pd


import sklearn
print(sklearn.__version__)

1.0.2


# 데이터 받기
url = "https://raw.githubusercontent.com/inikoreaackr/ml_datasets/main/playgolf.csv"
df = pd.read_csv(url)


# 데이터 첫 다섯 instance 확인
df.head()


# 데이터 타입 확인
df.dtypes

OUTLOOK        object
TEMPERATURE    object
HUMIDITY       object
WINDY            bool
PLAY GOLF      object
dtype: object


# object 타입을 category로 변경
for col in df.columns:
  df[col] = df[col].astype('category')


# 변경이 되었는지 확인
df.dtypes

OUTLOOK        category
TEMPERATURE    category
HUMIDITY       category
WINDY          category
PLAY GOLF      category
dtype: object


# PLAY GOLF feature 출력
df['PLAY GOLF']

0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: PLAY GOLF, dtype: category
Categories (2, object): ['No', 'Yes']


# PLAY GOLF는 binary 변수입니다. 각 카테고리의 갯수를 세어봅니다.
df['PLAY GOLF'].value_counts(sort = True)

Yes    9
No     5
Name: PLAY GOLF, dtype: int64


# 이 데이터셋에서 "Play Golf = Yes"로 예측하는 ZeroR 모델의 정확도를 계산해봅니다.
9 / (9 + 5)

0.6428571428571429


# 수도코드 구현
from collections import Counter

total_errors = []

for col in df.columns[:-1]:
  error = 0
  for val in df[col].unique():
    length = len(df[df[col] == val])
    print(f"{col} : {val}, length : {length}")
    print(Counter(df[df[col] == val]['PLAY GOLF']).most_common())
    error += (length - Counter(df[df[col] == val]['PLAY GOLF']).most_common()[0][1])
  print(f"\nerror of {col}: [{error}] \n")
  total_errors.append(error)

OUTLOOK : Rainy, length : 5
[('No', 3), ('Yes', 2)]
OUTLOOK : Overcast, length : 4
[('Yes', 4)]
OUTLOOK : Sunny, length : 5
[('Yes', 3), ('No', 2)]

error of OUTLOOK: [4] 

TEMPERATURE : Hot, length : 4
[('No', 2), ('Yes', 2)]
TEMPERATURE : Mild, length : 6
[('Yes', 4), ('No', 2)]
TEMPERATURE : Cool, length : 4
[('Yes', 3), ('No', 1)]

error of TEMPERATURE: [5] 

HUMIDITY : High, length : 7
[('No', 4), ('Yes', 3)]
HUMIDITY : Normal, length : 7
[('Yes', 6), ('No', 1)]

error of HUMIDITY: [4] 

WINDY : False, length : 8
[('Yes', 6), ('No', 2)]
WINDY : True, length : 6
[('No', 3), ('Yes', 3)]

error of WINDY: [5]


# 오류가 가장 작은 feature를 고릅니다.

best_feature = df.columns[np.argmin(total_errors)]
print(best_feature)

OUTLOOK


# best feature에 대해 룰셋을 생성합니다.
oneRules = []
for val in df[best_feature].unique():
  print(f"{best_feature} : {val}", "-> ", end = ' ')
  print(Counter(df[df[best_feature] == val]['PLAY GOLF']).most_common()[0][0])
  oneRules.append((best_feature, val, Counter(df[df[best_feature] == val]['PLAY GOLF']).most_common()[0][0]))

OUTLOOK : Rainy ->  No
OUTLOOK : Overcast ->  Yes
OUTLOOK : Sunny ->  Yes


df.head()


df.describe()


# 카테고리 데이터를 정수로 인코딩
df_enc = pd.DataFrame()
df_enc['OUTLOOK'] = df['OUTLOOK'].cat.codes
df_enc['TEMPERATURE'] = df['TEMPERATURE'].cat.codes
df_enc['HUMIDITY'] = df['HUMIDITY'].cat.codes
df_enc['WINDY'] = df['WINDY'].cat.codes
df_enc['PLAY GOLF'] = df['PLAY GOLF'].cat.codes


df_enc.head()


# 인코딩된 데이터의 타입을 프린트해봅니다.
df_enc.dtypes

OUTLOOK        int8
TEMPERATURE    int8
HUMIDITY       int8
WINDY          int8
PLAY GOLF      int8
dtype: object


# 분류기에 넣을 feature과 해당 label을 구분합니다.
features = df_enc.drop(columns=['PLAY GOLF'])
label = df_enc['PLAY GOLF']


from sklearn.naive_bayes import CategoricalNB
model = CategoricalNB()


model.fit(features.values, label)

CategoricalNB()


score = model.score(features.values, label)
score

0.9285714285714286


# p(x_i|y_i) 출력
from pprint import pprint
feature_log_prior = model.feature_log_prob_
for feature_prior in feature_log_prior:
  pprint(np.exp(feature_prior))

array([[0.125     , 0.5       , 0.375     ],
       [0.41666667, 0.25      , 0.33333333]])
array([[0.25      , 0.375     , 0.375     ],
       [0.33333333, 0.25      , 0.41666667]])
array([[0.71428571, 0.28571429],
       [0.36363636, 0.63636364]])
array([[0.42857143, 0.57142857],
       [0.63636364, 0.36363636]])


# p(y_j) 출력
np.exp(model.class_log_prior_)

array([0.35714286, 0.64285714])


# instances에 대해서 예측을 해봅니다. 
# ("Sunny", "Hot", "Normal", False) [2, 1, 1, 0]
# ("Rainy", "Mild", "High", False) [1, 2, 0, 0]

print(model.predict_proba([[2, 1, 1, 0]]), model.predict([[2, 1, 1, 0]]))
print(model.predict_proba([[1, 2, 0, 0]]), model.predict([[1, 2, 0, 0]]))

[[0.22086561 0.77913439]] [1]
[[0.5695011 0.4304989]] [0]


# load data
car_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
balloons_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data'


import numpy as np
import pandas as pd
import sklearn


# 데이터 프레임 생성

car_df = pd.read_csv(car_url, header = None)
car_df.columns = ["buying", "maint", "doors", "persons", "lung_boot", "satefy", "accept"]
bal_df = pd.read_csv(balloons_url, header = None)
bal_df.columns = ["color", "size", "act", "age", "inflated"]


# 데이터의 column name 출력

print(car_df.columns.to_list()[:-1])
print(bal_df.columns.to_list()[:-1])

['buying', 'maint', 'doors', 'persons', 'lung_boot', 'satefy']
['color', 'size', 'act', 'age']


# Car evaluation 'buying' feature 의 카테고리 출력

print(car_df['buying'].unique())

['vhigh' 'high' 'med' 'low']


# Car evaluation 'accept' label 의 각 class 와 instance 개수

car_df['accept'].value_counts(sort = True)

unacc    1210
acc       384
good       69
vgood      65
Name: accept, dtype: int64


# Balloons에서 'color' feature이 yellow인 instance 출력

bal_df['color'].value_counts(sort = True)
print(bal_df.loc[(bal_df['color'] == 'YELLOW')])

    color   size      act    age inflated
0  YELLOW  SMALL  STRETCH  ADULT        T
1  YELLOW  SMALL  STRETCH  ADULT        T
2  YELLOW  SMALL  STRETCH  CHILD        F
3  YELLOW  SMALL      DIP  ADULT        F
4  YELLOW  SMALL      DIP  CHILD        F
5  YELLOW  LARGE  STRETCH  ADULT        T
6  YELLOW  LARGE  STRETCH  ADULT        T
7  YELLOW  LARGE  STRETCH  CHILD        F
8  YELLOW  LARGE      DIP  ADULT        F
9  YELLOW  LARGE      DIP  CHILD        F


# Object 타입을 정수형으로 전처리

car_df_enc = pd.DataFrame()
bal_df_enc = pd.DataFrame()

# Car evaluation
for col in car_df.columns:
  car_df[col] = car_df[col].astype('category')

car_df_enc['buying'] = car_df['buying'].cat.codes
car_df_enc['maint'] = car_df['maint'].cat.codes
car_df_enc['doors'] = car_df['doors'].cat.codes
car_df_enc['persons'] = car_df['persons'].cat.codes
car_df_enc['lung_boot'] = car_df['lung_boot'].cat.codes
car_df_enc['satefy'] = car_df['satefy'].cat.codes
car_df_enc['accept'] = car_df['accept'].cat.codes

# Balloons
for col in bal_df.columns:
  bal_df[col] = bal_df[col].astype('category')

bal_df_enc['color'] = bal_df['color'].cat.codes
bal_df_enc['size'] = bal_df['size'].cat.codes
bal_df_enc['act'] = bal_df['act'].cat.codes
bal_df_enc['age'] = bal_df['age'].cat.codes
bal_df_enc['inflated'] = bal_df['inflated'].cat.codes


from sklearn.naive_bayes import CategoricalNB

car_model = CategoricalNB()
bal_model = CategoricalNB()


# Car evaluation fit & score 출력

car_features = car_df_enc.drop(columns=['accept'])
car_label = car_df_enc['accept']
car_model.fit(car_features.values, car_label)
car_score = car_model.score(car_features.values, car_label)
car_score

0.8715277777777778


# Balloons fit & score 출력

bal_features = bal_df_enc.drop(columns=['inflated'])
bal_label = bal_df_enc['inflated']
bal_model.fit(bal_features.values, bal_label)
bal_score = bal_model.score(bal_features, bal_label)
bal_score

/usr/local/lib/python3.8/dist-packages/sklearn/base.py:443: UserWarning: X has feature names, but CategoricalNB was fitted without feature names
  warnings.warn(

1.0


# class probability& feature probability 출력

# Car evaluation
from pprint import pprint

car_feature_log_prior = car_model.feature_log_prob_
for featue_prior in car_feature_log_prior:
  pprint(np.exp(featue_prior))
print(np.exp(car_model.class_log_prior_))

array([[0.28092784, 0.23195876, 0.29896907, 0.18814433],
       [0.01369863, 0.64383562, 0.32876712, 0.01369863],
       [0.26771005, 0.21334432, 0.22158155, 0.29736409],
       [0.01449275, 0.57971014, 0.39130435, 0.01449275]])
array([[0.27319588, 0.23969072, 0.29896907, 0.18814433],
       [0.01369863, 0.64383562, 0.32876712, 0.01369863],
       [0.25947282, 0.22158155, 0.22158155, 0.29736409],
       [0.20289855, 0.39130435, 0.39130435, 0.01449275]])
array([[0.21134021, 0.25773196, 0.26546392, 0.26546392],
       [0.21917808, 0.26027397, 0.26027397, 0.26027397],
       [0.2693575 , 0.24794069, 0.24135091, 0.24135091],
       [0.15942029, 0.23188406, 0.30434783, 0.30434783]])
array([[0.00258398, 0.51421189, 0.48320413],
       [0.01388889, 0.51388889, 0.47222222],
       [0.47568013, 0.25803792, 0.26628195],
       [0.01470588, 0.45588235, 0.52941176]])
array([[0.374677  , 0.35142119, 0.27390181],
       [0.34722222, 0.34722222, 0.30555556],
       [0.30420445, 0.32399011, 0.37180544],
       [0.60294118, 0.38235294, 0.01470588]])
array([[0.52971576, 0.00258398, 0.46770026],
       [0.43055556, 0.01388889, 0.55555556],
       [0.22918384, 0.47568013, 0.29513603],
       [0.97058824, 0.01470588, 0.01470588]])
[0.22222222 0.03993056 0.70023148 0.03761574]


# Balloons

bal_feature_log_prior = bal_model.feature_log_prob_
for featue_prior in bal_feature_log_prior:
  pprint(np.exp(featue_prior))
print(np.exp(bal_model.class_log_prior_))


# car evaluation instances 예측
# ("vhigh", "vhigh", 2, 2, "small", "high") [3, 3, 0, 0, 2, 0]
# ("low", "low", "5more", "more", "big, "med) [1, 1, 3, 2, 0, 0]

print(car_model.predict_proba([[3, 3, 0, 0, 2, 0]]), car_model.predict([[3, 3, 0, 0, 2, 0]]))
print(car_model.predict_proba([[1, 1, 3, 2, 0, 0]]), car_model.predict([[1, 1, 3, 2, 0, 0]]))

[[9.21115137e-04 4.43484909e-06 9.99074059e-01 3.90721613e-07]] [2]
[[0.20014669 0.19352277 0.09437552 0.51195502]] [3]


# balloons instances 예측
#("YELLOW","LARGE", "STRETCH", "ADULT") [1, 0, 1, 0]
#("PULPLE", "SMALL", "DIP", "ADULT") [0, 1, 0, 0]

print(bal_model.predict_proba([[1, 0, 1, 0]]), bal_model.predict([[1, 0, 1, 0]]))
print(bal_model.predict_proba([[0, 1, 0, 0]]), bal_model.predict([[0, 1, 0, 0]]))

[[0.19107307 0.80892693]] [1]
[[0.79281184 0.20718816]] [0]


bal_df_enc.head(15)


import math


import pandas as pd


cdf = pd.read_csv(car_url, header = None)
bdf = pd.read_csv(balloons_url, header = None)
cdf.columns = ["buying", "maint", "doors", "persons", "lung_boot", "satefy", "accept"]
bdf.columns = ["color", "size", "act", "age", "inflated"]


# log likelihood probability
def calculate_likelihood(df):
  likelihood = dict()
  y = df[df.columns[-1]]
  sz = df.size/df.columns.size
  for feature in df.columns[:-1]:
    likelihood[feature] ={}
    for categ in y.unique():
      class_count = y.value_counts()[categ]
      feature_count = df[df.columns[:-1]][feature][y[y == categ].index.values.tolist()].value_counts().to_dict()
      for feat_cat, feat_count in feature_count.items():
        likelihood[feature][feat_cat  + "_" + categ] = feat_count/class_count
  return likelihood

def calc_prior(df):
  prior={}
  for feat in df.columns.to_list()[:-1]:
    values = df[feat].value_counts().to_dict()
    prior[feat] = {}
    for value, count in values.items():
      prior[feat][value] = count/df[df.columns[:-1]].size
  return prior


# log class prior probability 
def calculate_class_prob(y):
  class_prior = {}
  for categ in y.unique():
    class_prior[categ] = math.log(y.value_counts(normalize = True)[categ])
  return class_prior
calculate_class_prob(cdf[cdf.columns[-1]])['unacc']

-0.3563443107732141


def naive_bayes_classifier(df, inst):
  likelihood = calculate_likelihood(df)
  prior = calc_prior(df)
  prob_out = dict()

  for categ in df[df.columns[-1]].unique():
      calculate_class_prob(df[df.columns[-1]])
      likesum = 0
      for feature, feature_value in zip (df.columns[:-1], inst):
        if feature_value + '_' + categ not in likelihood[feature]:
          continue
        else:
          likesum += math.log(likelihood[feature][feature_value + '_' + categ])
      class_prior = calculate_class_prob(df[df.columns[-1]])
      if categ in class_prior:
        prob_out[categ] = likesum + class_prior[categ]
      else:
        continue
      result = min(prob_out, key = lambda x :prob_out[x])
  print(prob_out)


naive_bayes_classifier(cdf, ["vhigh", "vhigh", "2", "2", "small", "high"])

{'unacc': -7.298119948403321, 'acc': -8.33742842300862, 'vgood': -5.152134856369955, 'good': -6.769162938070731}


naive_bayes_classifier(bdf, ["PULPLE", "SMALL", "DIP", "CHILD"])

{'T': -1.6094379124341003, 'F': -2.014903020542265}

실습 1차시 - ZeroR, OneR, Naive Bayes Classifier¶

실습 내용:¶

실습 데이터¶

1. ZeroR¶

OneR¶

Naive Bayes Classifier with scikit-learn¶

1. 기계학습 기본 용어 정의/의미¶

2. 적용 과제¶

2.1 pandas¶

2.2 데이터 이해 및 전처리¶

2.3 모델 생성, 훈련 및 결과 해석¶

모델 예측 결과 해설¶

car evaluation¶

balloons¶

3. Naive Bayes Classifier 구현¶

TAEWON KIM

Recent posts

Comments

	OUTLOOK	TEMPERATURE	HUMIDITY	WINDY	PLAY GOLF
0	Rainy	Hot	High	False	No
1	Rainy	Hot	High	True	No
2	Overcast	Hot	High	False	Yes
3	Sunny	Mild	High	False	Yes
4	Sunny	Cool	Normal	False	Yes

	color	size	act	age	inflated
0	1	1	1	0	1
1	1	1	1	0	1
2	1	1	1	1	0
3	1	1	0	0	0
4	1	1	0	1	0
5	1	0	1	0	1
6	1	0	1	0	1
7	1	0	1	1	0
8	1	0	0	0	0
9	1	0	0	1	0
10	0	1	1	0	1
11	0	1	1	0	1
12	0	1	1	1	0
13	0	1	0	0	0
14	0	1	0	1	0

	color	size	act	age	inflated
0	1	1	1	0	1
1	1	1	1	0	1
2	1	1	1	1	0
3	1	1	0	0	0
4	1	1	0	1	0
5	1	0	1	0	1
6	1	0	1	0	1
7	1	0	1	1	0
8	1	0	0	0	0
9	1	0	0	1	0
10	0	1	1	0	1
11	0	1	1	0	1
12	0	1	1	1	0
13	0	1	0	0	0
14	0	1	0	1	0

실습 1차시 - ZeroR, OneR, Naive Bayes Classifier¶

실습 내용:¶

실습 데이터¶

1. ZeroR¶

OneR¶

Naive Bayes Classifier with scikit-learn¶

1. 기계학습 기본 용어 정의/의미¶

2. 적용 과제¶

2.1 pandas¶

2.2 데이터 이해 및 전처리¶

2.3 모델 생성, 훈련 및 결과 해석¶

모델 예측 결과 해설¶

car evaluation¶

balloons¶

3. Naive Bayes Classifier 구현¶

TAEWON KIM

Recent posts

Newsletter

Comments

	color	size	act	age	inflated
0	1	1	1	0	1
1	1	1	1	0	1
2	1	1	1	1	0
3	1	1	0	0	0
4	1	1	0	1	0
5	1	0	1	0	1
6	1	0	1	0	1
7	1	0	1	1	0
8	1	0	0	0	0
9	1	0	0	1	0
10	0	1	1	0	1
11	0	1	1	0	1
12	0	1	1	1	0
13	0	1	0	0	0
14	0	1	0	1	0