!pip install mglearn
!pip install --upgrade joblib==1.1.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: mglearn in /usr/local/lib/python3.7/dist-packages (0.1.9)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from mglearn) (1.0.2)
Requirement already satisfied: imageio in /usr/local/lib/python3.7/dist-packages (from mglearn) (2.9.0)
Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from mglearn) (1.3.5)
Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from mglearn) (7.1.2)
Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from mglearn) (1.21.6)
Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from mglearn) (1.1.0)
Requirement already satisfied: cycler in /usr/local/lib/python3.7/dist-packages (from mglearn) (0.11.0)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from mglearn) (3.2.2)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mglearn) (3.0.9)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mglearn) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->mglearn) (1.4.4)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->mglearn) (4.1.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib->mglearn) (1.15.0)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->mglearn) (2022.4)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->mglearn) (3.1.0)
Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->mglearn) (1.7.3)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: joblib==1.1.0 in /usr/local/lib/python3.7/dist-packages (1.1.0)


1 - (4/6)**2 - (2/6)**2

0.4444444444444445


sample_labels = ["unacc", "unacc", "acc", "acc", "good", "good"]
impurity = 1


from collections import Counter
label_counts = Counter(sample_labels)
print(label_counts)

Counter({'unacc': 2, 'acc': 2, 'good': 2})


for label in label_counts:
  print(label)
  prob = label_counts[label]/len(sample_labels)
  print(prob)

unacc
0.3333333333333333
acc
0.3333333333333333
good
0.3333333333333333


for label in label_counts:
  prob = label_counts[label]/len(sample_labels)
  impurity -= prob ** 2
print(impurity)

0.6666666666666665


def gini(dataset):
  impurity = 1
  label_counts =Counter(dataset)
  for label in label_counts:
    prob_of_label = label_counts[label] / len(dataset)
    impurity -= prob_of_label ** 2
  return impurity


unsplit_labels = ["unacc", "unacc", "unacc", "unacc", "unacc", "unacc", "good", 
                  "good", "good", "good", "vgood", "vgood", "vgood"]

split_labels_1 = [["unacc", "unacc", "unacc", "unacc", "unacc", "unacc", "good", "good", "vgood"], 
                  [ "good", "good"], 
                  ["vgood", "vgood"]]

split_labels_2 = [["unacc", "unacc", "unacc", "unacc","unacc", "unacc", "good", "good", "good", "good"], 
                  ["vgood", "vgood", "vgood"]]


# unsplit_labels의 지니 불순도를 계산해봅니다.
info_gain_1 = gini(unsplit_labels)
info_gain_1

0.6390532544378698


for subset in split_labels_1:
  info_gain_1 -= gini(subset)
print(info_gain_1)

0.14522609394404257


info_gain_2 = gini(unsplit_labels)
for subset in split_labels_2:
  info_gain_2 -= gini(subset)
print(info_gain_2)

0.15905325443786977


def information_gain(starting_labels, split_labels):
  info_gain = gini(starting_labels)
  for subset in split_labels:
   info_gain -= gini(subset)
  return info_gain


# 샘플 데이터
cars = [['med', 'low', '3', '4', 'med', 'med'], 
        ['med', 'vhigh', '4', 'more', 'small', 'high'], 
        ['high', 'med', '3', '2', 'med', 'low'], 
        ['med', 'low', '4', '4', 'med', 'low'], 
        ['med', 'low', '5more', '2', 'big', 'med'],
        ['med', 'med', '2', 'more', 'big', 'high'],
        ['med', 'med', '2', 'more', 'med', 'med'],
        ['vhigh', 'vhigh', '2', '2', 'med', 'low'], 
        ['high', 'med', '4', '2', 'big', 'low'], 
        ['low', 'low', '2', '4', 'big', 'med']]

car_labels = ['acc', 'acc', 'unacc', 'unacc', 'unacc', 'vgood', 'acc', 'unacc', 'unacc', 'good']


def weighted_information_gain(starting_labels, split_labels):
  info_gain = gini(starting_labels)
  for subset in split_labels:
   info_gain -= gini(subset) * (len(subset) / len(starting_labels))
  return info_gain


def split(dataset, labels, column): 
    data_subsets = []
    label_subsets = [] 
    # empty list
    counts = list(set([data[column] for data in dataset]))
    # list 의 중복 항목 제거를 위한 set 변환
    for k in counts: # k=counts element ['2', '4', 'more']
        new_data_subset = [] 
        new_label_subset = []
        for i in range(len(dataset)): # data set len -> all looping
            if dataset[i][column] == k: 
                new_data_subset.append(dataset[i])
                new_label_subset.append(labels[i])
        data_subsets.append(new_data_subset)
        label_subsets.append(new_label_subset) 
    return data_subsets, label_subsets


# split 함수 호출
split_data, split_labels = split(cars, car_labels, 3)


split_data

[[['med', 'vhigh', '4', 'more', 'small', 'high'],
  ['med', 'med', '2', 'more', 'big', 'high'],
  ['med', 'med', '2', 'more', 'med', 'med']],
 [['med', 'low', '3', '4', 'med', 'med'],
  ['med', 'low', '4', '4', 'med', 'low'],
  ['low', 'low', '2', '4', 'big', 'med']],
 [['high', 'med', '3', '2', 'med', 'low'],
  ['med', 'low', '5more', '2', 'big', 'med'],
  ['vhigh', 'vhigh', '2', '2', 'med', 'low'],
  ['high', 'med', '4', '2', 'big', 'low']]]


len(split_data)

3


# index 3으로 데이터를 분할하였을 때 정보증가량을 출력
weighted_information_gain(car_labels, split_labels)

0.30666666666666675


# 데이터에 있는 모든 feature들에 대하여 `split()` 함수와 `information_gain()` 함수를 호출 
# 4th feature(persons feature)가 가장 큰 영향을 미친다.
for i in range(0,6):
  split_data, split_labels = split(cars, car_labels, i)
  print(weighted_information_gain(car_labels, split_labels))

0.2733333333333334
0.040000000000000036
0.10666666666666666
0.30666666666666675
0.15000000000000002
0.29000000000000004


# 위의 함수들을 종합하여 가장 적합한 분할 feature을 찾는 함수 작성
def find_best_split(dataset, labels):
  best_gain = 0
  best_feature = 0
  for feature in range(len(dataset[0])):
    data_subset, label_subset = split(dataset, labels, feature)
    gain = weighted_information_gain(labels, label_subset)
    if gain > best_gain:
      best_gain, best_feature = gain, feature
  return best_gain, best_feature


best_gain, best_feature = find_best_split(cars,  car_labels)


best_feature

3


best_gain

0.30666666666666675


def build_tree(data, labels):
  best_gain, best_feature = find_best_split(data,  labels)
  if best_gain == 0:
    return Counter(labels)
  data_subsets, label_subsets = split(data,  labels, best_feature)
  branches = []
  for i in range(len(data_subsets)):
    branch = build_tree(data_subsets[i], label_subsets[i])
    branches.append(branch)
  return branches


def print_tree(node, spacing=""):
    question_dict = {0: "Buying Price", 1:"Price of maintenance", 
                     2:"Number of doors", 3:"Person Capacity", 
                     4:"Size of luggage boot", 5:"Estimated Saftey"}
    # Base case: 끝노드에 도달함
    if isinstance(node, Counter):
        print (spacing + str(node))
        return

    print (spacing + "Splitting")

    # 분할 지점에서 각 브랜치에 대해 재귀적으로 print_tree 함수를 호출
    for i in range(len(node)):
        print (spacing + '--> Branch ' + str(i)+':')
        print_tree(node[i], spacing + "  ")


#  `build_tree` 함수와 `print_tree` 함수를 출력해봅니다.
tree = build_tree(cars, car_labels)
print_tree(tree)

Splitting
--> Branch 0:
  Splitting
  --> Branch 0:
    Counter({'acc': 1})
  --> Branch 1:
    Counter({'acc': 1})
  --> Branch 2:
    Counter({'vgood': 1})
--> Branch 1:
  Splitting
  --> Branch 0:
    Counter({'unacc': 1})
  --> Branch 1:
    Counter({'good': 1})
  --> Branch 2:
    Counter({'acc': 1})
--> Branch 2:
  Counter({'unacc': 4})


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_moons

X, y = make_moons(noise=0.32, random_state=42, n_samples=250)
sns.scatterplot(x=X[:, 0], y=X[:, 1], 
                hue=y, marker="o", s=25, 
                edgecolor="k", legend=False).set_title("Moon Data")
plt.show()


from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier()


dt.fit(X,y)

DecisionTreeClassifier()


dt.score(X,y)

1.0


# classifier 결정트리를 시각화
from sklearn.tree import export_graphviz # drawing graphs specified in DOT language scripts
from six import StringIO
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(dt, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('/tree.png')
Image(graph.create_png())


from mglearn import plot_interactive_tree

ax = plot_interactive_tree.plot_tree_partition(X, y, dt)
ax.set_title("first decision tree")

Text(0.5, 1.0, 'first decision tree')


pruned_dt = DecisionTreeClassifier(max_depth = 3)
pruned_dt.fit(X, y)
print(pruned_dt.score(X,y))

0.888


dot_data = StringIO()
export_graphviz( pruned_dt, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('/tree.png')
Image(graph.create_png())


ax = plot_interactive_tree.plot_tree_partition(X, y, pruned_dt)
ax.set_title("first decision tree")

Text(0.5, 1.0, 'first decision tree')


import pandas as pd

data_url = "https://raw.githubusercontent.com/inikoreaackr/ml_datasets/main/titanic.csv"
data = pd.read_csv(data_url)
data


data.columns

data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

data.describe()


data = data.dropna()


data.shape

(714, 7)


data.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
dtype: object


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data.Sex = le.fit_transform(data.Sex)

/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py:5516: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


data.dtypes

Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
dtype: object


y = data['Survived']
X = data.drop(columns = ['Survived'])


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)


dt = DecisionTreeClassifier()


dt.fit(X_train, y_train)

DecisionTreeClassifier()


print(dt.score(X_test, y_test))

0.8046511627906977


dot_data = StringIO()
export_graphviz(dt, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('/tree.png')
Image(graph.create_png())


from sklearn.tree import DecisionTreeClassifier
df = DecisionTreeClassifier()


dt_mxlf = DecisionTreeClassifier(max_leaf_nodes = 1000)
dt_mxdth = DecisionTreeClassifier(max_depth = 6)
dt_mss = DecisionTreeClassifier(min_samples_split = 8)
dt_msl = DecisionTreeClassifier(min_samples_leaf = 32) 
dt_mid = DecisionTreeClassifier(min_impurity_decrease = 0.02)


# prunig parameter를 조절하지 않은 Decision tree
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.8232558139534883


# `max_leaf_nodes` parameter를 조절한 Decision tree
dt_mxlf.fit(X_train, y_train)
dt_mxlf.score(X_test, y_test)

0.8186046511627907


# `max_depth` parameter를 조절한 Decision tree
dt_mxdth.fit(X_train, y_train)
dt_mxdth.score(X_test, y_test)

0.8372093023255814


#`min_sample_split` parameter를 조절한 Decision tree
dt_mss.fit(X_train, y_train)
dt_mss.score(X_test, y_test)

0.8


#`min_sample_leaf` parameter를 조절한 Decision tree
dt_msl.fit(X_train, y_train)
dt_msl.score(X_test, y_test)

0.827906976744186


#`min_impurity_decrease` parameter를 조절한 Decision tree
dt_mid.fit(X_train, y_train)
dt_mid.score(X_test, y_test)

0.8


# `max_leaf_nodes` parameter를 조절한 Decision tree
dot_data = StringIO()
export_graphviz(dt_mxlf, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('/tree.png')
Image(graph.create_png())


# `max_depth` parameter를 조절한 Decision tree
dot_data = StringIO()
export_graphviz(dt_mxdth, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('/tree.png')
Image(graph.create_png())


#`min_sample_split` parameter를 조절한 Decision tree
dot_data = StringIO()
export_graphviz(dt_mss, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('/tree.png')
Image(graph.create_png())


#`min_sample_leaf` parameter를 조절한 Decision tree
dot_data = StringIO()
export_graphviz(dt_msl, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('/tree.png')
Image(graph.create_png())


#`min_impurity_decrease` parameter를 조절한 Decision tree
dot_data = StringIO()
export_graphviz(dt_mid, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('/tree.png')
Image(graph.create_png())

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...	...
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.0000	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.0000	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.4500	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.0000	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.7500	NaN	Q

	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

지니 불순도 (Gini Impurity)¶

실습 1¶

정보증가량 (Information Gain)¶

실습 2¶

가중 정보증가량 (Weighted Information Gain)¶

실습 3¶

실습 4: 재귀 트리 만들기 (Recursive Tree Building)¶

실습 5: scikit-learn으로 구현하는 결정트리¶

실습 6: 결정 트리 가지치기 (pruning)¶

과제¶

TAEWON KIM

Recent posts

Comments

지니 불순도 (Gini Impurity)¶

실습 1¶

정보증가량 (Information Gain)¶

실습 2¶

가중 정보증가량 (Weighted Information Gain)¶

실습 3¶

실습 4: 재귀 트리 만들기 (Recursive Tree Building)¶

실습 5: scikit-learn으로 구현하는 결정트리¶

실습 6: 결정 트리 가지치기 (pruning)¶

과제¶

TAEWON KIM

Recent posts

Newsletter

Comments