# 군집이란?
# 군집은 비지도학습.... 정답이 없는 것들... data 들만 있는 것들을
# 그룹핑하는 작업

# 그룹핑을 해두어 새로운 데이터가 들어왔을때 해당되는 그룹으로 분류할 수 있다

import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("../data_set/7.군집/Mall_Customers.csv")
df.head()

# 시각화 해주는 라이브러리 추가
!pip install yellowbrick

Requirement already satisfied: yellowbrick in c:\users\user\appdata\roaming\python\python311\site-packages (1.5)
Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in c:\users\user\anaconda3\lib\site-packages (from yellowbrick) (3.8.0)
Requirement already satisfied: scipy>=1.0.0 in c:\users\user\anaconda3\lib\site-packages (from yellowbrick) (1.11.4)
Requirement already satisfied: scikit-learn>=1.0.0 in c:\users\user\anaconda3\lib\site-packages (from yellowbrick) (1.2.2)
Requirement already satisfied: numpy>=1.16.0 in c:\users\user\anaconda3\lib\site-packages (from yellowbrick) (1.26.4)
Requirement already satisfied: cycler>=0.10.0 in c:\users\user\anaconda3\lib\site-packages (from yellowbrick) (0.11.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\user\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.2.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\user\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\user\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\user\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (23.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\user\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\user\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\user\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2)
Requirement already satisfied: joblib>=1.1.1 in c:\users\user\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\user\appdata\roaming\python\python311\site-packages (from scikit-learn>=1.0.0->yellowbrick) (3.1.0)
Requirement already satisfied: six>=1.5 in c:\users\user\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0)

df.columns

Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')

from yellowbrick.cluster import KElbowVisualizer

features = ['Annual Income (k$)', 'Spending Score (1-100)']
model = KMeans()
vis = KElbowVisualizer(model, k=(1, 10))
                            # 군집에 대한 갯수를 1 ~ 10 개의 그룹으로 지정
                            # 그 중에 맞는 것으로 설정
vis.fit(df[features])
vis.show()
# 검은 점선이 있는 4 가 군집의 적당한 수이다
# 4개의 집단으로 분류했을때가 가장 적합하다...!

<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>

from sklearn.metrics import silhouette_score
all_scores = []
i=2
while i<10:
    i+=1
    km = KMeans(n_clusters=i)
    km.fit( df[features] )
    sil_score = silhouette_score(df[features], km.labels_)
    dic = {"cluster_num" : i, "score" : sil_score}
    all_scores.append(dic)
s_df = pd.DataFrame(all_scores)

# score 를 기준으로 오름차순으로 정렬
s_df.sort_values(by="score")

# 5개의 집단으로 분류했을때가 가장 적합하다...!

set(km.labels_)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

km = KMeans(n_clusters=5, init="k-means++", max_iter=300)
                                        # 300 번 반복하면서 최적의 중심점을 찾겠다
            # n_clusters = 5 : 5 개의 그룹으로 묶겠다
km.fit(df[features])

KMeans(n_clusters=5)

KMeans(n_clusters=5)

set(km.labels_)

{0, 1, 2, 3, 4}

# cluster 컬럼을 생성하여 각 데이터가 속한
# 그룹을 표현
df['cluster'] = km.labels_
df

# 군집을 통해 분류한 것을 시각화
f = ['Annual Income (k$)', 'Spending Score (1-100)', 'cluster']
plt.figure(figsize=(12,8))
sns.scatterplot( data = df[f], x="Annual Income (k$)", y="Spending Score (1-100)", hue="cluster" )

<Axes: xlabel='Annual Income (k$)', ylabel='Spending Score (1-100)'>

# 중심좌표를 알려주는 cluster_centers_
# 각 군집마다 중심 좌표를 알려준다
km.cluster_centers_

array([[55.2962963 , 49.51851852],
       [25.72727273, 79.36363636],
       [26.30434783, 20.91304348],
       [88.2       , 17.11428571],
       [86.53846154, 82.12820513]])

cnt = km.cluster_centers_
cnt[:,0]

array([55.2962963 , 25.72727273, 26.30434783, 88.2       , 86.53846154])

cnt[:,1]

array([49.51851852, 79.36363636, 20.91304348, 17.11428571, 82.12820513])

# 군집 별 중심좌표를 시각화
plt.scatter(x=cnt[:,0], y=cnt[:,1], c="red", s=200)
                                # c : 표시될 점의 색깔
                                # s : 표시될 점의 크기

<matplotlib.collections.PathCollection at 0x1539bbc9210>

# 군집의 표본과 중심좌표를 동시에 시각화
plt.figure(figsize=(12,8))
sns.scatterplot( data = df[f], x="Annual Income (k$)", y="Spending Score (1-100)", hue="cluster" )

plt.scatter(x=cnt[:,0], y=cnt[:,1], c="red", s=200)

<matplotlib.collections.PathCollection at 0x15398c23650>

fe = ['Annual Income (k$)', 'Spending Score (1-100)']
label = "cluster"

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[fe], df[label], test_size=0.3)

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# 새로운 데이터가 들어오면 몇번 그룹인지 분류하기 위해
# RandomForestClassifier 를 사용
params = {
    "n_estimators" : range(5,100,10),
    "max_depth" : range(4,11,2),
    "min_samples_leaf" : range(5,21,5)
}

rfc = RandomForestClassifier()
grid_cv = GridSearchCV(rfc, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print("최적의 파라미터 : ", grid_cv.best_params_)
print("train : ", grid_cv.score(X_train, y_train))
print("test : ", grid_cv.score(X_test, y_test))

최적의 파라미터 :  {'max_depth': 4, 'min_samples_leaf': 5, 'n_estimators': 25}
train :  0.9928571428571429
test :  0.9333333333333333

# 새로운 데이터를 집어넣으면 몇번째 그룹에 해당하는 데이터인지
# 분류해준다
grid_cv.predict([[20,20]])

array([2])

	cluster_num	score
7	10	0.452751
5	8	0.454558
6	9	0.458196
0	3	0.467614
1	4	0.493196
4	7	0.528810
3	6	0.539761
2	5	0.553932

	CustomerID	Gender	Age	Annual Income (k$)	Spending Score (1-100)	cluster
0	1	Male	19	15	39	2
1	2	Male	21	15	81	1
2	3	Female	20	16	6	2
3	4	Female	23	16	77	1
4	5	Female	31	17	40	2
...	...	...	...	...	...	...
195	196	Female	35	120	79	4
196	197	Female	45	126	28	3
197	198	Male	32	126	74	4
198	199	Male	32	137	18	3
199	200	Male	30	137	83	4

[머신러닝] 변수 선택법 ( feature selection ) (0)	2024.05.28
[머신러닝] 회귀 및 평가지표 (0)	2024.05.27
[머신러닝] 과적합 및 하이퍼파라미터 (0)	2024.05.27
[머신러닝] 지도학습 ( 분류, 회귀 ), 평가지표 선택하는 방법 (0)	2024.05.24
[머신러닝] 탐색적 데이터분석 ( EDA, 표준화, 가중치 ) (0)	2024.05.24

전영호의 개발 블로그

ml 그룹 분류

[머신러닝] 군집 ( 고객분류 )

군집 ( 고객분류 )

군집(clustering)¶

k-means¶

k-means 파라미터¶

군집화가 완료되면 관련 주요 속성이 있다¶

고객분류¶

'BE > 머신러닝(ML)' 카테고리의 다른 글

+ Recent posts

티스토리툴바