# 단변량 : 비슷한 컬럼들을 확인 후 제거하는 것
# embed : Tree 계열 알고리즘에서만 사용 가능, 각 컬럼의 티어를 알려준다.

from sklearn.ensemble import RandomForestRegressor
import pandas as pd
df = pd.read_csv("../data_set/6.회귀/data_cleaning.csv")
df.head()

df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'temp_int'],
      dtype='object')

f = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'hour']
l = "count"
X, y = df[f], df[l]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = RandomForestRegressor()
model.fit(X_train, y_train)
print("train : ", model.score(X_train, y_train))
print("test : ", model.score(X_test, y_test))

train :  0.9916614979489478
test :  0.9410126086888452

# 서로의 상관관계가 높을 수록 그래프가 일정하게 상승 또는 하강한다
import seaborn as sns
sns.scatterplot(data=df, x="temp", y="atemp")

<Axes: xlabel='temp', ylabel='atemp'>

# 각 컬럼 간의 상관관계를 보여준다
# temp 와 atemp 의 상관관계가 1에 가깝기 때문에
# 제거해도 될듯 하다
df.corr(numeric_only=True)

import matplotlib.pyplot as plt
plt.subplots(figsize=(12,12))
sns.heatmap(df.corr(numeric_only=True), annot=True)

<Axes: >

# 상관관계가 비슷한 것들은 / 이 형태나 \ 이 형태로 값이
# 일정하게 줄어들거나 늘어난다
sns.pairplot(df, height=1)

<seaborn.axisgrid.PairGrid at 0x16a895c6710>

# atemp 삭제
f = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'humidity', 'windspeed', 'year', 'month', 'day', 'hour']

# atemp 컬럼을 삭제해도 예측 결과 값에는 영향을 크게 끼치지 않았음
# atemp 와 temp 가 거의 동일한 컬럼이었기 때문...!
X_train, X_test, y_train, y_test = train_test_split(df[f], df[l], test_size=0.3)
model = RandomForestRegressor()
model.fit(X_train, y_train)
print("train : ", model.score(X_train, y_train))
print("test : ", model.score(X_test, y_test))

train :  0.9917278017758199
test :  0.9399504472888517

from itertools import combinations
sample_bag = [1,2,3,4]
for c in combinations(sample_bag, 2) :
    # sample_bag 를 2개씩 뽑으라는 뜻
    print(c, type(c))

(1, 2) <class 'tuple'>
(1, 3) <class 'tuple'>
(1, 4) <class 'tuple'>
(2, 3) <class 'tuple'>
(2, 4) <class 'tuple'>
(3, 4) <class 'tuple'>

all_result = []

# combinations() 가 컬럼들을 2개씩 묶어준다
for c in combinations(f, 2):
    print(c)

('season', 'holiday')
('season', 'workingday')
('season', 'weather')
('season', 'temp')
('season', 'humidity')
('season', 'windspeed')
('season', 'year')
('season', 'month')
('season', 'day')
('season', 'hour')
('holiday', 'workingday')
('holiday', 'weather')
('holiday', 'temp')
('holiday', 'humidity')
('holiday', 'windspeed')
('holiday', 'year')
('holiday', 'month')
('holiday', 'day')
('holiday', 'hour')
('workingday', 'weather')
('workingday', 'temp')
('workingday', 'humidity')
('workingday', 'windspeed')
('workingday', 'year')
('workingday', 'month')
('workingday', 'day')
('workingday', 'hour')
('weather', 'temp')
('weather', 'humidity')
('weather', 'windspeed')
('weather', 'year')
('weather', 'month')
('weather', 'day')
('weather', 'hour')
('temp', 'humidity')
('temp', 'windspeed')
('temp', 'year')
('temp', 'month')
('temp', 'day')
('temp', 'hour')
('humidity', 'windspeed')
('humidity', 'year')
('humidity', 'month')
('humidity', 'day')
('humidity', 'hour')
('windspeed', 'year')
('windspeed', 'month')
('windspeed', 'day')
('windspeed', 'hour')
('year', 'month')
('year', 'day')
('year', 'hour')
('month', 'day')
('month', 'hour')
('day', 'hour')

for c in combinations(f, 2):
    # 컬럼을 list 자료형으로 형변환해서 사용해야 한다
    print( df[list(c)] )

       season  holiday
0           1        0
1           1        0
2           1        0
3           1        0
4           1        0
...       ...      ...
10881       4        0
10882       4        0
10883       4        0
10884       4        0
10885       4        0

[10886 rows x 2 columns]
       season  workingday
0           1           0
1           1           0
2           1           0
3           1           0
4           1           0
...       ...         ...
10881       4           1
10882       4           1
10883       4           1
10884       4           1
10885       4           1

[10886 rows x 2 columns]
       season  weather
0           1        1
1           1        1
2           1        1
3           1        1
4           1        1
...       ...      ...
10881       4        1
10882       4        1
10883       4        1
10884       4        1
10885       4        1

[10886 rows x 2 columns]
       season   temp
0           1   9.84
1           1   9.02
2           1   9.02
3           1   9.84
4           1   9.84
...       ...    ...
10881       4  15.58
10882       4  14.76
10883       4  13.94
10884       4  13.94
10885       4  13.12

[10886 rows x 2 columns]
       season  humidity
0           1        81
1           1        80
2           1        80
3           1        75
4           1        75
...       ...       ...
10881       4        50
10882       4        57
10883       4        61
10884       4        61
10885       4        66

[10886 rows x 2 columns]
       season  windspeed
0           1     0.0000
1           1     0.0000
2           1     0.0000
3           1     0.0000
4           1     0.0000
...       ...        ...
10881       4    26.0027
10882       4    15.0013
10883       4    15.0013
10884       4     6.0032
10885       4     8.9981

[10886 rows x 2 columns]
       season  year
0           1  2011
1           1  2011
2           1  2011
3           1  2011
4           1  2011
...       ...   ...
10881       4  2012
10882       4  2012
10883       4  2012
10884       4  2012
10885       4  2012

[10886 rows x 2 columns]
       season  month
0           1      1
1           1      1
2           1      1
3           1      1
4           1      1
...       ...    ...
10881       4     12
10882       4     12
10883       4     12
10884       4     12
10885       4     12

[10886 rows x 2 columns]
       season  day
0           1    1
1           1    1
2           1    1
3           1    1
4           1    1
...       ...  ...
10881       4   19
10882       4   19
10883       4   19
10884       4   19
10885       4   19

[10886 rows x 2 columns]
       season  hour
0           1     0
1           1     1
2           1     2
3           1     3
4           1     4
...       ...   ...
10881       4    19
10882       4    20
10883       4    21
10884       4    22
10885       4    23

[10886 rows x 2 columns]
       holiday  workingday
0            0           0
1            0           0
2            0           0
3            0           0
4            0           0
...        ...         ...
10881        0           1
10882        0           1
10883        0           1
10884        0           1
10885        0           1

[10886 rows x 2 columns]
       holiday  weather
0            0        1
1            0        1
2            0        1
3            0        1
4            0        1
...        ...      ...
10881        0        1
10882        0        1
10883        0        1
10884        0        1
10885        0        1

[10886 rows x 2 columns]
       holiday   temp
0            0   9.84
1            0   9.02
2            0   9.02
3            0   9.84
4            0   9.84
...        ...    ...
10881        0  15.58
10882        0  14.76
10883        0  13.94
10884        0  13.94
10885        0  13.12

[10886 rows x 2 columns]
       holiday  humidity
0            0        81
1            0        80
2            0        80
3            0        75
4            0        75
...        ...       ...
10881        0        50
10882        0        57
10883        0        61
10884        0        61
10885        0        66

[10886 rows x 2 columns]
       holiday  windspeed
0            0     0.0000
1            0     0.0000
2            0     0.0000
3            0     0.0000
4            0     0.0000
...        ...        ...
10881        0    26.0027
10882        0    15.0013
10883        0    15.0013
10884        0     6.0032
10885        0     8.9981

[10886 rows x 2 columns]
       holiday  year
0            0  2011
1            0  2011
2            0  2011
3            0  2011
4            0  2011
...        ...   ...
10881        0  2012
10882        0  2012
10883        0  2012
10884        0  2012
10885        0  2012

[10886 rows x 2 columns]
       holiday  month
0            0      1
1            0      1
2            0      1
3            0      1
4            0      1
...        ...    ...
10881        0     12
10882        0     12
10883        0     12
10884        0     12
10885        0     12

[10886 rows x 2 columns]
       holiday  day
0            0    1
1            0    1
2            0    1
3            0    1
4            0    1
...        ...  ...
10881        0   19
10882        0   19
10883        0   19
10884        0   19
10885        0   19

[10886 rows x 2 columns]
       holiday  hour
0            0     0
1            0     1
2            0     2
3            0     3
4            0     4
...        ...   ...
10881        0    19
10882        0    20
10883        0    21
10884        0    22
10885        0    23

[10886 rows x 2 columns]
       workingday  weather
0               0        1
1               0        1
2               0        1
3               0        1
4               0        1
...           ...      ...
10881           1        1
10882           1        1
10883           1        1
10884           1        1
10885           1        1

[10886 rows x 2 columns]
       workingday   temp
0               0   9.84
1               0   9.02
2               0   9.02
3               0   9.84
4               0   9.84
...           ...    ...
10881           1  15.58
10882           1  14.76
10883           1  13.94
10884           1  13.94
10885           1  13.12

[10886 rows x 2 columns]
       workingday  humidity
0               0        81
1               0        80
2               0        80
3               0        75
4               0        75
...           ...       ...
10881           1        50
10882           1        57
10883           1        61
10884           1        61
10885           1        66

[10886 rows x 2 columns]
       workingday  windspeed
0               0     0.0000
1               0     0.0000
2               0     0.0000
3               0     0.0000
4               0     0.0000
...           ...        ...
10881           1    26.0027
10882           1    15.0013
10883           1    15.0013
10884           1     6.0032
10885           1     8.9981

[10886 rows x 2 columns]
       workingday  year
0               0  2011
1               0  2011
2               0  2011
3               0  2011
4               0  2011
...           ...   ...
10881           1  2012
10882           1  2012
10883           1  2012
10884           1  2012
10885           1  2012

[10886 rows x 2 columns]
       workingday  month
0               0      1
1               0      1
2               0      1
3               0      1
4               0      1
...           ...    ...
10881           1     12
10882           1     12
10883           1     12
10884           1     12
10885           1     12

[10886 rows x 2 columns]
       workingday  day
0               0    1
1               0    1
2               0    1
3               0    1
4               0    1
...           ...  ...
10881           1   19
10882           1   19
10883           1   19
10884           1   19
10885           1   19

[10886 rows x 2 columns]
       workingday  hour
0               0     0
1               0     1
2               0     2
3               0     3
4               0     4
...           ...   ...
10881           1    19
10882           1    20
10883           1    21
10884           1    22
10885           1    23

[10886 rows x 2 columns]
       weather   temp
0            1   9.84
1            1   9.02
2            1   9.02
3            1   9.84
4            1   9.84
...        ...    ...
10881        1  15.58
10882        1  14.76
10883        1  13.94
10884        1  13.94
10885        1  13.12

[10886 rows x 2 columns]
       weather  humidity
0            1        81
1            1        80
2            1        80
3            1        75
4            1        75
...        ...       ...
10881        1        50
10882        1        57
10883        1        61
10884        1        61
10885        1        66

[10886 rows x 2 columns]
       weather  windspeed
0            1     0.0000
1            1     0.0000
2            1     0.0000
3            1     0.0000
4            1     0.0000
...        ...        ...
10881        1    26.0027
10882        1    15.0013
10883        1    15.0013
10884        1     6.0032
10885        1     8.9981

[10886 rows x 2 columns]
       weather  year
0            1  2011
1            1  2011
2            1  2011
3            1  2011
4            1  2011
...        ...   ...
10881        1  2012
10882        1  2012
10883        1  2012
10884        1  2012
10885        1  2012

[10886 rows x 2 columns]
       weather  month
0            1      1
1            1      1
2            1      1
3            1      1
4            1      1
...        ...    ...
10881        1     12
10882        1     12
10883        1     12
10884        1     12
10885        1     12

[10886 rows x 2 columns]
       weather  day
0            1    1
1            1    1
2            1    1
3            1    1
4            1    1
...        ...  ...
10881        1   19
10882        1   19
10883        1   19
10884        1   19
10885        1   19

[10886 rows x 2 columns]
       weather  hour
0            1     0
1            1     1
2            1     2
3            1     3
4            1     4
...        ...   ...
10881        1    19
10882        1    20
10883        1    21
10884        1    22
10885        1    23

[10886 rows x 2 columns]
        temp  humidity
0       9.84        81
1       9.02        80
2       9.02        80
3       9.84        75
4       9.84        75
...      ...       ...
10881  15.58        50
10882  14.76        57
10883  13.94        61
10884  13.94        61
10885  13.12        66

[10886 rows x 2 columns]
        temp  windspeed
0       9.84     0.0000
1       9.02     0.0000
2       9.02     0.0000
3       9.84     0.0000
4       9.84     0.0000
...      ...        ...
10881  15.58    26.0027
10882  14.76    15.0013
10883  13.94    15.0013
10884  13.94     6.0032
10885  13.12     8.9981

[10886 rows x 2 columns]
        temp  year
0       9.84  2011
1       9.02  2011
2       9.02  2011
3       9.84  2011
4       9.84  2011
...      ...   ...
10881  15.58  2012
10882  14.76  2012
10883  13.94  2012
10884  13.94  2012
10885  13.12  2012

[10886 rows x 2 columns]
        temp  month
0       9.84      1
1       9.02      1
2       9.02      1
3       9.84      1
4       9.84      1
...      ...    ...
10881  15.58     12
10882  14.76     12
10883  13.94     12
10884  13.94     12
10885  13.12     12

[10886 rows x 2 columns]
        temp  day
0       9.84    1
1       9.02    1
2       9.02    1
3       9.84    1
4       9.84    1
...      ...  ...
10881  15.58   19
10882  14.76   19
10883  13.94   19
10884  13.94   19
10885  13.12   19

[10886 rows x 2 columns]
        temp  hour
0       9.84     0
1       9.02     1
2       9.02     2
3       9.84     3
4       9.84     4
...      ...   ...
10881  15.58    19
10882  14.76    20
10883  13.94    21
10884  13.94    22
10885  13.12    23

[10886 rows x 2 columns]
       humidity  windspeed
0            81     0.0000
1            80     0.0000
2            80     0.0000
3            75     0.0000
4            75     0.0000
...         ...        ...
10881        50    26.0027
10882        57    15.0013
10883        61    15.0013
10884        61     6.0032
10885        66     8.9981

[10886 rows x 2 columns]
       humidity  year
0            81  2011
1            80  2011
2            80  2011
3            75  2011
4            75  2011
...         ...   ...
10881        50  2012
10882        57  2012
10883        61  2012
10884        61  2012
10885        66  2012

[10886 rows x 2 columns]
       humidity  month
0            81      1
1            80      1
2            80      1
3            75      1
4            75      1
...         ...    ...
10881        50     12
10882        57     12
10883        61     12
10884        61     12
10885        66     12

[10886 rows x 2 columns]
       humidity  day
0            81    1
1            80    1
2            80    1
3            75    1
4            75    1
...         ...  ...
10881        50   19
10882        57   19
10883        61   19
10884        61   19
10885        66   19

[10886 rows x 2 columns]
       humidity  hour
0            81     0
1            80     1
2            80     2
3            75     3
4            75     4
...         ...   ...
10881        50    19
10882        57    20
10883        61    21
10884        61    22
10885        66    23

[10886 rows x 2 columns]
       windspeed  year
0         0.0000  2011
1         0.0000  2011
2         0.0000  2011
3         0.0000  2011
4         0.0000  2011
...          ...   ...
10881    26.0027  2012
10882    15.0013  2012
10883    15.0013  2012
10884     6.0032  2012
10885     8.9981  2012

[10886 rows x 2 columns]
       windspeed  month
0         0.0000      1
1         0.0000      1
2         0.0000      1
3         0.0000      1
4         0.0000      1
...          ...    ...
10881    26.0027     12
10882    15.0013     12
10883    15.0013     12
10884     6.0032     12
10885     8.9981     12

[10886 rows x 2 columns]
       windspeed  day
0         0.0000    1
1         0.0000    1
2         0.0000    1
3         0.0000    1
4         0.0000    1
...          ...  ...
10881    26.0027   19
10882    15.0013   19
10883    15.0013   19
10884     6.0032   19
10885     8.9981   19

[10886 rows x 2 columns]
       windspeed  hour
0         0.0000     0
1         0.0000     1
2         0.0000     2
3         0.0000     3
4         0.0000     4
...          ...   ...
10881    26.0027    19
10882    15.0013    20
10883    15.0013    21
10884     6.0032    22
10885     8.9981    23

[10886 rows x 2 columns]
       year  month
0      2011      1
1      2011      1
2      2011      1
3      2011      1
4      2011      1
...     ...    ...
10881  2012     12
10882  2012     12
10883  2012     12
10884  2012     12
10885  2012     12

[10886 rows x 2 columns]
       year  day
0      2011    1
1      2011    1
2      2011    1
3      2011    1
4      2011    1
...     ...  ...
10881  2012   19
10882  2012   19
10883  2012   19
10884  2012   19
10885  2012   19

[10886 rows x 2 columns]
       year  hour
0      2011     0
1      2011     1
2      2011     2
3      2011     3
4      2011     4
...     ...   ...
10881  2012    19
10882  2012    20
10883  2012    21
10884  2012    22
10885  2012    23

[10886 rows x 2 columns]
       month  day
0          1    1
1          1    1
2          1    1
3          1    1
4          1    1
...      ...  ...
10881     12   19
10882     12   19
10883     12   19
10884     12   19
10885     12   19

[10886 rows x 2 columns]
       month  hour
0          1     0
1          1     1
2          1     2
3          1     3
4          1     4
...      ...   ...
10881     12    19
10882     12    20
10883     12    21
10884     12    22
10885     12    23

[10886 rows x 2 columns]
       day  hour
0        1     0
1        1     1
2        1     2
3        1     3
4        1     4
...    ...   ...
10881   19    19
10882   19    20
10883   19    21
10884   19    22
10885   19    23

[10886 rows x 2 columns]

for c in combinations(f, 2):
    X_train, X_test, y_train, y_test = train_test_split(df[list(c)], df['count'], test_size=0.3)
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    print(str(c))
    print("train : ", train_score)
    print("test : ", test_score)
    print("="*20)

('season', 'holiday')
train :  0.05845819571136379
test :  0.06801029389571611
====================
('season', 'workingday')
train :  0.06119690546536827
test :  0.06356368452321925
====================
('season', 'weather')
train :  0.07576082202281542
test :  0.08724412927488545
====================
('season', 'temp')
train :  0.23284427316676115
test :  0.2126150527730073
====================
('season', 'humidity')
train :  0.28060963515764825
test :  0.2553308376517509
====================
('season', 'windspeed')
train :  0.09695952795617624
test :  0.0870184127764726
====================
('season', 'year')
train :  0.12481734997659888
test :  0.1419045806133954
====================
('season', 'month')
train :  0.07110487449658509
test :  0.07775741473009401
====================
('season', 'day')
train :  0.0730837414840454
test :  0.0508598813046125
====================
('season', 'hour')
train :  0.6119506571560571
test :  0.603707014633353
====================
('holiday', 'workingday')
train :  6.254614111333989e-05
test :  -7.213416623663704e-05
====================
('holiday', 'weather')
train :  0.018203745414681616
test :  0.017385163764621958
====================
('holiday', 'temp')
train :  0.16992644518592026
test :  0.18105496093542683
====================
('holiday', 'humidity')
train :  0.13910601748660945
test :  0.1188547039149569
====================
('holiday', 'windspeed')
train :  0.019516032040894027
test :  0.01644872706450229
====================
('holiday', 'year')
train :  0.07140907585286749
test :  0.05987754943200241
====================
('holiday', 'month')
train :  0.07170470686238317
test :  0.07658428013922136
====================
('holiday', 'day')
train :  0.007090447830976676
test :  -0.002347600544694739
====================
('holiday', 'hour')
train :  0.5203690714412798
test :  0.5147726988065502
====================
('workingday', 'weather')
train :  0.017652970990177486
test :  0.018520313691839374
====================
('workingday', 'temp')
train :  0.17701773948619048
test :  0.1656941296996629
====================
('workingday', 'humidity')
train :  0.1458885900358774
test :  0.12107148034198867
====================
('workingday', 'windspeed')
train :  0.025180747003014092
test :  0.009864892264444092
====================
('workingday', 'year')
train :  0.06463100782531617
test :  0.0758191172864463
====================
('workingday', 'month')
train :  0.07721271823416076
test :  0.0669414583433603
====================
('workingday', 'day')
train :  0.00513734237636676
test :  -0.0019346627158343122
====================
('workingday', 'hour')
train :  0.6538749506801984
test :  0.6533623225870554
====================
('weather', 'temp')
train :  0.19434607518171065
test :  0.17027741524867768
====================
('weather', 'humidity')
train :  0.15828235468992424
test :  0.10505369754010518
====================
('weather', 'windspeed')
train :  0.04406413642063167
test :  0.03658864299539133
====================
('weather', 'year')
train :  0.08756829553309142
test :  0.07928282501967343
====================
('weather', 'month')
train :  0.09587192759658014
test :  0.08846831294974555
====================
('weather', 'day')
train :  0.030841733528468973
test :  0.009799462464378794
====================
('weather', 'hour')
train :  0.5532384362164128
test :  0.5332867151807875
====================
('temp', 'humidity')
train :  0.3686949135390849
test :  0.24043607129139977
====================
('temp', 'windspeed')
train :  0.2590220131987515
test :  0.11573756231088816
====================
('temp', 'year')
train :  0.23753762757820374
test :  0.21815533604779336
====================
('temp', 'month')
train :  0.2880174693849482
test :  0.22970742981217374
====================
('temp', 'day')
train :  0.2663310001326936
test :  0.14210666955745732
====================
('temp', 'hour')
train :  0.679074285859746
test :  0.6037291739012631
====================
('humidity', 'windspeed')
train :  0.25819049064192934
test :  0.036960054751834504
====================
('humidity', 'year')
train :  0.20136505635494273
test :  0.18650522639089195
====================
('humidity', 'month')
train :  0.38909825925915065
test :  0.2496396608260928
====================
('humidity', 'day')
train :  0.3194661339783871
test :  0.06093953967038068
====================
('humidity', 'hour')
train :  0.6346694753826874
test :  0.5050059135565201
====================
('windspeed', 'year')
train :  0.09323464953999572
test :  0.08183108426857966
====================
('windspeed', 'month')
train :  0.13519972851366324
test :  0.06816860927886215
====================
('windspeed', 'day')
train :  0.06735231888334947
test :  -0.012116863497556363
====================
('windspeed', 'hour')
train :  0.5454843269854666
test :  0.4981731533709608
====================
('year', 'month')
train :  0.14949221759923648
test :  0.13777955850772106
====================
('year', 'day')
train :  0.06481644985431456
test :  0.07503715410081335
====================
('year', 'hour')
train :  0.6219452481465291
test :  0.6155966763154723
====================
('month', 'day')
train :  0.10347692407809816
test :  0.0417316959498103
====================
('month', 'hour')
train :  0.6344925826836026
test :  0.6095869000632028
====================
('day', 'hour')
train :  0.5278468147569957
test :  0.47347898020353973
====================

for c in combinations(f, 2):
    X_train, X_test, y_train, y_test = train_test_split(df[list(c)], df['count'], test_size=0.3)
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    result = {"combination" : str(c), "train" : train_score, "test" : test_score}
    all_result.append(result)
all_result

[{'combination': "('season', 'holiday')",
  'train': 0.06429541388996607,
  'test': 0.05439025396453023},
 {'combination': "('season', 'workingday')",
  'train': 0.06534159774564652,
  'test': 0.05273019184958949},
 {'combination': "('season', 'weather')",
  'train': 0.08216391857162053,
  'test': 0.07265795524806051},
 {'combination': "('season', 'temp')",
  'train': 0.2351771066164272,
  'test': 0.20490765892966023},
 {'combination': "('season', 'humidity')",
  'train': 0.2856744999467695,
  'test': 0.24563783000940131},
 {'combination': "('season', 'windspeed')",
  'train': 0.09543727422483361,
  'test': 0.08810482783949436},
 {'combination': "('season', 'year')",
  'train': 0.13202699951491037,
  'test': 0.12640899224946056},
 {'combination': "('season', 'month')",
  'train': 0.0741459578302236,
  'test': 0.06991633720348411},
 {'combination': "('season', 'day')",
  'train': 0.07435037626015506,
  'test': 0.04502838971546008},
 {'combination': "('season', 'hour')",
  'train': 0.6064060557910389,
  'test': 0.6161882221308241},
 {'combination': "('holiday', 'workingday')",
  'train': 9.283914579183428e-05,
  'test': -0.0006753884944799005},
 {'combination': "('holiday', 'weather')",
  'train': 0.01698704128836559,
  'test': 0.020411523484866634},
 {'combination': "('holiday', 'temp')",
  'train': 0.17543847668694112,
  'test': 0.16892160464232586},
 {'combination': "('holiday', 'humidity')",
  'train': 0.1519459899428811,
  'test': 0.08329276861928858},
 {'combination': "('holiday', 'windspeed')",
  'train': 0.022829782512737373,
  'test': 0.008072210959060855},
 {'combination': "('holiday', 'year')",
  'train': 0.07105795444215501,
  'test': 0.06048052370486012},
 {'combination': "('holiday', 'month')",
  'train': 0.07507252836735323,
  'test': 0.06951961287107877},
 {'combination': "('holiday', 'day')",
  'train': 0.0068259510156423175,
  'test': 0.003024065717116664},
 {'combination': "('holiday', 'hour')",
  'train': 0.5228490974196865,
  'test': 0.5080892804225323},
 {'combination': "('workingday', 'weather')",
  'train': 0.020828301449846043,
  'test': 0.011411014014170995},
 {'combination': "('workingday', 'temp')",
  'train': 0.17502869489703998,
  'test': 0.17364869335565458},
 {'combination': "('workingday', 'humidity')",
  'train': 0.14476689394463738,
  'test': 0.12851029339806597},
 {'combination': "('workingday', 'windspeed')",
  'train': 0.023235429159391696,
  'test': 0.01615716563294245},
 {'combination': "('workingday', 'year')",
  'train': 0.07277112893558291,
  'test': 0.05661629910846899},
 {'combination': "('workingday', 'month')",
  'train': 0.07715409638606674,
  'test': 0.06466835772871471},
 {'combination': "('workingday', 'day')",
  'train': 0.005231127635079602,
  'test': -0.0036445773889950406},
 {'combination': "('workingday', 'hour')",
  'train': 0.65163346772907,
  'test': 0.65980453848609},
 {'combination': "('weather', 'temp')",
  'train': 0.1952024653027693,
  'test': 0.16654321420746143},
 {'combination': "('weather', 'humidity')",
  'train': 0.15658685847221743,
  'test': 0.10298867496824105},
 {'combination': "('weather', 'windspeed')",
  'train': 0.0474964097877556,
  'test': 0.02771650824026939},
 {'combination': "('weather', 'year')",
  'train': 0.09116637560345542,
  'test': 0.07078163179731245},
 {'combination': "('weather', 'month')",
  'train': 0.09422581958616993,
  'test': 0.09301492937102684},
 {'combination': "('weather', 'day')",
  'train': 0.02716329585565125,
  'test': 0.019863988688087142},
 {'combination': "('weather', 'hour')",
  'train': 0.5558689455886267,
  'test': 0.5285109716235332},
 {'combination': "('temp', 'humidity')",
  'train': 0.37535892235591173,
  'test': 0.23880588782898804},
 {'combination': "('temp', 'windspeed')",
  'train': 0.25899025839398493,
  'test': 0.12629183224336316},
 {'combination': "('temp', 'year')",
  'train': 0.2356678407766435,
  'test': 0.22465200787573958},
 {'combination': "('temp', 'month')",
  'train': 0.28378494720565084,
  'test': 0.2352749739844313},
 {'combination': "('temp', 'day')",
  'train': 0.2698911501769191,
  'test': 0.13544182234706081},
 {'combination': "('temp', 'hour')",
  'train': 0.6904494294728518,
  'test': 0.5739633601701017},
 {'combination': "('humidity', 'windspeed')",
  'train': 0.2705701220348399,
  'test': 0.01345498635788167},
 {'combination': "('humidity', 'year')",
  'train': 0.20736843182661158,
  'test': 0.17373936030535997},
 {'combination': "('humidity', 'month')",
  'train': 0.386774099530133,
  'test': 0.2711809725326212},
 {'combination': "('humidity', 'day')",
  'train': 0.31456450734972696,
  'test': 0.06626831878009554},
 {'combination': "('humidity', 'hour')",
  'train': 0.6484746860145716,
  'test': 0.4599222799053242},
 {'combination': "('windspeed', 'year')",
  'train': 0.094354903191749,
  'test': 0.0771786967680903},
 {'combination': "('windspeed', 'month')",
  'train': 0.13161596556156407,
  'test': 0.08218792527707564},
 {'combination': "('windspeed', 'day')",
  'train': 0.07047871647514425,
  'test': -0.018641448646160574},
 {'combination': "('windspeed', 'hour')",
  'train': 0.553397149647815,
  'test': 0.48190568850884374},
 {'combination': "('year', 'month')",
  'train': 0.14710504454494133,
  'test': 0.1424721899303245},
 {'combination': "('year', 'day')",
  'train': 0.07342932178624018,
  'test': 0.05346903169933215},
 {'combination': "('year', 'hour')",
  'train': 0.6208178463861078,
  'test': 0.6179964590935694},
 {'combination': "('month', 'day')",
  'train': 0.09841937927428457,
  'test': 0.05207547071903362},
 {'combination': "('month', 'hour')",
  'train': 0.6386774092912783,
  'test': 0.6014056338728897},
 {'combination': "('day', 'hour')",
  'train': 0.5235915799173942,
  'test': 0.48925313013863636}]

# test 를 기준으로 오름차순으로 정렬
result_df = pd.DataFrame(all_result).sort_values(by="test")

# test 의 점수가 가장 높은 5가지를 확인
result_df.tail()

fe = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'humidity', 'windspeed', 'year', 'month', 'day', 'hour']

# 가장 좋은 조합
best_f = ["workingday", "hour"]

all_result = []
for f in fe:
    # 가장 좋은 조합을 확인하기 위해
    # feature 하나씩 추가해서 확인
    best_f.append(f)
    # print(best_f)

    X_train, X_test, y_train, y_test = train_test_split(df[best_f], df['count'], test_size=0.3)
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    train_s = model.score(X_train, y_train)
    test_s = model.score(X_test, y_test)

    result = {"combin" : best_f.copy(), "train" : train_s, "test" : test_s}
    all_result.append(result)

    # 다음 feature를 넣어서 확인하기 위해 마지막에 넣은 feature 제거
    best_f.pop()
    # print(best_f)

all_result

[{'combin': ['workingday', 'hour', 'season'],
  'train': 0.7580913458160565,
  'test': 0.7440944945103845},
 {'combin': ['workingday', 'hour', 'holiday'],
  'train': 0.6650019150696596,
  'test': 0.6345567260841918},
 {'combin': ['workingday', 'hour', 'workingday'],
  'train': 0.6590593073097475,
  'test': 0.6431016423118261},
 {'combin': ['workingday', 'hour', 'weather'],
  'train': 0.6926684259802643,
  'test': 0.6668174416876799},
 {'combin': ['workingday', 'hour', 'temp'],
  'train': 0.8323620614455098,
  'test': 0.7295907373392445},
 {'combin': ['workingday', 'hour', 'humidity'],
  'train': 0.8004993632245669,
  'test': 0.600456524550598},
 {'combin': ['workingday', 'hour', 'windspeed'],
  'train': 0.6990780034197865,
  'test': 0.6266172439536393},
 {'combin': ['workingday', 'hour', 'year'],
  'train': 0.767761505426242,
  'test': 0.7636787124282669},
 {'combin': ['workingday', 'hour', 'month'],
  'train': 0.7863534443871265,
  'test': 0.7419456112733107},
 {'combin': ['workingday', 'hour', 'day'],
  'train': 0.6762676095319997,
  'test': 0.5931861225822459},
 {'combin': ['workingday', 'hour', 'hour'],
  'train': 0.6560251705446849,
  'test': 0.6500550217881965}]

# test 를 기준으로 오름차순으로 정렬
result_df = pd.DataFrame(all_result).sort_values(by="test")

# test 의 점수가 가장 높은 5가지를 확인
result_df.tail()

# 함수로 위 과정 처리
def features_select(feature, best_f):
    all_result = []
    for f in feature:
        # 가장 좋은 조합을 확인하기 위해
        # feature 하나씩 추가해서 확인
        best_f.append(f)
        # print(best_f)
    
        X_train, X_test, y_train, y_test = train_test_split(df[best_f], df['count'], test_size=0.3)
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        train_s = model.score(X_train, y_train)
        test_s = model.score(X_test, y_test)
    
        result = {"combin" : best_f.copy(), "train" : train_s, "test" : test_s}
        all_result.append(result)
    
        # 다음 feature를 넣어서 확인하기 위해 마지막에 넣은 feature 제거
        best_f.pop()
        # print(best_f)
    return all_result;

fe = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'humidity', 'windspeed', 'month', 'day', 'hour']

# 가장 좋은 조합
best_f = ["workingday", "hour", "year"]

# test 를 기준으로 오름차순으로 정렬
result_df = pd.DataFrame(features_select(fe, best_f)).sort_values(by="test")

# test 의 점수가 가장 높은 5가지를 확인
result_df.tail()

# 이제 후진 선택법을 사용해 볼 것임

fe = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'humidity', 'windspeed', 'month', 'day', 'hour', "workingday", "hour", "year"]

# best_f = []

all_result = []

for c in combinations(fe, len(fe)-1):
    # feature 에서 전체 컬럼의 수 -1 을 해서 하나씩 빼가면서
    # 테스트를 해보겠다는 뜻
    target = list(c)
    # print(target)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    train_s = model.score(X_train, y_train)
    test_s = model.score(X_test, y_test)

    dropped = set(fe) - set(target)
    result = {"dropped":dropped, "train":train_s, "test":test_s}
    all_result.append(result)

result_df = pd.DataFrame(all_result).sort_values(by="test")
result_df

fe = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'humidity', 'windspeed', 'month', 'day', 'hour', "workingday", "hour", "year"]

from sklearn.feature_selection import RFE

model = RandomForestRegressor()

rfe = RFE( estimator = model )
rfe.fit(X_train, y_train)

rfe_df = pd.DataFrame()
rfe_df["ranking"] = rfe.ranking_
rfe_df["features"] = X_train.columns
rfe_df

# ranking 이 높을수록 결과 값에 영향을 크게 미치는
# 중요한 컬럼들
rfe_df.sort_values(by="ranking")

feature = ['workingday', 'temp', 'humidity', 'year', 'month', 'hour', 'atemp']

model = RandomForestRegressor()
model.fit(X_train, y_train)
print("train : ", model.score(X_train, y_train))
print("test : ", model.score(X_test, y_test))
# 모든 컬럼을 사용했을때 평가점수

train :  0.9918053687355339
test :  0.9402065652970045

X_train.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'year', 'month', 'day', 'hour'],
      dtype='object')

X_tr, X_te, y_tr, y_te = train_test_split(df[feature], df[l], test_size=0.3)

model = RandomForestRegressor()
model.fit(X_tr, y_tr)
print("train : ", model.score(X_tr, y_tr))
print("test : ", model.score(X_te, y_te))
# 결과에 영향을 크게 주는 컬럼들을 사용했을때 평가 점수

train :  0.9895003195998577
test :  0.9279685991163032

# embed 방식 변수 선택법을 확인해볼 것임
model = RandomForestRegressor()
model.fit(X_train, y_train)

print("train : ", model.score(X_train, y_train))
print("test : ", model.score(X_test, y_test))
print(model.feature_importances_)
print(X_train.columns)

train :  0.9917659298573632
test :  0.9393860178830495
[0.01127486 0.00352902 0.07385774 0.01356585 0.09019879 0.02357791
 0.02972932 0.00938863 0.08657932 0.03626527 0.01343145 0.60860183]
Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'year', 'month', 'day', 'hour'],
      dtype='object')

df_1 = pd.DataFrame()
# feature_importances 는 Tree 계열 알고리즘에서만 존재한다
# 이 방법이 embed 방법!!
df_1['rank'] = model.feature_importances_
df_1['feature'] = X_train.columns
df_1.sort_values(by="rank", ascending=False)

f = ['hour', 'temp', 'year', 'workingday', 'month', 'humidity']
X_train, X_test, y_train, y_test = train_test_split(df[f], df[l], test_size=0.3)
model = RandomForestRegressor()
model.fit(X_train, y_train)

print("train : ", model.score(X_train, y_train))
print("test : ", model.score(X_test, y_test))

train :  0.9889267902576386
test :  0.9297102519637512

	season	holiday	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count	year	month	day	hour	temp_int
season	1.000000	0.029368	-0.008126	0.008879	0.258689	0.264744	0.190610	-0.147121	0.096758	0.164011	0.163439	-0.004797	0.971524	0.001729	-0.006546	0.257917
holiday	0.029368	1.000000	-0.250491	-0.007074	0.000295	-0.005215	0.001929	0.008409	0.043799	-0.020956	-0.005393	0.012021	0.001731	-0.015877	-0.000354	0.000192
workingday	-0.008126	-0.250491	1.000000	0.033772	0.029966	0.024660	-0.010880	0.013373	-0.319111	0.119460	0.011594	-0.002482	-0.003394	0.009829	0.002780	0.029603
weather	0.008879	-0.007074	0.033772	1.000000	-0.055035	-0.055376	0.406244	0.007261	-0.135918	-0.109340	-0.128655	-0.012548	0.012144	-0.007890	-0.022740	-0.054556
temp	0.258689	0.000295	0.029966	-0.055035	1.000000	0.984948	-0.064949	-0.017852	0.467097	0.318571	0.394454	0.061226	0.257589	0.015551	0.145430	0.999313
atemp	0.264744	-0.005215	0.024660	-0.055376	0.984948	1.000000	-0.043536	-0.057473	0.462067	0.314635	0.389784	0.058540	0.264173	0.011866	0.140343	0.984431
humidity	0.190610	0.001929	-0.010880	0.406244	-0.064949	-0.043536	1.000000	-0.318607	-0.348187	-0.265458	-0.317371	-0.078606	0.204537	-0.011335	-0.278011	-0.064205
windspeed	-0.147121	0.008409	0.013373	0.007261	-0.017852	-0.057473	-0.318607	1.000000	0.092276	0.091052	0.101369	-0.015221	-0.150192	0.036157	0.146631	-0.017660
casual	0.096758	0.043799	-0.319111	-0.135918	0.467097	0.462067	-0.348187	0.092276	1.000000	0.497250	0.690414	0.145241	0.092722	0.014109	0.302045	0.467047
registered	0.164011	-0.020956	0.119460	-0.109340	0.318571	0.314635	-0.265458	0.091052	0.497250	1.000000	0.970948	0.264265	0.169451	0.019111	0.380540	0.318048
count	0.163439	-0.005393	0.011594	-0.128655	0.394454	0.389784	-0.317371	0.101369	0.690414	0.970948	1.000000	0.260403	0.166862	0.019826	0.400601	0.394003
year	-0.004797	0.012021	-0.002482	-0.012548	0.061226	0.058540	-0.078606	-0.015221	0.145241	0.264265	0.260403	1.000000	-0.004932	0.001800	-0.004234	0.060692
month	0.971524	0.001731	-0.003394	0.012144	0.257589	0.264173	0.204537	-0.150192	0.092722	0.169451	0.166862	-0.004932	1.000000	0.001974	-0.006818	0.256862
day	0.001729	-0.015877	0.009829	-0.007890	0.015551	0.011866	-0.011335	0.036157	0.014109	0.019111	0.019826	0.001800	0.001974	1.000000	0.001132	0.016202
hour	-0.006546	-0.000354	0.002780	-0.022740	0.145430	0.140343	-0.278011	0.146631	0.302045	0.380540	0.400601	-0.004234	-0.006818	0.001132	1.000000	0.145353
temp_int	0.257917	0.000192	0.029603	-0.054556	0.999313	0.984431	-0.064205	-0.017660	0.467047	0.318048	0.394003	0.060692	0.256862	0.016202	0.145353	1.000000

	combin	train	test
2	[workingday, hour, year, workingday]	0.764556	0.771759
3	[workingday, hour, year, weather]	0.802760	0.778615
4	[workingday, hour, year, temp]	0.927649	0.848878
0	[workingday, hour, year, season]	0.874880	0.861009
7	[workingday, hour, year, month]	0.906467	0.881726

	dropped	train	test
5	{month}	0.992111	0.930813
4	{day}	0.991652	0.932279
0	{year}	0.991575	0.936426
6	{windspeed}	0.992017	0.936628
1	{}	0.991685	0.937244
8	{temp}	0.991496	0.937614
11	{holiday}	0.991583	0.939131
10	{}	0.991901	0.939687
12	{season}	0.991995	0.940164
9	{weather}	0.991278	0.941420
3	{}	0.991178	0.942275
7	{humidity}	0.991742	0.943053
2	{}	0.991287	0.944364

[머신러닝] 군집 ( 고객분류 ) (0)	2024.05.28
[머신러닝] 회귀 및 평가지표 (0)	2024.05.27
[머신러닝] 과적합 및 하이퍼파라미터 (0)	2024.05.27
[머신러닝] 지도학습 ( 분류, 회귀 ), 평가지표 선택하는 방법 (0)	2024.05.24
[머신러닝] 탐색적 데이터분석 ( EDA, 표준화, 가중치 ) (0)	2024.05.24

전영호의 개발 블로그

머신러닝 변수 선택

[머신러닝] 변수 선택법 ( feature selection )

변수 선택법 ( feature selection )

feature selection(변수 선택법)¶

wrapper(전진/후진 선택)¶

'BE > 머신러닝(ML)' 카테고리의 다른 글

+ Recent posts

티스토리툴바

	datetime	season	weather	temp	atemp	humidity	casual	registered	count	year	month	day	hour	temp_int
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16	2011	1	1	0	9
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40	2011	1	1	1	9
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32	2011	1	1	2	9
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13	2011	1	1	3	9
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1	2011	1	1	4	9

	combination	train	test
9	('season', 'hour')	0.606406	0.616188
51	('year', 'hour')	0.620818	0.617996
81	('workingday', 'hour')	0.657728	0.644720
136	('workingday', 'hour')	0.657899	0.644887
26	('workingday', 'hour')	0.651633	0.659805

	combin	train	test
3	[workingday, hour, weather]	0.692668	0.666817
4	[workingday, hour, temp]	0.832362	0.729591
8	[workingday, hour, month]	0.786353	0.741946
0	[workingday, hour, season]	0.758091	0.744094
7	[workingday, hour, year]	0.767762	0.763679

	rank	feature
0	0.617593	hour
1	0.122718	temp
2	0.082577	year
3	0.071475	workingday
4	0.055804	month
5	0.049833	humidity