ColumnTransformerλ₯Ό μ¬μ©νμ¬ νμ΄νλΌμΈμΌλ‘ λ€λ₯Έ μ΄(μ«μ, λ²μ£Ό, ν μ€νΈ ν¬ν¨)μ μ μ²λ¦¬ν λ μ΅μ’ λ³νλ λ°μ΄ν°μ κΈ°λ₯ μ΄λ¦μ κ°μ Έμ¬ μ μμΌλ―λ‘ λλ²κΉ μ΄ μ΄λ ΅μ΅λλ€.
μ½λλ λ€μκ³Ό κ°μ΅λλ€.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)
target = data.pop('survived')
numeric_columns = ['age','sibsp','parch']
category_columns = ['pclass','sex','embarked']
text_columns = ['name','home.dest']
numeric_transformer = Pipeline(steps=[
('impute',SimpleImputer(strategy='median')),
('scaler',StandardScaler()
)
])
category_transformer = Pipeline(steps=[
('impute',SimpleImputer(strategy='constant',fill_value='missing')),
('ohe',OneHotEncoder(handle_unknown='ignore'))
])
text_transformer = Pipeline(steps=[
('cntvec',CountVectorizer())
])
preprocesser = ColumnTransformer(transformers=[
('numeric',numeric_transformer,numeric_columns),
('category',category_transformer,category_columns),
('text',text_transformer,text_columns[0])
])
preprocesser.fit_transform(data)
preprocesser.get_feature_names()
μ€λ₯κ° λ°μν©λλ€.AttributeError: Transformer numeric (type Pipeline) does not provide get_feature_names.
ColumnTransformer
μμ text_transformer
λ λ¬Έμμ΄(μ: 'Sex')λ§ μ²λ¦¬ν μ μμ§λ§ text_columns
μ κ°μ λ¬Έμμ΄ λͺ©λ‘μ μ²λ¦¬ν μ μμ΅λλ€.μ΄κ²μ ColumnTransformerμ λν λ¬Έμ κ° μλλλ€.
eli5
λ νμ΄νλΌμΈμ μ§μν μ μλ κΈ°λ₯ μ΄λ¦ κΈ°λ₯μ ꡬνν©λλ€.Re 2. μλ§λ κ° μ΄μ ν μ€νΈ 벑ν°λΌμ΄μ λ₯Ό μ μ©νλ κΉλν λ°©λ²μ΄ μλ€λ κ²μ΄ λΉμ°νΈμ μ΄λΌλ λ§μ΄ λ§μ κ²μ λλ€. λ¨μν CountVectorizer λ±μμ μ¬λ¬ μ λ ₯ μ΄μ μ§μνκΈ° μμνμ§ μλ ν μ΄κ²μ΄ μ΄λ»κ² κΉλνκ² λ¬μ±λ μ μλμ§ μ λͺ¨λ₯΄κ² μ΅λλ€.
μ΄κ²μ ColumnTransformerμ λν λ¬Έμ κ° μλλλ€.
- νμ΄νλΌμΈμ κ΄ν κ²μ λλ€.
eli5
λ νμ΄νλΌμΈμ μ§μν μ μλ κΈ°λ₯ μ΄λ¦ κΈ°λ₯μ ꡬνν©λλ€.Re 2. μλ§λ κ° μ΄μ ν μ€νΈ 벑ν°λΌμ΄μ λ₯Ό μ μ©νλ κΉλν λ°©λ²μ΄ μλ€λ κ²μ΄ λΉμ°νΈμ μ΄λΌλ λ§μ΄ λ§μ κ²μ λλ€. λ¨μν CountVectorizer λ±μμ μ¬λ¬ μ λ ₯ μ΄μ μ§μνκΈ° μμνμ§ μλ ν μ΄κ²μ΄ μ΄λ»κ² κΉλνκ² λ¬μ±λ μ μλμ§ μ λͺ¨λ₯΄κ² μ΅λλ€.
μΉμ ν λ΅λ³ κ°μ¬ν©λλ€!
λ΄κ° μλ λ°μ κ°μ΄ OneHotEncoder
, CountVectorizer
μ κ°μ΄ νλμ μ΄μ λ€μ€ μ΄λ‘ λ³κ²½ν μ μλ λ©μλλ₯Ό μ¬μ©νμ¬ μ΄μ μ¬μ μ²λ¦¬ν λ νμ΄νλΌμΈ λ§μ§λ§ λ¨κ³μ λ³νκΈ°μμ μ λ°μ΄ν° μ΄ μ΄λ¦μ κ°μ Έμ¬ μ μμ΅λλ€. get_feature_names
ν¨μλ μ μ΄μ μμ±νμ§ μλ λ©μλλ₯Ό μ¬μ©ν λ μμ μ΄ μ΄λ¦λ§ μ€μ ν μ μμ΅λλ€.
def get_column_names_from_ColumnTransformer(column_transformer):
col_name = []
for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
raw_col_name = transformer_in_columns[2]
if isinstance(transformer_in_columns[1],Pipeline):
transformer = transformer_in_columns[1].steps[-1][1]
else:
transformer = transformer_in_columns[1]
try:
names = transformer.get_feature_names()
except AttributeError: # if no 'get_feature_names' function, use raw column name
names = raw_col_name
if isinstance(names,np.ndarray): # eg.
col_name += names.tolist()
elif isinstance(names,list):
col_name += names
elif isinstance(names,str):
col_name.append(names)
return col_name
μμ μ½λλ₯Ό μ¬μ©νμ¬ preprocesser
μ μ΄ μ΄λ¦μ μ»μ μ μμ΅λλ€.
μ΄ μ½λκ° μ΄ μ§λ¬Έμ ν΄κ²°ν©λκΉ?
eli5 νμ¬ ν΄λΉ κΈ°λ₯μ μ°Ύμ μ μμ΅λλ€. eli5μ λͺ
μμ μμ λλ APIμ λν λ§ν¬λ₯Ό μ 곡ν μ μμ΅λκΉ?
eli5μ κ΄λ ¨νμ¬ transform_feature_names(explain_weightsμμ μ¬μ©)λ₯Ό μ°Έμ‘°νμμμ€.
1μ #6425μ 볡μ νμ΄μ£ ? λλ κ·Έκ²μ μ μ μ°κ³ μΆλ€.
ColumnTransformer
μ¬μ©νλ©΄ μ¬λ¬ ν
μ€νΈ μ΄μ μ§μνλ κ²μ΄ λ§€μ° μ½λ€κ³ μκ°ν©λλ€. κ°μ₯ μμ μ½λλ μλμ§λ§ κ° ν
μ€νΈ μ΄μ λν΄ CountVectorizerλ₯Ό μΆκ°ν μ μμ΅λλ€.
κ·Έλ¦¬κ³ get_feature_names
κ° μλ€κ³ ν΄μ μ΄ μ΄λ¦λ§ μ¬μ©ν μ μλ€λ μλ―Έλ μλκΈ° λλ¬Έμ μ€λν«μ μ€μ λ‘ λ¬Έμ λ₯Ό ν΄κ²°νμ§ λͺ»ν©λλ€.
1μ #6425μ 볡μ νμ΄μ£ ? λλ κ·Έκ²μ μ μ μ°κ³ μΆλ€.
ColumnTransformer
μ¬μ©νλ©΄ μ¬λ¬ ν μ€νΈ μ΄μ μ§μνλ κ²μ΄ λ§€μ° μ½λ€κ³ μκ°ν©λλ€. κ°μ₯ μμ μ½λλ μλμ§λ§ κ° ν μ€νΈ μ΄μ λν΄ CountVectorizerλ₯Ό μΆκ°ν μ μμ΅λλ€.그리κ³
get_feature_names
κ° μλ€κ³ ν΄μ μ΄ μ΄λ¦λ§ μ¬μ©ν μ μλ€λ μλ―Έλ μλκΈ° λλ¬Έμ μ€λν«μ μ€μ λ‘ λ¬Έμ λ₯Ό ν΄κ²°νμ§ λͺ»ν©λλ€.
μ, μ μ²λ¦¬ νμ΄νλΌμΈμμ pandas DataFrame νΌλ ν μμ±λ λ°μ΄ν°μμ μ νν λ¬΄μ¨ μΌμ΄ μΌμ΄λ¬λμ§ μ μ μλλ‘ κΈ°λ₯ μ΄λ¦μ μ»λ κ²μ΄ μ’μ΅λλ€.
μκ² μ΅λλ€. μ€λ³΅μΌλ‘ λ«μ΅λλ€.
μ΄κ²μ ColumnTransformerμ λν λ¬Έμ κ° μλλλ€.
- νμ΄νλΌμΈμ κ΄ν κ²μ λλ€.
eli5
λ νμ΄νλΌμΈμ μ§μν μ μλ κΈ°λ₯ μ΄λ¦ κΈ°λ₯μ ꡬνν©λλ€.Re 2. μλ§λ κ° μ΄μ ν μ€νΈ 벑ν°λΌμ΄μ λ₯Ό μ μ©νλ κΉλν λ°©λ²μ΄ μλ€λ κ²μ΄ λΉμ°νΈμ μ΄λΌλ λ§μ΄ λ§μ κ²μ λλ€. λ¨μν CountVectorizer λ±μμ μ¬λ¬ μ λ ₯ μ΄μ μ§μνκΈ° μμνμ§ μλ ν μ΄κ²μ΄ μ΄λ»κ² κΉλνκ² λ¬μ±λ μ μλμ§ μ λͺ¨λ₯΄κ² μ΅λλ€.
μΉμ ν λ΅λ³ κ°μ¬ν©λλ€!
λ΄κ° μλ λ°μ κ°μ΄OneHotEncoder
,CountVectorizer
μ κ°μ΄ νλμ μ΄μ λ€μ€ μ΄λ‘ λ³κ²½ν μ μλ λ©μλλ₯Ό μ¬μ©νμ¬ μ΄μ μ¬μ μ²λ¦¬ν λ νμ΄νλΌμΈ λ§μ§λ§ λ¨κ³μ λ³νκΈ°μμ μ λ°μ΄ν° μ΄ μ΄λ¦μ κ°μ Έμ¬ μ μμ΅λλ€.get_feature_names
ν¨μλ μ μ΄μ μμ±νμ§ μλ λ©μλλ₯Ό μ¬μ©ν λ μμ μ΄ μ΄λ¦λ§ μ€μ ν μ μμ΅λλ€.def get_column_names_from_ColumnTransformer(column_transformer): col_name = [] for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder' raw_col_name = transformer_in_columns[2] if isinstance(transformer_in_columns[1],Pipeline): transformer = transformer_in_columns[1].steps[-1][1] else: transformer = transformer_in_columns[1] try: names = transformer.get_feature_names() except AttributeError: # if no 'get_feature_names' function, use raw column name names = raw_col_name if isinstance(names,np.ndarray): # eg. col_name += names.tolist() elif isinstance(names,list): col_name += names elif isinstance(names,str): col_name.append(names) return col_name
μμ μ½λλ₯Ό μ¬μ©νμ¬
preprocesser
μ μ΄ μ΄λ¦μ μ»μ μ μμ΅λλ€.
μ΄ μ½λκ° μ΄ μ§λ¬Έμ ν΄κ²°ν©λκΉ?
eli5 νμ¬ ν΄λΉ κΈ°λ₯μ μ°Ύμ μ μμ΅λλ€. eli5μ λͺ μμ μμ λλ APIμ λν λ§ν¬λ₯Ό μ 곡ν μ μμ΅λκΉ?
onehot νμμ λν΄ rawname_valueμ κ°μ μ΄λ¦μ λλ리기 μν΄ μ½κ°μ κ°μ μ νμ΅λλ€.
def get_column_names_from_ColumnTransformer(column_transformer):
col_name = []
for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
raw_col_name = transformer_in_columns[2]
raw_col_name_reverse = raw_col_name[::-1]
if isinstance(transformer_in_columns[1],Pipeline):
transformer = transformer_in_columns[1].steps[-1][1]
else:
transformer = transformer_in_columns[1]
try:
names = transformer.get_feature_names()
exchange_name = [(_.split("_")) for _ in preprocessor.transformers_[:-1][0][1].steps[-1][1].get_feature_names()]
last_pre_name = ""
last_raw_name = ""
for pre_name,value in exchange_name:
if pre_name==last_pre_name:
col_name.append(last_raw_name+"_"+value)
if pre_name!=last_pre_name:
last_pre_name=pre_name
last_raw_name=raw_col_name_reverse.pop()
col_name.append(last_raw_name+"_"+value)
except AttributeError: # if no 'get_feature_names' function, use raw column name
names = raw_col_name
if isinstance(names,np.ndarray): # eg.
col_name += names.tolist()
elif isinstance(names,list):
col_name += names
elif isinstance(names,str):
col_name.append(names)
return col_name
μ΄κ²μ ColumnTransformerμ λν λ¬Έμ κ° μλλλ€.
- νμ΄νλΌμΈμ κ΄ν κ²μ λλ€.
eli5
λ νμ΄νλΌμΈμ μ§μν μ μλ κΈ°λ₯ μ΄λ¦ κΈ°λ₯μ ꡬνν©λλ€.Re 2. μλ§λ κ° μ΄μ ν μ€νΈ 벑ν°λΌμ΄μ λ₯Ό μ μ©νλ κΉλν λ°©λ²μ΄ μλ€λ κ²μ΄ λΉμ°νΈμ μ΄λΌλ λ§μ΄ λ§μ κ²μ λλ€. λ¨μν CountVectorizer λ±μμ μ¬λ¬ μ λ ₯ μ΄μ μ§μνκΈ° μμνμ§ μλ ν μ΄κ²μ΄ μ΄λ»κ² κΉλνκ² λ¬μ±λ μ μλμ§ μ λͺ¨λ₯΄κ² μ΅λλ€.
μΉμ ν λ΅λ³ κ°μ¬ν©λλ€!
λ΄κ° μλ λ°μ κ°μ΄OneHotEncoder
,CountVectorizer
μ κ°μ΄ νλμ μ΄μ λ€μ€ μ΄λ‘ λ³κ²½ν μ μλ λ©μλλ₯Ό μ¬μ©νμ¬ μ΄μ μ¬μ μ²λ¦¬ν λ νμ΄νλΌμΈ λ§μ§λ§ λ¨κ³μ λ³νκΈ°μμ μ λ°μ΄ν° μ΄ μ΄λ¦μ κ°μ Έμ¬ μ μμ΅λλ€.get_feature_names
ν¨μλ μ μ΄μ μμ±νμ§ μλ λ©μλλ₯Ό μ¬μ©ν λ μμ μ΄ μ΄λ¦λ§ μ€μ ν μ μμ΅λλ€.def get_column_names_from_ColumnTransformer(column_transformer): col_name = [] for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder' raw_col_name = transformer_in_columns[2] if isinstance(transformer_in_columns[1],Pipeline): transformer = transformer_in_columns[1].steps[-1][1] else: transformer = transformer_in_columns[1] try: names = transformer.get_feature_names() except AttributeError: # if no 'get_feature_names' function, use raw column name names = raw_col_name if isinstance(names,np.ndarray): # eg. col_name += names.tolist() elif isinstance(names,list): col_name += names elif isinstance(names,str): col_name.append(names) return col_name
μμ μ½λλ₯Ό μ¬μ©νμ¬
preprocesser
μ μ΄ μ΄λ¦μ μ»μ μ μμ΅λλ€.
μ΄ μ½λκ° μ΄ μ§λ¬Έμ ν΄κ²°ν©λκΉ?
eli5 νμ¬ ν΄λΉ κΈ°λ₯μ μ°Ύμ μ μμ΅λλ€. eli5μ λͺ μμ μμ λλ APIμ λν λ§ν¬λ₯Ό μ 곡ν μ μμ΅λκΉ?
νμ΄νλΌμΈμμ add_indicatorμ ν¨κ» simpleimputerλ₯Ό μ μ©νλ©΄ μ΄λ»κ² λ κΉμ? μ΄ λ°©λ²μ μλνμ§ μμ΅λλ€.
νμ΄νλΌμΈμμ add_indicatorμ ν¨κ» simpleimputerλ₯Ό μ μ©νλ©΄ μ΄λ»κ² λ κΉμ? μ΄ λ°©λ²μ μλνμ§ μμ΅λλ€.
μ΄ κ΅¬μ±μ λν΄ get_feature_names λ©μλκ° μμΌλ©΄ μ’μ κ²μ λλ€.
νμ΄νλΌμΈμμ add_indicatorμ ν¨κ» simpleimputerλ₯Ό μ μ©νλ©΄ μ΄λ»κ² λ κΉμ? μ΄ λ°©λ²μ μλνμ§ μμ΅λλ€.
λ€μμ λ¨κΈ° μ루μ μ λν μ μ κΈ°μ¬μ λλ€. λ€λ₯Έ λͺ¨λ λ°°μ΄ μ νμ λͺ©λ‘μΌλ‘ κ°μ λ³ννκ³ SimpleImputer(add_indicate=True)μ κ²½μ°λ₯Ό μ²λ¦¬ν©λλ€. λν μ‘°κΈ λ μ₯ν©ν©λλ€.
def get_column_names_from_ColumnTransformer(column_transformer):
col_name = []
for transformer_in_columns in column_transformer.transformers_[:-1]: #the last transformer is ColumnTransformer's 'remainder'
print('\n\ntransformer: ', transformer_in_columns[0])
raw_col_name = list(transformer_in_columns[2])
if isinstance(transformer_in_columns[1], Pipeline):
# if pipeline, get the last transformer
transformer = transformer_in_columns[1].steps[-1][1]
else:
transformer = transformer_in_columns[1]
try:
if isinstance(transformer, OneHotEncoder):
names = list(transformer.get_feature_names(raw_col_name))
elif isinstance(transformer, SimpleImputer) and transformer.add_indicator:
missing_indicator_indices = transformer.indicator_.features_
missing_indicators = [raw_col_name[idx] + '_missing_flag' for idx in missing_indicator_indices]
names = raw_col_name + missing_indicators
else:
names = list(transformer.get_feature_names())
except AttributeError as error:
names = raw_col_name
print(names)
col_name.extend(names)
return col_name
μ°Έκ³ λ‘ μ λ 볡μ‘ν Pipelines λ° ColumnTransformersμμ κΈ°λ₯ μ΄λ¦μ μΆμΆνλ λ°©λ²μ λν λͺ κ°μ§ μ½λμ λΈλ‘κ·Έλ₯Ό μμ±νμ΅λλ€. μ½λλ μ΄μ κ²μλ¬Όλ³΄λ€ κ°μ λ κ²μ λλ€. https://towardsdatascience.com/extracting-plotting-feature-names-importance-from-scikit-learn-pipelines-eb5bfa6a31f4
@kylegilde νλ₯ν κΈ°μ¬μ μ½λ κ°μ¬ν©λλ€. 맀λ ₯μ²λΌ μλν©λλ€. κΈλ‘λ² μ€λͺ
μ μν΄ λͺ μκ° λμ KernelSHAP λ° μ리λ°μ΄ μ μ¨λ¦νμ§λ§ handle_unkown='ignore'
μμ΄λ onehot λ³νκΈ°κ° μλνμ§ μμμ΅λλ€.
λ€μμ μλ¦Όμ μ΄μ ν¬ν¨νλ @pjgao μ μ€λν«μ λ€λ₯Έ λ²μ μ λλ€.
def get_columns_from_transformer(column_transformer, input_colums):
col_name = []
for transformer_in_columns in column_transformer.transformers_[:-1]: #the last transformer is ColumnTransformer's 'remainder'
raw_col_name = transformer_in_columns[2]
if isinstance(transformer_in_columns[1],Pipeline):
transformer = transformer_in_columns[1].steps[-1][1]
else:
transformer = transformer_in_columns[1]
try:
names = transformer.get_feature_names(raw_col_name)
except AttributeError: # if no 'get_feature_names' function, use raw column name
names = raw_col_name
if isinstance(names,np.ndarray): # eg.
col_name += names.tolist()
elif isinstance(names,list):
col_name += names
elif isinstance(names,str):
col_name.append(names)
[_, _, reminder_columns] = column_transformer.transformers_[-1]
for col_idx in reminder_columns:
col_name.append(input_colums[col_idx])
return col_name
ν΅μ¬ μ½λλ² μ΄μ€μ μ μ¬ν κΈ°λ₯μ μΆκ°νλ κ²μ λν΄ μ΄λ»κ² μκ°νμλκΉ?
κ°μ₯ μ μ©ν λκΈ
μΉμ ν λ΅λ³ κ°μ¬ν©λλ€!
λ΄κ° μλ λ°μ κ°μ΄
OneHotEncoder
,CountVectorizer
μ κ°μ΄ νλμ μ΄μ λ€μ€ μ΄λ‘ λ³κ²½ν μ μλ λ©μλλ₯Ό μ¬μ©νμ¬ μ΄μ μ¬μ μ²λ¦¬ν λ νμ΄νλΌμΈ λ§μ§λ§ λ¨κ³μ λ³νκΈ°μμ μ λ°μ΄ν° μ΄ μ΄λ¦μ κ°μ Έμ¬ μ μμ΅λλ€.get_feature_names
ν¨μλ μ μ΄μ μμ±νμ§ μλ λ©μλλ₯Ό μ¬μ©ν λ μμ μ΄ μ΄λ¦λ§ μ€μ ν μ μμ΅λλ€.μμ μ½λλ₯Ό μ¬μ©νμ¬
preprocesser
μ μ΄ μ΄λ¦μ μ»μ μ μμ΅λλ€.μ΄ μ½λκ° μ΄ μ§λ¬Έμ ν΄κ²°ν©λκΉ?
eli5 νμ¬ ν΄λΉ κΈ°λ₯μ μ°Ύμ μ μμ΅λλ€. eli5μ λͺ μμ μμ λλ APIμ λν λ§ν¬λ₯Ό μ 곡ν μ μμ΅λκΉ?