Evalml: ๋ฒ”์ฃผํ˜• ๋˜๋Š” ๋ถ€์šธ ์—ด์— ์—†์Œ์ด ์žˆ๋Š” ๊ฒฝ์šฐ ์ž…๋ ฅ๊ธฐ๋ฅผ ๋งž์ถœ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.

์— ๋งŒ๋“  2020๋…„ 08์›” 19์ผ  ยท  3์ฝ”๋ฉ˜ํŠธ  ยท  ์ถœ์ฒ˜: alteryx/evalml

์žฌ์ƒ๊ธฐ

from evalml.pipelines.components import Imputer
df = pd.DataFrame({"a": [1, 2, 3], "b": ["1", "2", None]})
imputer = Imputer()
imputer.fit(df)
from evalml.pipelines.components import Imputer
df_with_bool = pd.DataFrame({"a": [1, 2, 3], "b": [True, False, None]})
imputer = Imputer()
imputer.fit(df_with_bool)

๋‘˜ ๋‹ค ๋™์ผํ•œ ์Šคํƒ ์ถ”์ ์„ ๊ฐ–์Šต๋‹ˆ๋‹ค.

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-69-9af4cfc17aec> in <module>
      1 df_with_bool = pd.DataFrame({"a": [1, 2, 3], "b": [True, False, None]})
      2 imputer = Imputer()
----> 3 imputer.fit(df_with_bool)

~/sources/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
     12         @wraps(method)
     13         def _set_fit(self, X, y=None):
---> 14             return_value = method(self, X, y)
     15             self._is_fitted = True
     16             return return_value

~/sources/evalml/evalml/pipelines/components/transformers/imputers/imputer.py in fit(self, X, y)
     76         X_categorical = X_null_dropped.select_dtypes(include=categorical_dtypes + boolean)
     77         if len(X_categorical.columns) > 0:
---> 78             self._categorical_imputer.fit(X_categorical, y)
     79             self._categorical_cols = X_categorical.columns
     80         return self

~/sources/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
     12         @wraps(method)
     13         def _set_fit(self, X, y=None):
---> 14             return_value = method(self, X, y)
     15             self._is_fitted = True
     16             return return_value

~/sources/evalml/evalml/pipelines/components/transformers/imputers/simple_imputer.py in fit(self, X, y)
     42         if not isinstance(X, pd.DataFrame):
     43             X = pd.DataFrame(X)
---> 44         self._component_obj.fit(X, y)
     45         self._all_null_cols = set(X.columns) - set(X.dropna(axis=1, how='all').columns)
     46         return self

~/miniconda3/envs/evalml/lib/python3.8/site-packages/sklearn/impute/_base.py in fit(self, X, y)
    300                                                     fill_value)
    301         else:
--> 302             self.statistics_ = self._dense_fit(X,
    303                                                self.strategy,
    304                                                self.missing_values,

~/miniconda3/envs/evalml/lib/python3.8/site-packages/sklearn/impute/_base.py in _dense_fit(self, X, strategy, missing_values, fill_value)
    384                 row_mask = np.logical_not(row_mask).astype(np.bool)
    385                 row = row[row_mask]
--> 386                 most_frequent[i] = _most_frequent(row, np.nan, 0)
    387 
    388             return most_frequent

~/miniconda3/envs/evalml/lib/python3.8/site-packages/sklearn/impute/_base.py in _most_frequent(array, extra_value, n_repeat)
     40             # has already been NaN-masked.
     41             warnings.simplefilter("ignore", RuntimeWarning)
---> 42             mode = stats.mode(array)
     43 
     44         most_frequent_value = mode[0][0]

~/miniconda3/envs/evalml/lib/python3.8/site-packages/scipy/stats/stats.py in mode(a, axis, nan_policy)
    498     counts = np.zeros(a_view.shape[:-1], dtype=np.int)
    499     for ind in inds:
--> 500         modes[ind], counts[ind] = _mode1D(a_view[ind])
    501     newshape = list(a.shape)
    502     newshape[axis] = 1

~/miniconda3/envs/evalml/lib/python3.8/site-packages/scipy/stats/stats.py in _mode1D(a)
    485 
    486     def _mode1D(a):
--> 487         vals, cnts = np.unique(a, return_counts=True)
    488         return vals[cnts.argmax()], cnts.max()
    489 

<__array_function__ internals> in unique(*args, **kwargs)

~/miniconda3/envs/evalml/lib/python3.8/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
    259     ar = np.asanyarray(ar)
    260     if axis is None:
--> 261         ret = _unique1d(ar, return_index, return_inverse, return_counts)
    262         return _unpack_tuple(ret)
    263 

~/miniconda3/envs/evalml/lib/python3.8/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
    320         aux = ar[perm]
    321     else:
--> 322         ar.sort()
    323         aux = ar
    324     mask = np.empty(aux.shape, dtype=np.bool_)

TypeError: '<' not supported between instances of 'NoneType' and 'bool'

๊ทธ๊ฒƒ์ด์ด ์ž‘ํ’ˆ np.nan ๋Œ€์‹  None

๋ชจ๋“  3 ๋Œ“๊ธ€

@freddyaboulton ๋ช…ํ™•ํ•œ ์žฌ์ƒ์‚ฐ์— ๊ฐ์‚ฌ๋“œ๋ฆฝ๋‹ˆ๋‹ค! ์ด๊ฒƒ์€ ๋‹ค๋ฅธ ๋ฒ„๊ทธ #1092๋„ ์„ค๋ช…ํ•˜๋Š” ๊ฒƒ์œผ๋กœ ๋ณด์ž…๋‹ˆ๋‹ค.

๋ฌธ์ œ
pandas ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์˜ ๊ธฐ๋Šฅ์— object ์œ ํ˜•์ด ์žˆ๊ณ  None ๊ฐ’์ด ํฌํ•จ๋˜์–ด ์žˆ์œผ๋ฉด Imputer ๊ฐ€ ์‹คํŒจํ•ฉ๋‹ˆ๋‹ค.

  1. X = pd.DataFrame({'feature1': [False, True, None, np.nan]}) ๋Š” object ์œ ํ˜•์˜ ํ”ผ์ณ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. Imputer.fit ์‹คํŒจ.
  2. X = pd.DataFrame({'feature1': [False, True, np.nan]}) ๋Š” object ์œ ํ˜•์˜ ํ”ผ์ณ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. Imputer.fit ์ž‘๋™ํ•ฉ๋‹ˆ๋‹ค.
  3. X = pd.DataFrame({'feature1': [False, True]}) ๋Š” bool ์œ ํ˜•์˜ ํ”ผ์ณ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. Imputer.fit ์ž‘๋™ํ•ฉ๋‹ˆ๋‹ค.

category ์œ ํ˜•๋„ ๋งˆ์ฐฌ๊ฐ€์ง€์ž…๋‹ˆ๋‹ค. ๋งˆ์ง€๋ง‰ ๊ฒฝ์šฐ๋Š” ์ ์šฉ๋˜์ง€ ์•Š์ง€๋งŒ ์œ ์‚ฌํ•œ ์ƒํ™ฉ์ด ๋ฌธ์ž์—ด ์œ ํ˜•์— ๋Œ€ํ•ด ๋ฐœ์ƒํ•ฉ๋‹ˆ๋‹ค.

๋ฉ”๋ชจ
์—ฌ๊ธฐ์„œ ํ˜ผ๋ž€์Šค๋Ÿฌ์šด ์ ์€ None ๊ฐ€ ๋‹ค๋ฅธ ๊ฒƒ์„ ์˜๋ฏธํ•  ์ˆ˜ ์žˆ๋‹ค๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค. nan ์™€ ๊ฐ™์„ ์ˆ˜๋„ ์žˆ๊ณ  ์ž์ฒด ๋ฒ”์ฃผ๋กœ ์‚ฌ์šฉํ•  ์ˆ˜๋„ ์žˆ์Šต๋‹ˆ๋‹ค.

์šฐ๋ฆฌ๊ฐ€ ๊ทธ ๊ทœ์น™์„ ๋ฌธ์„œํ™”ํ•˜๊ณ  ์„ค๋ช…ํ•˜๋Š” ํ•œ nan ๋กœ ์ทจ๊ธ‰ํ•˜๋Š” ๊ฒƒ์ด ์ข‹๋‹ค๊ณ  ์ƒ๊ฐํ•ฉ๋‹ˆ๋‹ค.

ํ•ด๊ฒฐ ๋ฐฉ๋ฒ•
๋ถ€์šธ / ์นดํ…Œ๊ณ ๋ฆฌ / ๋ฌธ์ž์—ด ๊ธฐ๋Šฅ์—์„œ None ์ •๋ฆฌ : df = df.fillna(value=np.nan)

๊ณ ์น˜๋‹ค
๋‹จ๊ธฐ:

  • Imputer ์„ ์—…๋ฐ์ดํŠธํ•˜์—ฌ None ์„ np.nan ๋กœ ๋Œ€์ฒด
  • Imputer API ๋ฌธ์„œ ๋ฐ automl ์‚ฌ์šฉ์ž ๊ฐ€์ด๋“œ๋ฅผ ์—…๋ฐ์ดํŠธํ•˜์—ฌ ์ด์— ๋Œ€ํ•ด ์–ธ๊ธ‰ํ•˜์„ธ์š”.
  • ๋ชจ๋“  ์˜๋„๋œ ๋ฐ์ดํ„ฐ ์œ ํ˜•์— ๋Œ€ํ•ด ๋ฐ์ดํ„ฐ์— None ๋ฅผ ํฌํ•จํ•˜์—ฌ Imputer ์˜ ํ…Œ์ŠคํŠธ ๋ฒ”์œ„๋ฅผ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.

๋ฐ์ดํ„ฐ์— None ๊ฐ€ ์žˆ์œผ๋ฉด ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ•˜๋Š” DataCheck ๋ฅผ ๋Œ€์‹  ์ถ”๊ฐ€ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ์ด๊ฒƒ์€ None ๊ฐ€ ์‰ฝ๊ฒŒ ๋ณ€ํ™˜๋  ์ˆ˜ ์žˆ๊ธฐ ๋•Œ๋ฌธ์— ๋ถˆํ•„์š”ํ•˜๊ฒŒ ๋Š๊ปด์ง‘๋‹ˆ๋‹ค.

์žฅ๊ธฐ๊ฐ„:
์ƒˆ๋กœ์šด DataTable ๋ฐ์ดํ„ฐ ๊ตฌ์กฐ๋ฅผ ์‚ฌ์šฉํ•˜๋„๋ก evalml์„ ์—…๋ฐ์ดํŠธํ•˜๋ฉด ์‚ฌ์šฉ์ž๋Š” ๋ฏธ๋ฆฌ ๊ฐ ๊ธฐ๋Šฅ์˜ ์œ ํ˜•์„ ๊ตฌ์„ฑํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ด๊ฒƒ์ด ํ‘œ์ค€ํ™”๊ฐ€ ์ด๋Ÿฌํ•œ ์ข…๋ฅ˜์˜ ์˜ค๋ฅ˜๋ฅผ ๋ฌด์˜๋ฏธํ•˜๊ฒŒ ๋งŒ๋“ค ๊ฒƒ์ด๋ผ๋Š” ๊ฒƒ์„ ์˜๋ฏธํ•˜๊ธฐ๋ฅผ ๋ฐ”๋ž๋‹ˆ๋‹ค.

#540๊ณผ ๊ด€๋ จ์ด ์žˆ์Šต๋‹ˆ๊นŒ?

@angela97lin ๐Ÿคฆ 100% ๊ด€๋ จ... ใ…‹. ์šฐ๋ฆฌ๋Š” imputer๊ฐ€ None s๋ฅผ np.nan s๋กœ ๋ณ€ํ™˜ํ•˜๊ธฐ๋กœ ๊ฒฐ์ •ํ–ˆ์Šต๋‹ˆ๋‹ค.

์—ฌ๊ธฐ์— ๊ธฐ๋ก์ด ๋” ์ตœ์‹ ์ด๊ธฐ ๋•Œ๋ฌธ์— ์ด์— ์ฐฌ์„ฑํ•˜์—ฌ #540์„ ๋‹ซ์Šต๋‹ˆ๋‹ค.

๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค!

์ด ํŽ˜์ด์ง€๊ฐ€ ๋„์›€์ด ๋˜์—ˆ๋‚˜์š”?
0 / 5 - 0 ๋“ฑ๊ธ‰