from evalml.pipelines.components import Imputer
df = pd.DataFrame({"a": [1, 2, 3], "b": ["1", "2", None]})
imputer = Imputer()
imputer.fit(df)
from evalml.pipelines.components import Imputer
df_with_bool = pd.DataFrame({"a": [1, 2, 3], "b": [True, False, None]})
imputer = Imputer()
imputer.fit(df_with_bool)
๋ ๋ค ๋์ผํ ์คํ ์ถ์ ์ ๊ฐ์ต๋๋ค.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-69-9af4cfc17aec> in <module>
1 df_with_bool = pd.DataFrame({"a": [1, 2, 3], "b": [True, False, None]})
2 imputer = Imputer()
----> 3 imputer.fit(df_with_bool)
~/sources/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
12 @wraps(method)
13 def _set_fit(self, X, y=None):
---> 14 return_value = method(self, X, y)
15 self._is_fitted = True
16 return return_value
~/sources/evalml/evalml/pipelines/components/transformers/imputers/imputer.py in fit(self, X, y)
76 X_categorical = X_null_dropped.select_dtypes(include=categorical_dtypes + boolean)
77 if len(X_categorical.columns) > 0:
---> 78 self._categorical_imputer.fit(X_categorical, y)
79 self._categorical_cols = X_categorical.columns
80 return self
~/sources/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
12 @wraps(method)
13 def _set_fit(self, X, y=None):
---> 14 return_value = method(self, X, y)
15 self._is_fitted = True
16 return return_value
~/sources/evalml/evalml/pipelines/components/transformers/imputers/simple_imputer.py in fit(self, X, y)
42 if not isinstance(X, pd.DataFrame):
43 X = pd.DataFrame(X)
---> 44 self._component_obj.fit(X, y)
45 self._all_null_cols = set(X.columns) - set(X.dropna(axis=1, how='all').columns)
46 return self
~/miniconda3/envs/evalml/lib/python3.8/site-packages/sklearn/impute/_base.py in fit(self, X, y)
300 fill_value)
301 else:
--> 302 self.statistics_ = self._dense_fit(X,
303 self.strategy,
304 self.missing_values,
~/miniconda3/envs/evalml/lib/python3.8/site-packages/sklearn/impute/_base.py in _dense_fit(self, X, strategy, missing_values, fill_value)
384 row_mask = np.logical_not(row_mask).astype(np.bool)
385 row = row[row_mask]
--> 386 most_frequent[i] = _most_frequent(row, np.nan, 0)
387
388 return most_frequent
~/miniconda3/envs/evalml/lib/python3.8/site-packages/sklearn/impute/_base.py in _most_frequent(array, extra_value, n_repeat)
40 # has already been NaN-masked.
41 warnings.simplefilter("ignore", RuntimeWarning)
---> 42 mode = stats.mode(array)
43
44 most_frequent_value = mode[0][0]
~/miniconda3/envs/evalml/lib/python3.8/site-packages/scipy/stats/stats.py in mode(a, axis, nan_policy)
498 counts = np.zeros(a_view.shape[:-1], dtype=np.int)
499 for ind in inds:
--> 500 modes[ind], counts[ind] = _mode1D(a_view[ind])
501 newshape = list(a.shape)
502 newshape[axis] = 1
~/miniconda3/envs/evalml/lib/python3.8/site-packages/scipy/stats/stats.py in _mode1D(a)
485
486 def _mode1D(a):
--> 487 vals, cnts = np.unique(a, return_counts=True)
488 return vals[cnts.argmax()], cnts.max()
489
<__array_function__ internals> in unique(*args, **kwargs)
~/miniconda3/envs/evalml/lib/python3.8/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
259 ar = np.asanyarray(ar)
260 if axis is None:
--> 261 ret = _unique1d(ar, return_index, return_inverse, return_counts)
262 return _unpack_tuple(ret)
263
~/miniconda3/envs/evalml/lib/python3.8/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
320 aux = ar[perm]
321 else:
--> 322 ar.sort()
323 aux = ar
324 mask = np.empty(aux.shape, dtype=np.bool_)
TypeError: '<' not supported between instances of 'NoneType' and 'bool'
๊ทธ๊ฒ์ด์ด ์ํ np.nan
๋์ None
@freddyaboulton ๋ช ํํ ์ฌ์์ฐ์ ๊ฐ์ฌ๋๋ฆฝ๋๋ค! ์ด๊ฒ์ ๋ค๋ฅธ ๋ฒ๊ทธ #1092๋ ์ค๋ช ํ๋ ๊ฒ์ผ๋ก ๋ณด์ ๋๋ค.
๋ฌธ์
pandas ๋ฐ์ดํฐ ํ๋ ์์ ๊ธฐ๋ฅ์ object
์ ํ์ด ์๊ณ None
๊ฐ์ด ํฌํจ๋์ด ์์ผ๋ฉด Imputer
๊ฐ ์คํจํฉ๋๋ค.
X = pd.DataFrame({'feature1': [False, True, None, np.nan]})
๋ object
์ ํ์ ํผ์ณ๋ฅผ ์์ฑํฉ๋๋ค. Imputer.fit
์คํจ.X = pd.DataFrame({'feature1': [False, True, np.nan]})
๋ object
์ ํ์ ํผ์ณ๋ฅผ ์์ฑํฉ๋๋ค. Imputer.fit
์๋ํฉ๋๋ค.X = pd.DataFrame({'feature1': [False, True]})
๋ bool
์ ํ์ ํผ์ณ๋ฅผ ์์ฑํฉ๋๋ค. Imputer.fit
์๋ํฉ๋๋ค.category
์ ํ๋ ๋ง์ฐฌ๊ฐ์ง์
๋๋ค. ๋ง์ง๋ง ๊ฒฝ์ฐ๋ ์ ์ฉ๋์ง ์์ง๋ง ์ ์ฌํ ์ํฉ์ด ๋ฌธ์์ด ์ ํ์ ๋ํด ๋ฐ์ํฉ๋๋ค.
๋ฉ๋ชจ
์ฌ๊ธฐ์ ํผ๋์ค๋ฌ์ด ์ ์ None
๊ฐ ๋ค๋ฅธ ๊ฒ์ ์๋ฏธํ ์ ์๋ค๋ ๊ฒ์
๋๋ค. nan
์ ๊ฐ์ ์๋ ์๊ณ ์์ฒด ๋ฒ์ฃผ๋ก ์ฌ์ฉํ ์๋ ์์ต๋๋ค.
์ฐ๋ฆฌ๊ฐ ๊ทธ ๊ท์น์ ๋ฌธ์ํํ๊ณ ์ค๋ช
ํ๋ ํ nan
๋ก ์ทจ๊ธํ๋ ๊ฒ์ด ์ข๋ค๊ณ ์๊ฐํฉ๋๋ค.
ํด๊ฒฐ ๋ฐฉ๋ฒ
๋ถ์ธ / ์นดํ
๊ณ ๋ฆฌ / ๋ฌธ์์ด ๊ธฐ๋ฅ์์ None
์ ๋ฆฌ : df = df.fillna(value=np.nan)
๊ณ ์น๋ค
๋จ๊ธฐ:
Imputer
์ ์
๋ฐ์ดํธํ์ฌ None
์ np.nan
๋ก ๋์ฒดImputer
API ๋ฌธ์ ๋ฐ automl ์ฌ์ฉ์ ๊ฐ์ด๋๋ฅผ ์
๋ฐ์ดํธํ์ฌ ์ด์ ๋ํด ์ธ๊ธํ์ธ์.None
๋ฅผ ํฌํจํ์ฌ Imputer
์ ํ
์คํธ ๋ฒ์๋ฅผ ์ถ๊ฐํฉ๋๋ค.๋ฐ์ดํฐ์ None
๊ฐ ์์ผ๋ฉด ์ค๋ฅ๊ฐ ๋ฐ์ํ๋ DataCheck
๋ฅผ ๋์ ์ถ๊ฐํ ์ ์์ต๋๋ค. ๊ทธ๋ฌ๋ ์ด๊ฒ์ None
๊ฐ ์ฝ๊ฒ ๋ณํ๋ ์ ์๊ธฐ ๋๋ฌธ์ ๋ถํ์ํ๊ฒ ๋๊ปด์ง๋๋ค.
์ฅ๊ธฐ๊ฐ:
์๋ก์ด DataTable
๋ฐ์ดํฐ ๊ตฌ์กฐ๋ฅผ ์ฌ์ฉํ๋๋ก evalml์ ์
๋ฐ์ดํธํ๋ฉด ์ฌ์ฉ์๋ ๋ฏธ๋ฆฌ ๊ฐ ๊ธฐ๋ฅ์ ์ ํ์ ๊ตฌ์ฑํ ์ ์์ต๋๋ค. ์ด๊ฒ์ด ํ์คํ๊ฐ ์ด๋ฌํ ์ข
๋ฅ์ ์ค๋ฅ๋ฅผ ๋ฌด์๋ฏธํ๊ฒ ๋ง๋ค ๊ฒ์ด๋ผ๋ ๊ฒ์ ์๋ฏธํ๊ธฐ๋ฅผ ๋ฐ๋๋๋ค.
#540๊ณผ ๊ด๋ จ์ด ์์ต๋๊น?
@angela97lin ๐คฆ 100% ๊ด๋ จ... ใ
. ์ฐ๋ฆฌ๋ imputer๊ฐ None
s๋ฅผ np.nan
s๋ก ๋ณํํ๊ธฐ๋ก ๊ฒฐ์ ํ์ต๋๋ค.
์ฌ๊ธฐ์ ๊ธฐ๋ก์ด ๋ ์ต์ ์ด๊ธฐ ๋๋ฌธ์ ์ด์ ์ฐฌ์ฑํ์ฌ #540์ ๋ซ์ต๋๋ค.
๊ฐ์ฌํฉ๋๋ค!