Kumpulan data: titanic_text.csv
Reproduksi:
import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived',], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
automl.search()
Kesalahan dengan:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-9-7e028fe6ad03> in <module>
1 automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
----> 2 automl.search()
~/Desktop/evalml/evalml/automl/automl_search.py in search(self, show_iteration_plot)
598 computation = computations[current_computation_index]
599 if computation.done():
--> 600 evaluation = computation.get_result()
601 data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger")
602 pipeline_id = self._post_evaluation_callback(pipeline, data, job_log)
~/Desktop/evalml/evalml/automl/engine/sequential_engine.py in get_result(self)
33 Raises Exception: If computation fails. Returns traceback.
34 """
---> 35 return self.work(**self.kwargs)
36
37 def cancel(self):
~/Desktop/evalml/evalml/automl/engine/engine_base.py in evaluate_pipeline(pipeline, automl_config, X, y, logger)
216 return train_and_score_pipeline(pipeline, automl_config=automl_config,
217 full_X_train=X, full_y_train=y,
--> 218 logger=logger)
219
220
~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
171 if automl_config.error_callback is not None:
172 automl_config.error_callback(exception=e, traceback=traceback.format_tb(sys.exc_info()[2]), automl=automl_config,
--> 173 fold_num=i, pipeline=pipeline)
174 if isinstance(e, PipelineScoreError):
175 nan_scores = {objective: np.nan for objective in e.exceptions}
~/Desktop/evalml/evalml/automl/callbacks.py in raise_error_callback(exception, traceback, automl, **kwargs)
13 logger.error(f'AutoML search raised a fatal exception: {str(exception)}')
14 logger.error("\n".join(traceback))
---> 15 raise exception
16
17
~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
160 try:
161 logger.debug(f"\t\t\tFold {i}: starting training")
--> 162 cv_pipeline = train_pipeline(pipeline, X_train, y_train, automl_config.optimize_thresholds, automl_config.objective)
163 logger.debug(f"\t\t\tFold {i}: finished training")
164 if automl_config.optimize_thresholds and pipeline.can_tune_threshold_with_objective(automl_config.objective):
~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_pipeline(pipeline, X, y, optimize_thresholds, objective)
111 test_size=0.2, random_seed=pipeline.random_seed)
112 cv_pipeline = pipeline.clone()
--> 113 cv_pipeline.fit(X, y)
114 tune_binary_threshold(cv_pipeline, objective, cv_pipeline.problem_type,
115 X_threshold_tuning, y_threshold_tuning)
~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
16 @wraps(method)
17 def _set_fit(self, X, y=None):
---> 18 return_value = method(self, X, y)
19 self._is_fitted = True
20 return return_value
~/Desktop/evalml/evalml/pipelines/classification_pipeline.py in fit(self, X, y)
35 self._encoder.fit(y)
36 y = self._encode_targets(y)
---> 37 self._fit(X, y)
38 return self
39
~/Desktop/evalml/evalml/pipelines/pipeline_base.py in _fit(self, X, y)
217 def _fit(self, X, y):
218 self.input_target_name = y.name
--> 219 self._component_graph.fit(X, y)
220 self.input_feature_names = self._component_graph.input_feature_names
221
~/Desktop/evalml/evalml/pipelines/component_graph.py in fit(self, X, y)
122 X = infer_feature_types(X)
123 X = _convert_woodwork_types_wrapper(X.to_dataframe())
--> 124 self._compute_features(self.compute_order, X, y, fit=True)
125 self._feature_provenance = self._get_feature_provenance(X.columns)
126 return self
~/Desktop/evalml/evalml/pipelines/component_graph.py in _compute_features(self, component_list, X, y, fit)
249 else:
250 if fit:
--> 251 component_instance.fit(input_x, input_y)
252 if not (fit and component_name == self.compute_order[-1]): # Don't call predict on the final component during fit
253 output = component_instance.predict(input_x)
~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
16 @wraps(method)
17 def _set_fit(self, X, y=None):
---> 18 return_value = method(self, X, y)
19 self._is_fitted = True
20 return return_value
~/Desktop/evalml/evalml/pipelines/components/estimators/estimator.py in fit(self, X, y)
45 X, y = self._manage_woodwork(X, y)
46 self.input_feature_names = list(X.columns)
---> 47 self._component_obj.fit(X, y)
48 return self
49
~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
439 self._le = LabelEncoder().fit(y)
440 self.classes_ = self._le.classes_
--> 441 return super().fit(X, self._le.transform(y), sample_weight)
442
443 @if_delegate_has_method(delegate='final_estimator_')
~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
182 fit_params=fit_params,
183 verbose=self.verbose)
--> 184 for est, meth in zip(all_estimators, self.stack_method_)
185 if est != 'drop'
186 )
~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
AttributeError: 'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'
Menjatuhkan semua kolom teks memungkinkan ansambel berjalan hingga selesai:
import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived', 'Name', 'Embarked'], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
automl.search()
Ini berpotensi relevan: https://github.com/nltk/nltk/issues/1576
Saya pikir kita dapat melakukan repro tanpa menjalankan pencarian - hanya menekan fit pada StackedEnsemblerClassifier
menimbulkan kesalahan. Saya tidak yakin apakah ini sepenuhnya karena multiprosesor, menggunakan Parallel(n_jobs=-1)
dengan pipa vanilla berfungsi. Saya pikir itu sesuatu yang terjadi dalam StackedEnsemblerClassifier
:
from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
from joblib import Parallel, delayed
import pytest
X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]
# Fitting each pipeline individually works
for pl in pipelines:
pl.fit(X, y)
# Wrapping each pipeline in sklearn works
for pl in pipelines:
WrappedSKClassifier(pl).fit(X, y)
def fit_pipeline(pipeline, X, y):
return pipeline.fit(X, y)
# Fitting pipelines in parallel works too
fit_pipelines = Parallel(n_jobs=-1)(delayed(fit_pipeline)(pl, X, y) for pl in pipelines)
fit_sklearn_wrapped = Parallel(n_jobs=-1)(delayed(fit_pipeline)(WrappedSKClassifier(pl), X, y) for pl in pipelines)
# Using ensemble does not work
ensemble = StackedEnsembleClassifier(pipelines)
with pytest.raises(AttributeError,
match="'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'"):
ensemble.fit(X, y)
@freddyaboulton luar biasa! Ya saya melihat itu juga.
Saya tahu kumpulan data yang kami berikan telah diacak relatif terhadap data titanic standar. Aku ingin tahu apakah itu ada hubungannya dengan penyebabnya di sini.
Untuk siapa pun yang mengambil ini, menyetel n_jobs=1
hanya untuk ensembler di automl membuat jejak tumpukan asli hilang. Itu bisa menjadi tambalan cepat tetapi masih ada ruang untuk menemukan akar masalahnya dan mengidentifikasi perbaikan yang lebih baik.
from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
import pytest
X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
X = X.drop(["Embarked"], axis=1)
components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]
ensemble = StackedEnsembleClassifier(pipelines, n_jobs=1)
ensemble.fit(X, y)
Juga, seseorang memeriksa saya tetapi saya dapat mengulangi masalah asli dengan evalml 0.23.0 jadi saya tidak yakin apakah perlu menyelidiki perbedaan antara 0.24.1 dan 0.23.0.
@freddyaboulton Saya pikir ini masuk akal mengingat https://github.com/nltk/nltk/issues/1576 , tidak yakin mereka pernah menyelesaikan ini tetapi saya juga dapat melakukan repro dengan 0.23.0 :')
Ini sepertinya sudah mendekati titik
Komentar yang paling membantu
Untuk siapa pun yang mengambil ini, menyetel
n_jobs=1
hanya untuk ensembler di automl membuat jejak tumpukan asli hilang. Itu bisa menjadi tambalan cepat tetapi masih ada ruang untuk menemukan akar masalahnya dan mengidentifikasi perbaikan yang lebih baik.Juga, seseorang memeriksa saya tetapi saya dapat mengulangi masalah asli dengan evalml 0.23.0 jadi saya tidak yakin apakah perlu menyelidiki perbedaan antara 0.24.1 dan 0.23.0.