Conjunto de datos: titanic_text.csv
Reproductor:
import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived',], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
automl.search()
Errores con:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-9-7e028fe6ad03> in <module>
1 automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
----> 2 automl.search()
~/Desktop/evalml/evalml/automl/automl_search.py in search(self, show_iteration_plot)
598 computation = computations[current_computation_index]
599 if computation.done():
--> 600 evaluation = computation.get_result()
601 data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger")
602 pipeline_id = self._post_evaluation_callback(pipeline, data, job_log)
~/Desktop/evalml/evalml/automl/engine/sequential_engine.py in get_result(self)
33 Raises Exception: If computation fails. Returns traceback.
34 """
---> 35 return self.work(**self.kwargs)
36
37 def cancel(self):
~/Desktop/evalml/evalml/automl/engine/engine_base.py in evaluate_pipeline(pipeline, automl_config, X, y, logger)
216 return train_and_score_pipeline(pipeline, automl_config=automl_config,
217 full_X_train=X, full_y_train=y,
--> 218 logger=logger)
219
220
~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
171 if automl_config.error_callback is not None:
172 automl_config.error_callback(exception=e, traceback=traceback.format_tb(sys.exc_info()[2]), automl=automl_config,
--> 173 fold_num=i, pipeline=pipeline)
174 if isinstance(e, PipelineScoreError):
175 nan_scores = {objective: np.nan for objective in e.exceptions}
~/Desktop/evalml/evalml/automl/callbacks.py in raise_error_callback(exception, traceback, automl, **kwargs)
13 logger.error(f'AutoML search raised a fatal exception: {str(exception)}')
14 logger.error("\n".join(traceback))
---> 15 raise exception
16
17
~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
160 try:
161 logger.debug(f"\t\t\tFold {i}: starting training")
--> 162 cv_pipeline = train_pipeline(pipeline, X_train, y_train, automl_config.optimize_thresholds, automl_config.objective)
163 logger.debug(f"\t\t\tFold {i}: finished training")
164 if automl_config.optimize_thresholds and pipeline.can_tune_threshold_with_objective(automl_config.objective):
~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_pipeline(pipeline, X, y, optimize_thresholds, objective)
111 test_size=0.2, random_seed=pipeline.random_seed)
112 cv_pipeline = pipeline.clone()
--> 113 cv_pipeline.fit(X, y)
114 tune_binary_threshold(cv_pipeline, objective, cv_pipeline.problem_type,
115 X_threshold_tuning, y_threshold_tuning)
~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
16 @wraps(method)
17 def _set_fit(self, X, y=None):
---> 18 return_value = method(self, X, y)
19 self._is_fitted = True
20 return return_value
~/Desktop/evalml/evalml/pipelines/classification_pipeline.py in fit(self, X, y)
35 self._encoder.fit(y)
36 y = self._encode_targets(y)
---> 37 self._fit(X, y)
38 return self
39
~/Desktop/evalml/evalml/pipelines/pipeline_base.py in _fit(self, X, y)
217 def _fit(self, X, y):
218 self.input_target_name = y.name
--> 219 self._component_graph.fit(X, y)
220 self.input_feature_names = self._component_graph.input_feature_names
221
~/Desktop/evalml/evalml/pipelines/component_graph.py in fit(self, X, y)
122 X = infer_feature_types(X)
123 X = _convert_woodwork_types_wrapper(X.to_dataframe())
--> 124 self._compute_features(self.compute_order, X, y, fit=True)
125 self._feature_provenance = self._get_feature_provenance(X.columns)
126 return self
~/Desktop/evalml/evalml/pipelines/component_graph.py in _compute_features(self, component_list, X, y, fit)
249 else:
250 if fit:
--> 251 component_instance.fit(input_x, input_y)
252 if not (fit and component_name == self.compute_order[-1]): # Don't call predict on the final component during fit
253 output = component_instance.predict(input_x)
~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
16 @wraps(method)
17 def _set_fit(self, X, y=None):
---> 18 return_value = method(self, X, y)
19 self._is_fitted = True
20 return return_value
~/Desktop/evalml/evalml/pipelines/components/estimators/estimator.py in fit(self, X, y)
45 X, y = self._manage_woodwork(X, y)
46 self.input_feature_names = list(X.columns)
---> 47 self._component_obj.fit(X, y)
48 return self
49
~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
439 self._le = LabelEncoder().fit(y)
440 self.classes_ = self._le.classes_
--> 441 return super().fit(X, self._le.transform(y), sample_weight)
442
443 @if_delegate_has_method(delegate='final_estimator_')
~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
182 fit_params=fit_params,
183 verbose=self.verbose)
--> 184 for est, meth in zip(all_estimators, self.stack_method_)
185 if est != 'drop'
186 )
~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
AttributeError: 'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'
Eliminar todas las columnas de texto permite que el conjunto se ejecute hasta su finalización:
import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived', 'Name', 'Embarked'], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
automl.search()
Esto es potencialmente relevante: https://github.com/nltk/nltk/issues/1576
Creo que podemos reproducir sin ejecutar la búsqueda; solo presionar ajustar en StackedEnsemblerClassifier
genera el error. No estoy seguro de si se debe completamente al multiprocesamiento, usar Parallel(n_jobs=-1)
con pipelines vanilla funciona. Creo que está sucediendo algo dentro de StackedEnsemblerClassifier
:
from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
from joblib import Parallel, delayed
import pytest
X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]
# Fitting each pipeline individually works
for pl in pipelines:
pl.fit(X, y)
# Wrapping each pipeline in sklearn works
for pl in pipelines:
WrappedSKClassifier(pl).fit(X, y)
def fit_pipeline(pipeline, X, y):
return pipeline.fit(X, y)
# Fitting pipelines in parallel works too
fit_pipelines = Parallel(n_jobs=-1)(delayed(fit_pipeline)(pl, X, y) for pl in pipelines)
fit_sklearn_wrapped = Parallel(n_jobs=-1)(delayed(fit_pipeline)(WrappedSKClassifier(pl), X, y) for pl in pipelines)
# Using ensemble does not work
ensemble = StackedEnsembleClassifier(pipelines)
with pytest.raises(AttributeError,
match="'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'"):
ensemble.fit(X, y)
@freddyaboulton ¡increíble! Sí, yo también veo eso.
Sé que el conjunto de datos que nos proporcionaron se había barajado en relación con los datos titánicos estándar. Me pregunto si eso tiene algo que ver con la causa aquí.
Para cualquiera que tome esto, configurar n_jobs=1
solo para el ensamblador en automl hace que el seguimiento de la pila original desaparezca. Puede ser un parche rápido, pero todavía hay espacio para encontrar la causa raíz e identificar una solución mejor.
from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
import pytest
X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
X = X.drop(["Embarked"], axis=1)
components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]
ensemble = StackedEnsembleClassifier(pipelines, n_jobs=1)
ensemble.fit(X, y)
Además, alguien me revisa, pero puedo reproducir el problema original con evalml 0.23.0, así que no estoy seguro de que valga la pena investigar la diferencia entre 0.24.1 y 0.23.0.
@freddyaboulton Creo que esto tiene sentido dado https://github.com/nltk/nltk/issues/1576, no estoy seguro de que hayan resuelto esto, pero también puedo reproducir con 0.23.0: ')
Esto ciertamente parece estar cerca del punto
Comentario más útil
Para cualquiera que tome esto, configurar
n_jobs=1
solo para el ensamblador en automl hace que el seguimiento de la pila original desaparezca. Puede ser un parche rápido, pero todavía hay espacio para encontrar la causa raíz e identificar una solución mejor.Además, alguien me revisa, pero puedo reproducir el problema original con evalml 0.23.0, así que no estoy seguro de que valga la pena investigar la diferencia entre 0.24.1 y 0.23.0.