Evalml: El conjunto de datos Titanic con columnas de texto falla en el conjunto

Creado en 20 may. 2021 · 5Comentarios · Fuente: alteryx/evalml

Reproductor:

import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived',], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
automl.search()

Errores con:


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-7e028fe6ad03> in <module>
      1 automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
----> 2 automl.search()

~/Desktop/evalml/evalml/automl/automl_search.py in search(self, show_iteration_plot)
    598                     computation = computations[current_computation_index]
    599                     if computation.done():
--> 600                         evaluation = computation.get_result()
    601                         data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger")
    602                         pipeline_id = self._post_evaluation_callback(pipeline, data, job_log)

~/Desktop/evalml/evalml/automl/engine/sequential_engine.py in get_result(self)
     33         Raises Exception: If computation fails. Returns traceback.
     34         """
---> 35         return self.work(**self.kwargs)
     36 
     37     def cancel(self):

~/Desktop/evalml/evalml/automl/engine/engine_base.py in evaluate_pipeline(pipeline, automl_config, X, y, logger)
    216     return train_and_score_pipeline(pipeline, automl_config=automl_config,
    217                                     full_X_train=X, full_y_train=y,
--> 218                                     logger=logger)
    219 
    220 

~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
    171             if automl_config.error_callback is not None:
    172                 automl_config.error_callback(exception=e, traceback=traceback.format_tb(sys.exc_info()[2]), automl=automl_config,
--> 173                                              fold_num=i, pipeline=pipeline)
    174             if isinstance(e, PipelineScoreError):
    175                 nan_scores = {objective: np.nan for objective in e.exceptions}

~/Desktop/evalml/evalml/automl/callbacks.py in raise_error_callback(exception, traceback, automl, **kwargs)
     13     logger.error(f'AutoML search raised a fatal exception: {str(exception)}')
     14     logger.error("\n".join(traceback))
---> 15     raise exception
     16 
     17 

~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
    160         try:
    161             logger.debug(f"\t\t\tFold {i}: starting training")
--> 162             cv_pipeline = train_pipeline(pipeline, X_train, y_train, automl_config.optimize_thresholds, automl_config.objective)
    163             logger.debug(f"\t\t\tFold {i}: finished training")
    164             if automl_config.optimize_thresholds and pipeline.can_tune_threshold_with_objective(automl_config.objective):

~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_pipeline(pipeline, X, y, optimize_thresholds, objective)
    111                                                                   test_size=0.2, random_seed=pipeline.random_seed)
    112     cv_pipeline = pipeline.clone()
--> 113     cv_pipeline.fit(X, y)
    114     tune_binary_threshold(cv_pipeline, objective, cv_pipeline.problem_type,
    115                           X_threshold_tuning, y_threshold_tuning)

~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
     16         @wraps(method)
     17         def _set_fit(self, X, y=None):
---> 18             return_value = method(self, X, y)
     19             self._is_fitted = True
     20             return return_value

~/Desktop/evalml/evalml/pipelines/classification_pipeline.py in fit(self, X, y)
     35         self._encoder.fit(y)
     36         y = self._encode_targets(y)
---> 37         self._fit(X, y)
     38         return self
     39 

~/Desktop/evalml/evalml/pipelines/pipeline_base.py in _fit(self, X, y)
    217     def _fit(self, X, y):
    218         self.input_target_name = y.name
--> 219         self._component_graph.fit(X, y)
    220         self.input_feature_names = self._component_graph.input_feature_names
    221 

~/Desktop/evalml/evalml/pipelines/component_graph.py in fit(self, X, y)
    122         X = infer_feature_types(X)
    123         X = _convert_woodwork_types_wrapper(X.to_dataframe())
--> 124         self._compute_features(self.compute_order, X, y, fit=True)
    125         self._feature_provenance = self._get_feature_provenance(X.columns)
    126         return self

~/Desktop/evalml/evalml/pipelines/component_graph.py in _compute_features(self, component_list, X, y, fit)
    249             else:
    250                 if fit:
--> 251                     component_instance.fit(input_x, input_y)
    252                 if not (fit and component_name == self.compute_order[-1]):  # Don't call predict on the final component during fit
    253                     output = component_instance.predict(input_x)

~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
     16         @wraps(method)
     17         def _set_fit(self, X, y=None):
---> 18             return_value = method(self, X, y)
     19             self._is_fitted = True
     20             return return_value

~/Desktop/evalml/evalml/pipelines/components/estimators/estimator.py in fit(self, X, y)
     45         X, y = self._manage_woodwork(X, y)
     46         self.input_feature_names = list(X.columns)
---> 47         self._component_obj.fit(X, y)
     48         return self
     49 

~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
    439         self._le = LabelEncoder().fit(y)
    440         self.classes_ = self._le.classes_
--> 441         return super().fit(X, self._le.transform(y), sample_weight)
    442 
    443     @if_delegate_has_method(delegate='final_estimator_')

~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
    182                                        fit_params=fit_params,
    183                                        verbose=self.verbose)
--> 184             for est, meth in zip(all_estimators, self.stack_method_)
    185             if est != 'drop'
    186         )

~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
   1052 
   1053             with self._backend.retrieval_context():
-> 1054                 self.retrieve()
   1055             # Make sure that we get a last message telling us we are done
   1056             elapsed_time = time.time() - self._start_time

~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
    931             try:
    932                 if getattr(self._backend, 'supports_timeout', False):
--> 933                     self._output.extend(job.get(timeout=self.timeout))
    934                 else:
    935                     self._output.extend(job.get())

~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    540         AsyncResults.get from multiprocessing."""
    541         try:
--> 542             return future.result(timeout=timeout)
    543         except CfTimeoutError as e:
    544             raise TimeoutError from e

~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
    433                 raise CancelledError()
    434             elif self._state == FINISHED:
--> 435                 return self.__get_result()
    436             else:
    437                 raise TimeoutError()

~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result

AttributeError: 'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'

Eliminar todas las columnas de texto permite que el conjunto se ejecute hasta su finalización:

import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived', 'Name', 'Embarked'], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
automl.search()

Esto es potencialmente relevante: https://github.com/nltk/nltk/issues/1576

bug

Fuente

angela97lin

Comentario más útil

Para cualquiera que tome esto, configurar n_jobs=1 solo para el ensamblador en automl hace que el seguimiento de la pila original desaparezca. Puede ser un parche rápido, pero todavía hay espacio para encontrar la causa raíz e identificar una solución mejor.

from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
import pytest

X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
X = X.drop(["Embarked"], axis=1)

components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]

ensemble = StackedEnsembleClassifier(pipelines, n_jobs=1)
ensemble.fit(X, y)

Además, alguien me revisa, pero puedo reproducir el problema original con evalml 0.23.0, así que no estoy seguro de que valga la pena investigar la diferencia entre 0.24.1 y 0.23.0.

freddyaboulton en 20 may. 2021

👍2

Todos 5 comentarios

Creo que podemos reproducir sin ejecutar la búsqueda; solo presionar ajustar en StackedEnsemblerClassifier genera el error. No estoy seguro de si se debe completamente al multiprocesamiento, usar Parallel(n_jobs=-1) con pipelines vanilla funciona. Creo que está sucediendo algo dentro de StackedEnsemblerClassifier :

from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
from joblib import Parallel, delayed
import pytest

X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')

components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]

# Fitting each pipeline individually works
for pl in pipelines:
    pl.fit(X, y)

# Wrapping each pipeline in sklearn works
for pl in pipelines:
    WrappedSKClassifier(pl).fit(X, y)


def fit_pipeline(pipeline, X, y):
    return pipeline.fit(X, y)

# Fitting pipelines in parallel works too
fit_pipelines = Parallel(n_jobs=-1)(delayed(fit_pipeline)(pl, X, y) for pl in pipelines)
fit_sklearn_wrapped = Parallel(n_jobs=-1)(delayed(fit_pipeline)(WrappedSKClassifier(pl), X, y) for pl in pipelines)


# Using ensemble does not work
ensemble = StackedEnsembleClassifier(pipelines)
with pytest.raises(AttributeError,
                   match="'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'"):
    ensemble.fit(X, y)

freddyaboulton en 20 may. 2021

👍1

@freddyaboulton ¡increíble! Sí, yo también veo eso.

Sé que el conjunto de datos que nos proporcionaron se había barajado en relación con los datos titánicos estándar. Me pregunto si eso tiene algo que ver con la causa aquí.

dsherry en 20 may. 2021

from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
import pytest

X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
X = X.drop(["Embarked"], axis=1)

components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]

ensemble = StackedEnsembleClassifier(pipelines, n_jobs=1)
ensemble.fit(X, y)

Además, alguien me revisa, pero puedo reproducir el problema original con evalml 0.23.0, así que no estoy seguro de que valga la pena investigar la diferencia entre 0.24.1 y 0.23.0.

freddyaboulton en 20 may. 2021

👍2

@freddyaboulton Creo que esto tiene sentido dado https://github.com/nltk/nltk/issues/1576, no estoy seguro de que hayan resuelto esto, pero también puedo reproducir con 0.23.0: ')

angela97lin en 20 may. 2021

Esto ciertamente parece estar cerca del punto

ParthivNaresh en 20 may. 2021

¿Fue útil esta página

0 / 5 - 0 calificaciones

Temas relacionados

Error al ejecutar AutoML en el conjunto de datos Iris

SydneyAyx · 3Comentarios

Agregar gráfico de dependencia parcial

dsherry · 3Comentarios

build_conda_pkg falla en main

dsherry · 3Comentarios

Actualice la canalización y los componentes para devolver estructuras de datos de Woodwork

angela97lin · 5Comentarios

Haga que automl ajuste automáticamente la mejor canalización en todos los datos de entrenamiento

dsherry · 3Comentarios