Evalml: Titanic dataset with text columns fails on ensembling

Dataset: titanic_text.csv


import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived',], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)

Errors with:

AttributeError                            Traceback (most recent call last)
<ipython-input-9-7e028fe6ad03> in <module>
      1 automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
----> 2 automl.search()

~/Desktop/evalml/evalml/automl/automl_search.py in search(self, show_iteration_plot)
    598                     computation = computations[current_computation_index]
    599                     if computation.done():
--> 600                         evaluation = computation.get_result()
    601                         data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger")
    602                         pipeline_id = self._post_evaluation_callback(pipeline, data, job_log)

~/Desktop/evalml/evalml/automl/engine/sequential_engine.py in get_result(self)
     33         Raises Exception: If computation fails. Returns traceback.
     34         """
---> 35         return self.work(**self.kwargs)
     37     def cancel(self):

~/Desktop/evalml/evalml/automl/engine/engine_base.py in evaluate_pipeline(pipeline, automl_config, X, y, logger)
    216     return train_and_score_pipeline(pipeline, automl_config=automl_config,
    217                                     full_X_train=X, full_y_train=y,
--> 218                                     logger=logger)

~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
    171             if automl_config.error_callback is not None:
    172                 automl_config.error_callback(exception=e, traceback=traceback.format_tb(sys.exc_info()[2]), automl=automl_config,
--> 173                                              fold_num=i, pipeline=pipeline)
    174             if isinstance(e, PipelineScoreError):
    175                 nan_scores = {objective: np.nan for objective in e.exceptions}

~/Desktop/evalml/evalml/automl/callbacks.py in raise_error_callback(exception, traceback, automl, **kwargs)
     13     logger.error(f'AutoML search raised a fatal exception: {str(exception)}')
     14     logger.error("\n".join(traceback))
---> 15     raise exception

~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
    160         try:
    161             logger.debug(f"\t\t\tFold {i}: starting training")
--> 162             cv_pipeline = train_pipeline(pipeline, X_train, y_train, automl_config.optimize_thresholds, automl_config.objective)
    163             logger.debug(f"\t\t\tFold {i}: finished training")
    164             if automl_config.optimize_thresholds and pipeline.can_tune_threshold_with_objective(automl_config.objective):

~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_pipeline(pipeline, X, y, optimize_thresholds, objective)
    111                                                                   test_size=0.2, random_seed=pipeline.random_seed)
    112     cv_pipeline = pipeline.clone()
--> 113     cv_pipeline.fit(X, y)
    114     tune_binary_threshold(cv_pipeline, objective, cv_pipeline.problem_type,
    115                           X_threshold_tuning, y_threshold_tuning)

~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
     16         @wraps(method)
     17         def _set_fit(self, X, y=None):
---> 18             return_value = method(self, X, y)
     19             self._is_fitted = True
     20             return return_value

~/Desktop/evalml/evalml/pipelines/classification_pipeline.py in fit(self, X, y)
     35         self._encoder.fit(y)
     36         y = self._encode_targets(y)
---> 37         self._fit(X, y)
     38         return self

~/Desktop/evalml/evalml/pipelines/pipeline_base.py in _fit(self, X, y)
    217     def _fit(self, X, y):
    218         self.input_target_name = y.name
--> 219         self._component_graph.fit(X, y)
    220         self.input_feature_names = self._component_graph.input_feature_names

~/Desktop/evalml/evalml/pipelines/component_graph.py in fit(self, X, y)
    122         X = infer_feature_types(X)
    123         X = _convert_woodwork_types_wrapper(X.to_dataframe())
--> 124         self._compute_features(self.compute_order, X, y, fit=True)
    125         self._feature_provenance = self._get_feature_provenance(X.columns)
    126         return self

~/Desktop/evalml/evalml/pipelines/component_graph.py in _compute_features(self, component_list, X, y, fit)
    249             else:
    250                 if fit:
--> 251                     component_instance.fit(input_x, input_y)
    252                 if not (fit and component_name == self.compute_order[-1]):  # Don't call predict on the final component during fit
    253                     output = component_instance.predict(input_x)

~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
     16         @wraps(method)
     17         def _set_fit(self, X, y=None):
---> 18             return_value = method(self, X, y)
     19             self._is_fitted = True
     20             return return_value

~/Desktop/evalml/evalml/pipelines/components/estimators/estimator.py in fit(self, X, y)
     45         X, y = self._manage_woodwork(X, y)
     46         self.input_feature_names = list(X.columns)
---> 47         self._component_obj.fit(X, y)
     48         return self

~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
    439         self._le = LabelEncoder().fit(y)
    440         self.classes_ = self._le.classes_
--> 441         return super().fit(X, self._le.transform(y), sample_weight)
    443     @if_delegate_has_method(delegate='final_estimator_')

~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
    182                                        fit_params=fit_params,
    183                                        verbose=self.verbose)
--> 184             for est, meth in zip(all_estimators, self.stack_method_)
    185             if est != 'drop'
    186         )

~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
   1053             with self._backend.retrieval_context():
-> 1054                 self.retrieve()
   1055             # Make sure that we get a last message telling us we are done
   1056             elapsed_time = time.time() - self._start_time

~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
    931             try:
    932                 if getattr(self._backend, 'supports_timeout', False):
--> 933                     self._output.extend(job.get(timeout=self.timeout))
    934                 else:
    935                     self._output.extend(job.get())

~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    540         AsyncResults.get from multiprocessing."""
    541         try:
--> 542             return future.result(timeout=timeout)
    543         except CfTimeoutError as e:
    544             raise TimeoutError from e

~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
    433                 raise CancelledError()
    434             elif self._state == FINISHED:
--> 435                 return self.__get_result()
    436             else:
    437                 raise TimeoutError()

~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result

AttributeError: 'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'

Dropping all text columns allows the ensemble to run to completion:

import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived', 'Name', 'Embarked'], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)

This is potentially relevant: https://github.com/nltk/nltk/issues/1576


For whoever picks this up, setting n_jobs=1 for only the ensembler in automl makes the original stack trace go away. That can be a quick patch but there's still room to find the root cause and identify a better fix.

from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
import pytest

X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
X = X.drop(["Embarked"], axis=1)

components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]

ensemble = StackedEnsembleClassifier(pipelines, n_jobs=1)
ensemble.fit(X, y)

Also, someone check me but I can repro the original issue with evalml 0.23.0 so I'm not sure it's worth investigating the diff between 0.24.1 and 0.23.0.

All 5 comments

I think we can repro without running search - just hitting fit on StackedEnsemblerClassifier raises the error. I'm not sure if it's entirely due to the multiprocessing, using Parallel(n_jobs=-1) with vanilla pipelines works. I think it's something happening within StackedEnsemblerClassifier:

from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
from joblib import Parallel, delayed
import pytest

X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')

components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]

# Fitting each pipeline individually works
for pl in pipelines:
    pl.fit(X, y)

# Wrapping each pipeline in sklearn works
for pl in pipelines:
    WrappedSKClassifier(pl).fit(X, y)

def fit_pipeline(pipeline, X, y):
    return pipeline.fit(X, y)

# Fitting pipelines in parallel works too
fit_pipelines = Parallel(n_jobs=-1)(delayed(fit_pipeline)(pl, X, y) for pl in pipelines)
fit_sklearn_wrapped = Parallel(n_jobs=-1)(delayed(fit_pipeline)(WrappedSKClassifier(pl), X, y) for pl in pipelines)

# Using ensemble does not work
ensemble = StackedEnsembleClassifier(pipelines)
with pytest.raises(AttributeError,
                   match="'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'"):
    ensemble.fit(X, y)

@freddyaboulton awesome! Yeah I see that too.

I know the dataset we were provided had been shuffled relative to the standard titanic data. I wonder if that has something to do with the cause here.

For whoever picks this up, setting n_jobs=1 for only the ensembler in automl makes the original stack trace go away. That can be a quick patch but there's still room to find the root cause and identify a better fix.

from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
import pytest

X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
X = X.drop(["Embarked"], axis=1)

components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]

ensemble = StackedEnsembleClassifier(pipelines, n_jobs=1)
ensemble.fit(X, y)

Also, someone check me but I can repro the original issue with evalml 0.23.0 so I'm not sure it's worth investigating the diff between 0.24.1 and 0.23.0.

@freddyaboulton I think this makes sense given https://github.com/nltk/nltk/issues/1576, not sure they ever resolved this but I can also repro with 0.23.0 :')

This certainly seems to be near point

