Evalml: टेक्स्ट कॉलम के साथ टाइटैनिक डेटासेट कोडांतरण में विफल रहता है

को निर्मित 20 मई 2021 · 5टिप्पणियाँ · स्रोत: alteryx/evalml

पुनरुत्पादक:

import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived',], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
automl.search()

इसके साथ त्रुटियां:


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-7e028fe6ad03> in <module>
      1 automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
----> 2 automl.search()

~/Desktop/evalml/evalml/automl/automl_search.py in search(self, show_iteration_plot)
    598                     computation = computations[current_computation_index]
    599                     if computation.done():
--> 600                         evaluation = computation.get_result()
    601                         data, pipeline, job_log = evaluation.get('scores'), evaluation.get("pipeline"), evaluation.get("logger")
    602                         pipeline_id = self._post_evaluation_callback(pipeline, data, job_log)

~/Desktop/evalml/evalml/automl/engine/sequential_engine.py in get_result(self)
     33         Raises Exception: If computation fails. Returns traceback.
     34         """
---> 35         return self.work(**self.kwargs)
     36 
     37     def cancel(self):

~/Desktop/evalml/evalml/automl/engine/engine_base.py in evaluate_pipeline(pipeline, automl_config, X, y, logger)
    216     return train_and_score_pipeline(pipeline, automl_config=automl_config,
    217                                     full_X_train=X, full_y_train=y,
--> 218                                     logger=logger)
    219 
    220 

~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
    171             if automl_config.error_callback is not None:
    172                 automl_config.error_callback(exception=e, traceback=traceback.format_tb(sys.exc_info()[2]), automl=automl_config,
--> 173                                              fold_num=i, pipeline=pipeline)
    174             if isinstance(e, PipelineScoreError):
    175                 nan_scores = {objective: np.nan for objective in e.exceptions}

~/Desktop/evalml/evalml/automl/callbacks.py in raise_error_callback(exception, traceback, automl, **kwargs)
     13     logger.error(f'AutoML search raised a fatal exception: {str(exception)}')
     14     logger.error("\n".join(traceback))
---> 15     raise exception
     16 
     17 

~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_and_score_pipeline(pipeline, automl_config, full_X_train, full_y_train, logger)
    160         try:
    161             logger.debug(f"\t\t\tFold {i}: starting training")
--> 162             cv_pipeline = train_pipeline(pipeline, X_train, y_train, automl_config.optimize_thresholds, automl_config.objective)
    163             logger.debug(f"\t\t\tFold {i}: finished training")
    164             if automl_config.optimize_thresholds and pipeline.can_tune_threshold_with_objective(automl_config.objective):

~/Desktop/evalml/evalml/automl/engine/engine_base.py in train_pipeline(pipeline, X, y, optimize_thresholds, objective)
    111                                                                   test_size=0.2, random_seed=pipeline.random_seed)
    112     cv_pipeline = pipeline.clone()
--> 113     cv_pipeline.fit(X, y)
    114     tune_binary_threshold(cv_pipeline, objective, cv_pipeline.problem_type,
    115                           X_threshold_tuning, y_threshold_tuning)

~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
     16         @wraps(method)
     17         def _set_fit(self, X, y=None):
---> 18             return_value = method(self, X, y)
     19             self._is_fitted = True
     20             return return_value

~/Desktop/evalml/evalml/pipelines/classification_pipeline.py in fit(self, X, y)
     35         self._encoder.fit(y)
     36         y = self._encode_targets(y)
---> 37         self._fit(X, y)
     38         return self
     39 

~/Desktop/evalml/evalml/pipelines/pipeline_base.py in _fit(self, X, y)
    217     def _fit(self, X, y):
    218         self.input_target_name = y.name
--> 219         self._component_graph.fit(X, y)
    220         self.input_feature_names = self._component_graph.input_feature_names
    221 

~/Desktop/evalml/evalml/pipelines/component_graph.py in fit(self, X, y)
    122         X = infer_feature_types(X)
    123         X = _convert_woodwork_types_wrapper(X.to_dataframe())
--> 124         self._compute_features(self.compute_order, X, y, fit=True)
    125         self._feature_provenance = self._get_feature_provenance(X.columns)
    126         return self

~/Desktop/evalml/evalml/pipelines/component_graph.py in _compute_features(self, component_list, X, y, fit)
    249             else:
    250                 if fit:
--> 251                     component_instance.fit(input_x, input_y)
    252                 if not (fit and component_name == self.compute_order[-1]):  # Don't call predict on the final component during fit
    253                     output = component_instance.predict(input_x)

~/Desktop/evalml/evalml/utils/base_meta.py in _set_fit(self, X, y)
     16         @wraps(method)
     17         def _set_fit(self, X, y=None):
---> 18             return_value = method(self, X, y)
     19             self._is_fitted = True
     20             return return_value

~/Desktop/evalml/evalml/pipelines/components/estimators/estimator.py in fit(self, X, y)
     45         X, y = self._manage_woodwork(X, y)
     46         self.input_feature_names = list(X.columns)
---> 47         self._component_obj.fit(X, y)
     48         return self
     49 

~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
    439         self._le = LabelEncoder().fit(y)
    440         self.classes_ = self._le.classes_
--> 441         return super().fit(X, self._le.transform(y), sample_weight)
    442 
    443     @if_delegate_has_method(delegate='final_estimator_')

~/Desktop/evalml_venv/lib/python3.7/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
    182                                        fit_params=fit_params,
    183                                        verbose=self.verbose)
--> 184             for est, meth in zip(all_estimators, self.stack_method_)
    185             if est != 'drop'
    186         )

~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
   1052 
   1053             with self._backend.retrieval_context():
-> 1054                 self.retrieve()
   1055             # Make sure that we get a last message telling us we are done
   1056             elapsed_time = time.time() - self._start_time

~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
    931             try:
    932                 if getattr(self._backend, 'supports_timeout', False):
--> 933                     self._output.extend(job.get(timeout=self.timeout))
    934                 else:
    935                     self._output.extend(job.get())

~/Desktop/evalml_venv/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    540         AsyncResults.get from multiprocessing."""
    541         try:
--> 542             return future.result(timeout=timeout)
    543         except CfTimeoutError as e:
    544             raise TimeoutError from e

~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
    433                 raise CancelledError()
    434             elif self._state == FINISHED:
--> 435                 return self.__get_result()
    436             else:
    437                 raise TimeoutError()

~/.pyenv/versions/3.7.9/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result

AttributeError: 'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'

सभी टेक्स्ट कॉलम को छोड़ने से पहनावा पूरा होने तक चलता है:

import pandas as pd
import woodwork as ww
data_set = pd.read_csv("titanic_alan.csv")
y = data_set['Survived']
X = data_set.drop(['Survived', 'Name', 'Embarked'], axis=1)
automl = AutoMLSearch(X, y, problem_type="binary", error_callback=raise_error_callback, max_batches=10, ensembling=True)
automl.search()

यह संभावित रूप से प्रासंगिक है: https://github.com/nltk/nltk/issues/1576

bug

स्रोत

angela97lin

सबसे उपयोगी टिप्पणी

जो कोई भी इसे उठाता है, उसके लिए n_jobs=1 को automl में केवल एनसेंबलर के लिए सेट करना मूल स्टैक ट्रेस को दूर कर देता है। यह एक त्वरित पैच हो सकता है लेकिन मूल कारण खोजने और बेहतर समाधान की पहचान करने के लिए अभी भी जगह है।

from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
import pytest

X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
X = X.drop(["Embarked"], axis=1)

components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]

ensemble = StackedEnsembleClassifier(pipelines, n_jobs=1)
ensemble.fit(X, y)

साथ ही, कोई मुझे जांचता है लेकिन मैं मूल मुद्दे को evalml 0.23.0 के साथ दोबारा कर सकता हूं, इसलिए मुझे यकीन नहीं है कि 0.24.1 और 0.23.0 के बीच अंतर की जांच करना उचित है।

freddyaboulton 20 मई 2021

👍2

सभी 5 टिप्पणियाँ

मुझे लगता है कि हम खोज को चलाए बिना रिप्रो कर सकते हैं - केवल StackedEnsemblerClassifier पर फिट हिट करने से त्रुटि बढ़ जाती है। मुझे यकीन नहीं है कि यह पूरी तरह से मल्टीप्रोसेसिंग के कारण है, वैनिला पाइपलाइनों के साथ Parallel(n_jobs=-1) का उपयोग करके काम करता है। मुझे लगता है कि यह StackedEnsemblerClassifier भीतर कुछ हो रहा है:

from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
from joblib import Parallel, delayed
import pytest

X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')

components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]

# Fitting each pipeline individually works
for pl in pipelines:
    pl.fit(X, y)

# Wrapping each pipeline in sklearn works
for pl in pipelines:
    WrappedSKClassifier(pl).fit(X, y)


def fit_pipeline(pipeline, X, y):
    return pipeline.fit(X, y)

# Fitting pipelines in parallel works too
fit_pipelines = Parallel(n_jobs=-1)(delayed(fit_pipeline)(pl, X, y) for pl in pipelines)
fit_sklearn_wrapped = Parallel(n_jobs=-1)(delayed(fit_pipeline)(WrappedSKClassifier(pl), X, y) for pl in pipelines)


# Using ensemble does not work
ensemble = StackedEnsembleClassifier(pipelines)
with pytest.raises(AttributeError,
                   match="'WordListCorpusReader' object has no attribute '_LazyCorpusLoader__args'"):
    ensemble.fit(X, y)

freddyaboulton 20 मई 2021

👍1

@freddyaboulton कमाल! हाँ मैं वह भी देखता हूँ।

मुझे पता है कि हमें जो डेटासेट प्रदान किया गया था, वह मानक टाइटैनिक डेटा के सापेक्ष फेरबदल किया गया था। मुझे आश्चर्य है कि क्या इसका यहां के कारण से कोई लेना-देना है।

dsherry 20 मई 2021

from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import StackedEnsembleClassifier
from evalml.pipelines.components.utils import WrappedSKClassifier
import pandas as pd
import pytest

X = pd.read_csv("/Users/freddy.boulton/Downloads/titanic_text.csv")
y = X.pop('Survived')
X = X.drop(["Embarked"], axis=1)

components = ["Imputer", "Text Featurization Component", "One Hot Encoder"]
estimators = ["Random Forest Classifier", "CatBoost Classifier", "Extra Trees Classifier"]
pipelines = [BinaryClassificationPipeline(component_graph=components + [est]) for est in estimators]

ensemble = StackedEnsembleClassifier(pipelines, n_jobs=1)
ensemble.fit(X, y)

freddyaboulton 20 मई 2021

👍2

@freddyaboulton मुझे लगता है कि यह समझ में आता है https://github.com/nltk/nltk/issues/1576, सुनिश्चित नहीं है कि उन्होंने इसे कभी हल किया है, लेकिन मैं 0.23.0 के साथ भी पुन: पेश कर सकता हूं:')

angela97lin 20 मई 2021

यह निश्चित रूप से निकट बिंदु लगता है

ParthivNaresh 20 मई 2021

क्या यह पृष्ठ उपयोगी था?

0 / 5 - 0 रेटिंग्स

Evalml: टेक्स्ट कॉलम के साथ टाइटैनिक डेटासेट कोडांतरण में विफल रहता है

सबसे उपयोगी टिप्पणी

सभी 5 टिप्पणियाँ

संबंधित मुद्दों