I have the following code which works normally but got a
UserWarning: One or more of the test scores are non-finite: [nan nan] category=UserWarning when I revised it into a more concise version (shown in the subsequent code snippet). Is the output of the one-hot encoder the culprit of the issue?
import pandas as pd from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import RidgeClassifier from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.model_selection import GridSearchCV train = pd.read_csv('/train.csv') test = pd.read_csv('/test.csv') sparse_features = [col for col in train.columns if col.startswith('cat')] dense_features = [col for col in train.columns if col not in sparse_features+['target']] X = train.drop(['target'], axis=1) y = train['target'].values skf = StratifiedKFold(n_splits=5) clf = RidgeClassifier() full_pipeline = ColumnTransformer(transformers=[ ('num', StandardScaler(), dense_features), ('cat', OneHotEncoder(), sparse_features) ]) X_prepared = full_pipeline.fit_transform(X) param_grid = { 'alpha': [ 0.1], 'fit_intercept': [False] } gs = GridSearchCV( estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=skf ) gs.fit(X_prepared, y) The revision is shown below.
clf2 = RidgeClassifier() preprocess_pipeline2 = ColumnTransformer([ ('num', StandardScaler(), dense_features), ('cat', OneHotEncoder(), sparse_features) ]) from sklearn.pipeline import Pipeline final_pipeline = Pipeline(steps=[ ('p', preprocess_pipeline2), ('c', clf2) ]) param_grid2 = { 'c__alpha': [0.4, 0.1], 'c__fit_intercept': [False] } gs2 = GridSearchCV( estimator=final_pipeline, param_grid=param_grid2, scoring='roc_auc', n_jobs=-1, cv=skf ) gs2.fit(X, y) Can anyone point out which part goes wrong?
EDIT: After setting error_score to raise, I can receive more feedback regarding the issue. It seems to me that I need to fit the one-hot encoder on the merged dataset that combines the training set and the test set. Am I correct? But if it is the case, why doesn't the first version complain about the same issue? BTW, does it make sense to introduce the argument handle_unknown='ignore' to handle this issue?
ValueError --------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "/opt/conda/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 431, in _process_worker r = call_item() File "/opt/conda/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 285, in __call__ return self.fn(*self.args, **self.kwargs) File "/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 595, in __call__ return self.func(*args, **kwargs) File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 263, in __call__ for func, args, kwargs in self.items] File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 263, in <listcomp> for func, args, kwargs in self.items] File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/fixes.py", line 222, in __call__ return self.function(*args, **kwargs) File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 620, in _fit_and_score test_scores = _score(estimator, X_test, y_test, scorer, error_score) File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 674, in _score scores = scorer(estimator, X_test, y_test) File "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 200, in __call__ sample_weight=sample_weight) File "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 334, in _score y_pred = method_caller(clf, "decision_function", X) File "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 53, in _cached_call return getattr(estimator, method)(*args, **kwargs) File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/metaestimators.py", line 120, in <lambda> out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs) File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 493, in decision_function Xt = transform.transform(Xt) File "/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py", line 565, in transform Xs = self._fit_transform(X, None, _transform_one, fitted=True) File "/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py", line 444, in _fit_transform self._iter(fitted=fitted, replace_strings=True), 1)) File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 1044, in __call__ while self.dispatch_one_batch(iterator): File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch self._dispatch(tasks) File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 777, in _dispatch job = self._backend.apply_async(batch, callback=cb) File "/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 208, in apply_async result = ImmediateResult(func) File "/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 572, in __init__ self.results = batch() File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 263, in __call__ for func, args, kwargs in self.items] File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 263, in <listcomp> for func, args, kwargs in self.items] File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/fixes.py", line 222, in __call__ return self.function(*args, **kwargs) File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 733, in _transform_one res = transformer.transform(X) File "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py", line 462, in transform force_all_finite='allow-nan') File "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py", line 136, in _transform raise ValueError(msg) ValueError: Found unknown categories ['MR', 'MW', 'DA'] in column 10 during transform """ The above exception was the direct cause of the following exception: ValueError Traceback (most recent call last) <ipython-input-48-b81f3b7b0724> in <module> 21 cv=skf 22 ) ---> 23 gs2.fit(X, y) /opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0 /opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params) 839 return results 840 --> 841 self._run_search(evaluate_candidates) 842 843 # multimetric is determined here because in the case of a callable /opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates) 1286 def _run_search(self, evaluate_candidates): 1287 """Search all candidates in param_grid""" -> 1288 evaluate_candidates(ParameterGrid(self.param_grid)) 1289 1290 /opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results) 807 (split_idx, (train, test)) in product( 808 enumerate(candidate_params), --> 809 enumerate(cv.split(X, y, groups)))) 810 811 if len(out) < 1: /opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1052 1053 with self._backend.retrieval_context(): -> 1054 self.retrieve() 1055 # Make sure that we get a last message telling us we are done 1056 elapsed_time = time.time() - self._start_time /opt/conda/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self) 931 try: 932 if getattr(self._backend, 'supports_timeout', False): --> 933 self._output.extend(job.get(timeout=self.timeout)) 934 else: 935 self._output.extend(job.get()) /opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout) 540 AsyncResults.get from multiprocessing.""" 541 try: --> 542 return future.result(timeout=timeout) 543 except CfTimeoutError as e: 544 raise TimeoutError from e /opt/conda/lib/python3.7/concurrent/futures/_base.py in result(self, timeout) 433 raise CancelledError() 434 elif self._state == FINISHED: --> 435 return self.__get_result() 436 else: 437 raise TimeoutError() /opt/conda/lib/python3.7/concurrent/futures/_base.py in __get_result(self) 382 def __get_result(self): 383 if self._exception: --> 384 raise self._exception 385 else: 386 return self._result ValueError: Found unknown categories ['MR', 'MW', 'DA'] in column 10 during transform https://stackoverflow.com/questions/66620269/i-got-the-warning-userwarning-one-or-more-of-the-test-scores-are-non-finite-w March 14, 2021 at 09:37AM
没有评论:
发表评论