ValueError in pipeline - featureHasher not working?
我认为我在使Vectorizer在gridsearch管道中工作时遇到问题:
数据为panda df x_train:
1 2 3 4 5 6 7 8 9 | bathrooms bedrooms price building_id manager_id 10 1.5 3 3000 53a5b119ba8f7b61d4e010512e0dfc85 5ba989232d0489da1b5f2c45f6688adc 10000 1.0 2 5465 c5c8a357cba207596b04d1afd1e4f130 7533621a882f71e25173b27e3139d83d 100004 1.0 1 2850 c3ba40552e2120b0acfc3cb5730bb2aa d9039c43983f6e564b1482b273bd7b01 100007 1.0 1 3275 28d9ad350afeaab8027513a3e52ac8d5 1067e078446a7897d2da493d2f741316 100013 1.0 4 3350 0 98e13ad4b495b9613cef886d79a6291f numeric_predictors = ['bathrooms', 'bedrooms', 'price'] categorical_predictors = ['building_id', 'manager_id'] |
minMaxScaler适合和转换:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import MinMaxScaler class MyScaler(BaseEstimator, TransformerMixin): def __init__(self, cols): self.cols = cols def fit(self, X, y=None): self.scaler = MinMaxScaler() self.scaler.fit(X[self.cols]) return self def transform(self, X): return self.scaler.transform(X[self.cols]) |
我的分类特征哈希向量器:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | from sklearn.feature_extraction import FeatureHasher from sklearn.feature_extraction.text import HashingVectorizer class MyVectorizer(BaseEstimator, TransformerMixin): """ Vectorize a set of categorical variables """ def __init__(self, cols, hashing=None): """ args: cols: a list of column names of the categorical variables hashing: If None, then vectorization is a simple one-hot-encoding. If an integer, then hashing is the number of features in the output. """ self.cols = cols self.hashing = hashing def fit(self, X, y=None): data = X[self.cols] # Choose a vectorizer if self.hashing is None: self.myvec = HashingVectorizer() else: self.myvec = FeatureHasher(n_features = self.hashing) self.myvec.fit(X[self.cols].to_dict(orient='records')) return self def transform(self, X): # Vectorize Input if self.hashing is None: return pd.DataFrame( self.myvec.transform(X[self.cols].to_dict(orient='records')), columns = self.myvec.feature_names_ ) else: return pd.DataFrame( self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray() ) |
GridSearch超参数:
1 2 3 4 | search_params = { 'preprocess__vectorize__hashing': [20, 40, 80], 'predict__alpha': [.01, .1, 1, 2, 10] } |
管道:
1 2 3 4 5 6 7 8 9 10 11 | from sklearn.pipeline import Pipeline from sklearn.pipeline import FeatureUnion from sklearn.linear_model import LinearRegression pipeline = Pipeline([ ('preprocess', FeatureUnion([ ('scale', MyScaler(cols=numeric_predictors)), ('vectorize', MyVectorizer(cols=categorical_predictors, hashing=5)) ])), ('predict', MultinomialNB()) ]) |
最后,使用gridsearchCV分类器调用此方法:
1 2 | grid_search = GridSearchCV(pipeline, search_params) grid_search.fit(x_train, y_train) |
我收到ValueError:输入X必须为非负数。 我检查了一下,然后我的numeric_predictor列的数据全部为非负数,因此我将其范围缩小到分类预测变量的散列问题。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | ValueError Traceback (most recent call last) <ipython-input-62-50522376d1e5> in <module>() 1 grid_search = GridSearchCV(pipeline, search_params) ----> 2 grid_search.fit(x_train, y_train) 3 grid_search.best_params_ /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params) 636 error_score=self.error_score) 637 for parameters, (train, test) in product(candidate_params, --> 638 cv.split(X, y, groups))) 639 640 # if one choose to see train score,"out" will contain train score info /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable) 777 # was dispatched. In particular this covers the edge 778 # case of Parallel used with an exhausted iterator. --> 779 while self.dispatch_one_batch(iterator): 780 self._iterating = True 781 else: /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator) 623 return False 624 else: --> 625 self._dispatch(tasks) 626 return True 627 /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch) 586 dispatch_timestamp = time.time() 587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) --> 588 job = self._backend.apply_async(batch, callback=cb) 589 self._jobs.append(job) 590 /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback) 109 def apply_async(self, func, callback=None): 110 """Schedule a func to be run""" --> 111 result = ImmediateResult(func) 112 if callback: 113 callback(result) /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch) 330 # Don't delay the application, to avoid keeping the input 331 # arguments in memory --> 332 self.results = batch() 333 334 def get(self): /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score) 435 estimator.fit(X_train, **fit_params) 436 else: --> 437 estimator.fit(X_train, y_train, **fit_params) 438 439 except Exception as e: /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params) 257 Xt, fit_params = self._fit(X, y, **fit_params) 258 if self._final_estimator is not None: --> 259 self._final_estimator.fit(Xt, y, **fit_params) 260 return self 261 /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in fit(self, X, y, sample_weight) 602 self.feature_count_ = np.zeros((n_effective_classes, n_features), 603 dtype=np.float64) --> 604 self._count(X, Y) 605 alpha = self._check_alpha() 606 self._update_feature_log_prob(alpha) /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y) 706 """Count and smooth feature occurrences.""" 707 if np.any((X.data if issparse(X) else X) < 0): --> 708 raise ValueError("Input X must be non-negative") 709 self.feature_count_ += safe_sparse_dot(Y.T, X) 710 self.class_count_ += Y.sum(axis=0) ValueError: Input X must be non-negative > /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py(708)_count() 706 """Count and smooth feature occurrences.""" 707 if np.any((X.data if issparse(X) else X) < 0): --> 708 raise ValueError("Input X must be non-negative") 709 self.feature_count_ += safe_sparse_dot(Y.T, X) 710 self.class_count_ += Y.sum(axis=0) |
是的,当哈希不为"无"时,选择
但是您可以按照文档中的说明使用FeatureHashser的
non_negative : boolean, optional, default False
When True, an absolute value is applied to the features matrix prior
to returning it. When used in conjunction with
alternate_sign=True, this significantly reduces the inner product
preservation property.
因此,请在MyVectorizer中更改此行:
1 | self.myvec = FeatureHasher(n_features = self.hashing) |
对此:
1 | self.myvec = FeatureHasher(n_features = self.hashing, non_negative=True) |
注意:
- 此参数自版本0.19起已被弃用,并将在0.21中删除。
- 您需要研究此参数将如何影响分类问题。