关于管道中的python:ValueError-featureHasher无法正常工作?

ValueError in pipeline - featureHasher not working?

我认为我在使Vectorizer在gridsearch管道中工作时遇到问题:

数据为panda df x_train:

1
2
3
4
5
6
7
8
9
bathrooms   bedrooms    price   building_id     manager_id
10  1.5     3   3000    53a5b119ba8f7b61d4e010512e0dfc85    5ba989232d0489da1b5f2c45f6688adc
10000   1.0     2   5465    c5c8a357cba207596b04d1afd1e4f130    7533621a882f71e25173b27e3139d83d
100004  1.0     1   2850    c3ba40552e2120b0acfc3cb5730bb2aa    d9039c43983f6e564b1482b273bd7b01
100007  1.0     1   3275    28d9ad350afeaab8027513a3e52ac8d5    1067e078446a7897d2da493d2f741316
100013  1.0     4   3350    0   98e13ad4b495b9613cef886d79a6291f

numeric_predictors = ['bathrooms', 'bedrooms', 'price']
categorical_predictors = ['building_id', 'manager_id']

minMaxScaler适合和转换:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler

class MyScaler(BaseEstimator, TransformerMixin):

def __init__(self, cols):
    self.cols = cols

def fit(self, X, y=None):

    self.scaler = MinMaxScaler()
    self.scaler.fit(X[self.cols])
    return self

def transform(self, X):
    return self.scaler.transform(X[self.cols])

我的分类特征哈希向量器:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import HashingVectorizer

class MyVectorizer(BaseEstimator, TransformerMixin):
   """
    Vectorize a set of categorical variables
   """


    def __init__(self, cols, hashing=None):
       """
        args:
            cols: a list of column names of the categorical variables
            hashing:
                If None, then vectorization is a simple one-hot-encoding.
                If an integer, then hashing is the number of features in the output.
       """

        self.cols = cols
        self.hashing = hashing

    def fit(self, X, y=None):

        data = X[self.cols]

        # Choose a vectorizer
        if self.hashing is None:
            self.myvec = HashingVectorizer()
        else:
            self.myvec = FeatureHasher(n_features = self.hashing)

        self.myvec.fit(X[self.cols].to_dict(orient='records'))
        return self

    def transform(self, X):

        # Vectorize Input
        if self.hashing is None:
            return pd.DataFrame(
                self.myvec.transform(X[self.cols].to_dict(orient='records')),
                columns = self.myvec.feature_names_
            )
        else:
            return pd.DataFrame(
                self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray()
            )

GridSearch超参数:

1
2
3
4
search_params = {
    'preprocess__vectorize__hashing': [20, 40, 80],
    'predict__alpha': [.01, .1, 1, 2, 10]
}

管道:

1
2
3
4
5
6
7
8
9
10
11
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression

pipeline = Pipeline([
    ('preprocess', FeatureUnion([
        ('scale', MyScaler(cols=numeric_predictors)),
        ('vectorize', MyVectorizer(cols=categorical_predictors, hashing=5))
    ])),
    ('predict', MultinomialNB())
])

最后,使用gridsearchCV分类器调用此方法:

1
2
grid_search = GridSearchCV(pipeline, search_params)
grid_search.fit(x_train, y_train)

我收到ValueError:输入X必须为非负数。 我检查了一下,然后我的numeric_predictor列的数据全部为非负数,因此我将其范围缩小到分类预测变量的散列问题。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    ValueError                                Traceback (most recent call last)
<ipython-input-62-50522376d1e5> in <module>()
      1 grid_search = GridSearchCV(pipeline, search_params)
----> 2 grid_search.fit(x_train, y_train)
      3 grid_search.best_params_

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params)
    636                                   error_score=self.error_score)
    637           for parameters, (train, test) in product(candidate_params,
--> 638                                                    cv.split(X, y, groups)))
    639
    640         # if one choose to see train score,"out" will contain train score info

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110        """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333
    334     def get(self):

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
    129
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132
    133     def __len__(self):

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    435             estimator.fit(X_train, **fit_params)
    436         else:
--> 437             estimator.fit(X_train, y_train, **fit_params)
    438
    439     except Exception as e:

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
    257         Xt, fit_params = self._fit(X, y, **fit_params)
    258         if self._final_estimator is not None:
--> 259             self._final_estimator.fit(Xt, y, **fit_params)
    260         return self
    261

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in fit(self, X, y, sample_weight)
    602         self.feature_count_ = np.zeros((n_effective_classes, n_features),
    603                                        dtype=np.float64)
--> 604         self._count(X, Y)
    605         alpha = self._check_alpha()
    606         self._update_feature_log_prob(alpha)

/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y)
    706        """Count and smooth feature occurrences."""
    707         if np.any((X.data if issparse(X) else X) < 0):
--> 708             raise ValueError("Input X must be non-negative")
    709         self.feature_count_ += safe_sparse_dot(Y.T, X)
    710         self.class_count_ += Y.sum(axis=0)

ValueError: Input X must be non-negative

> /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py(708)_count()
    706        """Count and smooth feature occurrences."""
    707         if np.any((X.data if issparse(X) else X) < 0):
--> 708             raise ValueError("Input X must be non-negative")
    709         self.feature_count_ += safe_sparse_dot(Y.T, X)
    710         self.class_count_ += Y.sum(axis=0)

是的,当哈希不为"无"时,选择FeatureHasher(),它可以输出负值。

但是您可以按照文档中的说明使用FeatureHashser的non_negative参数删除将这些负值转换为正值的方法:

non_negative : boolean, optional, default False

When True, an absolute value is applied to the features matrix prior
to returning it. When used in conjunction with
alternate_sign=True, this significantly reduces the inner product
preservation property.

因此,请在MyVectorizer中更改此行:

1
self.myvec = FeatureHasher(n_features = self.hashing)

对此:

1
self.myvec = FeatureHasher(n_features = self.hashing, non_negative=True)

注意:

  • 此参数自版本0.19起已被弃用,并将在0.21中删除。
  • 您需要研究此参数将如何影响分类问题。