Pythonã§ã¹ããŒã¹è¡åã䜿çšããŠxgboostããã¬ãŒãã³ã°ããŠãããšãã«ãValueErrorïŒfeature_namesã®äžäžèŽãçºçããŸãã
xgboostããŒãžã§ã³ã¯gitããææ°ã®ãã®ã§ãã å€ãããŒãžã§ã³ã§ã¯ãã®ãšã©ãŒã¯çºçããŸããã äºæž¬æéäžã«ãšã©ãŒãè¿ãããŸãã
from scipy import sparse
import xgboost as xgb
from random import *
randBinList = lambda n: [randint(0,1) for b in range(1,n+1)]
train = sparse.rand(100,500)
test = sparse.rand(10, 500)
y = randBinList(100)
clf = xgb.XGBClassifier()
clf.fit(train,y)
preds = clf.predict_proba(test)
ValueError Traceback (most recent call last)
<ipython-input-15-e03f10289bf1> in <module>()
----> 1 preds = clf.predict_proba(test)
/usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.egg/xgboost/sklearn.pyc in predict_proba(self, data, output_margin, ntree_limit)
471 class_probs = self.booster().predict(test_dmatrix,
472 output_margin=output_margin,
--> 473 ntree_limit=ntree_limit)
474 if self.objective == "multi:softprob":
475 return class_probs
/usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.egg/xgboost/core.pyc in predict(self, data, output_margin, ntree_limit, pred_leaf)
937 option_mask |= 0x02
938
--> 939 self._validate_features(data)
940
941 length = ctypes.c_ulong()
/usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.egg/xgboost/core.pyc in _validate_features(self, data)
1177
1178 raise ValueError(msg.format(self.feature_names,
-> 1179 data.feature_names))
1180
1181 def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True):
ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295', 'f296', 'f297', 'f298', 'f299', 'f300', 'f301', 'f302', 'f303', 'f304', 'f305', 'f306', 'f307', 'f308', 'f309', 'f310', 'f311', 'f312', 'f313', 'f314', 'f315', 'f316', 'f317', 'f318', 'f319', 'f320', 'f321', 'f322', 'f323', 'f324', 'f325', 'f326', 'f327', 'f328', 'f329', 'f330', 'f331', 'f332', 'f333', 'f334', 'f335', 'f336', 'f337', 'f338', 'f339', 'f340', 'f341', 'f342', 'f343', 'f344', 'f345', 'f346', 'f347', 'f348', 'f349', 'f350', 'f351', 'f352', 'f353', 'f354', 'f355', 'f356', 'f357', 'f358', 'f359', 'f360', 'f361', 'f362', 'f363', 'f364', 'f365', 'f366', 'f367', 'f368', 'f369', 'f370', 'f371', 'f372', 'f373', 'f374', 'f375', 'f376', 'f377', 'f378', 'f379', 'f380', 'f381', 'f382', 'f383', 'f384', 'f385', 'f386', 'f387', 'f388', 'f389', 'f390', 'f391', 'f392', 'f393', 'f394', 'f395', 'f396', 'f397', 'f398', 'f399', 'f400', 'f401', 'f402', 'f403', 'f404', 'f405', 'f406', 'f407', 'f408', 'f409', 'f410', 'f411', 'f412', 'f413', 'f414', 'f415', 'f416', 'f417', 'f418', 'f419', 'f420', 'f421', 'f422', 'f423', 'f424', 'f425', 'f426', 'f427', 'f428', 'f429', 'f430', 'f431', 'f432', 'f433', 'f434', 'f435', 'f436', 'f437', 'f438', 'f439', 'f440', 'f441', 'f442', 'f443', 'f444', 'f445', 'f446', 'f447', 'f448', 'f449', 'f450', 'f451', 'f452', 'f453', 'f454', 'f455', 'f456', 'f457', 'f458', 'f459', 'f460', 'f461', 'f462', 'f463', 'f464', 'f465', 'f466', 'f467', 'f468', 'f469', 'f470', 'f471', 'f472', 'f473', 'f474', 'f475', 'f476', 'f477', 'f478', 'f479', 'f480', 'f481', 'f482', 'f483', 'f484', 'f485', 'f486', 'f487', 'f488', 'f489', 'f490', 'f491', 'f492', 'f493', 'f494', 'f495', 'f496', 'f497', 'f498'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295', 'f296', 'f297', 'f298', 'f299', 'f300', 'f301', 'f302', 'f303', 'f304', 'f305', 'f306', 'f307', 'f308', 'f309', 'f310', 'f311', 'f312', 'f313', 'f314', 'f315', 'f316', 'f317', 'f318', 'f319', 'f320', 'f321', 'f322', 'f323', 'f324', 'f325', 'f326', 'f327', 'f328', 'f329', 'f330', 'f331', 'f332', 'f333', 'f334', 'f335', 'f336', 'f337', 'f338', 'f339', 'f340', 'f341', 'f342', 'f343', 'f344', 'f345', 'f346', 'f347', 'f348', 'f349', 'f350', 'f351', 'f352', 'f353', 'f354', 'f355', 'f356', 'f357', 'f358', 'f359', 'f360', 'f361', 'f362', 'f363', 'f364', 'f365', 'f366', 'f367', 'f368', 'f369', 'f370', 'f371', 'f372', 'f373', 'f374', 'f375', 'f376', 'f377', 'f378', 'f379', 'f380', 'f381', 'f382', 'f383', 'f384', 'f385', 'f386', 'f387', 'f388', 'f389', 'f390', 'f391', 'f392', 'f393', 'f394', 'f395', 'f396', 'f397', 'f398', 'f399', 'f400', 'f401', 'f402', 'f403', 'f404', 'f405', 'f406', 'f407', 'f408', 'f409', 'f410', 'f411', 'f412', 'f413', 'f414', 'f415', 'f416', 'f417', 'f418', 'f419', 'f420', 'f421', 'f422', 'f423', 'f424', 'f425', 'f426', 'f427', 'f428', 'f429', 'f430', 'f431', 'f432', 'f433', 'f434', 'f435', 'f436', 'f437', 'f438', 'f439', 'f440', 'f441', 'f442', 'f443', 'f444', 'f445', 'f446', 'f447', 'f448', 'f449', 'f450', 'f451', 'f452', 'f453', 'f454', 'f455', 'f456', 'f457', 'f458', 'f459', 'f460', 'f461', 'f462', 'f463', 'f464', 'f465', 'f466', 'f467', 'f468', 'f469', 'f470', 'f471', 'f472', 'f473', 'f474', 'f475', 'f476', 'f477', 'f478', 'f479', 'f480', 'f481', 'f482', 'f483', 'f484', 'f485', 'f486', 'f487', 'f488', 'f489', 'f490', 'f491', 'f492', 'f493', 'f494', 'f495', 'f496', 'f497', 'f498', 'f499']
training data did not have the following fields: f499
ããã¯ãã¹ããŒã¹è¡åãCSCã®å Žåã«ã®ã¿æ©èœããããã§ãã 以åã®ããŒãžã§ã³ã®ããã«ãCSRãŸãã¯COOãããªãã¯ã¹ã§ã¯æ©èœããŸããã
å³ç«¯ã®åããã¹ãŠ0ãŸãã¯1ã®å Žåãã©ã³ãã ãªåé¡ãçºçããŸãããïŒ ãã¶ãïŒ1091ãšïŒ1221ãšåãã§ãã
@sinhrks ïŒç§ã«ãšã£ãŠãããã¯ãã©ã³ãã ãã§ã¯ãããŸããã ç§ã¯é »ç¹ã«XGBoostãéåžžã«ãŸã°ããªããŒã¿ã§ãã¬ãŒãã³ã°ããŸãïŒãããŠããã¯çŽ æŽãããã§ãïŒããã¯éåžžä»ã®ãã¹ãŠã®ã¢ãã«ãããããªãåºãããŒãžã³ã§åã£ãŠããŸãïŒã
次ã«ããã¬ãŒãã³ã°æžã¿ã®ã¢ãã«ãæ¬çªç°å¢ã§å®è¡ã§ããããã«ãªã£ããããã¡ãããæ°ããåä¿¡ããŒã¿ãäºæž¬ããããšæããŸãã ãã¡ããããã®ããŒã¿ã¯ãŸã°ãã§ããå¯èœæ§ãé«ããæåŸã®åã§ããåã®å€ããããŸããã ãã®ãããXGBoostã¯é »ç¹ã«å£ããŠããŸããã¹ããŒã¹ããŒã¿ã®ãµããŒããåäžãããšããçç±ã ãã§ãä»ã®ïŒç²ŸåºŠã®äœãïŒã¢ãã«ã«åãæ¿ããããšã«ãªããŸããã
ãã®ãšã©ãŒãçºçããçç±ãšãã®å¯ŸåŠæ¹æ³ãæ£ç¢ºã«ç¥ã£ãŠãã人ã¯ããŸããïŒ ç§ã®æ¢åã®ã¹ã¯ãªããã倱æããŠããã®ã§ãããã¯ç§ã«ãšã£ãŠã®æ©ã¿ã®çš®ã§ãã
sklearnãã€ãã©ã€ã³ã®äžéšãšããŠxgboostãè©ŠããŠã¿ãŸããããåãåé¡ãçºçããŸããã ä¿®æ£ããããŸã§åé¿çã¯ãããŸããïŒ
ã¯ããpredictãåŒã³åºããšãã¯ãã¹ããŒã¹é åã®toarrayïŒïŒé¢æ°ã䜿çšããŸãã ããã¯ã¡ã¢ãªã§ã¯ã²ã©ãéå¹ççã§ãããå°ããªã¹ã©ã€ã¹ã§ã¯æ©èœããŸãã
ç§ã®iPhoneããéä¿¡ããã
2016幎8æ26æ¥ã«ã¯ã22:44ã§ããããã»ãããªã²ã¹ã®[email protected]ã¯æžããŸããïŒ
sklearnãã€ãã©ã€ã³ã®äžéšãšããŠxgboostãè©ŠããŠã¿ãŸããããåãåé¡ãçºçããŸããã ä¿®æ£ããããŸã§åé¿çã¯ãããŸããïŒ
â
ããªããã³ã¡ã³ãããã®ã§ããªãã¯ãããåãåã£ãŠããŸãã
ãã®ã¡ãŒã«ã«çŽæ¥è¿ä¿¡ããããGitHubã§è¡šç€ºããããã¹ã¬ããããã¥ãŒãããŠãã ããã
ãã¬ãŒãã³ã°æžã¿ã¢ãã«ãä¿åããŠããŒãããŠããäœããã®çç±ã§ãšã©ãŒã¯çºçããŸããã
bst = xgb.train(param, dtrain, num_round)
# predict is not working without this code
bst.save_model(model_file_name)
bst = xgb.Booster(param)
bst.load_model(model_file_name)
preds = bst.predict(dtest)
@ bryan-woods tocsc
ããè¯ãåé¿çãèŠã€ããããšãã§ããŸããã ããããããã©ãŒãã³ã¹ã®äœäžããããŸãããå¯ãªãããªãã¯ã¹ã«ããã»ã©æªãã¯ãããŸããã
xgboostãæ©èœããçŽåã«ãããsklearnãã€ãã©ã€ã³ã«å«ãã
class CSCTransformer(TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.tocsc()
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
def get_params(self, deep=True):
return {}
CSC圢åŒããæåŸã®åã«ãŒã以å€ã®ãšã³ããªãè¿œå ããŠããxgboostã®ææ°ããŒãžã§ã³ã®åé¡ã¯ä¿®æ£ãããŸããã ããŒãžã§ã³0.4a30ã«æ»ãããšã ããããããæ©èœãããããšãã§ããŸããå ã®äŸã§æ¬¡ã®èª¿æŽïŒåçŸå¯èœãªã·ãŒãã䜿çšïŒãæ€èšããŠãã ããã
>>> import xgboost as xgb
>>> import numpy as np
>>> from scipy import sparse
>>>
>>> np.random.seed(10)
>>> X = sparse.rand(100,10).tocsr()
>>> test = sparse.rand(10, 500).tocsr()
>>> y = np.random.randint(2,size=100)
>>>
>>> clf = xgb.XGBClassifier()
>>> clf.fit(X,y)
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
objective='binary:logistic', reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, seed=0, silent=True, subsample=1)
>>>
>>> try:
... pred = clf.predict_proba(test)
... print "Works when csr with version %s" %xgb.__version__
... except ValueError:
... "Broken when csr with version %s" %xgb.__version__
...
'Broken when csr with version 0.6'
>>> try:
... pred = clf.predict_proba(test.tocsc())
... print "Works when csc with version %s" %xgb.__version__
... except ValueError:
... "Still broken when csc with version %s" %xgb.__version__
...
'Still broken when csc with version 0.6'
>>> try:
... test[0,(test.shape[1]-1)] = 1.0
... pred = clf.predict_proba(test)
... print "Works when adding non-zero entries to last column with version %s" %xgb.__version__
... except ValueError:
... "Still broken when adding non-zero entries to last column with version %s" %xgb.__version__
...
/home/david.mcgarry/.conda/envs/ml/lib/python2.7/site-packages/scipy/sparse/compressed.py:730: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
SparseEfficiencyWarning)
'Still broken when adding non-zero entries to last column with version 0.6'
>>> import xgboost as xgb
>>> import numpy as np
>>> from scipy import sparse
>>>
>>> np.random.seed(10)
>>> X = sparse.rand(100,10).tocsr()
>>> test = sparse.rand(10, 500).tocsr()
>>> y = np.random.randint(2,size=100)
>>>
>>> clf = xgb.XGBClassifier()
>>> clf.fit(X,y)
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
objective='binary:logistic', reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, seed=0, silent=True, subsample=1)
>>>
>>> try:
... pred = clf.predict_proba(test)
... print "Works when csr with version %s" %xgb.__version__
... except ValueError:
... "Broken when csr with version %s" %xgb.__version__
...
Works when csr with version 0.4
>>> try:
... pred = clf.predict_proba(test.tocsc())
... print "Works when csc with version %s" %xgb.__version__
... except ValueError:
... "Still broken when csc with version %s" %xgb.__version__
...
Works when csc with version 0.4
>>> try:
... test[0,(test.shape[1]-1)] = 1.0
... pred = clf.predict_proba(test)
... print "Works when adding non-zero entries to last column with version %s" %xgb.__version__
... except ValueError:
... "Still broken when adding non-zero entries to last column with version %s" %xgb.__version__
...
/Users/david.mcgarry/anaconda/envs/ml/lib/python2.7/site-packages/scipy/sparse/compressed.py:739: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
SparseEfficiencyWarning)
Works when adding non-zero entries to last column with version 0.4
ããã§ãåãåé¡ããããŸãããååã®ãªãªãŒã¹ã§äœããééããªãå£ããŠããŸããã åãããŒã¿ã»ãããšåŠçã§ã以åã¯ãã®åé¡ã¯çºçããŠããŸããã§ããã ç§ã¯ééã£ãŠãããããããŸããããçŸåšãsklearnAPIã䜿çšããPythonã®ã¹ããŒã¹csré
åã䜿çšããåäœãã¹ãã¯ãªãããã§ãã äžèšã®@dmcgarryã®äŸãtests/python/tests_with_sklearn.py
ã«è¿œå ããããšã¯å¯èœã§ããããïŒ
CSRã¹ããŒã¹é åã§.toarrayïŒïŒã䜿çšããŠåé¿ããããšããŸããããäœããæ·±å»ã«å£ããŠããŸãã ä¿åããã¢ãã«ãããŒãããããã䜿çšããŠ.toarrayïŒïŒã䜿çšããŠäºæž¬ãå®è¡ããããšãããšããšã©ãŒã¡ãã»ãŒãžã¯è¡šç€ºãããŸããããçµæã¯æ£ãããããŸããã 0.4a30ã«ããŒã«ããã¯ããŸããããæ£åžžã«åäœããŸãã æ ¹æ¬çãªåå ãçªãæ¢ããæéããããŸããã§ããããããã¯è¯ããããŸããã
ãã®åé¡ã¯ãDMatrix..num_colïŒïŒãã¹ããŒã¹è¡åã®ãŒã以å€ã®åã®éã®ã¿ãè¿ãããã«çºçããŸãã ãããã£ãŠããã¬ãŒãã³ã°ããŒã¿ãšãã¹ãããŒã¿ã®äž¡æ¹ã«ãŒã以å€ã®åãåãéããå Žåããã¹ãŠãæ£åžžã«æ©èœããŸãã
ããããªããšãæ€èšŒé¢æ°ã次ã®ããã«åŒã³åºããããç°ãªãæ©èœåãªã¹ããäœæãããŸãã
<strong i="7">@property</strong>
def feature_names(self):
"""Get feature names (column labels).
Returns
-------
feature_names : list or None
"""
if self._feature_names is None:
return ['f{0}'.format(i) for i in range(self.num_col())]
else:
return self._feature_names
self._feature_names
ã¯ãã¹ããŒã¹è¡åã®å Žåã¯Noneã§ãããŸããself.num_colïŒïŒã¯ãŒã以å€ã®åã®éã®ã¿ãè¿ãããããto-be-ãã®ãŒã以å€ã®åã®éãããã«æ€èšŒã«å€±æããŸããäºæž¬ããããããŒã¿ã¯ããã¬ãŒãã³ã°ããŒã¿ã®ãŒã以å€ã®åã®éãšã¯ç°ãªããŸãã
ãŸã Dunnoã§ããããããä¿®æ£ããã®ãæåã®æ¹æ³ã§ãã
@ bryan-woodsãå ±åããããšã«ãããã¹ããŒã¹è¡åã®åŠçã«æ ¹æ¬çãªåé¡ãããããšãå¿é ã§ããããšãã°ããã¬ã€ã³ãšãã¹ãã®äž¡æ¹ã«xåã®ãŒãåããããŸãããã€ã³ããã¯ã¹ãç°ãªããŸã=> ãfeature_namesïŒselfïŒãã¯äž¡æ¹ã®ã»ããã«å¯ŸããŠåãæ©èœãªã¹ããè¿ãããããšã©ãŒã¯çºçããŸãããããã¬ã€ã³ãšãã¹ãã®éã§ãŒã以å€ã®åã€ã³ããã¯ã¹ãäžèŽããªããããäºæž¬ãééã£ãŠããå¯èœæ§ããããŸãã
誰ãããã®åé¡ã«åãçµãã ããšããããŸããïŒ å°ãªããšããéçºã«äœ¿çšã§ããåäœãã¹ããéçºãã人ã¯ããŸããïŒ
ç§ã¯ããã«åãçµãã§ããŸããããäžèšã®@dmcgarryã®äŸã¯ããŠããããã¹ãã®éå§ãšããŠäœ¿çšã§ãããšæããŸãã
import xgboost as xgb
import numpy as np
import scipy.sparse
def test_xgbclassifier_sklearn_sparse():
np.random.seed(10)
X = scipy.sparse.rand(100,10).tocsr()
test = scipy.sparse.rand(10, 500).tocsr()
y = np.random.randint(2,size=100)
clf = xgb.XGBClassifier()
clf.fit(X,y)
pred = clf.predict_proba(test)
ãªããžããªã®ãã©ãŒã¯ã§ãããã€ãã®æ°ããã¹ããŒã¹é
åãã¹ããäœæããŸããã èå³ã®ããæ¹ãžïŒ
https://github.com/bryan-woods/xgboost/blob/sparse_test/tests/python/test_scipy_sparse.py
ãã§ãã¯ã¢ãŠãã®ã«ãŒããã£ã¬ã¯ããªãããã¹ããå®è¡ããã«ã¯ã次ã®æé ã«åŸããŸãã
python -m nose tests / python / test_scipy_sparse.py
äž¡æ¹ã®ãã¹ãã倱æããããšã«æ°ä»ãã§ãããã ããã¯ãå°ãªããšãããã«å¯ŸããŠéçºããããã®ãã¹ããæäŸããŸãã
ç§ããã®åé¡ãçµéšããŠããŸããããããæçµçã«ã©ã€ãã©ãªã§è§£æ±ºããããŸã§ãä¿®æ£ããããã®æè¯ã®æ¹æ³ãç解ããããšãã§ããŸããã
maxidïŒ0ãªã©ã®æ倧æ©èœã€ã³ããã¯ã¹ã䜿çšããŠæ©èœãªã¹ãã«æ©èœãè¿œå ã§ããŸã
ããŒã¿ãã¬ãŒã ãæž¡ãããšã§åé¡ã解決ããŸãã
ã©ãããã°ããŒãžã§ã³0.4ã«æ»ãããšãã§ããŸããïŒ
pip install --upgrade xgboost == 0.4a30
ãã¹ãŠã®ã¿ã€ãã®ã¹ããŒã¹è¡åãæ©èœããŸããã§ããïŒç§ã¯tf-idfããŒã¿ã䜿çšããŠããŸãïŒã 以åã®ããŒãžã§ã³ã«æ»ããªããã°ãªããŸããã§ããã ãã³ããããããšãïŒ
ãŸã åé¡ãæ±ããŠããçããïŒäœ¿çšããŠããã³ãŒãã«ã¯ïŒ1606ã®ä¿®æ£ãå«ãŸããŠããŸããïŒ
ã¯ããxgboostã®æåŸã®ããŒãžã§ã³ãã€ã³ã¹ããŒã«ããŸãããããŸã ãã®åé¡ãçºçããŠããŸãã
ããã¯ãŸã ååšããŠãããç°¡åã«åçŸã§ããŸãã ååãªå€§ããã®ããŒã¿ã»ããã䜿çšããå Žåãããã¯çºçããå¯èœæ§ãäœããªããŸããããããã°ãªããæ€çŽ¢ãªããžã§ã¯ãã§ã©ããããå Žåãtrain / cvãã¹ãã»ããã§äœ¿çšå¯èœãªæ©èœãç°ãªãcvåå²å ã§ã»ãŒç¢ºå®ã«çºçããŸãã
æ£çŽãªãšãããDMatrixãscipyã¹ããŒã¹è¡åã«ãã£ãŠæäŸããã圢ç¶ãã³ããç¡èŠããçç±ãç§ã¯æ¬åœã«ç解ããŠããŸããã ãµã€ãºãèšç®ããã®ã§ã¯ãªãããã®æ å ±ã«åºã¥ããŠèšå®ããå¿ èŠããããŸãã
Xgboost Pythonãã€ãã£ãAPIïŒ0.6ïŒã䜿çšããŠããŸãããå«ãŸããŠããè¡ã®ããããã«æåŸã®åãå®çŸ©ãããŠããå ŽåãLIBSVM [ã¹ããŒã¹]圢åŒãã¡ã€ã«ããDMatrixãããŒããããšãã«åããšã©ãŒãçºçããŸãã ç§ã®åé¿çã¯ãæåã®è¡ã«ãããŒåãå®çŸ©ããããšã§ãã:(
train_fv_file = 'train_fv_eval.svm'
dtrain = xgb.DMatrix(train_fv_file, feature_names=feature_vector_labels, feature_types=feature_vector_types)
åçŸãéåžžã«ç°¡åãªå ŽåãåçŸå¯èœãªäŸãæäŸããããšæã人ã¯ããŸããïŒ ã§ããã°ãsklearnã¬ã€ã€ãŒãªãã§ïŒèããããåå ãç¹å®ããããïŒã
@gabrielspmoreira ïŒæåŸã®æ°åãå®å
šã«ãŸã°ããªLIBSVMãã¡ã€ã«ããããŒãããããšã«ã€ããŠã®ããªãã®ãã€ã³ããããããŸã...ãã®DMatrixæ§ç¯æ¹æ³ã¯ã num_col
ãã³ããæã€ããšãããæ©æµãåããã§ãããã
In [42]: matrix = xgboost.DMatrix(scipy.sparse.csr_matrix([[0, 2, 3, 0], [0, 2, 2, 0], [1, 0, 5, 0], [0, 1, 0, 0]], shape=(4,4)))
In [43]: matrix.num_col()
Out[43]: 3L
è¡/åã®ãµããµã³ãã«ã§æ°ããDMatrixãäœæããããã³ã«ããããçºçããå¯èœæ§ããããŸãïŒDMatrixã«åã®æ°ãæ瀺çã«æ瀺ããå Žåã§ããåã®æ°ãæžå°ããŸãïŒã ããã¯ããµãã»ããããã¹ãŠãŒãã«ãªãå¯èœæ§ãé«ããããå°ããããŒã¿ã»ãããŸãã¯éåžžã«ãŸã°ããªåã§ããçºçããŸãã
ãã¬ã€ã³/ãã¹ãã»ããéã§ãããçºçãããšãã¢ãã«ã¯ç°ãªãæ°ã®æ©èœãäºæããValueErrorãåãåºãããããã¹ãã»ãããè©äŸ¡ã§ããŸããã
äœãèµ·ãã£ãŠããã®ã確信ããŠããã®ã§ãxgboostã³ã¢ãšsklearn-wrapperå ã§ãããæ©èœãã/æ©èœããªããã¹ããèŠã€ããããšããŠããŸãããã©ãã§èµ·ãã£ãŠããã®ãããããŸããã
@ l3link ïŒã³ãŒããå€ããªã£ãŠããããã§ãã ãããç§ãåŸããã®ã§ãïŒ
In [2]: import scipy
...: import xgboost
...: matrix = xgboost.DMatrix(scipy.sparse.csr_matrix([[0, 2, 3, 0], [0, 2, 2, 0], [1, 0, 5, 0], [0, 1, 0, 0]], shape=(4,4)))
...: matrix.num_col()
...:
Out[2]: 4L
In [3]: matrix._init_from_csr??
Signature: matrix._init_from_csr(csr)
Source:
def _init_from_csr(self, csr):
"""
Initialize data from a CSR matrix.
"""
if len(csr.indices) != len(csr.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixCreateFromCSREx(c_array(ctypes.c_size_t, csr.indptr),
c_array(ctypes.c_uint, csr.indices),
c_array(ctypes.c_float, csr.data),
len(csr.indptr), len(csr.data),
csr.shape[1],
ctypes.byref(self.handle)))
File: c:\anaconda2\lib\site-packages\xgboost-0.6-py2.7.egg\xgboost\core.py
Type: instancemethod
ã¯ã
In [64]: xgboost.__version__
Out[64]: '0.6'
Signature: matrix._init_from_csr(csr)
Source:
def _init_from_csr(self, csr):
"""
Initialize data from a CSR matrix.
"""
if len(csr.indices) != len(csr.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixCreateFromCSR(c_array(ctypes.c_ulong, csr.indptr),
c_array(ctypes.c_uint, csr.indices),
c_array(ctypes.c_float, csr.data),
len(csr.indptr), len(csr.data),
ctypes.byref(self.handle)))
File: ~/anaconda/lib/python2.7/site-packages/xgboost/core.py
Type: instancemethod
ç§ã®.6ããŒãžã§ã³ã«XGDMatrixCreateFromCSRExåœä»€ã®ä»£ããã«XGDMatrixCreateFromCSRãå«ãŸããŠããã®ã¯å¥åŠã«æããŸãããããã¯åœ¢ã«ãªããŸããã
osxã®ååžãç°ãªãå¯èœæ§ã¯ãããŸããïŒ
@ bryan-woodsãå ±åããããšã«ãããã¹ããŒã¹è¡åã®åŠçã«æ ¹æ¬çãªåé¡ãããããšãå¿é ã§ããããšãã°ããã¬ã€ã³ãšãã¹ãã®äž¡æ¹ã«xåã®ãŒãåããããŸãããã€ã³ããã¯ã¹ãç°ãªããŸã=> ãfeature_namesïŒselfïŒãã¯äž¡æ¹ã®ã»ããã«å¯ŸããŠåãæ©èœãªã¹ããè¿ãããããšã©ãŒã¯çºçããŸãããããã¬ã€ã³ãšãã¹ãã®éã§ãŒã以å€ã®åã€ã³ããã¯ã¹ãäžèŽããªããããäºæž¬ãééã£ãŠããå¯èœæ§ããããŸãã
誰ãããã®è³ªåã«çããŠããããŸããïŒ 0.4ããŒãžã§ã³ã«æ»ãããšãããåäœããŠããããã«èŠããŸããããŸã æ¬åœã«ã¹ããŒã¹è¡åã䜿çšããŠãããããæ£åžžã«åäœããŠãããã©ããå¿é ã§ãã
@ l3linkããã«ã€ããŠå¥åŠãªããšã¯äœããããŸããïŒããŒãžã§ã³çªå·ïŒãŸãã¯pypiããã±ãŒãžïŒãé·æéæŽæ°ãããªãããšããããŸãã ããšãã°ãä»æ¥ã®https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/VERSIONãã¡ã€ã«ã¯7æ29æ¥ã«æåŸã«å€æŽãããæåŸã®pypiããã±ãŒãžã¯httpsïŒ//pypi.pythonã§ãã org / pypi / xgboost /ã®æ¥ä»ã¯8æ9æ¥ã§ãã ä¿®æ£ã¯9æ23æ¥ïŒ1606ã«æåºãããŸããã githubããææ°ã®ã³ãŒãããã§ãã¯ããŠãã ããã
ãã³ãDataFrame
ïŒéã¹ããŒã¹è¡šçŸïŒã䜿çšãããšãã«ãã®åé¡ãçºçããŸããã
df.as_matrix()
ãä»ããŠnumpy ndarray
ã«å€æãããšã©ãŒãåãé€ããŸããã
ããŒã¿ãã¬ãŒã ãé åã«å€æããåŸãç§ããã®ãšã©ãŒãåãé€ããŸããã
åè»ã»ãããšåãé åºã§ãã¹ãã»ããã®åã䞊ã¹æ¿ãããšããããä¿®æ£ãããŸããã
ç§ã¯ãã³ãã®ããŒã¿ãã¬ãŒã ã䜿çšããŸããã ããããªããšã .as_matrix()
ã䜿çšããŠãåãåé¡ãçºçããŠããŸããã
ãã£ãïŒ
test = test[train.columns]
@warpuvãœãªã¥ãŒã·ã§ã³ãè©ŠããŸãããã
ãã¬ã€ã³/ãã¹ãcsrãããªãã¯ã¹ãcscã«å€æããããšã¯ç§ã®ããã«åããŸãã
Xtrain = scipy.sparse.csc_matrix(Xtrain)
csc_matrix
ãžã®å€æ0.6a2
ã§ãã¹ããããŠããŸãïŒ
X_train = scipy.sparse.csc_matrix(X_train)
X_test = scipy.sparse.csc_matrix(X_test)
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_test = xgb.DMatrix(X_test, label=y_test)
type(X_train) <class 'scipy.sparse.csr.csr_matrix'>
type(X_test) <class 'scipy.sparse.csr.csr_matrix'>
type(X_train) <class 'scipy.sparse.csc.csc_matrix'>
type(X_test) <class 'scipy.sparse.csc.csc_matrix'>
type(xgb_train) <class 'xgboost.core.DMatrix'>
type(xgb_test) <class 'xgboost.core.DMatrix'>
ç§ã®å
ã®ã¹ããŒã¹è¡åã¯ãsklearn tfã®åºåã§ã-idfãã¯ãã©ã€ã¶ãŒã¯csr_matrix
圢åŒã§ããã
ãŸã ä¿®æ£ã¯ãããŸããïŒ
python3ã§ææ°ããŒãžã§ã³ïŒ0.7.post3ïŒããã«ãããã°ããã§ããã®åé¡ããŸã ååšããããšã確èªã§ããŸãã äžèšã®@dmcgarryã®äŸãé©çšããåŸã§ãã csr_matrix
ãšcsc_matrix
äž¡æ¹ã§åé¡ãçºçããŠããŸãã
import xgboost as xgb
import numpy as np
from scipy import sparse
np.random.seed(10)
X_csr = sparse.rand(100, 10).tocsr()
test_csr = sparse.rand(10, 500).tocsr()
X_csc = sparse.rand(100, 10).tocsc()
test_csc = sparse.rand(10, 500).tocsc()
y = np.random.randint(2, size=100)
clf_csr = xgb.XGBClassifier()
clf_csr.fit(X_csr, y)
clf_csc = xgb.XGBClassifier()
clf_csc.fit(X_csc, y)
# Try with csr
try:
pred = clf_csr.predict_proba(test_csr)
print("Works when csr with version %s" %xgb.__version__)
except ValueError:
print("Broken when csr with version %s" %xgb.__version__)
try:
test_csr[0,(test_csr.shape[1]-1)] = 1.0
pred = clf_csr.predict_proba(test_csr)
print("Works when adding non-zero entries to last column with version %s" %xgb.__version__)
except:
print("Still broken when adding non-zero entries to last column with version %s" %xgb.__version__)
# Try with csc
try:
pred = clf_csc.predict_proba(test_csc)
print("Works when csc with version %s" %xgb.__version__)
except ValueError:
print("Broken when csc with version %s" %xgb.__version__)
try:
test_csc[0,(test_csc.shape[1]-1)] = 1.0
pred = clf_csc.predict_proba(test_csc)
print("Works when adding non-zero entries to last column with version %s" %xgb.__version__)
except:
print("Still broken when adding non-zero entries to last column with version %s" %xgb.__version__)
äžèšã®ã³ãŒãã«ããã次ã®åºåãåŸãããŸããã
Broken when csr with version 0.7
Still broken when adding non-zero entries to last column with version 0.7
Broken when csc with version 0.7
Still broken when adding non-zero entries to last column with version 0.7
plsãã«ã
ãã®åé¡ã解決ãããã®ã¯ãªãã§ããïŒ
ç§ã¯æè¿ãã®åé¡ã«2åééããŸããã ããã±ãŒã¹ã§ã¯ãå ¥åããŒã¿ãã¬ãŒã ãé åã«å€æŽããã ãã§æ©èœããŸãã 2ã€ç®ã¯ãtest_df = test_df [train_df.columns]ã䜿çšããŠãã¹ãããŒã¿ãã¬ãŒã ã®ååãå調æŽããå¿ èŠããããŸãã ã©ã¡ãã®å Žåããtrain_dfãštest_dfã®ååã¯ãŸã£ããåãã§ãã
@CathyQianã®ã³ã¡ã³ããããããŸãããã train_df
/ test_df
ãŸã°ãã§ããïŒ ãŸãããããã®åé¡ãçºçãããšãã«ãã©ã®ããŒãžã§ã³ã®xgboostãå®è¡ããŠããŸãããïŒ
@CathyQian xgboostã¯åã®_order_ã«äŸåããŠãããããã¯ãã®åé¡ãšã¯é¢ä¿ãããŸããã
@ewellinger WRTããªãã®äŸïŒ10åã®ç¹åŸŽãæã€ããŒã¿ã§ãã¬ãŒãã³ã°ãããã¢ãã«ã¯ãäºæž¬ã®ããã«500åã®ç¹åŸŽãæã€ããŒã¿ãåãå ¥ããã¹ãã§ã¯ãªãããããšã©ãŒãã¹ããŒãããŸãã ãŸãããã¹ãŠã®è¡åããDMatricesãäœæãããããã®num_colãšnum_rowã調ã¹ããšãæåŸ ãããçµæãåŸãããŸãã
ãã¹ããŒã¹æ§ã®åé¡ãã®çŸåšã®ç¶æ ã¯æ¬¡ã®ãšããã§ãã
@warpuvããã¯ç§ã®ããã«åããŸããã©ããããããšãã
å¯åºŠè¡åã§åããšã©ãŒãçºçããŸããã ïŒææ°ã®anacondaã®xgboost v.0.6ïŒ
ãã¬ãŒãã³ã°ãµã³ãã«ã®ããŸããŸãªç¹åŸŽãµãã»ããã§è€æ°ã®ååž°ãå®è¡ãããšããšã©ãŒãçºçããŸããã
次ã®ååž°ããã£ããã£ã³ã°ããåã«æ¯åæ°ããã¢ãã«ã€ã³ã¹ã¿ã³ã¹ãäœæãããšãåé¡ãä¿®æ£ãããŸããã
- libsvmããŒã¿ãDMatrixã«ããŒããããšãã«äºåå®çŸ©ãããåæ°ãæå®ãããã©ã¡ãŒã¿ãŒã¯ãŸã å®è£ ãããŠããŸããã è²¢ç®ãããã©ã³ãã£ã¢ã¯å€§æè¿ã§ãã
0.8ã®æç¹ã§ãããã¯ãŸã ååšããŠããŸããããïŒ
CSCããDMatrixãäœæãããšãæ£ãã次å ã®ãªããžã§ã¯ããçæãããŸãããæåŸã®è¡ãå®å šã«ã¹ããŒã¹ã§ããå Žåããã¬ãŒãã³ã°ãŸãã¯äºæž¬äžã«èª€ã£ãçµæãåŸãããå¯èœæ§ããããŸãïŒ2630ã ãã®éšåãé©åã«ä¿®æ£ããæéããŸã ãããŸããã§ããã
@ khotilov ïŒ3553ã¯ãã®åé¡ãä¿®æ£ããŸããã
libsvmããŒã¿ãDMatrixã«ããŒããããšãã«äºåå®çŸ©ãããåæ°ãæå®ãããã©ã¡ãŒã¿ãŒã¯ãŸã å®è£ ãããŠããŸããã è²¢ç®ãããã©ã³ãã£ã¢ã¯å€§æè¿ã§ãã
@MonsieurWaveãã®æ©èœã§ã¯ãdmlc-coreãžã®å°ããªãã«ãªã¯ãšã¹ãã§ããŸãããã¯ãã§ãã ãããèŠãŠã¿ãŸãããã
@ hcho3ã©ããããããšãã
ä»ã®ãšãããlibsvmã®æåã®è¡ãããã»ã©ãŸã°ãã«ããªãããšã§ãã®åé¡ãåé¿ããŸããã€ãŸããå€0ã®åãä¿åããŸãã
æãåèã«ãªãã³ã¡ã³ã
ãã®åé¡ã¯ãDMatrix..num_colïŒïŒãã¹ããŒã¹è¡åã®ãŒã以å€ã®åã®éã®ã¿ãè¿ãããã«çºçããŸãã ãããã£ãŠããã¬ãŒãã³ã°ããŒã¿ãšãã¹ãããŒã¿ã®äž¡æ¹ã«ãŒã以å€ã®åãåãéããå Žåããã¹ãŠãæ£åžžã«æ©èœããŸãã
ããããªããšãæ€èšŒé¢æ°ã次ã®ããã«åŒã³åºããããç°ãªãæ©èœåãªã¹ããäœæãããŸãã
self._feature_names
ã¯ãã¹ããŒã¹è¡åã®å Žåã¯Noneã§ãããŸããself.num_colïŒïŒã¯ãŒã以å€ã®åã®éã®ã¿ãè¿ãããããto-be-ãã®ãŒã以å€ã®åã®éãããã«æ€èšŒã«å€±æããŸããäºæž¬ããããããŒã¿ã¯ããã¬ãŒãã³ã°ããŒã¿ã®ãŒã以å€ã®åã®éãšã¯ç°ãªããŸãããŸã Dunnoã§ããããããä¿®æ£ããã®ãæåã®æ¹æ³ã§ãã