# best subset regression in python

best subset就是对所有的自变量可能的组合都作为一个模型，然后根据最优（对线性回归，比如用R^2， 对logistic regression，使用AUC或者其他标准 ）选择适当的模型。

import numpy as np
import pandas as pd
import urllib
from itertools import chain, combinations
import statsmodels.api as sm

# read in data from UCLA ATS

# ignore the sas codes and clean data to pandas DataFrame
ericksen = pd.DataFrame([x.replace('\r\n', '').split() for x in f[19:85]])
ericksen.columns = 'area perc_min crimrate poverty diffeng hsgrad housing city countprc undcount'.split()

# convert to numeric values
for i in ericksen.columns[1:]:
ericksen['num_'+i] = ericksen[i] .map(lambda x: float(x) + 0.0)

def best_subset(X, y):
n_features = X.shape[1]
subsets = chain.from_iterable(combinations(xrange(n_features), k+1) for k in xrange(n_features))
best_score = -np.inf
best_subset = None
for subset in subsets:
lin_reg = sm.OLS(y, X.iloc[:, subset]).fit()
if score > best_score:
best_score, best_subset = score, subset
return best_subset, best_score

X = ericksen.ix[:, 10:18]
y = ericksen.ix[:, 18]

           area perc_min crimrate poverty diffeng hsgrad housing city  \
0       Alabama     26.1       49    18.9     0.2   43.5     7.6    0
1        Alaska      5.7       62    10.7     1.7   17.5    23.6    0
2       Arizona     18.9       81    13.2     3.2   27.6     8.1    0
3      Arkansas     16.9       38    19.0     0.2   44.5     7.0    0
4  California_R     24.3       73    10.4     5.0   26.0    11.8    0

countprc undcount  num_perc_min  num_crimrate  num_poverty  num_diffeng  \
0        0    -0.04          26.1            49         18.9          0.2
1      100     3.35           5.7            62         10.7          1.7
2       18     2.48          18.9            81         13.2          3.2
3        0    -0.74          16.9            38         19.0          0.2
4        4     3.60          24.3            73         10.4          5.0


    best_subset(X, y)