variable selection in python — pydata: Huiming's learning notes

1. decision tree to pick top predictable factors

this is to run the regression decision tree first, then get the feature importance. The feature importances. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. For regression, it is the mean square error.

import pandas as pd
import numpy as np
import itertools
import pickle
from itertools import chain, combinations
import statsmodels.formula.api as smf
import scipy.stats as stats
from scipy.stats import norm
import statsmodels.api as sm
import matplotlib.pyplot as plt
import copy
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 12)


def importance_dt():
    regr_1 = DecisionTreeRegressor()
    regr_1 = regr_1.fit(cniX, cniY)
    print mean_squared_error(regr_1.predict(cniX), cniY)
    print regr_1.score(cniX, cniY)
    importance_dt = pd.DataFrame([cniX.columns.values, regr_1.feature_importances_]).T.sort(1, ascending = False)
    return importance_dt

2. single factor analysis: by rsquared_adj

# linear regression to get the adj_rsquare
def importance_reg(indata = df):
    scores = {}
    for i in cniX.columns:
        f = 'cum_pd_num ~ ' + i
        reg = smf.ols(formula = str(f), data = indata).fit()
        scores[i] = reg.rsquared_adj
        # scores[i] = reg.fvalue
    return pd.DataFrame.from_dict(scores, orient = 'index').reset_index().sort(0, ascending = False)

xvar = cniX.columns.tolist()
yvar = 'cum_pd_num'

3. this is to mimic the SAS forward variable selection

each step, the variable is selected by their f-test value.

def importance_foreward(indata = df, yVar = yvar, xVar = xvar, stopn = 4):

    flist = []
    nx = min(len(xVar), stopn)

    while len(flist) < nx:
        best_score = -np.inf
        for i in xVar:
            newflist = flist + [i]
            f = yVar + ' ~ ' + '+'.join(newflist)
            reg = smf.ols(formula = str(f), data = indata).fit()
            score = reg.fvalue
            if score > best_score:
                best_score, record_i, record_newflist = score, i, newflist
        flist = record_newflist
        print flist
        xVar.remove(record_i)
        print len(xVar)
    finmodel =  smf.ols(formula = str(yVar + ' ~ ' + '+'.join(flist)), data = indata).fit()
    print finmodel.summary()
    return flist

4. best subset regression

it will loop through all the variables combination of the Xs. So this will be very time consuming: in you have n variables, then you will run n! regressions.

# this one also will take long time to run
def best_subset(X, y, k_features = [3, 4]):
    n_features = X.shape[1]
    subsets = chain.from_iterable(combinations(xrange(n_features), k+1) for k in k_features)
    best_score = -np.inf
    best_subset = None
    for subset in subsets:
        lin_reg = sm.OLS(y, X.iloc[:, subset]).fit()
        pred = lin_reg.predict()
        score = lin_reg.rsquared_adj
        # print score
        if score > best_score:
            best_score, best_subset, best_reg_summary = score, f, reg.summary()
    return best_subset, best_score

5. one variable transformation for all variables

if you have 10 variables, and each variable has different transformation with the same starting prefix name. then you can force one of these transformed variable in the model. The reason to include one kind of transform is to avoid multicollinearity. if you include gdp and ln(gdp) then it is very likely they are high correlated.

# try to pick one var in ech group, too long time to run
def one_var_each(indata = df):
    scores = {}
    bbbx = [x for x in cniX.columns if u'bbb_spread' in x]
    gdpx = [x for x in cniX.columns if u'real_gdp_yoy' in x]
    uemx = [x for x in cniX.columns if u'unemploy_rate' in x]
    cprx = [x for x in cniX.columns if u'corp_profit' in x]
    spix = [x for x in cniX.columns if u'sp_index' in x]
    vixx = [x for x in cniX.columns if u'vix' in x]
    for i in bbbx:
        for j in gdpx:
            for k in uemx:
                for l in cprx:
                    for m in spix:
                        for n in vixx:
                            f = 'cum_pd_num ~ ' + i + ' + ' + j  + ' + ' +  k  + ' + ' +  l  + ' + ' +  m  + ' + ' +  n
                            reg = smf.ols(formula = str(f), data = indata).fit()
                            scores[f] = reg.rsquared_adj
    return scores