frequency |
x.value_counts(dropna = False).sort_index() pd.crosstab(df.A, df.B).apply(lambda r: r/r.sum(), axis = 1)
|
proc freq; |
drop/deep |
-
df.drop(['d1', 'd2', 'd3'], axis = 1) df.loc[:, ['k1', 'k2'])
|
data a(drop=d1 d2 d2); data a(keep=k1 k2 k3);
|
rename |
df.columns = ['a1', 'a2', 'a3'] df.rename(columns={'orig1':'new1', 'orig1':'new2'})
|
data a(rename=(orig1=new1 orig2=new2)); |
summarize |
|
proc summary; |
bin |
pd.cut(x, [min, cut1, …, cutk, max]) np.digitize(x, [cut1, cut2, …, cutk])
|
proc rank; |
row select |
df.loc[(cond1) & (cond2), :] df.iloc[:, [1, 3, 5]]
|
where cond1 and cond2; |
merge |
pd.merge(df1, df2, on = , how= ) pd.concat([df1, df2, df3], axis = 0, ignore_index = True) -
df1.join(df1, how = )
|
merge df1 df2; by col1; |
sort |
df.sort(['sort_by1', 'sort_by2'], ascending = [True, False])
|
proc sort; by sortby1 descending sort_by2; |
count missing |
df.count() df.isnull().sum()
|
proc means n nmiss min mean median max; |
format |
- 1 <-> 1 mapping:
df.column1.replace(zip(old_value, new_value)) hgc2012.sic_code.replace(dict(zip(sic_indust.sic, sic_indust.indust))) - interval bin/mapping, like from PD to risk ranking:
[ranking[j] for j in np.digitize(x, intervals)] -
pd.cut(vectorx, [-np.inf, .25, .5, 1, 1.5, 2, 3, 3.6, 4.2, np.inf], labels = range(9))
|
proc format; |
columns start with |
[x for x in list(df) if x.startswith('score')]
|
var scores: ; |
index |
df.reset_index(inplace = True) df.set_index(‘Column_name’, inplace = True)
|
|
apply / map / applymap |
pd.Series.map(func, dict, Series) pd.Series.apply() pd.DataFrame.apply(func, axis = 0)
|
data transformation, fotmat, aggregate |