1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| housing["income_cat"] = pd.cut(housing["median_income"],bins=[0,1.5,3.0,4.5,6,np.inf],labels=[1,2,3,4,5])
housing["income_cat"].value_counts()
housing["income_cat"].hist() from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_split=1,test_size=0.2,random_state=42) for train_index,test_index in split.split(housing,housing["income_cat"]): strat_train_set = housing.loc[train_index] srat_test_set = housing.loc[test_index] strat_test_set["income_cat"].value_counts() / len(strat_test_set) housing["income_cat"].value_count() / len(strat_test_set)
for set_ in (strat_train_set,strat_test_set): set_.drop("income_cat",axis=1,inplace=True)
|