赞
踩
背景: 在分类问题中,由于各种原因,我们所获取到的数据集很容易出现正负样本的不平衡,或者某些数据特别多,有些数据则特别少,在这样的数据集中,进行训练,我们很难获得较好的训练结果,为此我们引入了上采样与下采样解决样本不平衡的问题。
pip install imblearn
#使用make_classification生成样本数据
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
n_redundant=0, n_repeated=0, n_classes=3,
n_clusters_per_class=1,
weights=[0.01, 0.05, 0.94],
class_sep=0.8, random_state=0)
#SMOTE过采样
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE().fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))
#ADASYN过采样
X_resampled, y_resampled = ADASYN().fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))
#欠采样
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))
#随机欠采样
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))
#两种组合采样的方法 from collections import Counter from sklearn.datasets import make_classification X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, weights=[0.01, 0.05, 0.94], class_sep=0.8, random_state=0) print(sorted(Counter(y).items())) from imblearn.combine import SMOTEENN smote_enn = SMOTEENN(random_state=0) X_resampled, y_resampled = smote_enn.fit_resample(X, y) print(sorted(Counter(y_resampled).items())) from imblearn.combine import SMOTETomek smote_tomek = SMOTETomek(random_state=0) X_resampled, y_resampled = smote_tomek.fit_resample(X, y) print(sorted(Counter(y_resampled).items()))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。