python机器学习手册——学习代码（导入、处理、时间、图像、降维、sklearn模型、神经网络）_python resample sklearn

作者：你好赵伟 | 2024-03-22 19:07:10

踩

python resample sklearn

一、向量、矩阵、数组

# 1.1 创建向量 array
#一维数组
import numpy as np
vector_row = np.array([1,2,3])  # 创建行向量
vector_column = np.array([[1],[2],[3]])     # 创建列向量

# 1.2 创建矩阵
matrix = np.array([[1,2],[1,2],[1,2]])

# 1.3 创建稀疏矩阵（只保存非零值,节省计算成本）
from scipy import sparse
matrix = np.array([[0,0],[0,1],[3,0]])
matrix_sparse = sparse.csr_matrix(matrix)  # 创建压缩的稀疏行矩阵
print(matrix_sparse)  #  (1, 1)   1   左侧为坐标，行列从0计
                      #  (2, 0)   3   右侧为储存的非零值
                      
# 1.4 选择元素（向量右闭，矩阵右开）
vector[2]     # 选择向量第3个元素
vector[:]     # 选取所有元素
vector[:3]    # 0到第3个（含3）元素
vector[3:]    # 第3个后所有元素
vector[-1]    # 最后一个元素
matrix[1,1]   # 选择矩阵第2行第2列
matrix[:2,:]  # 选择矩阵1、2行
matrix[:,1:2] # 选择矩阵第2列

# 1.5 查看矩阵属性
matrix.shape  # 行列数 (3,4)
matrix.size  # 元素数量（行*列） 12
matrix.ndim   # 维数 2

# 1.6 对数组中多个元素同时应用某函数 vectorize
add_100 = lambda i: i + 100   # 创建函数，每个值加一百
vectorized_add_100 = np.vectorize(add_100)   # 创建向量化函数
vectorized_add_100(matrix)    # 对矩阵所有元素应用函数

matrix + 100    # 结果同上

# 1.7 找到最大最小值
np.max(matrix)
np.min(matrix)
np.max(matrix,axis=0)  # 每列最大元素  [3,1]
np.max(matrix,axis=1)  # 每行最大元素

# 1.8 平均值方差标准差
np.mean(matrix) 
np.var(matrix)
np.std(matrix)
np.mean(matrix,axis=0)  # 每列平均值

# 1.9 矩阵变形
matrix.reshape(2,3)
matrix.reshape(2,-1)  # -1:自动识别列数

# 1.10 转置
matrix.T

# 1.11 展开矩阵
matrix.flatten()  # 转换成一维向量，同matrix.reshape(1,-1)

# 1.12 矩阵的秩 matrix_rank
np.linalg.matrix_rank(matrix)

# 1.13 行列式 det
matrix = np.array([[1,-1,3],[1,1,6],[3,8,9]])
np.linalg.det(matrix)

# 1.14 对角线元素 diagonal
matrix.diagonal()
matrix.diagonal(offset=1)   # 主对角线向上偏移1的对角线元素
matrix.diagonal(offset=-1)  # 主对角线向下偏移1的对角线元素

# 1.15 矩阵的迹 trace
matrix.trace()   # 对角线元素之和

# 1.16 特征值和特征向量
a,b = np.linalg.eig(matrix)
a   # 特征值
b   # 特征向量

# 1.17 点积 dot
a = np.array([1,2,3])
b = np.array([4,5,6])
np.dot(a,b)  # 两个向量之积

# 1.18 矩阵加减
np.add(a,b)       # 同 a + b
np.subtract(a,b)  # 同 a - b

# 1.19 矩阵乘法
np.dot(a,b)
a @ b   # 同上
a * b   # 矩阵对应元素相乘

# 1.20 矩阵的逆 inv
np.linalg.inv(matrix)

# 1.21 生成随机数 random
np.random.seed(0)
np.random.random(3)        # 生成3个0到1之间浮点数
np.random.randint(0,11,3)  # 生成3个0到10之间整数
np.random.normal(0,1,3)    # 生成3个(0,1)正态分布
np.random.uniform(1,2,3)   # 生成3个大于等于1小于2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

二、加载数据

# 2.1 加载样本数据集
from sklearn import datasets     # 加载scikit-learn的数据集
digits = datasets.load_digits()  # 加载手写数字数据集，研究图像分类
features = digits.data           # 创建特征矩阵
target = digits.target           # 创建目标向量
features[0]                      # 查看第一个样本数据
# load_boston:503个波士顿房价，研究回归
# load_iris:150个鸢尾花尺寸，研究分类

# 2.2 创建仿真数据集
from sklearn.datasets import make_regression  # 用于线性回归
from sklearn.datasets import make_classification  # 用于分类
from sklearn.datasets import make_blobs       # 用于聚类

# 2.3 加载CSV文件
import pandas as pd
url = 'htttps://tinyurl.com/simulated_data'  # 可加载本地或远端CSV
data = pd.read_csv(url)  # 可设置sep=、header=True(有列名)
data.head(2)             # 查看前两行

# 2.4 加载Excel文件
url = 'htttps://tinyurl.com/simulated_excel'  
data = pd.read_excel(url,sheetname=0,header=1)  # sheetname可用表名、序号、列表
data.head(2) 

# 2.5 加载JSON文件
data = pd.read_json(url,orient='columns')  # orient是JSON结构参数
# json_normalize:将半结构化json数据转换为Dataframe

# 2.6 查询SQL数据库 create_engine、read_sql_query
from aqlalchemy import create_engine
database_connection = create_engine('sqlite:///sample.db') # 创建数据库连接
dataframe = pd.read_sql_query('SELECT * FROM data',database_connection)  # 加载数据,返回data表的所有列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

三、数据整理

# 3.1 创建数据帧
data = pd.DataFrame()           # 创建
data['name'] = ['aaa','bbb']    # 增加列
data['age'] = [38,25]
data['driver'] = [True,False]
new_person = pd.Series(['ccc',40,True],index=['name','age','driver'])  # 创建一行
data.append(new_person,ignore_index=True)  # 附加一行

# 3.2 描述数据
data.head(2)    # 查看前两行
data.shape      # 查看行列
data.describe() # 查看数值型变量的描述性统计量

# 3.3 浏览数据帧(对行) iloc
data.iloc[0]     # 第一行
data.iloc[1:4]   # 2,3,4行
data.iloc[:4]    # 1,2,3,4行
data.loc[:,'name']   # loc-标签（字符串），iloc-序号

# 3.4 条件语句选行
data[data['age'] == 38].head(1)  # age为38的第一行
data[(data['age'] <= 38) & (data['driver'] == False)]  # 多个条件

# 3.5 替换值 replace
data['age'].replace(38,40)              # 38换成40
data['age'].replace([38,40],[25,50])    # 同时替换多值
data.replace(1,'one')                   # 替换整个表中数据
data.replace(r'1st','First',regex=True) # 可用正则

# 3.6 重命名列 rename
data.rename(columns={
   'age':'Age'})                # Age替换age
data.rename(columns={
   'age':'Age','name':'Name'})  # 改多个

# 3.7 计算值
print('max:',data['age'].max())
print('min:',data['age'].min())
print('mean:',data['age'].mean())
print('sum:',data['age'].sum())
print('count:',data['age'].count())
# 方差var，标准差std，峰态kurt，偏态skew，平均值标准误差sem，众数mode，中位数median

# 3.8 查找唯一值 unique
data['age'].unique()        # 筛选唯一值
data['age'].value_counts()  # 所有唯一值和出现的次数
data['age'].nunique()       # 多少个唯一值

# 3.9 缺失值 isnull、na_values
data[data['age'].isnull()]  # 或notnull
data = pd.read_csv(url,na_values=[np.nan,'NONE',-999])  # 设置这3个为缺失值

# 3.10 删列 drop
data.drop('age',axis=1)          # 删一列
data.drop('age','name',axis=1)   # 删多列  inplace=True会修改本身

# 3.11 删行
data[data['age'] != 38]

# 3.12 删重复行 drop_duplicates
data.drop_duplicates()
data.drop_duplicates(subset=['age'])   # 删age中重复行，默认保存先出现的行
data.drop_duplicates(subset=['age']，keep='last')  # 保存后出现的行

# 3.13 根据值对行分组
data.groupby('age').mean()           # 根据age分组，计算平均值
data.groupby(['name','age'])['driver'].mean()  # name分组后age分组，再计算driver平均数

# 3.14 按时间段对行分组  resample
import pandas as pd
import numpy as np
time = pd.date_range('06/06/2017',periods=100000,freq='30s')  # 创建日期范围
data = pd.DataFrame(index=time)   # 创建数据帧
data['count'] = np.random.randint(1,10,100000)  # 创建一列1到10整数随机变量
data.resample('w').sum()    # 按周分组，计算每周总和
# '2w'两周，'m'月
data.resample('w'，label='left').sum()   # 默认返回时间右边界值，label可改成左

# 3.15 遍历一个列的数据
for name in data['name'][0:2]:
    print(name.upper())         # 大写打印前两行名字

# 3.16 对一列元素应用某函数 apply
def uppercase(x):
    return x.upper()   # 创建函数
data['name'].apply(uppercase)[0:2]

# 3.17 对所有分组应用函数
data.groupby('age').apply(lambda x: x.count())  # 对行分组，每组应用函数

# 3.18 连接多个数据帧 concat
pd.concat([data_a,data_b],axis=0)  # 沿着行的方向连接两个数据帧（上下）
pd.concat([data_a,data_b],axis=1)  # 沿着列的方向连接两个数据帧（左右）

# 3.19 合并两个数据帧 merge
pd.merge(data_a,data_b,on='id')    # 等值连接（交集)
pd.merge(data_a,data_b,on='id',how='outer')    # 并集
pd.merge(data_a,data_b,on='id',how='left')    # 左连接（留下左表有的id）
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

四、处理数值型数据

# 4.1 min-max特征的缩放 MinMaxScaler
import numpy as np
from sklearn import preprocessing
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1)) # 创建缩放器
scaled_feature = minmax_scale.fit_transform(feature)  # 缩放特征的值
scaled_feature

# 4.2 0-1特征的标准化 StandardScaler
x = np.array([[-1000.1],[-200.2],[500.5],[600.6],[9000.9]])
scaler = preprocessing.StandardScaler()   # 创建缩放器
standardized = scaler.fit_transform(x)    # 转换特征
standardized
# 中位数和四分位数间距进行缩放（存在异常值） RobustScaler
robust_scaler = preprocessing.RobustScaler()
robust_scaler.fit_transform(x)

# 4.3 归一化观察值 Normalizer
from sklearn.preprocessing import Normalizer
features = np.array([[0.5,0.5],[1.1,3.4],[1.5,20.2],[1.63,34.4],[10.9,3.3]])
normalizer = Normalizer(norm='l2')  # L2范数（欧式范数）
normalizer.transform(features)

# 4.4 生成多项式和交互特征 PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures
features = np.array([[2,3],[2,3],[2,3]])         # 创建特征矩阵
polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False)  # 阶数最高为2
polynomial_interaction.fit_transform(features)   # 创建多项式特征

# 4.5 转换特征
from sklearn.preprocessing import FunctionTransformer
def add_ten(x):
    return x+10
ten_transformer = FunctionTransformer(add_ten)  # 创建转换器
ten_transformer.transform(features)             # 转换特征矩阵
# apply也可
import pandas as pd
df = pd.DataFrame(features,columns=['feature_1','feature_2'])  # 创建数据帧
df.apply(add_ten)  # 应用函数

# 4.6 识别异常值 EllipticEnvelope （正常1，异常-1）
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
features,_ = make_blobs(n_samples = 10,n_features = 2,centers = 1,random_state = 1)     # 创建模拟数据
features[0,0] = 10000   # 将第一个观察值替换为极端值
features[0,1] = 10000
outlier_detector = EllipticEnvelope(contamination=.1)    # 创建识别器
outlier_detector.fit(features)      # 拟合识别器
outlier_detector.predict(features)  # 预测异常值

# 4.7 处理异常值
houses = pd.DataFrame()
houses['Price'] = [534433,392333,293222,4322032]
houses['Bathrooms'] = [2,3.5,2,116]
houses['Square_Feet'] = [1500,2500,1500,48000]
# 1.丢弃
houses[houses['Bathrooms'] < 20]
# 2.标记  where
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1) # 异常值标1
houses
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]    # 对特征值取对数值
houses

# 4.8 特征离散化(可用于数值型数据编码)
from sklearn.preprocessing import Binarizer
age = np.array([[6],[12],[20],[36],[65]])
# 根据阈值将特征二值化 Binarizer
binarizer = Binarizer(18)       # 创建二值化器
binarizer.fit_transform(age)    # 转换特征
# 根据多个阈值将特征离散化 digitize
np.digitize(age,bins=[20,30,64])               # bins左闭右开
np.digitize(age,bins=[20,30,64]，right=True)   # 改为左开右闭

# 4.9 使用聚类将观察值分组 KMeans
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
features,_ = make_blobs(n_samples = 50,n_features = 2,centers = 3,random_state = 1) 
data = pd.DataFrame(features,columns=['feature_1','feature_2'])
clusterer = KMeans(3,random_state=0)    # 创建K-Mean聚类器
clusterer.fit(features)
data['group'] = clusterer.predict(features)    # 预测聚类的值
data.head(5)

# 4.10 删除带有缺失值的观察值
# numpy
features = np.array([[1.1,11.1],[2.2,22.2],[3.3,33.3],[4.4,44.4],[np.nan,55]])
features[~np.isnan(features).any(axis=1)]  # ~:非  保留没有缺失值的
# pandas dropna
data = pd.DataFrame(features,columns=['feature_1','feature_2'])
data.dropna()

# 4.11 填充缺失值
# 数据量不大，KNN预测缺失值
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
features,_ = make_blobs(n_samples = 1000,n_features = 2,random_state = 1) 
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)  # 标准化特征值
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan         # 第一个特征向量的第一个值替换为缺失值
features_knn_imputed = KNN(k=5,verbose=0).complete(standardized_features)   # 预测缺失值
print('TRUE:',true_value)
print('Imputed:',features_knn_imputed[0,0])   # 对比真实值和预测值

# 平均数、中位数、众数填充，效果较KNN差
from sklearn.preprocessing import Imputer
mean_inputer = Imputer(strategy='mean'ssssssssssss,axis=0)
features_mean_inputed = mean_imputer.fit_transform(features)
print('TRUE:',true_value)
print('Imputed:',features_mean_inputed[0,0])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

五、处理分类数据

# 5.1 对无序分类特征编码 
# LabelBinarizer
import numpy as np
from sklearn.preprocessing import LabelBinarizer,MultiLabelBinarizer
feature = np.array([['a'],['b'],['a'],['c'],['a']])
one_hot = LabelBinarizer()       # 创建one_hot编码
one_hot.fit_transform(feature)
one_hot.classes_        # 输出分类
one_hot.inverse_transform(one_hot.transform(feature))  # 对one_hot编码逆转换
# MultiLabelBinarizer（每个观察值有多分类）
one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(multiclass_feature)
one_hot_multiclass.classes_
# pandas
import pandas as pd
pd.get_dummies(feature[:,0])   # 创建虚拟变量
# (没有内在顺序不宜用1，2，3编码，one-hot编码后应删除一个编码特征)

# 5.2 对有序分类特征编码 replace
data = pd.DataFrame({
   'score':['low','low','medium','medium','high']})
scale_mapper = {
   'low':1,'medium':2,'high':3}     # 创建映射器
data['score'].replace(scale_mapper)              # 使用映射器替换特征

# 5.3 对特征字典编码 DictVectorizer
from sklearn.feature_extraction import DictVectorizer
data = [{
   'red':2,'blue':4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/你好赵伟/article/detail/290309