赞
踩
import pandas as pd
s=pd.Series([1,2,3,4],index=["a",'b','c','d'])
s.index
s.values
s[1]
s['a']
s[[1,2,3]]
list(s.iteritems())
b=pd.Series([1,3,5,7],index=["a",'b','c','d'])
b-s
s=pd.read_csv('data.txt',parse_dates=[0],header=None)
s.dtypes
s.shape
s.columns
s[0]
s.loc[1:3]
0 | 1 | |
---|---|---|
1 | 2015-01-01 | 13.5 |
2 | 2014-01-01 | 10.5 |
3 | 2013-01-01 | 10.5 |
[]运算符可以通过列索引标签或者列序号获取指定的列,loc[]可以通过行索引标签或者行号获取指定的行
### 将内存中的数据转换为Dataframe对象 调用Datafrmae()可以将多种格式的数据转换为Dataframe对象,它的三个参数data,index和column分别为数据,行索引,列索引。data参数可以是 - 二维数组或者能转换为二维数组的嵌套列表 - 字典:字典中的每对“键-值”将成为Dataframe对象的列。值可以是一维数组,列表或者series对象import numpy as np
pd.DataFrame(np.random.randint(1,10,(4,2)),index=['a','b','c','d'],columns=['键','value'])
键 | value | |
---|---|---|
a | 3 | 5 |
b | 7 | 9 |
c | 1 | 5 |
d | 3 | 5 |
此外还可以调用from_开头的类方法,将特定格式的数据转换为dataframe对象。from_dict()将字典转换为dataframe对象,其中orient参数可以指定字典键对应的方向。
- 默认值是“columns”,表示将字典的键转换为列索引,即字典中的每个值与每一列相对应
- 当参数为‘index’时,字典中的每个值与一行对应
- 当字典为嵌套字典时,另外一个轴的索引值由第二个字典中的键所决定
dict1={'a':[1,2,3],'b':[4,5,6]}
dict2={'a':{'l':1,'o':2,'v':3},'b':{'y':4,'o':5,'u':4}}
pd.DataFrame.from_dict(dict1,orient='index')
0 | 1 | 2 | |
---|---|---|---|
a | 1 | 2 | 3 |
b | 4 | 5 | 6 |
pd.DataFrame.from_dict(dict1,orient='columns')
a | b | |
---|---|---|
0 | 1 | 4 |
1 | 2 | 5 |
2 | 3 | 6 |
pd.DataFrame.from_dict(dict2,orient='index')
l | o | v | y | u | |
---|---|---|---|---|---|
a | 1.0 | 2 | 3.0 | NaN | NaN |
b | NaN | 5 | NaN | 4.0 | 4.0 |
pd.DataFrame.from_dict(dict2,orient='columns')
a | b | |
---|---|---|
l | 1.0 | NaN |
o | 2.0 | 5.0 |
u | NaN | 4.0 |
v | 3.0 | NaN |
y | NaN | 4.0 |
from_items()将‘键值’序列转换为dataframe对象,之中‘键’表示一维数据的列表,数组或者series对象,orient参数与from_dict一致
items=dict1.items()
pd.DataFrame.from_items(items,orient='columns')
a | b | |
---|---|---|
0 | 1 | 4 |
1 | 2 | 5 |
2 | 3 | 6 |
pd.DataFrame.from_items(items,orient="index",columns=['1','2','3'])#当Orient参数为index时,必须指定columns列索引
1 | 2 | 3 | |
---|---|---|---|
a | 1 | 2 | 3 |
b | 4 | 5 | 6 |
df=pd.DataFrame.from_dict(dict2,orient='index')
df.to_dict(orient='records')#将每一行转换为一个字典,形成一个字典列表
df.to_dict(orient='list')#列表字典,将一列转换为字典,用列索引来对应键值
df.to_dict(orient='dict')#嵌套字典,外部字典键值用列索引,内部字典键值用行索引
df.to_records()
df.to_records(index=False)
index=df.columns
index.values
index[index>'o']
index[[1,3]]
index[1::2]
index.get_loc('v')
index.get_indexer(['v','o'])
mindex=pd.Index([('a','x'),('a','y'),('b','x'),('b','y')])
mindex.levels[0]
mindex.levshape
class1=['a','b','c','d']
class2=['1','2','3','4']
pd.MultiIndex.from_arrays([class1,class2])
df=pd.DataFrame(np.random.randint(0,10,(4,3)),index=['a','b','c','d'],columns=['1','2','3'])
df[2:4]#取行
1 | 2 | 3 | |
---|---|---|---|
c | 5 | 8 | 7 |
d | 8 | 0 | 0 |
df['1']
df[['1','2']]
1 | 2 | |
---|---|---|
a | 2 | 2 |
b | 8 | 0 |
c | 5 | 8 |
d | 8 | 0 |
df[df['1']>2]#b布尔数组
1 | 2 | 3 | |
---|---|---|---|
b | 8 | 0 | 6 |
c | 5 | 8 | 7 |
d | 8 | 0 | 0 |
df[df>2]#会将不符合条件的数直接设置为NaN值
1 | 2 | 3 | |
---|---|---|---|
a | NaN | NaN | NaN |
b | 8.0 | NaN | 6.0 |
c | 5.0 | 8.0 | 7.0 |
d | 8.0 | NaN | NaN |
df.loc['a']
df.loc['a','1']
df.iloc[0]#取行
df.iloc[[1,3]]
1 | 2 | 3 | |
---|---|---|---|
b | 8 | 0 | 6 |
d | 8 | 0 | 0 |
df.iloc[[1,3],2]
df.iloc[:,1]
df.ix[1:3,'1']
df.at['a','1']
df.iat[1,2]
df.loc[['a','b'],['1','3']]
1 | 3 | |
---|---|---|
a | 2 | 0 |
b | 8 | 6 |
df.query("'1'>2")
import pandas as pd
df=pd.read_csv('data.txt',encoding='utf-8',parse_dates=[0],na_values=['-',' '],header=0)
df.columns=['time','ratio']
df
time | ratio | |
---|---|---|
0 | 2015-01-01 | 13.5 |
1 | 2014-01-01 | 10.5 |
2 | 2013-01-01 | 10.5 |
3 | 2012-01-01 | 10.5 |
4 | 2011-01-01 | 10.7 |
5 | 2009-01-01 | 12.6 |
6 | 2008-01-01 | 12.9 |
7 | 2007-01-01 | 12.7 |
8 | 2006-01-01 | 14.3 |
9 | 2005-01-01 | 13.7 |
10 | 2004-01-01 | 14.1 |
11 | 2003-01-01 | 14.3 |
12 | 2002-01-01 | 13.9 |
import tushare as ts
import pandas as pd
from sqlalchemy import create_engine
df = ts.get_hist_data('000875')#读取数据,格式为DataFrame
engine = create_engine('mysql://root:luozong@localhost/luozong?charset=utf8')#用sqlalchemy创建引擎
df.to_sql('tick_data',engine,if_exists='append')#存入数据库,这句有时候运行一次报错,运行第二次就不报错了,不知道为什么
df1 = pd.read_sql('tick_data',engine)#从数据库中读取表存为DataFrame
import pandas as pd
import pymysql
a=pymysql.connect(host='localhost',port=3306,user='root',passwd='luozong',db='world',use_unicode=True, charset="utf8")
sql='select %s from %s where CountryCode=%s '%('*','city','"CHN"')
d=pd.read_sql(sql,con=a)
a.close()
with pd.ExcelWriter(r'C:\Users\Administrator\Desktop\1.xls') as writer:
d.to_excel(writer)
import matplotlib.pyplot as pl
df1=df1[600:]
y1=df1['high']
y2=df1['close']
y3=df1['low']
pl.figure(figsize=(16,9))
pl.plot(df1['date'],y1,label='high price')
pl.plot(df1['date'],y2,label='close price',color='red')
pl.plot(df1['date'],y3,label='low price')
pl.legend()
pl.show()
df2=df1[:50]
df2.mean()
df2.max()
import sys
import numpy as np
t=np.linspace(0,10,40)
x=np.sin(1*np.pi*t)
x[np.random.randint(0,len(t),40)]+= np.random.normal(0,0.3,40)
s=pd.Series(x,index=t)
s_mean=pd.rolling_mean(s,5,center=True)
s_median=pd.rolling_median(s,5,center=True)
pl.plot(t,x,t,s_mean,t,s_median)
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with
Series.rolling(window=5,center=True).mean()
import sys
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: pd.rolling_median is deprecated for Series and will be removed in a future version, replace with
Series.rolling(window=5,center=True).median()
[<matplotlib.lines.Line2D at 0x2d212b26828>,
<matplotlib.lines.Line2D at 0x2d212b26cc0>,
<matplotlib.lines.Line2D at 0x2d212b26e10>]
expanding_*()函数对序列进行扩展窗口运算,例如expanding_max()返回每个元素为止的历史最大值
x=np.cumsum(np.random.randn(400))
x_max=pd.expanding_max(x)
x_min=pd.expanding_min(x)
x_mean=pd.expanding_mean(x)
pl.plot(x)
pl.plot(x_min)
pl.plot(x_max)
pl.plot(x_mean)
pl.show()
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: pd.expanding_max is deprecated for ndarrays and will be removed in a future version
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: pd.expanding_min is deprecated for ndarrays and will be removed in a future version
This is separate from the ipykernel package so we can avoid doing imports until
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: pd.expanding_mean is deprecated for ndarrays and will be removed in a future version
after removing the cwd from sys.path.
x
df_abc=df2.columns
df_abc.str.upper()
s_utf8=pd.Series(['北京','北京市','北京地区'])
s_unicode=s_utf8.str.decode('utf-8')
s_gb2312=s_utf8.str.encode('gb2312')
s_utf8.str.len()
s_abc=pd.Series(['a','b','c'])
s_utf8+'-'+s_abc*2
now=pd.Timestamp.now()
now_shanghai=now.tz_localize('Asia/Shanghai')
now_shanghai
now_shanghai.tz_convert('Asia/Tokyo')
import pytz
pytz.common_timezones
pd.Period.now(freq='D')
pd.Period.now(freq='H')
from pandas.tseries import frequencies
frequencies._period_code_map.keys()
now_day=pd.Period.now(freq='D')
now_day.start_time
now_day.end_time
now_shanghai.to_period('H')
now_shanghai.to_datetime()
now.day
now.year
now.month
now.dayofweek
now.dayofyear
nation_day=pd.Timestamp('2015-10-1')
td=now-nation_day
nation_day+td
td.days
td.seconds
td.components
tuple(td.components)
def random_timestamps(start, end, freq, count):
index = pd.date_range(start, end, freq=freq)#根据时间段产生随机时间
locations = np.random.choice(np.arange(len(index)), size=count, replace=False)
locations.sort()
return index[locations]
np.random.seed(42)
ts_index = random_timestamps("2015-01-01", "2015-10-01", freq="Min", count=5)
pd_index = ts_index.to_period("M")
td_index = pd.TimedeltaIndex(np.diff(ts_index))
print (ts_index, "\n")
print (pd_index, "\n")
print (td_index, "\n")
ts_index.weekday
pd_index.month
td_index.seconds
ts_index.shift(1,freq='H')
ts_index.shift(1,freq='M')
ts_index.normalize()
ts_index.to_period("H").to_timestamp()
ts=pd.Series(range(0,5),index=ts_index)
ts
ts.between_time('9:00','18:00')
ts.tshift(1,freq='M')
pd_series=pd.Series(range(0,5),index=pd_index)
td_series=pd.Series(range(0,4),index=td_index)
pd_series.shift(1,freq='M')
td_series.shift('1',freq='H')
data=np.random.randint(0,10,(10,3))
df_int=pd.DataFrame(data,columns=['A',"B","C"])
dd=df_int.where(df_int>2)
dd.isnull()
A | B | C | |
---|---|---|---|
0 | False | True | True |
1 | False | True | False |
2 | False | True | True |
3 | False | False | False |
4 | False | False | False |
5 | False | False | False |
6 | False | True | True |
7 | False | True | False |
8 | False | True | False |
9 | False | True | False |
dd.notnull()
A | B | C | |
---|---|---|---|
0 | True | False | False |
1 | True | False | True |
2 | True | False | False |
3 | True | True | True |
4 | True | True | True |
5 | True | True | True |
6 | True | False | False |
7 | True | False | True |
8 | True | False | True |
9 | True | False | True |
count()返回每行或者每列的非NaN元素的个数
dd.count()
dd.count(axis=1)
dd.dropna()
A | B | C | |
---|---|---|---|
3 | 3 | 3.0 | 3.0 |
4 | 9 | 6.0 | 7.0 |
5 | 5 | 4.0 | 3.0 |
dd.dropna(thresh=4,axis=1)#这个方法有bug
A | C | |
---|---|---|
0 | 8 | NaN |
1 | 8 | 3.0 |
2 | 6 | NaN |
3 | 3 | 3.0 |
4 | 9 | 7.0 |
5 | 5 | 3.0 |
6 | 5 | NaN |
7 | 3 | 9.0 |
8 | 6 | 4.0 |
9 | 8 | 7.0 |
dd
A | B | C | |
---|---|---|---|
0 | 8 | NaN | NaN |
1 | 8 | NaN | 3.0 |
2 | 6 | NaN | NaN |
3 | 3 | 3.0 | 3.0 |
4 | 9 | 6.0 | 7.0 |
5 | 5 | 4.0 | 3.0 |
6 | 5 | NaN | NaN |
7 | 3 | NaN | 9.0 |
8 | 6 | NaN | 4.0 |
9 | 8 | NaN | 7.0 |
当行数据按照某种物理顺序排列时,可以使用NaN前后的数据对其进行填充。ffill()使用之前的数据进行填充,而bfill()则使用之后的数据进行填充。interpolate()使用前后数据进行插值填充。
interpolate()默认使用等距线性插值,可以同时其method参数设置插值算法,其方法主要有:
'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh',
'polynomial', 'spline', 'piecewise_polynomial',
'from_derivatives', 'pchip', 'akima'}```
```python
dd.ffill()
<div class="se-preview-section-delimiter"></div>
A | B | C | |
---|---|---|---|
0 | 8 | NaN | NaN |
1 | 8 | NaN | 3.0 |
2 | 6 | NaN | 3.0 |
3 | 3 | 3.0 | 3.0 |
4 | 9 | 6.0 | 7.0 |
5 | 5 | 4.0 | 3.0 |
6 | 5 | 4.0 | 3.0 |
7 | 3 | 4.0 | 9.0 |
8 | 6 | 4.0 | 4.0 |
9 | 8 | 4.0 | 7.0 |
dd.bfill()
<div class="se-preview-section-delimiter"></div>
A | B | C | |
---|---|---|---|
0 | 8 | 3.0 | 3.0 |
1 | 8 | 3.0 | 3.0 |
2 | 6 | 3.0 | 3.0 |
3 | 3 | 3.0 | 3.0 |
4 | 9 | 6.0 | 7.0 |
5 | 5 | 4.0 | 3.0 |
6 | 5 | NaN | 9.0 |
7 | 3 | NaN | 9.0 |
8 | 6 | NaN | 4.0 |
9 | 8 | NaN | 7.0 |
dd.interpolate(method='index')#使用索引值填充
<div class="se-preview-section-delimiter"></div>
A | B | C | |
---|---|---|---|
0 | 8 | NaN | NaN |
1 | 8 | NaN | 3.0 |
2 | 6 | NaN | 3.0 |
3 | 3 | 3.0 | 3.0 |
4 | 9 | 6.0 | 7.0 |
5 | 5 | 4.0 | 3.0 |
6 | 5 | 4.0 | 6.0 |
7 | 3 | 4.0 | 9.0 |
8 | 6 | 4.0 | 4.0 |
9 | 8 | 4.0 | 7.0 |
dd.interpolate(method="linear")
<div class="se-preview-section-delimiter"></div>
A | B | C | |
---|---|---|---|
0 | 8 | NaN | NaN |
1 | 8 | NaN | 3.0 |
2 | 6 | NaN | 3.0 |
3 | 3 | 3.0 | 3.0 |
4 | 9 | 6.0 | 7.0 |
5 | 5 | 4.0 | 3.0 |
6 | 5 | 4.0 | 6.0 |
7 | 3 | 4.0 | 9.0 |
8 | 6 | 4.0 | 4.0 |
9 | 8 | 4.0 | 7.0 |
此外,还可以使用字典参数fillna()对不同的列使用不同的值进行填充NaN
dd.fillna({"B":5,"C":6})
<div class="se-preview-section-delimiter"></div>
A | B | C | |
---|---|---|---|
0 | 8 | 5.0 | 6.0 |
1 | 8 | 5.0 | 3.0 |
2 | 6 | 5.0 | 6.0 |
3 | 3 | 3.0 | 3.0 |
4 | 9 | 6.0 | 7.0 |
5 | 5 | 4.0 | 3.0 |
6 | 5 | 5.0 | 6.0 |
7 | 3 | 5.0 | 9.0 |
8 | 6 | 5.0 | 4.0 |
9 | 8 | 5.0 | 7.0 |
各种聚合方法的skipna参数默认为TRUE,因此计算是将忽略NaN元素,注意每行或者每列是单独运算的。如果需要忽略包含NaN的整行,需要先调用dropna()。
dd.sum()
<div class="se-preview-section-delimiter"></div>
A 61.0
B 13.0
C 36.0
dtype: float64
dd.sum(skipna=False)
<div class="se-preview-section-delimiter"></div>
A 61.0
B NaN
C NaN
dtype: float64
combine_first(other)方法使用other填充df中的NaN元素。它将df中的NaN元素替换为other中对应标签的元素。
other=pd.DataFrame(np.random.randint(0,5,(10,3)),columns=['A','B','C'])
dd.combine_first(other)
<div class="se-preview-section-delimiter"></div>
A | B | C | |
---|---|---|---|
0 | 8 | 3.0 | 1.0 |
1 | 8 | 2.0 | 3.0 |
2 | 6 | 0.0 | 1.0 |
3 | 3 | 3.0 | 3.0 |
4 | 9 | 6.0 | 7.0 |
5 | 5 | 4.0 | 3.0 |
6 | 5 | 4.0 | 4.0 |
7 | 3 | 0.0 | 9.0 |
8 | 6 | 4.0 | 4.0 |
9 | 8 | 4.0 | 7.0 |
Dataframe的shape属性和Numpy的二维数组是一样的,是一个有两个元素的数组。因为Dataframe的index和columns都支持multiindex索引,因此可以用Dataframe表示更高维的数据
d0=df[:10]
<div class="se-preview-section-delimiter"></div>
d0=d0[['open', 'high', 'close', 'low', 'volume']]
<div class="se-preview-section-delimiter"></div>
d0.groupby(['open','volume']).mean()
<div class="se-preview-section-delimiter"></div>
high | close | low | ||
---|---|---|---|---|
open | volume | |||
3.10 | 94721.49 | 3.14 | 3.11 | 3.08 |
111745.67 | 3.13 | 3.12 | 3.09 | |
3.11 | 71162.30 | 3.14 | 3.12 | 3.11 |
3.13 | 64214.61 | 3.15 | 3.14 | 3.12 |
98914.75 | 3.14 | 3.08 | 3.07 | |
3.14 | 60980.01 | 3.14 | 3.12 | 3.11 |
3.22 | 195015.98 | 3.22 | 3.09 | 3.06 |
3.25 | 113690.00 | 3.27 | 3.25 | 3.22 |
3.27 | 99754.00 | 3.30 | 3.25 | 3.24 |
3.32 | 176011.23 | 3.32 | 3.28 | 3.26 |
由于Dataframe可以看做是一个series的字典,因此可以通过Dataframe[colname]=values即可添加新列。有时候新添加的列是从已存在的列上计算而来的,这时候可以使用eval()方法计算。
import numpy as np
df['new_col']=df.eval('percent/100')
<div class="se-preview-section-delimiter"></div>
df
<div class="se-preview-section-delimiter"></div>
time | percent | new_col | |
---|---|---|---|
0 | 2015-01-01 | 13.5 | 0.135 |
1 | 2014-01-01 | 10.5 | 0.105 |
2 | 2013-01-01 | 10.5 | 0.105 |
3 | 2012-01-01 | 10.5 | 0.105 |
4 | 2011-01-01 | 10.7 | 0.107 |
5 | 2009-01-01 | 12.6 | 0.126 |
6 | 2008-01-01 | 12.9 | 0.129 |
7 | 2007-01-01 | 12.7 | 0.127 |
8 | 2006-01-01 | 14.3 | 0.143 |
9 | 2005-01-01 | 13.7 | 0.137 |
10 | 2004-01-01 | 14.1 | 0.141 |
11 | 2003-01-01 | 14.3 | 0.143 |
12 | 2002-01-01 | 13.9 | 0.139 |
assign()方法添加由关键字参数指定的列,它返回一个新的Dataframe对象,原数据的内容保持不变
df.assign(new_col2=100-df.percent)
<div class="se-preview-section-delimiter"></div>
time | percent | new_col | new_col2 | |
---|---|---|---|---|
0 | 2015-01-01 | 13.5 | 0.135 | 86.5 |
1 | 2014-01-01 | 10.5 | 0.105 | 89.5 |
2 | 2013-01-01 | 10.5 | 0.105 | 89.5 |
3 | 2012-01-01 | 10.5 | 0.105 | 89.5 |
4 | 2011-01-01 | 10.7 | 0.107 | 89.3 |
5 | 2009-01-01 | 12.6 | 0.126 | 87.4 |
6 | 2008-01-01 | 12.9 | 0.129 | 87.1 |
7 | 2007-01-01 | 12.7 | 0.127 | 87.3 |
8 | 2006-01-01 | 14.3 | 0.143 | 85.7 |
9 | 2005-01-01 | 13.7 | 0.137 | 86.3 |
10 | 2004-01-01 | 14.1 | 0.141 | 85.9 |
11 | 2003-01-01 | 14.3 | 0.143 | 85.7 |
12 | 2002-01-01 | 13.9 | 0.139 | 86.1 |
append()方法用于添加行,他没有inplace参数,只能返回一个全新的对象。由于每次调用append()都会复制原有数据,因此如果在循环中使用append()添加数据行会影响运行数速度,可以考虑使用一个列表缓存所有分块数据,然后使用concat()将这些数据沿着指定轴拼贴到一起。
def random_dataframe(n):
columns = ["A", "B", "C"]
for i in range(n):
nrow = np.random.randint(10, 20)
yield pd.DataFrame(np.random.randint(0, 100, size=(nrow, 3)), columns=columns)
df_list = list(random_dataframe(1000))
<div class="se-preview-section-delimiter"></div>
%%time
df_res1 = pd.DataFrame([])
for df in df_list:
df_res1 = df_res1.append(df)
<div class="se-preview-section-delimiter"></div>
Wall time: 574 ms
%%time
df_res2 = pd.concat(df_list, axis=0)
<div class="se-preview-section-delimiter"></div>
Wall time: 129 ms
drop()删除指定标签的行或者列
df.drop("A",axis=1)
<div class="se-preview-section-delimiter"></div>
B | C | |
---|---|---|
0 | 29 | 88 |
1 | 45 | 15 |
2 | 50 | 81 |
3 | 86 | 81 |
4 | 88 | 25 |
5 | 21 | 42 |
6 | 32 | 82 |
7 | 2 | 59 |
8 | 24 | 62 |
9 | 49 | 14 |
10 | 80 | 95 |
11 | 67 | 18 |
reset_index()可以将索引转换成列,通过level参数可以指定被转换为列的级别。如果只希望从索引中删除某个级别,可以设置drop参数为TRUE
df=pd.read_csv('data.txt',header=None)
<div class="se-preview-section-delimiter"></div>
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-53-e2a66a7edd4c> in <module>()
----> 1 df=pd.read_csv('data.txt',header=None,index=0)
TypeError: parser_f() got an unexpected keyword argument 'index'
df.columns=['time','percent']
df.index=df['time']
<div class="se-preview-section-delimiter"></div>
df.reset_index(level='time',drop=True)#可以将原有的索引删除
<div class="se-preview-section-delimiter"></div>
time | percent | |
---|---|---|
0 | 2016 | 15.2 |
1 | 2015 | 13.5 |
2 | 2014 | 10.5 |
3 | 2013 | 10.5 |
4 | 2012 | 10.5 |
5 | 2011 | 10.7 |
6 | 2009 | 12.6 |
7 | 2008 | 12.9 |
8 | 2007 | 12.7 |
9 | 2006 | 14.3 |
10 | 2005 | 13.7 |
11 | 2004 | 14.1 |
12 | 2003 | 14.3 |
13 | 2002 | 13.9 |
set_index()可将列转换为航所以,如果append参数为false,则删除当前的行索引;若为TRUE,则为当前的索引添加新的级别
df.set_index('time')
<div class="se-preview-section-delimiter"></div>
percent | |
---|---|
time | |
2016 | 15.2 |
2015 | 13.5 |
2014 | 10.5 |
2013 | 10.5 |
2012 | 10.5 |
2011 | 10.7 |
2009 | 12.6 |
2008 | 12.9 |
2007 | 12.7 |
2006 | 14.3 |
2005 | 13.7 |
2004 | 14.1 |
2003 | 14.3 |
2002 | 13.9 |
import matplotlib.pyplot as pl
pl.plot(df['percent'])
pl.show()
<div class="se-preview-section-delimiter"></div>
stack()方法把指定级别的列索引转换为行索引,而unstack()则把行索引转换为列索引
reorder_levels()和swaplevel()交换指定轴的索引级别
pivot()可以将dataframe中的散列数据分别作为行索引,列索引和元素值,将这三列数据转换为二维表格
pivot()的三个参数index,columns和values只支持指定一列数据,若不指定values数据,就将剩余的列都当做元素值列,得到多级索引
所谓分组运算就是使用特定的条件将数据分为多个分组,然后对每个分组进行运算,最后再将结果整合起来。pandas中的分组运算由Dataframe或Series对象的groupby()方法实现
groupby()的axis参数默认为0表示源数据的行进行分组。源数据的每行与分组数据的每个元素对应,分组数据中的每个唯一值对应每一个分组
当分组用的数据在源数据时中时,可以直接通过列名指定分组数据
import pandas as pd
import numpy as np
dose=pd.read_csv('dose.csv')
tmt_groupby=dose.groupby('Tmt')
<div class="se-preview-section-delimiter"></div>
print(tmt_groupby)
<div class="se-preview-section-delimiter"></div>
<pandas.core.groupby.DataFrameGroupBy object at 0x000001EB6DB94EF0>
还可以使用列表传递多组分组数据给groupby()
double_dose=dose.groupby(['Tmt','Age'])
<div class="se-preview-section-delimiter"></div>
**当分组数据不在源数据中时,可以直接传递分组数据
randvalue=np.random.randint(0,10,dose.shape[0])
random_group=dose.groupby(randvalue)
<div class="se-preview-section-delimiter"></div>
当分组数据可以通过源数据的行索引计算时,可以将计算函数传递给groupby()
alternate_group=dose.groupby(lambda n:n%3)
<div class="se-preview-section-delimiter"></div>
上述三种分组数据可以任意自由组合
crazy_group=dose.groupby(['Gender',lambda n:n%2,randvalue])
<div class="se-preview-section-delimiter"></div>
* 使用len()可以获得分组数*
len(tmt_groupby)
<div class="se-preview-section-delimiter"></div>
4
Groupby对象直接迭代接口,他与字典的iteritems()方法类似,每次迭代得到的分组的键和数据,当使用多列数据分组时,与每个组对应的键是一个元组
for key ,value in tmt_groupby:
print(key,value.shape)
<div class="se-preview-section-delimiter"></div>
A (65, 6)
B (65, 6)
C (65, 6)
D (65, 6)
* get_group()方法可以获得与指定的分组键对应对应的数据*
tmt_groupby.get_group('C')
<div class="se-preview-section-delimiter"></div>
Dose | Response1 | Response2 | Tmt | Age | Gender | |
---|---|---|---|---|---|---|
0 | 50.0 | 9.872 | 10.032 | C | 60s | F |
2 | 25.0 | 0.626 | 0.803 | C | 50s | M |
3 | 25.0 | 1.372 | 1.557 | C | 60s | F |
4 | 15.0 | 0.010 | 0.020 | C | 60s | F |
7 | 20.0 | 0.038 | 0.033 | C | 60s | M |
14 | 100.0 | 9.710 | 11.246 | C | 50s | M |
18 | 30.0 | 4.892 | 4.851 | C | 60s | F |
21 | 0.1 | 0.000 | 0.000 | C | 60s | F |
22 | 0.1 | 0.000 | 0.000 | C | 60s | M |
25 | 40.0 | 9.025 | 9.929 | C | 50s | M |
33 | 5.0 | 0.000 | 0.000 | C | 50s | M |
37 | 10.0 | 0.000 | 0.001 | C | 60s | F |
38 | 80.0 | 9.345 | 9.736 | C | 60s | M |
43 | 10.0 | 0.000 | 0.000 | C | 50s | M |
44 | 80.0 | 10.163 | 11.245 | C | 50s | M |
45 | 1.0 | 0.000 | 0.000 | C | 50s | M |
47 | 5.0 | 0.000 | 0.000 | C | 60s | M |
50 | 100.0 | 9.627 | 10.149 | C | 60s | F |
54 | 20.0 | 0.079 | 0.097 | C | 50s | M |
72 | 100.0 | 9.615 | 9.407 | C | 60s | M |
79 | 30.0 | 2.390 | 2.487 | C | 60s | M |
80 | 40.0 | 8.141 | 7.638 | C | 60s | M |
81 | 60.0 | 10.328 | 10.113 | C | 40s | F |
87 | 5.0 | 0.000 | 0.000 | C | 60s | M |
108 | 25.0 | 0.465 | 0.651 | C | 60s | M |
111 | 20.0 | 0.049 | 0.101 | C | 60s | M |
114 | 50.0 | 9.836 | 9.101 | C | 60s | M |
128 | 40.0 | 8.481 | 8.157 | C | 60s | M |
132 | 15.0 | 0.004 | 0.006 | C | 50s | M |
133 | 50.0 | 10.417 | 11.080 | C | 50s | M |
… | … | … | … | … | … | … |
154 | 5.0 | 0.000 | 0.000 | C | 60s | F |
155 | 0.1 | 0.000 | 0.000 | C | 60s | M |
162 | 80.0 | 10.140 | 10.147 | C | 60s | F |
165 | 60.0 | 10.490 | 11.218 | C | 50s | M |
172 | 20.0 | 0.184 | 0.247 | C | 60s | F |
174 | 1.0 | 0.000 | 0.000 | C | 60s | M |
176 | 25.0 | 0.351 | 0.339 | C | 60s | M |
178 | 15.0 | 0.002 | 0.002 | C | 60s | M |
179 | 1.0 | 0.000 | 0.000 | C | 60s | F |
181 | 100.0 | 9.923 | 9.737 | C | 60s | M |
186 | 25.0 | 1.027 | 1.372 | C | 40s | F |
190 | 40.0 | 8.539 | 8.590 | C | 40s | F |
191 | 50.0 | 10.208 | 9.857 | C | 40s | F |
198 | 10.0 | 0.000 | 0.002 | C | 40s | F |
200 | 15.0 | 0.008 | 0.034 | C | 40s | F |
201 | 10.0 | 0.000 | 0.000 | C | 60s | M |
206 | 60.0 | 9.961 | 10.126 | C | 60s | F |
207 | 80.0 | 10.115 | 9.402 | C | 60s | M |
209 | 10.0 | 0.000 | 0.000 | C | 60s | M |
210 | 60.0 | 9.855 | 9.711 | C | 60s | M |
216 | 30.0 | 2.895 | 3.518 | C | 50s | M |
220 | 40.0 | 9.092 | 9.356 | C | 60s | F |
222 | 15.0 | 0.002 | 0.008 | C | 60s | M |
223 | 20.0 | 0.117 | 0.288 | C | 40s | F |
232 | 0.1 | 0.000 | 0.000 | C | 40s | F |
236 | 30.0 | 1.728 | 1.931 | C | 60s | M |
240 | 60.0 | 9.808 | 9.343 | C | 60s | M |
243 | 1.0 | 0.000 | 0.000 | C | 60s | M |
249 | 30.0 | 4.350 | 3.873 | C | 40s | F |
259 | 100.0 | 10.169 | 10.201 | C | 40s | F |
65 rows × 6 columns
对Groupby的下标操作将获得一个只包含源数据中指定列的新groupby对象,通过这种方法可以先使用源数据中的某些列进行分组,然后选择一些列进行后续计算
tmt_groupby['Dose']
<div class="se-preview-section-delimiter"></div>
<pandas.core.groupby.SeriesGroupBy object at 0x000001EB6DEA3080>
`Groupby类中定义了_getattr()_()方法,因此当获取Groupby中未定义的属性时,将按照下面的顺序操作:
- 如果属性名是源数据对象的某列的名称,则相当于Groupby[name],即获得针对该列的Groupby对象
- 如果属性名是源数据对象的方法时,则相当于通过apply()对每个分组调用该方法。注意Pandas中定义了转换为apply()的方法集合,只有在此集合之中的方法才能被自动转换
tmt_groupby.Dose
<div class="se-preview-section-delimiter"></div>
<pandas.core.groupby.SeriesGroupBy object at 0x000001EB6E835BA8>
通过Groupby()对象提供的agg()、transform()、filter()以及apply()等方法可以实现各种分组运算,每个方法的第一个参数都是一个回调函数,改函数对每个分组的数据进行运算并返回结果。这些方法根据回调函数的返回结果生成最终的分组运算结果。
agg()对每个分组的数据进行聚合运算。所谓聚合运算就是将一组由N个数值组成的数据转换为单个数值的运算,例如求和、平均值、中间值甚至是随机取值等都是聚合运算。其回调函数接受的数据是表示每个分组中每列数据的series对象,若回调函数不能处理series对象,则agg()会接着尝试将整个分组的数据作为dataframe对象传递给回调函数,回调函数对其参数记性聚合运算,将series对象转换为单个数值,或将Dataframe对象转换为series对象。agg()返回一个Dataframe对象,其行索引为每个分组的键,而列索引为源数据的列索引
tmt_groupby.agg(np.mean)
<div class="se-preview-section-delimiter"></div>
Dose | Response1 | Response2 | |
---|---|---|---|
Tmt | |||
A | 33.546154 | 6.728985 | 6.863185 |
B | 33.546154 | 5.573354 | 5.456415 |
C | 33.546154 | 4.040415 | 4.115323 |
D | 33.546154 | 3.320646 | 3.188369 |
tmt_groupby.agg(lambda df:df.loc[df.Response1.idxmax()])
<div class="se-preview-section-delimiter"></div>
Dose | Response1 | Response2 | Age | Gender | |
---|---|---|---|---|---|
Tmt | |||||
A | 80.0 | 11.226 | 10.132 | 60s | F |
B | 100.0 | 10.824 | 10.158 | 50s | M |
C | 60.0 | 10.490 | 11.218 | 50s | M |
D | 80.0 | 10.911 | 9.854 | 60s | F |
transform()对每个分组中的数据进行转换运算。与agg()相同,首先尝试将每列的series对象传递给回调函数,如果失败,则将表示整个分组的Dataframe对象传递给回调函数。回调函数的返回结果与参数的形状相同,transform()将这些结果按照源数据的顺序合并在一起。
tmt_groupby.transform(lambda s:s-s.mean()).head()
<div class="se-preview-section-delimiter"></div>
Dose | Response1 | Response2 | |
---|---|---|---|
0 | 16.453846 | 5.831585 | 5.916677 |
1 | -18.546154 | -3.318646 | -3.184369 |
2 | -8.546154 | -3.414415 | -3.312323 |
3 | -8.546154 | -2.668415 | -2.558323 |
4 | -18.546154 | -4.030415 | -4.095323 |
filter()对每个分组进行条件判断,她将表示每个分组的dataframe对象传递给回调参数,该函数返回TRUE或False,以决定是否保留该分组。filter()的返回结果是过滤掉一些行之后的Dataframe对象,其行索引与源数据的行索引的顺序一致。
tmt_groupby.filter(lambda df : df.Response1.max()<11).head()
<div class="se-preview-section-delimiter"></div>
Dose | Response1 | Response2 | Tmt | Age | Gender | |
---|---|---|---|---|---|---|
0 | 50.0 | 9.872 | 10.032 | C | 60s | F |
1 | 15.0 | 0.002 | 0.004 | D | 60s | F |
2 | 25.0 | 0.626 | 0.803 | C | 50s | M |
3 | 25.0 | 1.372 | 1.557 | C | 60s | F |
4 | 15.0 | 0.010 | 0.020 | C | 60s | F |
apply()将表示每个分组的Dataframe对象传递给回调函数并收集其返回值,将这些返回值按照某种规则合并。apply的用法十分灵活,可以实现上述agg(),transform()和filter()方法的功能。它会根据回调函数的返回值的类型选择恰当的合并方式,然而这种自动选择有时会得到令人费解的结果
tmt_groupby.apply(pd.DataFrame.max)
<div class="se-preview-section-delimiter"></div>
Dose | Response1 | Response2 | Tmt | Age | Gender | |
---|---|---|---|---|---|---|
Tmt | ||||||
A | 100.0 | 11.226 | 10.745 | A | 60s | M |
B | 100.0 | 10.824 | 10.340 | B | 60s | M |
C | 100.0 | 10.490 | 11.246 | C | 60s | M |
D | 100.0 | 10.911 | 9.863 | D | 60s | M |
tmt_groupby.apply(pd.DataFrame.mean)
<div class="se-preview-section-delimiter"></div>
Dose | Response1 | Response2 | |
---|---|---|---|
Tmt | |||
A | 33.546154 | 6.728985 | 6.863185 |
B | 33.546154 | 5.573354 | 5.456415 |
C | 33.546154 | 4.040415 | 4.115323 |
D | 33.546154 | 3.320646 | 3.188369 |
* 当回调函数的返回值是DataFrame对象时,根据其行标签是否与参数对象的行标签为同一对象,会得到不同的结果*
group=tmt_groupby[['Response1','Response1']]
<div class="se-preview-section-delimiter"></div>
group.apply(lambda df:df-df.mean()).head()
<div class="se-preview-section-delimiter"></div>
Response1 | Response1 | |
---|---|---|
0 | 5.831585 | 5.831585 |
1 | -3.318646 | -3.318646 |
2 | -3.414415 | -3.414415 |
3 | -2.668415 | -2.668415 |
4 | -4.030415 | -4.030415 |
group.apply(lambda df:(df-df.mean())[:]).head()
<div class="se-preview-section-delimiter"></div>
Response1 | Response1 | ||
---|---|---|---|
Tmt | |||
A | 6 | -6.728985 | -6.728985 |
10 | -1.503985 | -1.503985 | |
12 | -6.728985 | -6.728985 | |
17 | -6.728985 | -6.728985 | |
32 | 2.566015 | 2.566015 |
当回调函数返回None值时,将忽略该返回值,因此可以实现filter()的功能
tmt_groupby.apply(lambda df:None if df.Response1.mean()<5 else df.sample(2))
<div class="se-preview-section-delimiter"></div>
Dose | Response1 | Response2 | Tmt | Age | Gender | ||
---|---|---|---|---|---|---|---|
Tmt | |||||||
A | 93 | 40.0 | 9.787 | 10.103 | A | 60s | F |
157 | 50.0 | 10.317 | 10.465 | A | 50s | M | |
B | 170 | 0.1 | 0.000 | 0.000 | B | 40s | F |
89 | 20.0 | 4.199 | 4.733 | B | 60s | F |
def read_git_log(log_fn):
import io
with io.open(log_fn, "r", encoding="utf8") as f:
author = datetime = None
message = []
message_start = False
for line in f:
line = line.strip()
if not line:
continue
if line.startswith("commit"):
if author is not None:
yield author, datetime, u"\n".join(message)
del message[:]
message_start = False
elif line.startswith("Author:"):
author = line[line.index(":")+1 : line.index("<")].strip()
elif line.startswith("Date:"):
datetime = line[line.index(":")+1 :].strip()
message_start = True
elif message_start:
message.append(line)
<div class="se-preview-section-delimiter"></div>

df_commit=pd.DataFrame(read_git_log('pandas.log'),columns=['Author','DateString','Message'])
df_commit.shape
<div class="se-preview-section-delimiter"></div>
(17260, 3)
df_commit.DateString.head()
<div class="se-preview-section-delimiter"></div>
0 Sat Jun 9 02:40:03 2018 +0300
1 Fri Jun 8 18:32:20 2018 -0500
2 Fri Jun 8 19:44:17 2018 +0200
3 Fri Jun 8 18:39:11 2018 +0100
4 Fri Jun 8 11:54:36 2018 -0500
Name: DateString, dtype: object
df_commit['Date']=pd.to_datetime(df_commit.DateString)
print(df_commit.head())
<div class="se-preview-section-delimiter"></div>
Author DateString \
0 Pyry Kovanen Sat Jun 9 02:40:03 2018 +0300
1 chris-b1 Fri Jun 8 18:32:20 2018 -0500
2 Joris Van den Bossche Fri Jun 8 19:44:17 2018 +0200
3 Fábio Rosado Fri Jun 8 18:39:11 2018 +0100
4 Tom Augspurger Fri Jun 8 11:54:36 2018 -0500
Message Date
0 BUG: Fix empty Data frames to JSON round-tripp... 2018-06-08 23:40:03
1 BLD: include dll in package_data on Windows (#... 2018-06-08 23:32:20
2 REGR: allow merging on object boolean columns ... 2018-06-08 17:44:17
3 DOC: update multi-index term with MultiIndex (... 2018-06-08 17:39:11
4 Revert change to comparison op with datetime.d... 2018-06-08 16:54:36
df_commit['TimeZone']=df_commit.DateString.str[-5:]
import re
df_commit["Type"] = df_commit.Message.str.extract(r"^([A-Z/]{2,12})", flags=re.MULTILINE)
<div class="se-preview-section-delimiter"></div>
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
This is separate from the ipykernel package so we can avoid doing imports until
df_commit.head(10)
<div class="se-preview-section-delimiter"></div>
Author | DateString | Message | Date | TimeZone | Type | |
---|---|---|---|---|---|---|
0 | Pyry Kovanen | Sat Jun 9 02:40:03 2018 +0300 | BUG: Fix empty Data frames to JSON round-tripp… | 2018-06-08 23:40:03 | +0300 | BUG |
1 | chris-b1 | Fri Jun 8 18:32:20 2018 -0500 | BLD: include dll in package_data on Windows (#… | 2018-06-08 23:32:20 | -0500 | BLD |
2 | Joris Van den Bossche | Fri Jun 8 19:44:17 2018 +0200 | REGR: allow merging on object boolean columns … | 2018-06-08 17:44:17 | +0200 | REGR |
3 | Fábio Rosado | Fri Jun 8 18:39:11 2018 +0100 | DOC: update multi-index term with MultiIndex (… | 2018-06-08 17:39:11 | +0100 | DOC |
4 | Tom Augspurger | Fri Jun 8 11:54:36 2018 -0500 | Revert change to comparison op with datetime.d… | 2018-06-08 16:54:36 | -0500 | NaN |
5 | Damini Satya | Fri Jun 8 09:50:20 2018 -0700 | Fix #21356: JSON nested_to_record Silently Dro… | 2018-06-08 16:50:20 | -0700 | NaN |
6 | Joris Van den Bossche | Fri Jun 8 18:41:49 2018 +0200 | DOC: update whatsnew 0.23.1 (#21387) | 2018-06-08 16:41:49 | +0200 | DOC |
7 | Tom Augspurger | Fri Jun 8 11:27:13 2018 -0500 | REGR: NA-values in ctors with string dtype (#2… | 2018-06-08 16:27:13 | -0500 | REGR |
8 | Uddeshya Singh | Fri Jun 8 21:55:51 2018 +0530 | BUG: invalid rolling window on empty input (#2… | 2018-06-08 16:25:51 | +0530 | BUG |
9 | Dr. Irv | Fri Jun 8 07:34:33 2018 -0400 | BUG: Series.combine() fails with ExtensionArra… | 2018-06-08 11:34:33 | -0400 | BUG |
df_commit.set_index('Date',drop=False,inplace=True)
df_commit.sort_index(inplace=True)
df_commit.head()
<div class="se-preview-section-delimiter"></div>
Author | DateString | Message | Date | TimeZone | Type | |
---|---|---|---|---|---|---|
Date | ||||||
2009-08-05 02:32:49 | Wes McKinney | Wed Aug 5 02:32:49 2009 +0000 | adding trunk\ngit-svn-id: http://pandas.google… | 2009-08-05 02:32:49 | +0000 | NaN |
2009-08-05 02:33:13 | Wes McKinney | Wed Aug 5 02:33:13 2009 +0000 | oops\ngit-svn-id: http://pandas.googlecode.com… | 2009-08-05 02:33:13 | +0000 | NaN |
2009-08-05 03:17:29 | Wes McKinney | Wed Aug 5 03:17:29 2009 +0000 | added svn:ignore\ngit-svn-id: http://pandas.go… | 2009-08-05 03:17:29 | +0000 | NaN |
2009-08-05 03:30:16 | Wes McKinney | Wed Aug 5 03:30:16 2009 +0000 | first commit with cleaned up code\ngit-svn-id:… | 2009-08-05 03:30:16 | +0000 | NaN |
2009-08-05 03:40:05 | Wes McKinney | Wed Aug 5 03:40:05 2009 +0000 | minor edit\ngit-svn-id: http://pandas.googleco… | 2009-08-05 03:40:05 | +0000 | NaN |
timedelta=df_commit.Date.diff(1).dropna()
<div class="se-preview-section-delimiter"></div>
hour_delta=timedelta.dt.days*24+timedelta.dt.seconds/3600.0
ax=hour_delta.plot(kind='hist',figsize=(16,9),bins=100,histtype='step',lw=4,range=(0,5))
ax.set_xlabel('Hours')
<div class="se-preview-section-delimiter"></div>
Text(0.5,0,'Hours')
ax=df_commit.Author.resample('W',how='count').plot(kind='area',figsize=(16,9),color='red',lw=4)
ax.set_xlabel('date')
<div class="se-preview-section-delimiter"></div>
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).count()
"""Entry point for launching an IPython kernel.
Text(0.5,0,'date')
ax=df_commit.Author.resample('M',how='count').plot(kind='area',figsize=(16,9),color='red',lw=4)
ax.set_xlabel('Month')
<div class="se-preview-section-delimiter"></div>
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).count()
"""Entry point for launching an IPython kernel.
Text(0.5,0,'Month')
ax=df_commit.Author.resample('M',how='count').plot(kind='area',figsize=(16,9),color='red',lw=4)
ax.set_xlabel('Month')
<div class="se-preview-section-delimiter"></div>
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).count()
"""Entry point for launching an IPython kernel.
Text(0.5,0,'Month')
df_commit['Period']=df_commit.index.to_period('M')
<div class="se-preview-section-delimiter"></div>
df_commit.groupby('Period').count().plot(kind='area',figsize=(16,9),color='red',lw=4)
<div class="se-preview-section-delimiter"></div>
Text(0.5,17.2,'Month')
df_commit.groupby('Author').count()['Message']
<div class="se-preview-section-delimiter"></div>
Author 3553x 1 A. Flaxman 1 ARF 1 Aaron Barber 1 Aaron Critchley 9 Aaron Schumacher 1 Aaron Staple 2 Aaron Toth 1 AbdealiJK 1 Abraham Flaxman 19 Acanthostega 1 Adam Chainz 1 Adam Gleave 1 Adam Greenhall 4 Adam Hooper 1 Adam J. Stewart 3 Adam Kim 1 Adam Klein 327 Adam Marcus 2 Adam Obeng 5 Adam Smith 1 AdamShamlian 1 Adrian 1 Adrian Liaw 1 Adrien Emery 1 Agustín Herranz 1 Ajay Saxena 5 Akash Tandon 1 Alan Du 1 Alan Velasco 5 ... tv3141 1 tworec 1 tzinckgraf 2 unknown 3 unutbu 18 verakai 1 vytas 1 waitingkuo 4 wandersoncferreira 1 watercrossing 1 wavedatalab 1 wcwagner 4 westurner 2 willayd 1 xgdgsc 1 xpvpc 2 y-p 943 yelite 1 ysau 1 yui-knk 5 zach powers 3 zachcp 1 zertrin 1 zhanghui 1 zhangjinjie 1 znmean 1 zzgao 1 Åsmund Hjulstad 1 Øystein S. Haaland 1 颜发才(Yan Facai) 1 Name: Message, Length: 1379, dtype: int64
s_Author=df_commit.Author.value_counts()#另一种统计方法
<div class="se-preview-section-delimiter"></div>
* 下面使用crosstab()统计每个月每位作者的提交次数,所得到的结果df_counts的行索引为月份,列索引为作者*
df_counts=pd.crosstab(df_commit.index.to_period('M'),df_commit.Author)
df_counts.index.name='Month'
df_counts.shape
<div class="se-preview-section-delimiter"></div>
(107, 1379)
df_counts[s_Author.head(5).index].plot(kind='area',subplots=True,figsize=(8,6),sharex=True,sharey=True)
<div class="se-preview-section-delimiter"></div>
array([<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB75238898>,
<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB7556CB00>,
<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB75596B70>,
<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB755C1BE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB755EBC50>],
dtype=object)
daily_commit = df_commit.index.to_period("D").value_counts() #❶
daily_commit.index = pd.MultiIndex.from_arrays([daily_commit.index.asfreq("W"), #❷
daily_commit.index.weekday])
daily_commit = daily_commit.sort_index()
active_data = daily_commit.unstack(0).iloc[:, -60:].fillna(0) #❸
<div class="se-preview-section-delimiter"></div>
<div class="se-preview-section-delimiter"></div>
#%fig=Pandas项目的活动记录图
import matplotlib.pyplot as pl
fig, ax = pl.subplots(figsize=(15, 4))
ax.set_aspect("equal")
ax.pcolormesh(active_data.values, cmap="Greens",
vmin=0, vmax=active_data.values.max() * 0.75) #❹
tick_locs = np.arange(3, 60, 10)
ax.set_xticks(tick_locs + 0.5)
ax.set_xticklabels(active_data.columns[tick_locs].to_timestamp(how="start").format())
ax.set_yticks(np.arange(7) + 0.5)
from pandas.tseries.frequencies import DAYS
ax.set_yticklabels(DAYS);
<div class="se-preview-section-delimiter"></div>

store=pd.HDFStore('aqi.hdf5')
df_aqi=store.select('aqi')
<div class="se-preview-section-delimiter"></div>
df_aqi.head()
<div class="se-preview-section-delimiter"></div>
Time | City | Position | AQI | Level | PM2_5 | PM10 | CO | NO2 | O3 | SO2 | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2014-04-11 15:00:00 | 上海 | 普陀 | 76.0 | 良 | 49.0 | 101.0 | 0.000 | 0.0 | 0.0 | 0.0 |
2 | 2014-04-11 15:00:00 | 上海 | 十五厂 | 72.0 | 良 | 52.0 | 94.0 | 0.479 | 53.0 | 124.0 | 9.0 |
3 | 2014-04-11 15:00:00 | 上海 | 虹口 | 80.0 | 良 | 59.0 | 98.0 | 0.612 | 52.0 | 115.0 | 11.0 |
4 | 2014-04-11 15:00:00 | 上海 | 徐汇上师大 | 74.0 | 良 | 54.0 | 87.0 | 0.706 | 43.0 | 113.0 | 14.0 |
5 | 2014-04-11 15:00:00 | 上海 | 杨浦四漂 | 84.0 | 良 | 62.0 | 99.0 | 0.456 | 43.0 | 82.0 | 9.0 |
df_aqi['Date']=pd.to_datetime(df_aqi.Time)
<div class="se-preview-section-delimiter"></div>
df_aqi.set_index(df_aqi.index.to_period('D'),drop=False,inplace=True)#必须设置inplace,index才会是datetimeIndex
<div class="se-preview-section-delimiter"></div>
df_aqi.head()
<div class="se-preview-section-delimiter"></div>
Time | City | Position | AQI | Level | PM2_5 | PM10 | CO | NO2 | O3 | SO2 | Date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||||
2014-04-11 | 2014-04-11 15:00:00 | 上海 | 普陀 | 76.0 | 良 | 49.0 | 101.0 | 0.000 | 0.0 | 0.0 | 0.0 | 2014-04-11 15:00:00 |
2014-04-11 | 2014-04-11 15:00:00 | 上海 | 十五厂 | 72.0 | 良 | 52.0 | 94.0 | 0.479 | 53.0 | 124.0 | 9.0 | 2014-04-11 15:00:00 |
2014-04-11 | 2014-04-11 15:00:00 | 上海 | 虹口 | 80.0 | 良 | 59.0 | 98.0 | 0.612 | 52.0 | 115.0 | 11.0 | 2014-04-11 15:00:00 |
2014-04-11 | 2014-04-11 15:00:00 | 上海 | 徐汇上师大 | 74.0 | 良 | 54.0 | 87.0 | 0.706 | 43.0 | 113.0 | 14.0 | 2014-04-11 15:00:00 |
2014-04-11 | 2014-04-11 15:00:00 | 上海 | 杨浦四漂 | 84.0 | 良 | 62.0 | 99.0 | 0.456 | 43.0 | 82.0 | 9.0 | 2014-04-11 15:00:00 |
df_aqi.City.value_counts()
<div class="se-preview-section-delimiter"></div>
天津 134471
北京 109999
上海 92745
天津市 13
北京市 12
上海市 10
Name: City, dtype: int64
df_aqi['City']=df_aqi.City.str.replace('市','')
<div class="se-preview-section-delimiter"></div>
df_aqi.City.value_counts()
<div class="se-preview-section-delimiter"></div>
天津 134484
北京 110011
上海 92755
Name: City, dtype: int64
df_aqi.corr()
<div class="se-preview-section-delimiter"></div>
AQI | PM2_5 | PM10 | CO | NO2 | O3 | SO2 | |
---|---|---|---|---|---|---|---|
AQI | 1.000000 | 0.943671 | 0.694076 | 0.610657 | 0.533769 | -0.136260 | 0.419722 |
PM2_5 | 0.943671 | 1.000000 | 0.569205 | 0.632692 | 0.555882 | -0.168933 | 0.425680 |
PM10 | 0.694076 | 0.569205 | 1.000000 | 0.460423 | 0.471571 | -0.136218 | 0.414261 |
CO | 0.610657 | 0.632692 | 0.460423 | 1.000000 | 0.565252 | -0.233338 | 0.537967 |
NO2 | 0.533769 | 0.555882 | 0.471571 | 0.565252 | 1.000000 | -0.439186 | 0.448082 |
O3 | -0.136260 | -0.168933 | -0.136218 | -0.233338 | -0.439186 | 1.000000 | -0.197512 |
SO2 | 0.419722 | 0.425680 | 0.414261 | 0.537967 | 0.448082 | -0.197512 | 1.000000 |
fig,ax=pl.subplots(figsize=(16,9))
ax.set_aspect('equal')
ax.pcolormesh(df_aqi.corr(),cmap='Greens',vmin=-0.9,vmax=1)
<div class="se-preview-section-delimiter"></div>
<matplotlib.collections.QuadMesh at 0x1eb1af6ff60>
fig,ax=pl.subplots(figsize=(16,9))
plot_dataframe_as_colormesh(df_aqi.corr(),ax=ax,colorbar=True,xtick_rot=90)
<div class="se-preview-section-delimiter"></div>
<matplotlib.axes._subplots.AxesSubplot at 0x1eb1b0c44a8>
ax.pcolormesh?
<div class="se-preview-section-delimiter"></div>
import pylab as pl
import numpy as np
def plot_dataframe_as_colormesh(df, ax=None, inverse_yaxis=False, colorbar=False, xtick_rot=0,
xtick_start=0, xtick_step=1, ytick_start=0, ytick_step=1,
xtick_format=None, ytick_format=None,
**kw):
nrow, ncol = df.shape
if ax is None:
fig_width = 10.0
fig_height = fig_width / ncol * nrow
fig, ax = pl.subplots(figsize=(fig_width, fig_height))
ax.set_aspect("equal")
if inverse_yaxis:
ax.invert_yaxis()
mesh = ax.pcolormesh(df.values, **kw)
if colorbar:
pl.colorbar(ax=ax, mappable=mesh)
xticks_loc = np.arange(xtick_start, ncol, xtick_step)
yticks_loc = np.arange(ytick_start, nrow, ytick_step)
xlabels = df.columns.tolist()
if xtick_format is not None:
xlabels = [xtick_format(label) for label in xlabels]
ylabels = df.index.tolist()
if ytick_format is not None:
ylabels = [ytick_format(label) for label in ylabels]
ax.set_xticks(xticks_loc + 0.5)
ax.set_xticklabels([xlabels[idx] for idx in xticks_loc], rotation=xtick_rot)
ax.set_yticks(yticks_loc + 0.5)
ax.set_yticklabels([ylabels[idx] for idx in yticks_loc])
return ax
<div class="se-preview-section-delimiter"></div>

daily_city_groupby=df_aqi.groupby(['Date','City'])
<div class="se-preview-section-delimiter"></div>
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: 'Date' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
"""Entry point for launching an IPython kernel.
mean_PM=daily_city_groupby.PM2_5.mean().unstack(1)
mean_PM.plot(kind='hist',histtype='step',bins=20,normed=True,lw=2)
pl.grid()
pl.rcParams['font.family']='STSong'
<div class="se-preview-section-delimiter"></div>
C:\Users\Administrator\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
warnings.warn("The 'normed' kwarg is deprecated, and has been "
mean_PM.plot(kind='kde',lw=2)
pl.xlim(0,800)
<div class="se-preview-section-delimiter"></div>
(0, 800)
mean_PM.corr()
<div class="se-preview-section-delimiter"></div>
City | 上海 | 北京 | 天津 |
---|---|---|---|
City | |||
上海 | 1.000000 | -0.131391 | 0.032613 |
北京 | -0.131391 | 1.000000 | 0.583861 |
天津 | 0.032613 | 0.583861 | 1.000000 |
fig,ax=pl.subplots(figsize=(4,3))
plot_dataframe_as_colormesh(mean_PM.corr(),ax=ax,colorbar=True,xtick_rot=90)
<div class="se-preview-section-delimiter"></div>
<matplotlib.axes._subplots.AxesSubplot at 0x1eb1b6894e0>
week_mean=df_aqi.groupby([df_aqi.Time.dt.dayofweek,'City']).PM2_5.mean()
ax=week_mean.unstack(1).plot(kind='Bar')
from pandas.tseries.frequencies import DAYS
ax.set_xticklabels(DAYS)
ax.legend(loc='best',mode='expand',bbox_to_anchor=(0,1.2,1,0.102))
ax.grid(True)
<div class="se-preview-section-delimiter"></div>
hour_mean=df_aqi.groupby([df_aqi.Time.dt.hour,'City']).PM2_5.mean()
ax=hour_mean.unstack(1).plot(kind='Bar',figsize=(16,9))
ax.legend(loc='best',mode='expand',bbox_to_anchor=(0,1.,1,0.102),ncol=3)
ax.grid(True)
<div class="se-preview-section-delimiter"></div>
bj_weather=df_aqi.query('City=="北京"')
month_place_PM=bj_weather.groupby([bj_weather.Time.dt.to_period('M'),'Position']).PM2_5.mean()
<div class="se-preview-section-delimiter"></div>
place_mean=month_place_PM.mean(level=1).sort_values()
place_mean.plot(kind='bar',color='green',figsize=(16,9
))
<div class="se-preview-section-delimiter"></div>
<matplotlib.axes._subplots.AxesSubplot at 0x1eb219d69b0>
place=place_mean.iloc[[0,1,-2,-1]].index
ax=month_place_PM.unstack(1).loc[:,place].plot(kind='bar',figsize=(16,9),width=0.8)
ax.grid()
“`
stack()方法把指定级别的列索引转换为行索引,而unstack()则把行索引转换为列索引
reorder_levels()和swaplevel()交换指定轴的索引级别
pivot()可以将dataframe中的散列数据分别作为行索引,列索引和元素值,将这三列数据转换为二维表格
pivot()的三个参数index,columns和values只支持指定一列数据,若不指定values数据,就将剩余的列都当做元素值列,得到多级索引
所谓分组运算就是使用特定的条件将数据分为多个分组,然后对每个分组进行运算,最后再将结果整合起来。pandas中的分组运算由Dataframe或Series对象的groupby()方法实现
groupby()的axis参数默认为0表示源数据的行进行分组。源数据的每行与分组数据的每个元素对应,分组数据中的每个唯一值对应每一个分组
当分组用的数据在源数据时中时,可以直接通过列名指定分组数据
“`python
import pandas as pd
import numpy as np
dose=pd.read_csv(‘dose.csv’)
tmt_groupby=dose.groupby(‘Tmt’)
“`
Text(0.5,0,'Hours')
“`python
ax=df_commit.Author.resample(‘W’,how=’count’).plot(kind=’area’,figsize=(16,9),color=’red’,lw=4)
ax.set_xlabel(‘date’)
“`
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).count()
"""Entry point for launching an IPython kernel.
Text(0.5,0,'date')
“`python
ax=df_commit.Author.resample(‘M’,how=’count’).plot(kind=’area’,figsize=(16,9),color=’red’,lw=4)
ax.set_xlabel(‘Month’)
“`
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).count()
"""Entry point for launching an IPython kernel.
Text(0.5,0,'Month')
“`python
ax=df_commit.Author.resample(‘M’,how=’count’).plot(kind=’area’,figsize=(16,9),color=’red’,lw=4)
ax.set_xlabel(‘Month’)
“`
C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).count()
"""Entry point for launching an IPython kernel.
Text(0.5,0,'Month')
“`python
df_commit[‘Period’]=df_commit.index.to_period(‘M’)
“`
Text(0.5,17.2,'Month')
“`python
df_commit.groupby(‘Author’).count()[‘Message’]
“`
array([<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB75238898>,
<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB7556CB00>,
<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB75596B70>,
<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB755C1BE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x000001EB755EBC50>],
dtype=object)
“`python
daily_commit = df_commit.index.to_period(“D”).value_counts() #❶
daily_commit.index = pd.MultiIndex.from_arrays([daily_commit.index.asfreq(“W”), #❷
daily_commit.index.weekday])
daily_commit = daily_commit.sort_index()
active_data = daily_commit.unstack(0).iloc[:, -60:].fillna(0) #❸
“`
“`python
store=pd.HDFStore(‘aqi.hdf5’)
df_aqi=store.select(‘aqi’)
“`
<matplotlib.collections.QuadMesh at 0x1eb1af6ff60>
“`python
fig,ax=pl.subplots(figsize=(16,9))
plot_dataframe_as_colormesh(df_aqi.corr(),ax=ax,colorbar=True,xtick_rot=90)
“`
<matplotlib.axes._subplots.AxesSubplot at 0x1eb1b0c44a8>
“`python
ax.pcolormesh?
“`
C:\Users\Administrator\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
warnings.warn("The 'normed' kwarg is deprecated, and has been "
“`python
mean_PM.plot(kind=’kde’,lw=2)
pl.xlim(0,800)
“`
(0, 800)
“`python
mean_PM.corr()
“`
<matplotlib.axes._subplots.AxesSubplot at 0x1eb1b6894e0>
“`python
week_mean=df_aqi.groupby([df_aqi.Time.dt.dayofweek,’City’]).PM2_5.mean()
ax=week_mean.unstack(1).plot(kind=’Bar’)
from pandas.tseries.frequencies import DAYS
ax.set_xticklabels(DAYS)
ax.legend(loc=’best’,mode=’expand’,bbox_to_anchor=(0,1.2,1,0.102))
ax.grid(True)
“`
“`python
hour_mean=df_aqi.groupby([df_aqi.Time.dt.hour,’City’]).PM2_5.mean()
ax=hour_mean.unstack(1).plot(kind=’Bar’,figsize=(16,9))
ax.legend(loc=’best’,mode=’expand’,bbox_to_anchor=(0,1.,1,0.102),ncol=3)
ax.grid(True)
“`
“`python
bj_weather=df_aqi.query(‘City==”北京”’)
month_place_PM=bj_weather.groupby([bj_weather.Time.dt.to_period(‘M’),’Position’]).PM2_5.mean()
<matplotlib.axes._subplots.AxesSubplot at 0x1eb219d69b0>

```python
place=place_mean.iloc[[0,1,-2,-1]].index
ax=month_place_PM.unstack(1).loc[:,place].plot(kind='bar',figsize=(16,9),width=0.8)
ax.grid()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。