赞
踩
import numpy as np
import pandas as pd
pd.merge(left,right,how: str = 'inner',on=None,left_on=None,right_on=None,left_index: bool = False,
right_index: bool = False,sort: bool = False,suffixes=('_x', '_y'),copy: bool = True,indicator: bool = False,validate=None,)
df_1 = pd.DataFrame({
"userid":['a', 'b', 'c', 'd'],
"age":[23, 46, 32, 19]
})
df_1
# userid age
#0 a 23
#1 b 46
#2 c 32
#3 d 19
df_2 = pd.DataFrame({
"userid":['a', 'c'],
"payment":[2000, 3500]
})
df_2
#userid payment
#0 a 2000
#1 c 3500
df_1.merge(df_2,on='userid')
#userid age payment
#0 a 23 2000
#1 c 32 3500
pd.merge(df_1, df_2, on='userid')
#userid age payment
#0 a 23 2000
#1 c 32 3500
df_1 = pd.DataFrame({
"userid":['a', 'b', 'c', 'd'],
"age":[23, 46, 32, 19]
})
df_2 = pd.DataFrame({
"userid":['a', 'c','a', 'd'],
"payment":[2000, 3500, 500, 1000]
})
pd.merge(df_1, df_2, on="userid")
#userid age payment
#0 a 23 2000
#1 a 23 500
#2 c 32 3500
#3 d 19 1000
df_1 = pd.DataFrame({
"userid":['a', 'b', 'c', 'd'],
"age":[23, 46, 32, 19]
})
df_2 = pd.DataFrame({
"userid":['a', 'c','e'],
"payment":[2000, 3500, 600]
})
pd.merge(df_1, df_2,how='left', on="userid")
#userid age payment
#0 a 23 2000.0
#1 b 46 NaN
#2 c 32 3500.0
#3 d 19 NaN
pd.merge(df_1, df_2,how='right', on="userid")
#userid age payment
#0 a 23.0 2000
#1 c 32.0 3500
#2 e NaN 600
pd.merge(df_1, df_2,how='outer',on='userid')
#userid age payment
#0 a 23.0 2000.0
#1 b 46.0 NaN
#2 c 32.0 3500.0
#3 d 19.0 NaN
#4 e NaN 600.0
DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False)
df = pd.DataFrame({'month': [1, 4, 7, 10],
'year': [2012, 2014, 2013, 2014],
'sale': [55, 40, 84, 31]})
df
# month year sale
#0 1 2012 55
#1 4 2014 40
#2 7 2013 84
#3 10 2014 31
df.set_index('month')
year sale
month
#1 2012 55
#4 2014 40
#7 2013 84
#10 2014 31
df.set_index('month',drop=False)
# month year sale
#month
#1 1 2012 55
#4 4 2014 40
#7 7 2013 84
#10 10 2014 31
df.set_index('month', append=True)
df.loc[0]
#month 1
#year 2012
#sale 55
#Name: 0, dtype: int64
df.set_index('month', inplace=True)
df
# year sale
#month
#1 2012 55
#4 2014 40
#7 2013 84
#10 2014 31
df.set_index(pd.Series(range(4)))
#year sale
#0 2012 55
#1 2014 40
#2 2013 84
#3 2014 31
DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
df = pd.DataFrame({
'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
'rating': [4, 4, 3.5, 15, 5]
})
df
#brand style rating
#0 Yum Yum cup 4.0
#1 Yum Yum cup 4.0
32 Indomie cup 3.5
#3 Indomie pack 15.0
#4 Indomie pack 5.0
df.drop_duplicates()
#brand style rating
#0 Yum Yum cup 4.0
#2 Indomie cup 3.5
#3 Indomie pack 15.0
#4 Indomie pack 5.0
df.drop_duplicates(subset=['brand'])
#brand style rating
#0 Yum Yum cup 4.0
#2 Indomie cup 3.5
df.drop_duplicates(subset=['brand', 'style'], keep='last')
#brand style rating
#1 Yum Yum cup 4.0
#2 Indomie cup 3.5
#4 Indomie pack 5.0
df.index
#RangeIndex(start=0, stop=5, step=1)
df.index.tolist()
#[0, 1, 2, 3, 4]
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
data = pd.read_csv('爱奇艺视频数据.csv',encoding="gbk") data.info() #<class 'pandas.core.frame.DataFrame'> #RangeIndex: 99999 entries, 0 to 99998 #Data columns (total 24 columns): # # Column Non-Null Count Dtype #--- ------ -------------- ----- # 0 数据获取日期 99999 non-null object # 1 演员 97981 non-null object # 2 视频ID 99999 non-null object # 3 详细链接 99998 non-null object # 4 剧名 99999 non-null object # 5 状态 99158 non-null object # 6 类型 99999 non-null object # 7 来源平台 99999 non-null object # 8 整理后剧名 99999 non-null object # 9 更新时间 644 non-null object # 10 上映时间 78755 non-null float64 # 11 语言 85926 non-null object # 12 评分 99970 non-null float64 # 13 地区 98728 non-null object # 14 上映年份 78755 non-null float64 # 15 简介 99970 non-null object # 16 导演 97614 non-null object # 17 差评数 99970 non-null float64 # 18 评分人数 99970 non-null float64 # 19 播放量 99453 non-null float64 # 20 更新至 1272 non-null float64 # 21 总集数 98871 non-null float64 # 22 第几季 99999 non-null int64 # 23 好评数 99970 non-null float64 #dtypes: float64(9), int64(1), object(14) #memory usage: 18.3+ MB
pd.read_csv(sio, dtype={"user_id": int, "username": object})
pd.read_csv(sio, low_memory=False})
data.head(3)
data.columns
#Index(['数据获取日期', '演员', '视频ID', '详细链接', '剧名', '状态', '类型', '来源平台', '整理#后剧名',
# '更新时间', '上映时间', '语言', '评分', '地区', '上映年份', '简介', '导演', '差评数', #'评分人数',
# '播放量', '更新至', '总集数', '第几季', '好评数'],
# dtype='object')
data.groupby('导演')[['好评数','评分人数']].sum()
#好评数 评分人数
#导演
#Exact 375172.0 458543.0
#John Fawcett Steve Dimarco Paul Fox 1477942.0 1729878.0
#Michael Cuesta 527348.0 604104.0
#Michael Dinner 1032245.0 1312847.0
#Michael Engler 47804.0 61844.0
#... ... ...
#龚朝 4634.0 8620.0
#龚朝/杨巧文/王伟仁 676160.0 964912.0
#龚朝晖 4044245.0 5941895.0
#龚艺群 194079.0 290358.0
#龚若飞 29126.0 43151.0
#1196 rows × 2 columns
df_q1 = data.groupby('导演').sum()[['好评数','评分人数']] df_q1['好评率'] = df_q1['好评数']/df_q1['评分人数'] df_q1 #好评数 评分人数 好评率 #导演 #Exact 375172.0 458543.0 0.818183 #John Fawcett Steve Dimarco Paul Fox 1477942.0 1729878.0 0.854362 #Michael Cuesta 527348.0 604104.0 0.872942 #Michael Dinner 1032245.0 1312847.0 0.786265 #Michael Engler 47804.0 61844.0 0.772977 #... ... ... ... #龚朝 4634.0 8620.0 0.537587 #龚朝/杨巧文/王伟仁 676160.0 964912.0 0.700748 #龚朝晖 4044245.0 5941895.0 0.680632 #龚艺群 194079.0 290358.0 0.668413 #龚若飞 29126.0 43151.0 0.674979 #1196 rows × 3 columns
result_q1 = df_q1.sort_values('好评率',ascending=False)[:20]
result_q1
# 设置中文: # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文负号 plt.rcParams['axes.unicode_minus'] = False # 设置分别率 为100 plt.rcParams['figure.dpi'] = 100 # 设置大小 plt.rcParams['figure.figsize'] = (10,3) # 绘制图形 plt.bar(result_q1.index,result_q1['好评率']) # 设置y轴范围 plt.ylim(0.98,1) # 设置x轴文字倾斜 plt.xticks(rotation=70) # 设置网格 plt.grid(True, linestyle='--')
result_q1['好评率'].plot(kind='bar',
color = 'b',
width = 0.8,
alpha = 0.4,
rot = 45,
grid = True,
ylim = [0.98,1],
figsize = (12,4),
title = '不同导演电影的好评率')
movie_year = data.groupby('上映年份')[['评分人数']].sum()
movie_year_2000 = movie_year.loc[2000:]
plt.stackplot(movie_year_2000.index,movie_year_2000['评分人数'])
movie_title_group = data.groupby('整理后剧名')[['评分人数','好评数']].sum()
result_title = movie_title_group.sort_values('好评数',ascending=False)[:20]
result_title
result_title['好评数'].plot(kind='bar',
color = 'b',
width = 0.8,
alpha = 0.4,
rot = 45,
grid = True,
ylim = [1.3e+08,1.1e+09],
figsize = (12,4),
title = '不同剧的好评数')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。