赞
踩
目录
上机练习 12---使用爬虫+pandas+os 获取处理导入数据
- import pandas as pd
- import os
- import json
- import pymysql
- # 1. 创建以日期为时间为名字的日志文件,格式如 20230303102030.log# os.system("nowaday='$(date +%Y%m%d%H%M%S)' && touch
- /root/$nowaday.log")
- # 2. 字典{1: "Google", 2: "Runoob", 3: "Wiki"},转成 series 数据,
- 打印数据,提
- # 取"Runoob"打印
- # sites = {"a": "Google", "b": "Runoob", 3: "Wiki"}
- # print(pd.Series(sites))
- # print(sites["b"])
- # 3. 列表[["Google",10],["Runoob",12],["Wiki",13]],转成 DataFrame
- 数据打印
- # data = [["Google",10],["Runoob",12],["Wiki",13]]
- # print(pd.DataFrame(data))
- # 4. 将[{"a": 1, "b": 2},{"a": 5, "b": 10, "c": 20}]转成 DataFrame
- 数据,提取第 2 行 a 那一列的
- # 数据
- # data = [{"a": 1, "b": 2},{"a": 5, "b": 10, "c": 20}]
- # df=pd.DataFrame(data)
- # print(df.loc[1,"a"])
- # 5. 将 data = {"语文": [89,97,68,56,88,77],"数学":
- [99,67,100,78,89,66],"英语":
- # [73,57,89,90,82,55]}转成 DataFrame 数据,提取三科分数都及格的数据
- # data = {"语文": [89,97,68,56,88,77],"数学":
- [99,67,100,78,89,66],"英语":[73,57,89,90,82,55]}
- # df=pd.DataFrame(data)
- # for i in df.index:
- #
- if df.loc[i,"语文"]>=60 and df.loc[i,"数学"]>=60 and
- df.loc[i,"英语"]>=60:
- #
- print(df.loc[i])
- # 6. 使用 pandas 处理 douban.txt,提取列 id,title,rate,并且提取 rate
- 大于 7.5 的行导出
- # douban1.csv,使用 os 库调用 shell 脚本 mysqlcsv.sh 自动导入
- douban1.csv 到 mysql 数
- # 据库,表名 douban1
- # dict1=[]
- # with open("/root/douban.txt", "r", encoding="utf-8") as f:
- #
- result=f.read()
- #
- dict1=json.loads(result)
- #
- f.close()
- # df=pd.DataFrame(dict1["subjects"])
- # df=df[df.rate.astype(float) > 7.5 ][["id","title","rate"]] #
- 改变顺序和列数
- # print(df)
- # df.to_csv("/root/douban1.csv",index=False,header=False)# os.system(" /root/shell/mysqlcsv.sh ")
- # 7. 把 order.xlsx 使用 mobox 传入到/root/python 中,根据文档做如下操
- 作
- # 把文档数据(从列名开始)转换为 dataframe 输出
- # 在产品后面增加一列采购人,内容为 ["坤坤","杰杰","坤坤","丽丽","丽
- 丽","坤坤"]
- # 查看金额大于 100 的坤坤的订单列表
- # 查看金额大于 100 的坤坤的订单列表的产品及金额列
- # 把上一步的结果另存为 order_kunkun.xlsx,表单名称为坤坤,不显示索引,
- 如下:
- # 产品 金额
- # 投影仪 2000
- # 打印机 298
- # df =
- pd.read_excel("/root/python/order.xlsx",sheet_name="Sheet1",he
- ader=1)
- # print(df)
- # df.insert(2,"采购人",["坤坤","杰杰","坤坤","丽丽","丽丽","坤坤
- "])
- # print(df[df["采购人"]=="坤坤"][ df["金额"]>100])
- # df_kunkun = df[df["采购人"]=="坤坤" ][ df["金额"]>100][["产品","
- 金额"]]
- #
- df_kunkun.to_excel("order_kunkun.xlsx",sheet_name="xs",index=F
- alse)
- # mf=pd.read_excel("order_kunkun.xlsx",sheet_name="坤坤")
- # print(mf)

- import requests
- import pandas as pd
- import os
- # 伪装网站
- headers1={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)
- AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71
- Safari/537.36"}
- url1="https://movie.douban.com/j/chart/top_list" #爬取网页
- # 爬取参数
- params1={"type":"25","interval_id":"100:90","action":"","start
- ":"0","limit":"20"}
- response=requests.get(url=url1,headers=headers1,params=params1
- )
- # 获得前 20 个电影的 json 数据
- GetData=response.json()
- # print(GetData)
- print("爬取中......")
- # 将后续电影的 json 数据追加到 GetData 中
- while True:
- params1["start"]=str(int(params1["start"])+20)
- response=requests.get(url=url1,headers=headers1,params=para
- ms1)NewData=response.json()
- if NewData!=[]:
- GetData+=NewData
- # print(GetData)
- print("爬取中......")
- else:
- break
- print("数据爬取完毕,开始解析数据......")
- for i in GetData:
- i["rating"]=i["rating"][1]
- # 数据存入 excel 文件
- print("正在更新 excel 文件......")
- DisposalData=pd.DataFrame(GetData)[["id","title","release_date
- ","score","rating"]]
- DisposalData.to_csv("/root/python/WedDouban/reallydouban.csv",
- index=False,header=False,encoding="utf-8")
- # 存入数据库
- print("正在更新数据库......")
- os.system("cp /root/python/WedDouban/reallydouban.csv
- /usr/local/mysql/data/")
- os.system(" /root/python/WedDouban/doubancsv.sh")

- host="127.0.0.1"
- port="3306"
- user="root"
- passwd="root123456"
- dbname="test"
- # 编写 shell 脚本/root/shell/mysqlcsv.sh
- # 1.如果 douban 表存在则删除
- mysql1="drop table if exists DoubanMovieType"
- mysql -h$host -P$port -u$user -p$passwd $dbname -e "$mysql1"
- # 2.如果 douban 表不存在则新建
- mysql2="
- create table if not exists DoubanMovieType(
- id varchar(20) primary key,
- title varchar(20),
- release_date varchar(50),
- score varchar(20),rating varchar(40)
- )"
- mysql -h$host -P$port -u$user -p$passwd $dbname -e "$mysql2"
- # # 3.导入 douban.csv 数据到 douban 表中,建表过程为根据导入的 csv 文件
- 自动创建表
- mysql3="LOAD DATA INFILE
- '/usr/local/mysql/data/reallydouban.csv' INTO TABLE
- DoubanMovieType
- CHARACTER SET utf8
- FIELDS TERMINATED BY ','
- LINES TERMINATED BY '\n'
- IGNORE 1 LINES"
- mysql -h$host -P$port -u$user -p$passwd $dbname -e "$mysql3"
- # # 4.查询 douban 表验证结果
- # mysql4="select * from douban1"
- # mysql -h$host -P$port -u$user -p$passwd $dbname -e "$mysql4"

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。