赞
踩
requests作为爬虫的基础库,在我们快速爬取和反爬破解中起到很重要的作用, 其中的知识点大概有以下几个方面:
1- requests.get
… get请求获取数据
2- requests.post
…post请求获取数据
1- response.text
. 响应体str类型
2- response.encoding
从HTTP header中获取响应内容的编码方式
3- response.content
响应体bytes类型
4- response.staus_code
响应状态码
5- resposne.request.headers
响应对应的请求头
6- response.headers
响应头
7- response.request.codkie
响应对应请求的cookie
8- response.cookies
响应的cookie(经过了set-cookie动作)
9- response.url
获取访问的url
10- response.json()
获取json数据 得到的内容为字典(如果接口响应体的格式是json格式时)
11- response.ok
如果status_code 小于200则返回True, 否则返回False
1- 使用requests获取json数据。 配置cookie和user-agent
import requests import json from lxml import etree import pandas as pd def get_pro(url): headers = { "User-Agent": "PostmanRuntime", "Host": "www.ti.com.cn", "Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Cookie": "user_pref_language='zh-CN'; user_pref_currency='USD' ; last-domain=www.ti.com.cn; CONSENTMGR=ts:1663841657843%7Cconsent:true; tiSessionID=018364b0e3f4001314def085217c0506d001606500bd0; Qs_lvt_470423=1663841658; _gid=GA1.3.938983574.1663841661; _ga=GA1.3.1093917085.1663841661; _pxvid=53ce593f-3a5f-11ed-9086-4a5457546470; _gcl_au=1.1.1202026322.1663841671; __adroll_fpc=98c78cff267aae66ab4a0d83ec58a750-1663841671904; ELOQUA=GUID=4BEDAA278D4541A38864991D291B83A8; __ar_v4=QFXRHQEHOJDMLHSLFIWCLO%3A20220922%3A4%7C2XNKMR6P4VGD5MD3ZP4SQR%3A20220922%3A4%7CG3YHLXUICZC3XKDXYVVLO4%3A20220922%3A4; bm_sz=D41B60D0ECA1409E6269C65B0E1D8A7D~YAAQe0InO1HJUV2DAQAAlX7GZBEy8cD3cfIKOSojZGgbg/3W35pnD/IimJLSz2DPb7V4nP2QVr7dSbziplcts1ToAzEN/hKl3KP3iRIEhDrbuzwG09lnQJKByFgRMRcffhpJwlCavomYd+qST6ol+2b0i3yJ66K31G7ZcHnVKbOFttH7IPmHWgC57Nez5vjYNuyrA3SFHcPfDsM9qQNyGrF4XXGzZ5AJmFHPtm2mtD2G3dpvpQwoIjXMAfMf3c2X/ZlUxmwf3O16rYbZFr7fyIemhONdLmWmd4zCGJjoO6u65bfeO26XuSsSgF1gvPdKDs1CZ9L+xLO/GKuVQdRmLC9gDfnw0HfpoZCLcDKCW1aSCqCy2kcZuIaHzVU42qXUC1kwLKgNGS9R3jYKWI1nM6X9I0B1Q2UGqpxXN1sVMoyucRK5Fw==~3424563~3617078; ti_geo=country=CN|city=GUANGZHOU|continent=AS|tc_ip=121.35.2.123; ti_ua=Mozilla%2f5.0%20(Windows%20NT%2010.0%3b%20WOW64)%20AppleWebKit%2f537.36%20(KHTML,%20like%20Gecko)%20Chrome%2f96.0.4664.93%20Safari%2f537.36; ti_bm=; userType=Anonymous; gpn=Non-Product; mediav=%7B%22eid%22%3A%221171998%22%2C%22ep%22%3A%22%22%2C%22vid%22%3A%22V%3EV(o0%3EVk*9X'pHa%5E5wL%22%2C%22ctn%22%3A%22%22%2C%22vvid%22%3A%22V%3EV(o0%3EVk*9X'pHa%5E5wL%22%2C%22_mvnf%22%3A1%2C%22_mvctn%22%3A0%2C%22_mvck%22%3A0%2C%22_refnf%22%3A1%7D; _gat_ga_main_tracker=1; pxcts=84a68a88-3a70-11ed-98a4-4b4e476b5057; _pxff_cc=U2FtZVNpdGU9TGF4Ow==; bm_mi=E72E8E2998A099B60812D95CC458923B~YAAQZ8U8t1xK22GDAQAACK8hZRFEG3hOh/qD2QOoKt5PooKubQFRUJwuU41ZsXh5AxXscvosxmKQUwVzWiXMd3GpsKORnhGzlpqE3eoqYFIvpIRH+rmKPRVmJ5IAzT3q1UShVrDcNrLIrHA16d1iyZUOMuhpkGVlsWwU8UlswgIQYaKb0DSqoF7a3C1gbgZIQgs/pPqQx0K9GnFZgXFbm8yXck1FKFMZ8ijjhO020Auuaty+Yg88DUFThnVAl6nYI+GxNvUkTbAKD0lY5T3AQtZpPtOkcfbHmqC0gk+f2wBCGe1YlACR3G6zaVTcjLtjLK4eDBuOoUlHwtGjTsTRLGNHUrIfM6uPJdEPl0eLqGoLDafDRqgnV7o3nll4mmLpBLUtTTc=~1; ticontent=%2Fanalog%20and%20mixed-signal%2Fclocks%20%26%20timing%2Freal-time%20clocks%20(rtcs)%20%26%20timers; ga_content_cookie=%2Fanalog%20and%20mixed-signal%2Fclocks%20%26%20timing%2Freal-time%20clocks%20(rtcs)%20%26%20timers; da_lid=BCDD00D89A72EA16BCE2BB99F530830CBE|0|0|0; da_sid=8FEE33EB8E32AE8D29B3AA13B732C9070D|4|0|3; da_intState=; ak_bmsc=7976EE9DF88BE800A430E7FF6F5C60CB~000000000000000000000000000000~YAAQZ8U8t3tL22GDAQAANLUhZRHMmX+f/4J2VKsWUd57hlQx2SIQlCFMxujU0UFRHEsYIHVWbs99CiJl1Mf66ONAzkD42jge+OPSQtMOPwrthxtHjt+ioAy+xT7+wJBPVuJZ/ONitbO0Sdbb5YPTtOmVs/yKk3aJuF5Gyk3kf7cDo5Q3PUQBREpgVqMZRlszTzB6T3V0Fzv+vtB9BKDlNX9T24GfOal8SW6OOdLMtbx/BIMKflbTbTAsPFllIvEdhU97Su9U9T+X44IO0C1yGm4xUOffCB+20fnKxxa0fB0OC71r6snUflU57iw1m93MrgYtET7SSKVE5hdjtL4JMPQaoPLN0URPPV3wnh0K662XTZ10JHOdfUmFSFwcqLMR1KAZaryVN6deYbhqJ5rQXw3MEbzlNLY3P9kjLYorpc1CjQske9Yx57Qaq8a3EG8DLZY6VK/XFyfKSDlFglPFXYULEHs=; Qs_pv_470423=2745392645362471000%2C4285938632797885400%2C2810462465018459600%2C4189095345180553000%2C1243611918178687700; tipage=%2Fanalog%20and%20mixed-signal%2Fclocks%20%26%20timing%2Freal-time%20clocks%20(rtcs)%20%26%20timers%2Freal-time%20clocks%20(rtcs)%20%26%20timers%20products-cn; tipageshort=real-time%20clocks%20(rtcs)%20%26%20timers%20products-cn; ga_page_cookie=real-time%20clocks%20(rtcs)%20%26%20timers%20products-cn; ABTasty=uid=0cjpavvk9wnrrrwk&fst=1663841660388&pst=1663841660388&cst=1663847478322&ns=2&pvt=22&pvis=4&th=; ABTastySession=mrasn=&sen=3&lp=https%253A%252F%252Fwww.ti.com.cn%252Fzh-cn%252Famplifier-circuit%252Finstrumentation%252Fproducts.html%2523p358max%253D24%253B50; _px3=3dc91279d0f8be4de87bd7429a62ac3a4e47bb76cd18485670dd2d24a9a77bb3:YbVcwdKJ6VKjcXrCrW1deAz4l6c6cwWvF5v68WZ9NctY3yZTFx5cuS0OSmEymhyMARn06B+Se8hjomWxt7FA/A==:1000:Ac5xSCH0xsEmWWCdY8cmnbVlJ5Y8PSSQmfJX+GuwEBt2B46eXGMFTN4I3xo85r8BPsghdKd2Qv+wnrdc2YAL1QQ+edMB0ZVsZP2gIOY9nqz1sBuKUNKY0V8tYF7ku5xY3yXJEUaQlZ11GZj9ruyK5G6YI54os3F5xLyDfw1+F0a3mXS+wfavzGz6IKYN1Gz1khRoKEozKIFLbaAkX0p+Pg==; _pxde=5ac0c31fbd77e81c5a08c6e00a7b31ef5f69788bfc933f05dd1f64d7d9d8b47b:eyJ0aW1lc3RhbXAiOjE2NjM4NDkwNjE5NDcsImZfa2IiOjAsImlwY19pZCI6W10sImluY19pZCI6WyJiMzEyMWZmM2FlNGQwNDU2ZmQwMDI2YTUzNjJjMjY5ZiJdfQ==; utag_main=v_id:018364b0e3f4001314def085217c0506d001606500bd0$_sn:1$_ss:0$_pn:23%3Bexp-session$_st:1663850864569$ses_id:1663841657844%3Bexp-session$free_trial:false$dc_visit:1$dc_event:24%3Bexp-session$dc_region:ap-east-1%3Bexp-session; bm_sv=F594272FBFDA970ADD8EF7B1B88DD629~YAAQZ8U8txZS22GDAQAAMuYhZRELQhQDFi5uMH/2zp9+VsYXi0Ajoa7XmnX/ccNwG5d1kIzyQt3pP/0lXgKgzFAk6icZgG6SmTIaTAhoLPsLHtlxWTTYdXkHFeTsMm0V2OO71WkxgjjnUOmBREOoTGRMSiGvLT06Ekn0agWPTWiLbfoz9BA5A/j39lg5NLjEXo0jovQAkeDBdai/mdTgAx6/h2PkzHu3gPT/+PBgVRo4ETvS5GJ7akA0HdAFdbkq~1; ti_rid=13e3642; _abck=73D9ACE1E904F6A0BB0164BA18B5C3E2~-1~YAAQZ8U8tzNS22GDAQAA1OYhZQgXu78fU4MwKONm0GsNW6XirzDPEKofuo4B0UxCXafv6PuRRNIM+/GpL3aKFUEPTeTILYmxbt8hVMHXxWaAuVyVbnawuJOUsjJxItYm4DkvA94tcTLKJ1okIX2AMsUlvRo12cIZm+TtT/tW/rYYZQHIXPJVLOBq5hF/E1ZqmZryLVJ9lmWp8C2btvZRANjkL/zNMus+JiIh6IqQf+rQko7S3peCv+9KC3u4zWBgP7MYY9HQJZBgvKRU1O8OgQFlyJoDBDApxAJrVRuSH1RvZj80PCSujR6h47j3mcf/g7dOpwasWLafGDmbq+kvyuGxzYID5jgUzVnmEL4qYyycP33Am8a/vwDhD8ovQWthlqBrC+PlcXncTqIvqej6MSDg9h1gElU1Yttm5b3NwPGXqSHkf2EThlZmqHqbtc8Caacuijb90Up3BS1UnMSQYm7q~-1~-1~-1", "Referer": "https://www.ti.com.cn/zh-cn/amplifier-circuit/instrumentation/products.html" } res_html = requests.get(url=url, headers=headers) res = res_html.content.decode("utf-8") json_res = json.loads(res) # 获取商品列表 products_list = json_res["ParametricResults"] pro_res_list = [] # 结果格式化处理 for pro_dict in products_list: # 商品名称 pro_name = pro_dict.get("o3") # 商品类型 pro_features = pro_dict.get("o7") # 商品svg pro_data_svg = pro_dict.get("o10") # 商品描述 pro_desc = pro_dict.get("p2192") pro_attr_list = [pro_name, pro_features, pro_data_svg, pro_desc] pro_res_list.append(pro_attr_list) return pro_res_list if __name__ == '__main__': url = "https://www.ti.com.cn/selectiontool/paramdata/family/500/results?lang=cn&output=json" res_to_excel = get_pro(url) print(res_to_excel) # 存入csv文件 df = pd.DataFrame(data=res_to_excel, columns=["商品名称", "商品类型", "商品svg", "商品描述"]) df.to_excel("./ti商品信息.xlsx", index=False)
import requests from lxml import etree import pandas as pd def get_pro(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36", } res_html = requests.get(url=url, headers=headers) html_str = res_html.content.decode("utf-8") # 解析网页结果 html = etree.HTML(html_str) pro_info = html.xpath("//tbody//ul[@class='l02-zb']") pro_res_list = [] for pri_i in pro_info: try: pro_xinghao = pri_i.xpath("./li[1]/*/@title")[0] pro_pingpai = pri_i.xpath("./li[2]/a/text()")[0].strip() pro_fengzhuang = pri_i.xpath("./li[3]/*/@title")[0] pro_desc = pri_i.xpath("./li[4]/*/@title")[0] except: continue pro_list = [pro_xinghao, pro_pingpai, pro_fengzhuang, pro_desc] pro_res_list.append(pro_list) return pro_res_list if __name__ == '__main__': res_list = [] res_to_excel = [] for page_num in range(300, 333): url = f"https://list.szlcsc.com/catalog/{page_num}.html" res_list = get_pro(url) res_to_excel.extend(res_list) print(res_to_excel) # 存入csv文件 df = pd.DataFrame(data=res_to_excel, columns=["商品型号", "商品品牌", "商品封装", "商品描述"]) df.to_excel("./立创商品信息.xlsx", index=False)
import requests url = "posturl" headers = { "User-Agent”: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36" } form_data = { "from": "en", "to": "zh", "q": "lucky boy" } response = requests.post(url, params=form_data, headers=headers) res = response.content.decode('utf-8')
import requests import urllib.parse url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36' } params = { "from": "zh", "to": "en" } data = { "from": "zh", "to": "en", "query": "你好", "transtype": "translang", "simple_means_flag": "3", "sign": "232427.485594", "token": "fa3f170535ad8b9d05540a6c20471a59", "domain": "common" } data = urllib.parse.urlencode(data) params = urllib.parse.urlencode(params) print(data) print(params) resposne = requests.post(url, params=params, data=data, headers=headers) print(resposne.status_code) print(resposne.content.decode('GBK'))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。