赞
踩
1.接到需求需要对指定的微博账号进行微博内容抓取
这个任务也比较常见
那么拿到需求我们进到微博网站对进口进行查找,对接口进行抓取的话,能获得数据无疑是最省事安心的方式
2.那么在github上看到写的十分好用的案例,那么本着互联网开源的思想,对代码进行学习修改。
废话不多说,直接上代码
#!/usr/bin/env python # -*- coding: UTF-8 -*- import codecs import csv import json import math import os import random import sys import traceback from collections import OrderedDict from datetime import datetime, timedelta from time import sleep import requests from lxml import etree from requests.adapters import HTTPAdapter from tqdm import tqdm class Weibo(object): def __init__(self, user_id, filter=0, since_date='1900-01-01', pic_download=0, video_download=0): """Weibo类初始化""" if not isinstance(user_id, int): sys.exit(u'user_id值应为一串数字形式,请重新输入') if filter != 0 and filter != 1: sys.exit(u'filter值应为数字0或1,请重新输入') if not self.is_date(since_date): sys.exit(u'since_date值应为yyyy-mm-dd形式,请重新输入') if pic_download != 0 and pic_download != 1: sys.exit(u'pic_download值应为数字0或1,请重新输入') if video_download != 0 and video_download != 1: sys.exit(u'video_download值应为0或1,请重新输入') self.user_id = user_id # 用户id,即需要我们输入的数字,如昵称为"Dear-迪丽热巴"的id为1669879400 self.filter = filter # 取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 self.since_date = since_date # 起始时间,即爬取发布日期从该值到现在的微博,形式为yyyy-mm-dd self.pic_download = pic_download # 取值范围为0、1,程序默认值为0,代表不下载微博原始图片,1代表下载 self.video_download = video_download # 取值范围为0、1,程序默认为0,代表不下载微博视频,1代表下载 self.weibo = [] # 存储爬取到的所有微博信息 self.user = {} # 存储目标微博用户信息 self.got_count = 0 # 爬取到的微博数 def is_date(self, since_date): """判断日期格式是否正确""" try: datetime.strptime(since_date, "%Y-%m-%d") return True except ValueError: return False def get_json(self, params): """获取网页中json数据""" url = 'https://m.weibo.cn/api/container/getIndex?' r = requests.get(url, params=params) return r.json() def get_weibo_json(self, page): """获取网页中微博json数据""" params = {'containerid': '107603' + str(self.user_id), 'page': page} js = self.get_json(params) return js def get_user_info(self): """获取用户信息""" params = {'containerid': '100505' + str(self.user_id)} js = self.get_json(params) if js['ok']: info = js['data']['userInfo'] if info.get('toolbar_menus'): del info['toolbar_menus'] user_info = self.standardize_info(info) self.user = user_info return user_info def get_long_weibo(self, id): """获取长微博""" url = 'https://m.weibo.cn/detail/%s' % id html = requests.get(url).text html = html[html.find('"status":'):] html = html[:html.rfind('"hotScheme"')] html = html[:html.rfind(',')] html = '{' + html + '}' js = json.loads(html, strict=False) weibo_info = js.get('status') if weibo_info: weibo = self.parse_weibo(weibo_info) return weibo def get_pics(self, weibo_info): """获取微博原始图片url""" if weibo_info.get('pics'): pic_info = weibo_info['pics'] pic_list = [pic['large']['url'] for pic in pic_info] pics = ','.join(pic_list) else: pics = '' return pics def get_video_url(self, weibo_info): """获取微博视频url""" video_url = '' if weibo_info.get('page_info'): if weibo_info['page_info'].get('media_info'): media_info = weibo_info['page_info']['media_info'] video_url = media_info.get('mp4_720p_mp4') if not video_url: video_url = media_info.get('mp4_hd_url') if not video_url: video_url = media_info.get('mp4_sd_url') if not video_url: video_url = '' return video_url def download_one_file(self, url, file_path, type, weibo_id): """下载单个文件(图片/视频)""" try: if not os.path.isfile(file_path):
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。