python爬取新浪微博（爬取微博帐号所发内容，不爬取历史内容）_python爬取微博

作者：程序语言诗人 | 2024-02-03 14:10:12

踩

python爬取微博

实现：

跟踪比较活跃的微博号所发的微博内容，隔3-5分钟刷新（爬取）一次，只有更新了才爬的到，不爬取历史微博内容哦，爬取正文、文中图片、所属微博昵称、发布时间（时间戳格式)。

python_service.py

import win32serviceutil
import win32service
import win32event
import re,requests,time,datetime,pymysql,random
'''
遇到不懂的问题？Python学习交流群：821460695满足你的需求，资料都已经上传群文件，可以自行下载！
'''
WB_count1=8;WB_count=9;w1=0;Stop_py=1;w2=0
get_status=0
class SmallestPythonService(win32serviceutil.ServiceFramework):
    _svc_name_ = "SmallestPythonService"
    _svc_display_name_ = "The smallest possible Python Service"
    def __init__(self, args):
        win32serviceutil.ServiceFramework.__init__(self, args)
        # Create an event which we will use to wait on.
        # The "service stop" request will set this event.
        self.hWaitStop = win32event.CreateEvent(None, 0, 0, None)
    def SvcStop(self):
        # Before we do anything, tell the SCM we are starting the stop process.
        self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING)
        # And set my event.
        win32event.SetEvent(self.hWaitStop)
    def SvcDoRun(self):
        global get_status
        class crawl1:
            def __init__(self):
                pass
            def getcontent(self,start_url1):
                '''获取信息'''
                global WB_count1,time_int,picture_name,cont_soure,cont,WB_count
                html=requests.get(start_url1,cookies=cookie,headers=header,timeout=15)
                html=html.content
                reg=r't2\\">粉丝<\\/sp.*?ass=\\"S_line1\\">.*?ong clas.*?">(.*?)<\\/strong><span class=\\"S_txt2\\">微博'
                WB_count=re.findall(re.compile(reg),html)  #发微博数量
                if WB_count==[]:
                    self.GetBadCookie(I_D)
                    print 'cookie 被冻结'
                    cont,picture_name,cont_soure,time_int='1','1','1','1'
                    return cont,picture_name,cont_soure,time_int
                else:
                    reg1=r'<a name=.*? target=\\"_blank\\" href=\\"\\(.*?)" title=\\"(.*?)\\" date.*?a> 来自 <a'
                    WB_url=re.findall(re.compile(reg1),html)[1:2]   #真实数据有15条
                    for j in WB_url:
                        time_send=j[1]
                        timeArray = time.strptime(j[1], "%Y-%m-%d %H:%M")
                        time_int=int(time.mktime(timeArray))   #发表时间
                        fin_url='http://weibo.com'+j[0].replace('\\','')
                        html1=requests.get(fin_url,cookies=cookie,headers=header,timeout=15)
                        html1=html1.content
                        reg_time=r'<div class=\\"WB_from S_txt2\\">.*?ass=\\"S_txt2\\" target=\\"_blank\\" href=\\"(.*?)" title=\\".*?" date=\\".*?" node-type=\\"feed_list_item_d.*?来自'
                        zhuan_url=re.findall(re.compile(reg_time),html1)
                        if zhuan_url!=[]:
                            #zhuan_fin_url='http://weibo.com'+zhuan_url[0].replace('\\','')
                            #留着处理转发的微博
                            #logging_a='转发微博'
                            print '转发微博',datetime.datetime.now()
                            #logging.debug(logging_a)
                        else:
                            reg3=r'<div class=\\"WB_text W_f14\\" node-type=\\"feed_list_content\\" nick-name=\\"(.*?)\\">(.*?)<!-- 引用文件时'
                            cont=re.findall(re.compile(reg3),html1)[0][1]
                            regg=re.compile(r'<[^>]+>',re.S)
                            cont=regg.sub('',cont).replace('\\n',' ')
                            cont="".join(cont.split())   #正文
                            cont_soure=re.findall(re.compile(reg3),html1)[0][0]  #来源

                            regimg=r'<!-- 引用文件时，必须对midia_info赋值 -->(.*?)<!-- super card-->'
                            img=re.findall(re.compile(regimg),html1)[0]
                            regimg1=r'<img src=\\"(.*?)\\">'
                            img=re.findall(re.compile(regimg1),img)
                            picture_name=[]
                            if img!=[]:
                                for item in img:
                                    item_ringht=item.replace('\\','').replace('thumb150','mw690').replace('orj360','mw690')
                                    picture=requests.get(item_ringht)
                                    name=str('E:\\sina_image\\'+item[-15:])
                                    picture_name.append(item[-15:])  #图片名称
                                    try:
                                        f=open(name,'wb')
                                        f.write(picture.content)
                                    except Exception,e:
                                        print e,'picture'
                                    finally:
                                        f.close()
                            get_status=1
                            return cont,picture_name,cont_soure,time_int,time_send,get_status
                        cont,picture_name,cont_soure,time_int,time_send,get_status=0,0,0,0,0,0
                        return cont,picture_name,cont_soure,time_int,time_send,get_status
            def connectDB(self):
                '''链接数据库'''
                host="localhost"
                dbName="sina"
                user="root"
                password="root"
                db=pymysql.connect(host,user,password,dbName,charset='utf8')
                return db
                cursorDB = db.cursor()
                return cursorDB
            def creatTable(self,createTableName):
                '''创建数据库'''
                try:
                    createTableSql="CREATE TABLE IF NOT EXISTS "+ createTableName+"(id int(11) NOT NULL AUTO_INCREMENT,content TEXT,photo TEXT,url_name VARCHAR(255),time_in int,PRIMARY KEY (`id`))ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='微博信息'"
                    DB_create=self.connectDB()
                    cursor_create=DB_create.cursor()
                    cursor_create.execute(createTableSql)
                except Exception,e:
                    print e,'------creatTable_def'
                finally:
                    DB_create.close()
                #logging_b='create_ table '+createTableName+' successfully -------','start spider,now!'
                print 'create_ table '+createTableName+' successfully -------','start spider,now!'
                #logging.debug(logging_b)
                return createTableName
            def SelectUrl(self):
                '''
                读取weibo_id_list中url字段
                把所需要爬取的微博url，添加到start_url（列表）中，供爬取
                '''
                global start_url
                start_url=[]
                selectUrlSql="select url from weibo_id_list"
                DB_select=self.connectDB()
                cursor_select=DB_select.cursor()
                cursor_select.execute(selectUrlSql)
                results = cursor_select.fetchall()
                for i in results:
                    start_url.append(i[0])
                DB_select.commit()
                DB_select.close()
            def GetBadCookie(self,ID):
                '''
                记录没用的cookie
                默认valid有用为1，但是被禁就变成0
                '''
                updateContentSql="update cookies_list set valid=0 where id="+str(ID)
                DB_update=self.connectDB()
                cursor_uodate=DB_update.cursor()
                cursor_uodate.execute(updateContentSql)
                DB_update.commit()
                DB_update.close()
            def Selectcookie(self):
                '''
                提取出cookies
                返回：一条cookies'''
                global I_D,Stop_py
                selectContentSql="select id,cookie,valid from cookies_list where valid=1"
                DB_select=self.connectDB()
                cursor_select=DB_select.cursor()
                cursor_select.execute(selectContentSql)
                results = cursor_select.fetchall()
                if results==():
                    Stop_py=0   #标记结束py程序
                random_index_ck=random.randint(0,int(len(results)-1))
                I_D = results[random_index_ck][0]
                results = eval(results[random_index_ck][1]) #随机提取cookie
                DB_select.commit()
                DB_select.close()
                return results
            def SelectUA(self):
                '''
                提取出User-Agent
                返回：一个User-Agent
                '''
                selectUA_sql="select header,valid from headers_list where valid=1"
                DB_select=self.connectDB()
                cursor_select=DB_select.cursor()
                cursor_select.execute(selectUA_sql)
                results = cursor_select.fetchall()
                random_index_UA=random.randint(0,int(len(results)-1))
                results = eval(results[random_index_UA][0]) #随机提取header
                DB_select.commit()
                DB_select.close()
                return results
            def inserttable_logging(self,insert1,insert2,insert3,insert4,insert5):
                '''监控数据插入表中'''
                global w2
                insertLoggingSql="insert into weibo_logging"+"(weibo_id,spider_time,get_status,content,time_send)values(%s,%s,%s,%s,%s)"
                DB_insert=self.connectDB()
                cursor_insert=DB_insert.cursor()
                cursor_insert.execute(insertLoggingSql,(str(insert1),str(insert2),str(insert3),str(insert4),str(insert5)))
                w2=w2+1
                print 'logging successfully %s records'%(w2)
                DB_insert.commit()
                DB_insert.close()
            def inserttable(self,insertTable,insert1,insert2,insert3,insert4):
                '''有效数据插入表中'''
                global w1
                try:
                    insertContentSql_0="select content from "+insertTable+" where content="+"\'"+insert1+"\'"   #去重复
                    insertContentSql="INSERT INTO "+insertTable+"(content,photo,url_name,time_in)VALUES(%s,%s,%s,%s)"
                    DB_insert=self.connectDB()
                    cursor_insert=DB_insert.cursor()
                    a=self.connectDB().cursor().execute(insertContentSql_0)
                    if a:
                        #logging_c='repetition!'
                        print 'repetition!','########',datetime.datetime.now()
                        #logging.debug(logging_c)
                    else:
                        cursor_insert.execute(insertContentSql,(insert1,insert2,insert3,insert4))
                        w1=w1+1
                        #logging_d='inert contents to '+insertTable+' successfully crawling number '+str(w1)+' ------ '+insert3+' ------'
                        print 'inert contents to '+insertTable+' successfully'+' crawling number %s'% (w1),'------'+insert3+'------',datetime.datetime.now()
                        #logging.debug(logging_d)
                    DB_insert.commit()
                except Exception,e:
                    print e,'inserttable_def'
                finally:
                    DB_insert.close()

            crawl=crawl1()
            table=crawl.creatTable('weibo_result')  #更改表名
            while 1:
                crawl.SelectUrl()
                for start_url1 in start_url:
                    try:
                        cookie=crawl.Selectcookie()
                        header=crawl.SelectUA()
                        cont,picture_name,cont_soure,time_int,time_send,get_status=crawl.getcontent(start_url1)
                        crawl.inserttable_logging(cont_soure,str(datetime.datetime.now()),get_status,cont,time_send)
                        get_status=0
                        if cont!=0:
                            crawl.inserttable(table,cont,'@'.join(picture_name),cont_soure,time_int)
                            time.sleep(5)
                        else:time.sleep(10)
                    except Exception,e:
                        a,b,c,d='error',0,'error','error',
                        crawl.inserttable_logging(a,str(datetime.datetime.now()),b,c,d)
                        print e,'调用函数'
                        #logging.debug(e)
                        continue
                    finally:
                        if Stop_py==0:break
                if Stop_py==0:break
                time.sleep(120)
        win32event.WaitForSingleObject(self.hWaitStop, win32event.INFINITE)
if __name__=='__main__':
    win32serviceutil.HandleCommandLine(SmallestPythonService)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236

sina.py

import win32serviceutil
import win32service
import win32event
import re,requests,time,datetime,pymysql,random
'''
遇到不懂的问题？Python学习交流群：821460695满足你的需求，资料都已经上传群文件，可以自行下载！
'''
WB_count1=8;WB_count=9;w1=0;Stop_py=1;w2=0
get_status=0
class crawl1:
    def __init__(self):
        pass
    def getcontent(self,start_url1):
        '''获取信息'''
        global WB_count1,time_int,picture_name,cont_soure,cont,WB_count
        html=requests.get(start_url1,cookies=cookie,headers=header,timeout=15)
        html=html.content
        reg=r't2\\">粉丝<\\/sp.*?ass=\\"S_line1\\">.*?ong clas.*?">(.*?)<\\/strong><span class=\\"S_txt2\\">微博'
        WB_count=re.findall(re.compile(reg),html)  #发微博数量
        if WB_count==[]:
            self.GetBadCookie(I_D)
            print 'cookie 被冻结'
            cont,picture_name,cont_soure,time_int='1','1','1','1'
            return cont,picture_name,cont_soure,time_int
        else:
            reg1=r'<a name=.*? target=\\"_blank\\" href=\\"\\(.*?)" title=\\"(.*?)\\" date.*?a> 来自 <a'
            WB_url=re.findall(re.compile(reg1),html)[1:2]   #真实数据有15条
            for j in WB_url:
                time_send=j[1]
                timeArray = time.strptime(j[1], "%Y-%m-%d %H:%M")
                time_int=int(time.mktime(timeArray))   #发表时间
                fin_url='http://weibo.com'+j[0].replace('\\','')
                html1=requests.get(fin_url,cookies=cookie,headers=header,timeout=15)
                html1=html1.content
                reg_time=r'<div class=\\"WB_from S_txt2\\">.*?ass=\\"S_txt2\\" target=\\"_blank\\" href=\\"(.*?)" title=\\".*?" date=\\".*?" node-type=\\"feed_list_item_d.*?来自'
                zhuan_url=re.findall(re.compile(reg_time),html1)
                if zhuan_url!=[]:
                    #zhuan_fin_url='http://weibo.com'+zhuan_url[0].replace('\\','')
                    #留着处理转发的微博
                    #logging_a='转发微博'
                    print '转发微博',datetime.datetime.now()
                    #logging.debug(logging_a)
                else:
                    reg3=r'<div class=\\"WB_text W_f14\\" node-type=\\"feed_list_content\\" nick-name=\\"(.*?)\\">(.*?)<!-- 引用文件时'
                    cont=re.findall(re.compile(reg3),html1)[0][1]
                    regg=re.compile(r'<[^>]+>',re.S)
                    cont=regg.sub('',cont).replace('\\n',' ')
                    cont="".join(cont.split())   #正文
                    cont_soure=re.findall(re.compile(reg3),html1)[0][0]  #来源

                    regimg=r'<!-- 引用文件时，必须对midia_info赋值 -->(.*?)<!-- super card-->'
                    img=re.findall(re.compile(regimg),html1)[0]
                    regimg1=r'<img src=\\"(.*?)\\">'
                    img=re.findall(re.compile(regimg1),img)
                    picture_name=[]
                    if img!=[]:
                        for item in img:
                            item_ringht=item.replace('\\','').replace('thumb150','mw690').replace('orj360','mw690')
                            picture=requests.get(item_ringht)
                            name=str('E:\\sina_image\\'+item[-15:])
                            picture_name.append(item[-15:])  #图片名称
                            try:
                                f=open(name,'wb')
                                f.write(picture.content)
                            except Exception,e:
                                print e,'picture'
                            finally:
                                f.close()
                    get_status=1
                    return cont,picture_name,cont_soure,time_int,time_send,get_status
                cont,picture_name,cont_soure,time_int,time_send,get_status=0,0,0,0,0,0
                return cont,picture_name,cont_soure,time_int,time_send,get_status
    def connectDB(self):
        '''链接数据库'''
        host="localhost"
        dbName="sina"
        user="root"
        password="root"
        db=pymysql.connect(host,user,password,dbName,charset='utf8')
        return db
        cursorDB = db.cursor()
        return cursorDB
    def creatTable(self,createTableName):
        '''创建数据库'''
        try:
            createTableSql="CREATE TABLE IF NOT EXISTS "+ createTableName+"(id int(11) NOT NULL AUTO_INCREMENT,content TEXT,photo TEXT,url_name VARCHAR(255),time_in int,PRIMARY KEY (`id`))ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='微博信息'"
            DB_create=self.connectDB()
            cursor_create=DB_create.cursor()
            cursor_create.execute(createTableSql)
        except Exception,e:
            print e,'------creatTable_def'
        finally:
            DB_create.close()
        #logging_b='create_ table '+createTableName+' successfully -------','start spider,now!'
        print 'create_ table '+createTableName+' successfully -------','start spider,now!'
        #logging.debug(logging_b)
        return createTableName
    def SelectUrl(self):
        '''
        读取weibo_id_list中url字段
        把所需要爬取的微博url，添加到start_url（列表）中，供爬取
        '''
        global start_url
        start_url=[]
        selectUrlSql="select url from weibo_id_list"
        DB_select=self.connectDB()
        cursor_select=DB_select.cursor()
        cursor_select.execute(selectUrlSql)
        results = cursor_select.fetchall()
        for i in results:
            start_url.append(i[0])
        DB_select.commit()
        DB_select.close()
    def GetBadCookie(self,ID):
        '''
        记录没用的cookie
        默认valid有用为1，但是被禁就变成0
        '''
        updateContentSql="update cookies_list set valid=0 where id="+str(ID)
        DB_update=self.connectDB()
        cursor_uodate=DB_update.cursor()
        cursor_uodate.execute(updateContentSql)
        DB_update.commit()
        DB_update.close()
    def Selectcookie(self):
        '''
        提取出cookies
        返回：一条cookies'''
        global I_D,Stop_py
        selectContentSql="select id,cookie,valid from cookies_list where valid=1"
        DB_select=self.connectDB()
        cursor_select=DB_select.cursor()
        cursor_select.execute(selectContentSql)
        results = cursor_select.fetchall()
        if results==():
            Stop_py=0   #标记结束py程序
        random_index_ck=random.randint(0,int(len(results)-1))
        I_D = results[random_index_ck][0]
        results = eval(results[random_index_ck][1]) #随机提取cookie
        DB_select.commit()
        DB_select.close()
        return results
    def SelectUA(self):
        '''
        提取出User-Agent
        返回：一个User-Agent
        '''
        selectUA_sql="select header,valid from headers_list where valid=1"
        DB_select=self.connectDB()
        cursor_select=DB_select.cursor()
        cursor_select.execute(selectUA_sql)
        results = cursor_select.fetchall()
        random_index_UA=random.randint(0,int(len(results)-1))
        results = eval(results[random_index_UA][0]) #随机提取header
        DB_select.commit()
        DB_select.close()
        return results
    def inserttable_logging(self,insert1,insert2,insert3,insert4,insert5):
        '''监控数据插入表中'''
        global w2
        insertLoggingSql="insert into weibo_logging"+"(weibo_id,spider_time,get_status,content,time_send)values(%s,%s,%s,%s,%s)"
        DB_insert=self.connectDB()
        cursor_insert=DB_insert.cursor()
        cursor_insert.execute(insertLoggingSql,(str(insert1),str(insert2),str(insert3),str(insert4),str(insert5)))
        w2=w2+1
        print 'logging successfully %s records'%(w2)
        DB_insert.commit()
        DB_insert.close()
    def inserttable(self,insertTable,insert1,insert2,insert3,insert4):
        '''有效数据插入表中'''
        global w1
        try:
            insertContentSql_0="select content from "+insertTable+" where content="+"\'"+insert1+"\'"   #去重复
            insertContentSql="INSERT INTO "+insertTable+"(content,photo,url_name,time_in)VALUES(%s,%s,%s,%s)"
            DB_insert=self.connectDB()
            cursor_insert=DB_insert.cursor()
            a=self.connectDB().cursor().execute(insertContentSql_0)
            if a:
                #logging_c='repetition!'
                print 'repetition!','########',datetime.datetime.now()
                #logging.debug(logging_c)
            else:
                cursor_insert.execute(insertContentSql,(insert1,insert2,insert3,insert4))
                w1=w1+1
                #logging_d='inert contents to '+insertTable+' successfully crawling number '+str(w1)+' ------ '+insert3+' ------'
                print 'inert contents to '+insertTable+' successfully'+' crawling number %s'% (w1),'------'+insert3+'------',datetime.datetime.now()
                #logging.debug(logging_d)
            DB_insert.commit()
        except Exception,e:
            print e,'inserttable_def'
        finally:
            DB_insert.close()

if __name__=='__main__':
    crawl=crawl1()
    table=crawl.creatTable('weibo_result')  #更改表名
    while 1:
        crawl.SelectUrl()
        for start_url1 in start_url:
            try:
                cookie=crawl.Selectcookie()
                header=crawl.SelectUA()
                cont,picture_name,cont_soure,time_int,time_send,get_status=crawl.getcontent(start_url1)
                crawl.inserttable_logging(cont_soure,str(datetime.datetime.now()),get_status,cont,time_send)
                get_status=0
                if cont!=0:
                    crawl.inserttable(table,cont,'@'.join(picture_name),cont_soure,time_int)
                    time.sleep(5)
                else:time.sleep(10)
            except Exception,e:
                a,b,c,d='error',0,'error','error',
                crawl.inserttable_logging(a,str(datetime.datetime.now()),b,c,d)
                print e,'调用函数'
                #logging.debug(e)
                continue
            finally:
                if Stop_py==0:break
        if Stop_py==0:break
        time.sleep(180)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/article/detail/56691