赞
踩
跟踪比较活跃的微博号所发的微博内容,隔3-5分钟刷新(爬取)一次,只有更新了才爬的到,不爬取历史微博内容哦,爬取正文、文中图片、所属微博昵称、发布时间(时间戳格式)。
python_service.py
import win32serviceutil import win32service import win32event import re,requests,time,datetime,pymysql,random ''' 遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载! ''' WB_count1=8;WB_count=9;w1=0;Stop_py=1;w2=0 get_status=0 class SmallestPythonService(win32serviceutil.ServiceFramework): _svc_name_ = "SmallestPythonService" _svc_display_name_ = "The smallest possible Python Service" def __init__(self, args): win32serviceutil.ServiceFramework.__init__(self, args) # Create an event which we will use to wait on. # The "service stop" request will set this event. self.hWaitStop = win32event.CreateEvent(None, 0, 0, None) def SvcStop(self): # Before we do anything, tell the SCM we are starting the stop process. self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING) # And set my event. win32event.SetEvent(self.hWaitStop) def SvcDoRun(self): global get_status class crawl1: def __init__(self): pass def getcontent(self,start_url1): '''获取信息''' global WB_count1,time_int,picture_name,cont_soure,cont,WB_count html=requests.get(start_url1,cookies=cookie,headers=header,timeout=15) html=html.content reg=r't2\\">粉丝<\\/sp.*?ass=\\"S_line1\\">.*?ong clas.*?">(.*?)<\\/strong><span class=\\"S_txt2\\">微博' WB_count=re.findall(re.compile(reg),html) #发微博数量 if WB_count==[]: self.GetBadCookie(I_D) print 'cookie 被冻结' cont,picture_name,cont_soure,time_int='1','1','1','1' return cont,picture_name,cont_soure,time_int else: reg1=r'<a name=.*? target=\\"_blank\\" href=\\"\\(.*?)" title=\\"(.*?)\\" date.*?a> 来自 <a' WB_url=re.findall(re.compile(reg1),html)[1:2] #真实数据有15条 for j in WB_url: time_send=j[1] timeArray = time.strptime(j[1], "%Y-%m-%d %H:%M") time_int=int(time.mktime(timeArray)) #发表时间 fin_url='http://weibo.com'+j[0].replace('\\','') html1=requests.get(fin_url,cookies=cookie,headers=header,timeout=15) html1=html1.content reg_time=r'<div class=\\"WB_from S_txt2\\">.*?ass=\\"S_txt2\\" target=\\"_blank\\" href=\\"(.*?)" title=\\".*?" date=\\".*?" node-type=\\"feed_list_item_d.*?来自' zhuan_url=re.findall(re.compile(reg_time),html1) if zhuan_url!=[]: #zhuan_fin_url='http://weibo.com'+zhuan_url[0].replace('\\','') #留着处理转发的微博 #logging_a='转发微博' print '转发微博',datetime.datetime.now() #logging.debug(logging_a) else: reg3=r'<div class=\\"WB_text W_f14\\" node-type=\\"feed_list_content\\" nick-name=\\"(.*?)\\">(.*?)<!-- 引用文件时' cont=re.findall(re.compile(reg3),html1)[0][1] regg=re.compile(r'<[^>]+>',re.S) cont=regg.sub('',cont).replace('\\n',' ') cont="".join(cont.split()) #正文 cont_soure=re.findall(re.compile(reg3),html1)[0][0] #来源 regimg=r'<!-- 引用文件时,必须对midia_info赋值 -->(.*?)<!-- super card-->' img=re.findall(re.compile(regimg),html1)[0] regimg1=r'<img src=\\"(.*?)\\">' img=re.findall(re.compile(regimg1),img) picture_name=[] if img!=[]: for item in img: item_ringht=item.replace('\\','').replace('thumb150','mw690').replace('orj360','mw690') picture=requests.get(item_ringht) name=str('E:\\sina_image\\'+item[-15:]) picture_name.append(item[-15:]) #图片名称 try: f=open(name,'wb') f.write(picture.content) except Exception,e: print e,'picture' finally: f.close() get_status=1 return cont,picture_name,cont_soure,time_int,time_send,get_status cont,picture_name,cont_soure,time_int,time_send,get_status=0,0,0,0,0,0 return cont,picture_name,cont_soure,time_int,time_send,get_status def connectDB(self): '''链接数据库''' host="localhost" dbName="sina" user="root" password="root" db=pymysql.connect(host,user,password,dbName,charset='utf8') return db cursorDB = db.cursor() return cursorDB def creatTable(self,createTableName): '''创建数据库''' try: createTableSql="CREATE TABLE IF NOT EXISTS "+ createTableName+"(id int(11) NOT NULL AUTO_INCREMENT,content TEXT,photo TEXT,url_name VARCHAR(255),time_in int,PRIMARY KEY (`id`))ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='微博信息'" DB_create=self.connectDB() cursor_create=DB_create.cursor() cursor_create.execute(createTableSql) except Exception,e: print e,'------creatTable_def' finally: DB_create.close() #logging_b='create_ table '+createTableName+' successfully -------','start spider,now!' print 'create_ table '+createTableName+' successfully -------','start spider,now!' #logging.debug(logging_b) return createTableName def SelectUrl(self): ''' 读取weibo_id_list中url字段 把所需要爬取的微博url,添加到start_url(列表)中,供爬取 ''' global start_url start_url=[] selectUrlSql="select url from weibo_id_list" DB_select=self.connectDB() cursor_select=DB_select.cursor() cursor_select.execute(selectUrlSql) results = cursor_select.fetchall() for i in results: start_url.append(i[0]) DB_select.commit() DB_select.close() def GetBadCookie(self,ID): ''' 记录没用的cookie 默认valid有用为1,但是被禁就变成0 ''' updateContentSql="update cookies_list set valid=0 where id="+str(ID) DB_update=self.connectDB() cursor_uodate=DB_update.cursor() cursor_uodate.execute(updateContentSql) DB_update.commit() DB_update.close() def Selectcookie(self): ''' 提取出cookies 返回:一条cookies''' global I_D,Stop_py selectContentSql="select id,cookie,valid from cookies_list where valid=1" DB_select=self.connectDB() cursor_select=DB_select.cursor() cursor_select.execute(selectContentSql) results = cursor_select.fetchall() if results==(): Stop_py=0 #标记结束py程序 random_index_ck=random.randint(0,int(len(results)-1)) I_D = results[random_index_ck][0] results = eval(results[random_index_ck][1]) #随机提取cookie DB_select.commit() DB_select.close() return results def SelectUA(self): ''' 提取出User-Agent 返回:一个User-Agent ''' selectUA_sql="select header,valid from headers_list where valid=1" DB_select=self.connectDB() cursor_select=DB_select.cursor() cursor_select.execute(selectUA_sql) results = cursor_select.fetchall() random_index_UA=random.randint(0,int(len(results)-1)) results = eval(results[random_index_UA][0]) #随机提取header DB_select.commit() DB_select.close() return results def inserttable_logging(self,insert1,insert2,insert3,insert4,insert5): '''监控数据插入表中''' global w2 insertLoggingSql="insert into weibo_logging"+"(weibo_id,spider_time,get_status,content,time_send)values(%s,%s,%s,%s,%s)" DB_insert=self.connectDB() cursor_insert=DB_insert.cursor() cursor_insert.execute(insertLoggingSql,(str(insert1),str(insert2),str(insert3),str(insert4),str(insert5))) w2=w2+1 print 'logging successfully %s records'%(w2) DB_insert.commit() DB_insert.close() def inserttable(self,insertTable,insert1,insert2,insert3,insert4): '''有效数据插入表中''' global w1 try: insertContentSql_0="select content from "+insertTable+" where content="+"\'"+insert1+"\'" #去重复 insertContentSql="INSERT INTO "+insertTable+"(content,photo,url_name,time_in)VALUES(%s,%s,%s,%s)" DB_insert=self.connectDB() cursor_insert=DB_insert.cursor() a=self.connectDB().cursor().execute(insertContentSql_0) if a: #logging_c='repetition!' print 'repetition!','########',datetime.datetime.now() #logging.debug(logging_c) else: cursor_insert.execute(insertContentSql,(insert1,insert2,insert3,insert4)) w1=w1+1 #logging_d='inert contents to '+insertTable+' successfully crawling number '+str(w1)+' ------ '+insert3+' ------' print 'inert contents to '+insertTable+' successfully'+' crawling number %s'% (w1),'------'+insert3+'------',datetime.datetime.now() #logging.debug(logging_d) DB_insert.commit() except Exception,e: print e,'inserttable_def' finally: DB_insert.close() crawl=crawl1() table=crawl.creatTable('weibo_result') #更改表名 while 1: crawl.SelectUrl() for start_url1 in start_url: try: cookie=crawl.Selectcookie() header=crawl.SelectUA() cont,picture_name,cont_soure,time_int,time_send,get_status=crawl.getcontent(start_url1) crawl.inserttable_logging(cont_soure,str(datetime.datetime.now()),get_status,cont,time_send) get_status=0 if cont!=0: crawl.inserttable(table,cont,'@'.join(picture_name),cont_soure,time_int) time.sleep(5) else:time.sleep(10) except Exception,e: a,b,c,d='error',0,'error','error', crawl.inserttable_logging(a,str(datetime.datetime.now()),b,c,d) print e,'调用函数' #logging.debug(e) continue finally: if Stop_py==0:break if Stop_py==0:break time.sleep(120) win32event.WaitForSingleObject(self.hWaitStop, win32event.INFINITE) if __name__=='__main__': win32serviceutil.HandleCommandLine(SmallestPythonService)
sina.py
import win32serviceutil import win32service import win32event import re,requests,time,datetime,pymysql,random ''' 遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载! ''' WB_count1=8;WB_count=9;w1=0;Stop_py=1;w2=0 get_status=0 class crawl1: def __init__(self): pass def getcontent(self,start_url1): '''获取信息''' global WB_count1,time_int,picture_name,cont_soure,cont,WB_count html=requests.get(start_url1,cookies=cookie,headers=header,timeout=15) html=html.content reg=r't2\\">粉丝<\\/sp.*?ass=\\"S_line1\\">.*?ong clas.*?">(.*?)<\\/strong><span class=\\"S_txt2\\">微博' WB_count=re.findall(re.compile(reg),html) #发微博数量 if WB_count==[]: self.GetBadCookie(I_D) print 'cookie 被冻结' cont,picture_name,cont_soure,time_int='1','1','1','1' return cont,picture_name,cont_soure,time_int else: reg1=r'<a name=.*? target=\\"_blank\\" href=\\"\\(.*?)" title=\\"(.*?)\\" date.*?a> 来自 <a' WB_url=re.findall(re.compile(reg1),html)[1:2] #真实数据有15条 for j in WB_url: time_send=j[1] timeArray = time.strptime(j[1], "%Y-%m-%d %H:%M") time_int=int(time.mktime(timeArray)) #发表时间 fin_url='http://weibo.com'+j[0].replace('\\','') html1=requests.get(fin_url,cookies=cookie,headers=header,timeout=15) html1=html1.content reg_time=r'<div class=\\"WB_from S_txt2\\">.*?ass=\\"S_txt2\\" target=\\"_blank\\" href=\\"(.*?)" title=\\".*?" date=\\".*?" node-type=\\"feed_list_item_d.*?来自' zhuan_url=re.findall(re.compile(reg_time),html1) if zhuan_url!=[]: #zhuan_fin_url='http://weibo.com'+zhuan_url[0].replace('\\','') #留着处理转发的微博 #logging_a='转发微博' print '转发微博',datetime.datetime.now() #logging.debug(logging_a) else: reg3=r'<div class=\\"WB_text W_f14\\" node-type=\\"feed_list_content\\" nick-name=\\"(.*?)\\">(.*?)<!-- 引用文件时' cont=re.findall(re.compile(reg3),html1)[0][1] regg=re.compile(r'<[^>]+>',re.S) cont=regg.sub('',cont).replace('\\n',' ') cont="".join(cont.split()) #正文 cont_soure=re.findall(re.compile(reg3),html1)[0][0] #来源 regimg=r'<!-- 引用文件时,必须对midia_info赋值 -->(.*?)<!-- super card-->' img=re.findall(re.compile(regimg),html1)[0] regimg1=r'<img src=\\"(.*?)\\">' img=re.findall(re.compile(regimg1),img) picture_name=[] if img!=[]: for item in img: item_ringht=item.replace('\\','').replace('thumb150','mw690').replace('orj360','mw690') picture=requests.get(item_ringht) name=str('E:\\sina_image\\'+item[-15:]) picture_name.append(item[-15:]) #图片名称 try: f=open(name,'wb') f.write(picture.content) except Exception,e: print e,'picture' finally: f.close() get_status=1 return cont,picture_name,cont_soure,time_int,time_send,get_status cont,picture_name,cont_soure,time_int,time_send,get_status=0,0,0,0,0,0 return cont,picture_name,cont_soure,time_int,time_send,get_status def connectDB(self): '''链接数据库''' host="localhost" dbName="sina" user="root" password="root" db=pymysql.connect(host,user,password,dbName,charset='utf8') return db cursorDB = db.cursor() return cursorDB def creatTable(self,createTableName): '''创建数据库''' try: createTableSql="CREATE TABLE IF NOT EXISTS "+ createTableName+"(id int(11) NOT NULL AUTO_INCREMENT,content TEXT,photo TEXT,url_name VARCHAR(255),time_in int,PRIMARY KEY (`id`))ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='微博信息'" DB_create=self.connectDB() cursor_create=DB_create.cursor() cursor_create.execute(createTableSql) except Exception,e: print e,'------creatTable_def' finally: DB_create.close() #logging_b='create_ table '+createTableName+' successfully -------','start spider,now!' print 'create_ table '+createTableName+' successfully -------','start spider,now!' #logging.debug(logging_b) return createTableName def SelectUrl(self): ''' 读取weibo_id_list中url字段 把所需要爬取的微博url,添加到start_url(列表)中,供爬取 ''' global start_url start_url=[] selectUrlSql="select url from weibo_id_list" DB_select=self.connectDB() cursor_select=DB_select.cursor() cursor_select.execute(selectUrlSql) results = cursor_select.fetchall() for i in results: start_url.append(i[0]) DB_select.commit() DB_select.close() def GetBadCookie(self,ID): ''' 记录没用的cookie 默认valid有用为1,但是被禁就变成0 ''' updateContentSql="update cookies_list set valid=0 where id="+str(ID) DB_update=self.connectDB() cursor_uodate=DB_update.cursor() cursor_uodate.execute(updateContentSql) DB_update.commit() DB_update.close() def Selectcookie(self): ''' 提取出cookies 返回:一条cookies''' global I_D,Stop_py selectContentSql="select id,cookie,valid from cookies_list where valid=1" DB_select=self.connectDB() cursor_select=DB_select.cursor() cursor_select.execute(selectContentSql) results = cursor_select.fetchall() if results==(): Stop_py=0 #标记结束py程序 random_index_ck=random.randint(0,int(len(results)-1)) I_D = results[random_index_ck][0] results = eval(results[random_index_ck][1]) #随机提取cookie DB_select.commit() DB_select.close() return results def SelectUA(self): ''' 提取出User-Agent 返回:一个User-Agent ''' selectUA_sql="select header,valid from headers_list where valid=1" DB_select=self.connectDB() cursor_select=DB_select.cursor() cursor_select.execute(selectUA_sql) results = cursor_select.fetchall() random_index_UA=random.randint(0,int(len(results)-1)) results = eval(results[random_index_UA][0]) #随机提取header DB_select.commit() DB_select.close() return results def inserttable_logging(self,insert1,insert2,insert3,insert4,insert5): '''监控数据插入表中''' global w2 insertLoggingSql="insert into weibo_logging"+"(weibo_id,spider_time,get_status,content,time_send)values(%s,%s,%s,%s,%s)" DB_insert=self.connectDB() cursor_insert=DB_insert.cursor() cursor_insert.execute(insertLoggingSql,(str(insert1),str(insert2),str(insert3),str(insert4),str(insert5))) w2=w2+1 print 'logging successfully %s records'%(w2) DB_insert.commit() DB_insert.close() def inserttable(self,insertTable,insert1,insert2,insert3,insert4): '''有效数据插入表中''' global w1 try: insertContentSql_0="select content from "+insertTable+" where content="+"\'"+insert1+"\'" #去重复 insertContentSql="INSERT INTO "+insertTable+"(content,photo,url_name,time_in)VALUES(%s,%s,%s,%s)" DB_insert=self.connectDB() cursor_insert=DB_insert.cursor() a=self.connectDB().cursor().execute(insertContentSql_0) if a: #logging_c='repetition!' print 'repetition!','########',datetime.datetime.now() #logging.debug(logging_c) else: cursor_insert.execute(insertContentSql,(insert1,insert2,insert3,insert4)) w1=w1+1 #logging_d='inert contents to '+insertTable+' successfully crawling number '+str(w1)+' ------ '+insert3+' ------' print 'inert contents to '+insertTable+' successfully'+' crawling number %s'% (w1),'------'+insert3+'------',datetime.datetime.now() #logging.debug(logging_d) DB_insert.commit() except Exception,e: print e,'inserttable_def' finally: DB_insert.close() if __name__=='__main__': crawl=crawl1() table=crawl.creatTable('weibo_result') #更改表名 while 1: crawl.SelectUrl() for start_url1 in start_url: try: cookie=crawl.Selectcookie() header=crawl.SelectUA() cont,picture_name,cont_soure,time_int,time_send,get_status=crawl.getcontent(start_url1) crawl.inserttable_logging(cont_soure,str(datetime.datetime.now()),get_status,cont,time_send) get_status=0 if cont!=0: crawl.inserttable(table,cont,'@'.join(picture_name),cont_soure,time_int) time.sleep(5) else:time.sleep(10) except Exception,e: a,b,c,d='error',0,'error','error', crawl.inserttable_logging(a,str(datetime.datetime.now()),b,c,d) print e,'调用函数' #logging.debug(e) continue finally: if Stop_py==0:break if Stop_py==0:break time.sleep(180)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。