#! usr/bin/env python
# -*- coding: utf-8 -*-
'''
1.对百度贴吧的任意帖子进行抓取
2.指定是否只抓取楼主发帖内容
3.将抓取到的内容分析并保存到文件
'''
import urllib2
import re
class BDTB(object):
# 初始化、传入基本url,是否只看楼主
def __init__(self,url, seelz, floorTag):
self.url = url
# 是否只看楼主
self.seelz = '?see_lz=' + str(seelz)
# 全局file变量,文件写入操作对象
self.file = None
# 楼层标号,初始为1
self.floor = 1
# 默认的标题,如果没有成功获取到标题的话则会用这个标题
self.defaultTitle = u"百度贴吧"
# 是否写入楼分隔符的标记
self.floorTag = floorTag
# 传入页码,获取该页面的代码
def getHtml(self, pageNum):
try:
url = self.url + self.seelz + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#print response.read()
return response.read()
except urllib2.URLError as e:
if hasattr(e,'reason'):
print (e.reason)
return None
# 获取帖子的标题
def getTitle(self, pageIndex):
pattern = re.compile('<h3 class="core_title.*?">(.*?)</h3>', re.S)
result = re.search(pattern,pageIndex)
if result:
title = result.group(1)
return title
else:
return None
# 获取帖子的页数
def pageNum(self):
html = self.getHtml(1)
pattern = re.compile('<li class="l_reply_num".*?<span class="red">(.*?)</span>', re.S)
result = re.search(pattern, html)
if result:
return result.group(1)
else:
return None
# 传入指定页数 获取正文内容
def getContent(self, page):
pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
items = re.findall(pattern, page)
contents = []
for item in items:
content = self.replace(item)
contents.append(content)
return contents
def replace(self,x):
'''
处理页面标签
'''
# 去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
# 删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
# 把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
# 将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
# 把段落开头换为\n加空两格
replacePara = re.compile('<p.*?>')
# 将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
# 将其余标签剔除
removeExtraTag = re.compile('<.*?>')
x = re.sub(removeImg,"",x)
x = re.sub(removeAddr,"",x)
x = re.sub(replaceLine,"\n",x)
x = re.sub(replaceTD,"\t",x)
x = re.sub(replacePara,"\n ",x)
x = re.sub(replaceBR,"\n",x)
x = re.sub(removeExtraTag,"",x)
# strip()将前后多余内容删除
return x.strip()
def setTitleFile(self, title):
if title:
self.file = open('%s.txt', 'w+')%title
else:
self.file = file(self.defaultTitle+'.txt', 'w')
def writeData(self,contents):
for content in contents:
if self.floorTag:
floorLine = '\n楼层' + str(self.floorTag) + '-'*50 +'\n'
self.file.write(floorLine)
self.file.write(content)
self.floor += 1
self.file.close()
def start(self):
indexPage = self.getHtml(1)
pageNum = int(self.pageNum())
title = self.getTitle(indexPage)
self.setTitleFile(title)
if not pageNum:
print ('帖子已失效!')
return None
try:
print ('该帖子共有%s页'%(pageNum))
for i in range(pageNum):
print ("正在写入第" + str(i+1) + "页数据……")
page = self.getHtml(i)
contents = self.getContent(page)
self.writeData(contents)
except IOError as e:
print (e.message)
finally:
print ('写入完成')
baseUrl = 'https://tieba.baidu.com/p/5074437851'
seelz = raw_input('是否只查看楼主,输入0为否:\n')
floorTag = raw_input("是否写入楼层信息,是输入1,否输入0\n")
bdtb = BDTB(baseUrl, seelz, floorTag)
bdtb.start()
百度贴吧帖子
转载于:https://www.cnblogs.com/Emai76/p/6716072.html