当前位置: 首页 > news >正文

scrapy 爬取微博(五)【最新超详细解析】: 爬取微博文章

1 读取配置参数

爬取微博文章首先需要读取settings.py中的设置的配置变量,然后编写爬虫,读取的配置变量主要有爬取的关键词、时间范围、爬取区域等。

class WeiboSearchSpider(scrapy.Spider):name = 'weibo_search'allowed_domains = ['weibo.com']settings = get_project_settings()keyword_list = settings.get('KEYWORD_LIST')if not isinstance(keyword_list, list):if not os.path.isabs(keyword_list):keyword_list = os.getcwd() + os.sep + keyword_listif not os.path.isfile(keyword_list):sys.exit('不存在%s文件' % keyword_list)keyword_list = utils.get_keyword_list(keyword_list)for i, keyword in enumerate(keyword_list):if len(keyword) > 2 and keyword[0] == '#' and keyword[-1] == '#':keyword_list[i] = '%23' + keyword[1:-1] + '%23'weibo_type = utils.convert_weibo_type(settings.get('WEIBO_TYPE'))contain_type = utils.convert_contain_type(settings.get('CONTAIN_TYPE'))regions = utils.get_regions(settings.get('REGION'))base_url = 'https://s.weibo.com'start_date = settings.get('START_DATE',datetime.now().strftime('%Y-%m-%d'))end_date = settings.get('END_DATE', datetime.now().strftime('%Y-%m-%d'))if utils.str_to_time(start_date) > utils.str_to_time(end_date):sys.exit('settings.py配置错误,START_DATE值应早于或等于END_DATE值,请重新配置settings.py')further_threshold = settings.get('FURTHER_THRESHOLD', 46)mysql_error = Falsepymysql_error = False

2 start_requests

负责发送请求的start_requests

  def start_requests(self):start_date = datetime.strptime(self.start_date, '%Y-%m-%d')end_date = datetime.strptime(self.end_date,'%Y-%m-%d') + timedelta(days=1)start_str = start_date.strftime('%Y-%m-%d') + '-0'end_str = end_date.strftime('%Y-%m-%d') + '-0'for keyword in self.keyword_list:if not self.settings.get('REGION') or '全部' in self.settings.get('REGION'):base_url = 'https://s.weibo.com/weibo?q=%s' % keywordurl = base_url + self.weibo_typeurl += self.contain_typeurl += '&timescope=custom:{}:{}'.format(start_str, end_str)print(f'url:{url}')yield scrapy.Request(url=url,callback=self.parse,meta={'base_url': base_url,'keyword': keyword})else:for region in self.regions.values():base_url = ('https://s.weibo.com/weibo?q={}&region=custom:{}:1000').format(keyword, region['code'])url = base_url + self.weibo_typeurl += self.contain_typeurl += '&timescope=custom:{}:{}'.format(start_str, end_str)# 获取一个省的搜索结果yield scrapy.Request(url=url,callback=self.parse,meta={'base_url': base_url,'keyword': keyword,'province': region})

3 parse

负责解析页面上元素的parse方法

 def parse(self, response):base_url = response.meta.get('base_url')keyword = response.meta.get('keyword')province = response.meta.get('province')is_empty = response.xpath('//div[@class="card card-no-result s-pt20b40"]')page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))if is_empty:print('当前页面搜索结果为空')elif page_count < self.further_threshold:# 解析当前页面for weibo in self.parse_weibo(response):self.check_environment()yield weibonext_url = response.xpath('//a[@class="next"]/@href').extract_first()if next_url:next_url = self.base_url + next_urlyield scrapy.Request(url=next_url,callback=self.parse_page,meta={'keyword': keyword})else:start_date = datetime.strptime(self.start_date, '%Y-%m-%d')end_date = datetime.strptime(self.end_date, '%Y-%m-%d')while start_date <= end_date:start_str = start_date.strftime('%Y-%m-%d') + '-0'start_date = start_date + timedelta(days=1)end_str = start_date.strftime('%Y-%m-%d') + '-0'url = base_url + self.weibo_typeurl += self.contain_typeurl += '&timescope=custom:{}:{}&page=1'.format(start_str, end_str)# 获取一天的搜索结果yield scrapy.Request(url=url,callback=self.parse_by_day,meta={'base_url': base_url,'keyword': keyword,'province': province,'date': start_str[:-2]})

4 parse_weibo

解析微博内容

    def parse_weibo(self, response):"""解析网页中的微博信息"""keyword = response.meta.get('keyword')for sel in response.xpath("//div[@class='card-wrap']"):info = sel.xpath("div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']")if info:weibo = WeiboItem()weibo['mid'] = sel.xpath('@mid').extract_first()bid = sel.xpath('.//div[@class="from"]/a[1]/@href').extract_first().split('/')[-1].split('?')[0]weibo['bid'] = bidweibo['user_id'] = info[0].xpath('div[2]/a/@href').extract_first().split('?')[0].split('/')[-1]weibo['screen_name'] = info[0].xpath('div[2]/a/@nick-name').extract_first()txt_sel = sel.xpath('.//p[@class="txt"]')[0]retweet_sel = sel.xpath('.//div[@class="card-comment"]')retweet_txt_sel = ''if retweet_sel and retweet_sel[0].xpath('.//p[@class="txt"]'):retweet_txt_sel = retweet_sel[0].xpath('.//p[@class="txt"]')[0]content_full = sel.xpath('.//p[@node-type="feed_list_content_full"]')is_long_weibo = Falseis_long_retweet = Falseif content_full:if not retweet_sel:txt_sel = content_full[0]is_long_weibo = Trueelif len(content_full) == 2:txt_sel = content_full[0]retweet_txt_sel = content_full[1]is_long_weibo = Trueis_long_retweet = Trueelif retweet_sel[0].xpath('.//p[@node-type="feed_list_content_full"]'):retweet_txt_sel = retweet_sel[0].xpath('.//p[@node-type="feed_list_content_full"]')[0]is_long_retweet = Trueelse:txt_sel = content_full[0]is_long_weibo = Trueweibo['text'] = txt_sel.xpath('string(.)').extract_first().replace('\u200b', '').replace('\ue627', '')weibo['article_url'] = self.get_article_url(txt_sel)weibo['location'] = self.get_location(txt_sel)if weibo['location']:weibo['text'] = weibo['text'].replace('2' + weibo['location'], '')weibo['text'] = weibo['text'][2:].replace(' ', '')if is_long_weibo:weibo['text'] = weibo['text'][:-4]weibo['at_users'] = self.get_at_users(txt_sel)weibo['topics'] = self.get_topics(txt_sel)reposts_count = sel.xpath('.//a[@action-type="feed_list_forward"]/text()').extract()reposts_count = "".join(reposts_count)try:reposts_count = re.findall(r'\d+.*', reposts_count)except TypeError:print("无法解析转发按钮,可能是 1) 网页布局有改动 2) cookie无效或已过期。\n""请在 https://github.com/dataabc/weibo-search 查看文档,以解决问题,")raise CloseSpider()weibo['reposts_count'] = reposts_count[0] if reposts_count else '0'comments_count = sel.xpath('.//a[@action-type="feed_list_comment"]/text()').extract_first()comments_count = re.findall(r'\d+.*', comments_count)weibo['comments_count'] = comments_count[0] if comments_count else '0'attitudes_count = sel.xpath('.//a[@action-type="feed_list_like"]/button/span[2]/text()').extract_first()attitudes_count = re.findall(r'\d+.*', attitudes_count)weibo['attitudes_count'] = attitudes_count[0] if attitudes_count else '0'created_at = sel.xpath('.//div[@class="from"]/a[1]/text()').extract_first().replace(' ', '').replace('\n', '').split('前')[0]weibo['created_at'] = utils.standardize_date(created_at)source = sel.xpath('.//div[@class="from"]/a[2]/text()').extract_first()weibo['source'] = source if source else ''pics = ''is_exist_pic = sel.xpath('.//div[@class="media media-piclist"]')if is_exist_pic:pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract()pics = [pic[8:] for pic in pics]pics = [re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics]pics = ['https://' + pic for pic in pics]video_url = ''is_exist_video = sel.xpath('.//div[@class="thumbnail"]//video-player').extract_first()if is_exist_video:video_url = re.findall(r'src:\'(.*?)\'', is_exist_video)[0]video_url = video_url.replace('&amp;', '&')video_url = 'http:' + video_url# if not retweet_sel:#     weibo['pics'] = pics#     weibo['video_url'] = video_url# else:#     weibo['pics'] = ''#     weibo['video_url'] = ''weibo['retweet_id'] = ''if retweet_sel and retweet_sel[0].xpath('.//div[@node-type="feed_list_forwardContent"]/a[1]'):retweet = WeiboItem()retweet['id'] = retweet_sel[0].xpath('.//a[@action-type="feed_list_like"]/@action-data').extract_first()[4:]retweet['bid'] = retweet_sel[0].xpath('.//p[@class="from"]/a/@href').extract_first().split('/')[-1].split('?')[0]info = retweet_sel[0].xpath('.//div[@node-type="feed_list_forwardContent"]/a[1]')[0]retweet['user_id'] = info.xpath('@href').extract_first().split('/')[-1]retweet['screen_name'] = info.xpath('@nick-name').extract_first()retweet['text'] = retweet_txt_sel.xpath('string(.)').extract_first().replace('\u200b','').replace('\ue627', '')retweet['article_url'] = self.get_article_url(retweet_txt_sel)retweet['location'] = self.get_location(retweet_txt_sel)if retweet['location']:retweet['text'] = retweet['text'].replace('2' + retweet['location'], '')retweet['text'] = retweet['text'][2:].replace(' ', '')if is_long_retweet:retweet['text'] = retweet['text'][:-4]retweet['at_users'] = self.get_at_users(retweet_txt_sel)retweet['topics'] = self.get_topics(retweet_txt_sel)reposts_count = retweet_sel[0].xpath('.//ul[@class="act s-fr"]/li[1]/a[1]/text()').extract_first()reposts_count = re.findall(r'\d+.*', reposts_count)retweet['reposts_count'] = reposts_count[0] if reposts_count else '0'comments_count = retweet_sel[0].xpath('.//ul[@class="act s-fr"]/li[2]/a[1]/text()').extract_first()comments_count = re.findall(r'\d+.*', comments_count)retweet['comments_count'] = comments_count[0] if comments_count else '0'attitudes_count = retweet_sel[0].xpath('.//a[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"]//span[@class="woo-like-count"]/text()').extract_first()attitudes_count = re.findall(r'\d+.*', attitudes_count)retweet['attitudes_count'] = attitudes_count[0] if attitudes_count else '0'created_at = retweet_sel[0].xpath('.//p[@class="from"]/a[1]/text()').extract_first().replace(' ', '').replace('\n', '').split('前')[0]retweet['created_at'] = utils.standardize_date(created_at)source = retweet_sel[0].xpath('.//p[@class="from"]/a[2]/text()').extract_first()retweet['source'] = source if source else ''# retweet['pics'] = pics# retweet['video_url'] = video_urlretweet['retweet_id'] = ''yield {'weibo': retweet, 'keyword': keyword}weibo['retweet_id'] = retweet['id']weibo["ip"] = self.get_ip(bid)avator = sel.xpath("div[@class='card']/div[@class='card-feed']/div[@class='avator']")if avator:user_auth = avator.xpath('.//svg/@id').extract_first()print(user_auth)if user_auth == 'woo_svg_vblue':weibo['user_authentication'] = '蓝V'elif user_auth == 'woo_svg_vyellow':weibo['user_authentication'] = '黄V'elif user_auth == 'woo_svg_vorange':weibo['user_authentication'] = '红V'elif user_auth == 'woo_svg_vgold':weibo['user_authentication'] = '金V'else:weibo['user_authentication'] = '普通用户'print(weibo)weibo['keywords'] = keywordyield {'weibo': weibo, 'keyword': keyword}

5 运行情况

我们爬取的关键词是:[‘华为’, ‘苹果公司’]
爬取的时间范围是:2024年9月1日~9月30日
运行效果如下:
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

相关文章:

  • oracle direct path read处理过程
  • uniapp js怎么根据map需要显示的点位,计算自适应的缩放scale
  • 【Unity踩坑】Textmesh Pro是否需要加入Version Control?
  • 经典sql题(十四)炸裂函数的恢复
  • 资金晋阶司库|基于数字化标准建立的操作类应用
  • 生物医学光学第三章作业:归纳和总结生物发光的主要类型和特点
  • Linux 网络配置 (深入理解)
  • 网站建设公司如何选?2024专业网站建设公司哪家好TOP3
  • 解决json格式转换被特殊字符截断问题
  • EEPROM手册笔记
  • uniapp js向json中增加另一个json的全部数据,并获取json长度
  • 低空经济时代:无人机飞行安全要点详解
  • 探索自闭症表现研究报告:了解最新科研成果
  • 胤娲科技:AI界的超级充电宝——忆阻器如何让LLM告别电量焦虑
  • 【有啥问啥】大型语言模型的涌现能力(Emergent Abilities):新一代AI的曙光
  • [case10]使用RSQL实现端到端的动态查询
  • [笔记] php常见简单功能及函数
  • 【每日笔记】【Go学习笔记】2019-01-10 codis proxy处理流程
  • DOM的那些事
  • es6
  • ES6简单总结(搭配简单的讲解和小案例)
  • Java 11 发布计划来了,已确定 3个 新特性!!
  • mockjs让前端开发独立于后端
  • supervisor 永不挂掉的进程 安装以及使用
  • 二维平面内的碰撞检测【一】
  • 扑朔迷离的属性和特性【彻底弄清】
  • 思否第一天
  • LIGO、Virgo第三轮探测告捷,同时探测到一对黑洞合并产生的引力波事件 ...
  • ​字​节​一​面​
  • #Linux(权限管理)
  • #我与Java虚拟机的故事#连载09:面试大厂逃不过的JVM
  • (2)空速传感器
  • (4)(4.6) Triducer
  • (C#)Windows Shell 外壳编程系列4 - 上下文菜单(iContextMenu)(二)嵌入菜单和执行命令...
  • (c语言版)滑动窗口 给定一个字符串,只包含字母和数字,按要求找出字符串中的最长(连续)子串的长度
  • (编译到47%失败)to be deleted
  • (翻译)Quartz官方教程——第一课:Quartz入门
  • (附源码)流浪动物保护平台的设计与实现 毕业设计 161154
  • (免费领源码)Java#ssm#MySQL 创意商城03663-计算机毕业设计项目选题推荐
  • (小白学Java)Java简介和基本配置
  • (译) 理解 Elixir 中的宏 Macro, 第四部分:深入化
  • (译)2019年前端性能优化清单 — 下篇
  • (原創) 博客園正式支援VHDL語法著色功能 (SOC) (VHDL)
  • (中等) HDU 4370 0 or 1,建模+Dijkstra。
  • (转)程序员疫苗:代码注入
  • **CI中自动类加载的用法总结
  • .NET Core跨平台微服务学习资源
  • .Net FrameWork总结
  • .Net(C#)自定义WinForm控件之小结篇
  • .Net接口调试与案例
  • .NET中统一的存储过程调用方法(收藏)
  • .net专家(张羿专栏)
  • 。Net下Windows服务程序开发疑惑
  • [ vulhub漏洞复现篇 ] Apache Flink目录遍历(CVE-2020-17519)
  • [383] 赎金信 js