当前位置: 首页 > news >正文

python爬虫-爬小说

# 导入BeautifulSoup
from bs4 import BeautifulSoup as bf
from fastapi import FastAPI,Form,File
import time
import random
import requests
import tracebackapp = FastAPI(title='爬虫',description='regex web: https://regexr-cn.com/  \n  eg : <a href="https://www.zbytb.com/s-zb-.*?</a>  \n eg : <a href="[./].*?</a>',version='1.0.0')headers = [{"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"},{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"},{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"},{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14"},{"User-Agent":"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"},{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"},{"User-Agent":"Opera/9.25 (Windows NT 5.1; U; en)"},{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"},{"User-Agent":"Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"},{"User-Agent":"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12"},{"User-Agent":"Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9"},{"User-Agent":"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"},{"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "}
]proxys = []def wait():time.sleep(0.2)def getHeader():return random.choice(headers)def getProxy():return random.choice(proxys)def parseUrl(url):if(url.startswith('./')):url = url.replace('./','')return urldef start():try:list_html = requests.get('https://www.xjwxsw.com/xsmulu/27614204/', headers=getHeader())list_html.encoding = list_html.apparent_encodinglist_obj = bf(list_html.text, 'html.parser')atags = list_obj.find_all('div', id='content_1')[0].find_all('a')f = open('C://Users//admin//Desktop//777.txt', "a", encoding='utf-8')for atag in atags:title = atag.textprint(title)f.write(title)f.write("\n")href1 = 'https://www.xjwxsw.com'+atag.get('href')href2 = href1.split('.html')[0]+'_2.html'context1 = requests.get(href1, headers=getHeader())context1.encoding = context1.apparent_encodingcontext_obj1 = bf(context1.text, 'html.parser')ptags1 = context_obj1.find_all('div', id='booktxt')[0].find_all('p')for ptag1 in ptags1:f.write(ptag1.text)f.write("\n")context2 = requests.get(href2, headers=getHeader())context2.encoding = context2.apparent_encodingcontext_obj2 = bf(context2.text, 'html.parser')ptags2 = context_obj2.find_all('div', id='booktxt')[0].find_all('p')for ptag2 in ptags2:f.write(ptag2.text)f.write("\n")except Exception as e:traceback.print_exc()finally:f.close()
if __name__ == '__main__':start()
# 导入BeautifulSoup
from bs4 import BeautifulSoup as bf
from fastapi import FastAPI,Form,File
import time
import random
import requests
import tracebackapp = FastAPI(title='爬虫',description='regex web: https://regexr-cn.com/  \n  eg : <a href="https://www.zbytb.com/s-zb-.*?</a>  \n eg : <a href="[./].*?</a>',version='1.0.0')headers = [{"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"},{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"},{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"},{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14"},{"User-Agent":"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"},{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"},{"User-Agent":"Opera/9.25 (Windows NT 5.1; U; en)"},{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"},{"User-Agent":"Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"},{"User-Agent":"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12"},{"User-Agent":"Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9"},{"User-Agent":"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"},{"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "}
]proxys = []def wait():time.sleep(0.2)def getHeader():return random.choice(headers)def getProxy():return random.choice(proxys)def parseUrl(url):if(url.startswith('./')):url = url.replace('./','')return urldef start():try:list_html = requests.get('https://www.uuks5.com/book/766295/', headers=getHeader())list_html.encoding = list_html.apparent_encodinglist_obj = bf(list_html.text, 'html.parser')atags = list_obj.find_all('ul', id='chapterList')[0].find_all('a')f = open('C://Users//admin//Desktop//123.txt', "a", encoding='utf-8')for atag in atags:title = atag.textprint(title)f.write(title)f.write("\n")href1 = 'https://www.uuks5.com/'+atag.get('href')context1 = requests.get(href1, headers=getHeader())context1.encoding = context1.apparent_encodingcontext_obj1 = bf(context1.text, 'html.parser')ptags1 = context_obj1.find_all('div', id='TextContent')[0].find_all('p')for ptag1 in ptags1:f.write(ptag1.text)f.write("\n")except Exception as e:traceback.print_exc()finally:f.close()
if __name__ == '__main__':start()

相关文章:

  • PTT票据传递攻击
  • 基于单片机电梯控制系统设计与实现
  • Vitis HLS 学习笔记--抽象并行编程模型-控制驱动与数据驱动
  • VBA读取文本文件数据
  • Docker(一) Docker概述
  • 【全部更新完毕】2024电工杯B题详细思路代码成品文章教学:大学生平衡膳食食谱的优化设计及评价
  • 政策及需求多因素驱动下 中国适老化改造市场空间大
  • Web3 游戏平台 Creo Engine 销毁代币总量的20%,以促进长远发展
  • 2413. 最小偶倍数
  • 《TCP/IP网络编程》(第八章)域名及网络地址
  • 项目日记(3) boost搜索引擎
  • LabVIEW如何确保自动化设备的稳定性和可靠性?
  • RocketMQ使用(3):消息重复
  • 其它高阶数据结构⑦_Skiplist跳表_概念+实现+对比
  • 人工智能应用-实验8-用生成对抗网络生成数字图像
  • Android 架构优化~MVP 架构改造
  • Apache Spark Streaming 使用实例
  • canvas 绘制双线技巧
  • Codepen 每日精选(2018-3-25)
  • iOS 颜色设置看我就够了
  • JavaScript/HTML5图表开发工具JavaScript Charts v3.19.6发布【附下载】
  • Javascript编码规范
  • k8s如何管理Pod
  • React Transition Group -- Transition 组件
  • vue-cli3搭建项目
  • zookeeper系列(七)实战分布式命名服务
  • 测试开发系类之接口自动化测试
  • 初识 webpack
  • 关于 Linux 进程的 UID、EUID、GID 和 EGID
  • 技术:超级实用的电脑小技巧
  • 它承受着该等级不该有的简单, leetcode 564 寻找最近的回文数
  • 微信开放平台全网发布【失败】的几点排查方法
  • 我的业余项目总结
  • 小而合理的前端理论:rscss和rsjs
  • 7行Python代码的人脸识别
  • 策略 : 一文教你成为人工智能(AI)领域专家
  • ​LeetCode解法汇总2808. 使循环数组所有元素相等的最少秒数
  • # Panda3d 碰撞检测系统介绍
  • #我与Java虚拟机的故事#连载05:Java虚拟机的修炼之道
  • (delphi11最新学习资料) Object Pascal 学习笔记---第2章第五节(日期和时间)
  • (Redis使用系列) SpringBoot中Redis的RedisConfig 二
  • (分类)KNN算法- 参数调优
  • (附源码)spring boot车辆管理系统 毕业设计 031034
  • (译)2019年前端性能优化清单 — 下篇
  • (原創) 是否该学PetShop将Model和BLL分开? (.NET) (N-Tier) (PetShop) (OO)
  • (转)菜鸟学数据库(三)——存储过程
  • (转)程序员疫苗:代码注入
  • (转)甲方乙方——赵民谈找工作
  • (转)总结使用Unity 3D优化游戏运行性能的经验
  • (转载)虚函数剖析
  • *_zh_CN.properties 国际化资源文件 struts 防乱码等
  • .FileZilla的使用和主动模式被动模式介绍
  • .Net Core/.Net6/.Net8 ,启动配置/Program.cs 配置
  • @ModelAttribute 注解
  • [AI]文心一言出圈的同时,NLP处理下的ChatGPT-4.5最新资讯