当前位置：首页 > news >正文

爬虫入门（四）

news 来源：原创 2024/4/29 15:53:55

1.移动端数据

配置fiddler
    tools->options->connection->allow remote computer to connect
    fiddler port: xxxx
移动端安装fiddler的证书：
    保证移动端和fiddler所在的pc的网络在同一个网段下
    在移动端的浏览器中：fiddler所在机器的ip地址：fiddler的端口号
    证书下载完毕后进行安装切信任
配置手机的网络：
    给手机设置一个代理ip：port：

2.scrapy的初步使用

settings

ROBOTSTXT_OBEY = False

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'

ITEM_PIPELINES = {
   'firstblood.pipelines.FirstbloodPipeline': 300,
}

# -*- coding: utf-8 -*-
import scrapy


class FirstSpider(scrapy.Spider):
    # 爬虫文件的名称
    name = 'first'
    # 允许的域名
    # allowed_domains = ['www.xxx.com']
    # 起始url列表
    start_urls = ['https://www.qiushibaike.com/text/']

    # def parse(self, response):
    #     div_list = response.xpath('//div[@id="content-left"]/div')
    #     for div in div_list:
    #         # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
    #         # 如果可以保证xpath返回的列表中只有一个元素可以用.extract_first()
    #         author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
    #         content = div.xpath('./a[1]/div/span//text()').extract()
    #         content = ''.join(content)
    #
    #         print(author, content)
    #
    # # 实现解析+持久化存储
    # # 1.基于终端指令的持久化存储
    #     # 只可以将parse方法的返回值持久化存储到本地文件中
    # # 2.基于管道的持久化存储

    # 1.基于终端指令的持久化存储 scrapy crawl first -o xxx.csv
    def parse(self, response):
        div_list = response.xpath('//div[@id="content-left"]/div')
        all_data = []
        for div in div_list:
            # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
            # 如果可以保证xpath返回的列表中只有一个元素可以用.extract_first()
            author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
            content = div.xpath('./a[1]/div/span//text()').extract()
            content = ''.join(content)

            dic = {
                'author': author,
                'content': content
            }

            all_data.append(dic)

        return all_data

解析数据+管道持久化存储

settings

ITEM_PIPELINES = {
   'boosPro.pipelines.BoosproPipeline': 300,
   'boosPro.pipelines.MysqlPipeline': 301,
   'boosPro.pipelines.RedisPipeline': 302,
}

# -*- coding: utf-8 -*-
import scrapy
from boosPro.items import BoosproItem


class BossSpider(scrapy.Spider):
    name = 'boss'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position=']

    url = 'https://www.zhipin.com/c101010100/?query=python爬虫&page=%d&ka=page-2'
    page = 1
    # 解析+管道持久化存储
    def parse(self, response):
        li_list = response.xpath('//div[@class="job-list"]/ul/li')
        for li in li_list:
            job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()').extract_first()
            salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
            company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()

            # 实例化一个item对象
            item = BoosproItem()
            # 将解析到的数据全部封装到item对象中
            item["job_name"] = job_name
            item["salary"] = salary
            item["company"] = company

            # 将item提交给管道
            yield item

        if self.page <= 3:
            print("执行！！！")
            self.page += 1
            new_url = format(self.url % self.page)
            print(new_url)
            # 手动发起请求
            yield scrapy.Request(url=new_url, callback=self.parse)

items

class BoosproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    job_name = scrapy.Field()
    salary = scrapy.Field()
    company = scrapy.Field()

pipelines

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql
from redis import Redis


class BoosproPipeline(object):
    fp = None

    def open_spider(self, spider):
        print("开始爬虫......")
        self.fp = open('./boss.txt', 'w', encoding='utf-8')

    def close_spider(self, spider):
        print("结束爬虫.......")
        self.fp.close()
    # 爬虫文件每向管道提交一次item，则该方法就被调用一次
    # 参数：item 就是管道接收到的item类型对象

    def process_item(self, item, spider):
        self.fp.write(item["job_name"] + ":" + item["salary"] + ":" + item["company"] + "\n")
        return item


class MysqlPipeline(object):
    conn = None
    cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='scrapy', charset='utf8')
        print(self.conn)

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute('insert into boss values ("%s", "%s", "%s")' % (item["job_name"], item["salary"], item["company"]))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        self.conn.close()
        self.cursor.close()


class RedisPipeline(object):
    conn = None

    def open_spider(self, spider):
        self.conn = Redis(host='127.0.0.1', port=6379)
        print(self.conn)

    def process_item(self, item, spider):
        dic = {
            'name': item["job_name"],
            'salary': item["salary"],
            'company': item["company"]
        }

        self.conn.lpush('boss', dic)