当前位置: 首页 > news >正文

3.4 爬虫实战-爬去智联招聘职位信息

课程目标

爬去智联招聘

课程内容

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time
def tran_salary(ori_salary):if "万" in ori_salary:ori_salary = ori_salary.replace("万","")ori_salary = float(ori_salary)ori_salary *= 10000elif "千" in ori_salary:ori_salary = ori_salary.replace("千","")ori_salary = float(ori_salary)ori_salary *= 1000return ori_salary
def get_page(page):headers = {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7","accept-language": "zh-CN,zh;q=0.9","cache-control": "max-age=0","priority": "u=0, i","referer": "https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1","sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"","sec-ch-ua-mobile": "?0","sec-ch-ua-platform": "\"Windows\"","sec-fetch-dest": "document","sec-fetch-mode": "navigate","sec-fetch-site": "same-origin","sec-fetch-user": "?1","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"}cookies = {'_uab_collina': '172727354143424658574824', 'acw_tc': '1a0c638e17272735400066445e0053005d54c1b918e525ad699ab6092e7e02', 'acw_sc__v2': '66f41a3eacd51f5c8bb71f5793f12066ce7ab07c', 'x-zp-client-id': 'bb03355b-599f-495e-8d2b-30a4f57b7584', 'FSSBBIl1UgzbN7NS': '5iZQG1DC.WA2czpgKmafwzsAdzR.QEOHg8HC8skKuYfgXHOVdgLCPwc7y8ZlgriWSib.caw32rM0w0pfv0PBV9G', 'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2219229869c96ab7-04ec55af3b58d84-26001151-921600-19229869c9713fe%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTkyMjk4NjljOTZhYjctMDRlYzU1YWYzYjU4ZDg0LTI2MDAxMTUxLTkyMTYwMC0xOTIyOTg2OWM5NzEzZmUifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%2219229869c96ab7-04ec55af3b58d84-26001151-921600-19229869c9713fe%22%7D', 'sajssdk_2015_cross_new_user': '1', 'HMACCOUNT_BFESS': '3A069830089BCDB2', 'Hm_lvt_21a348fada873bdc2f7f75015beeefeb': '1727273541', 'Hm_lpvt_21a348fada873bdc2f7f75015beeefeb': '1727273541', 'HMACCOUNT': '3A069830089BCDB2', 'locationInfo_search': '{%22code%22:%22635%22%2C%22name%22:%22%E5%8D%97%E4%BA%AC%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}', 'FSSBBIl1UgzbN7NT': '5RBAFHbM0DOZqqqD1t3F9WG8co7bIo4rV6.nD4kr2dtixybv9BNj2CJSBoTS6tURMFwXclovVDDk_XXjinHgWwlAwiipd_yC9AZ3c7InbwyLyhfAZTH_vbrvOZ1x2kRsF.RbKexulxkWEG.GqrbeUedQMWVLHIeOa2CoNwYTdTGUm5_Nv6RqouNuFlzPykLCfXUuOFagtyYYE5hZ9WwOOl9WDpo378yA.WI.SlXZe5Hh.Nhm_tajQ5lufEmxXLurZC_ephouMjVXf4fav7tqBmB', '1420ba6bb40c9512e9642a1f8c243891': '3f4dcf67-7cd0-40ea-9af4-9dd3ea9acb1e'}url = f"https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p{page}"response = requests.get(url, headers=headers, cookies=cookies)html_str = response.textreturn html_strinfos = []
for i in range(1,6):print(f"正在爬取第{i}页")html_str = get_page(1)soup = BeautifulSoup(html_str,"html.parser")joblist = soup.find_all("div",class_="joblist-box__item")for job_item in tqdm(joblist):jobinfo__name = job_item.find("a",class_="jobinfo__name").text.strip()company_name = job_item.find("a",class_="companyinfo__name").text.strip()jobinfo__salary = job_item.find("p",class_="jobinfo__salary").text.strip()if jobinfo__salary == '面议':salary = 0else:if "·" in jobinfo__salary:jobinfo__salary = jobinfo__salary.split("·")[0]min_salary,max_salary = jobinfo__salary.split("-")min_salary = tran_salary(min_salary)max_salary = tran_salary(max_salary)salary = (min_salary+max_salary)/2jobinfo__tag = job_item.find("div",class_="jobinfo__tag")skills = [] # 技能要求if jobinfo__tag is not None:joblist_box__item_tags = jobinfo__tag.findAll("div")for joblist_box__item_tag in joblist_box__item_tags:skills.append(joblist_box__item_tag.text)jobinfo__other_info = job_item.find("div",class_="jobinfo__other-info")jobinfo__other_infos = jobinfo__other_info.find_all("div")area = jobinfo__other_infos[0].text.strip()area_strs = area.split("·")region,classify,city = "","",""if len(area_strs) > 2:region = area_strs[2]if len(area_strs) > 1:classify = area_strs[1]if len(area_strs) > 0:city = area_strs[0]experience_requirement = jobinfo__other_infos[1].text.strip()if experience_requirement == "经验不限":experience_requirement = "0"experience_requirement = experience_requirement.replace("年","")if "-" in experience_requirement:experience_requirement_list = experience_requirement.split("-")experience_requirement = experience_requirement_list[0]experience_requirement = int(experience_requirement)education_background_requirement = jobinfo__other_infos[2].text.strip()companyinfo__tag = job_item.find("div",class_="companyinfo__tag")comany_info_items = companyinfo__tag.findAll("div")finance_info = comany_info_items[0].text.strip()scale = comany_info_items[1].text.strip()if len(comany_info_items) > 2:conany_type = comany_info_items[2].text.strip()else:conany_type = ""info = { "公司名字": company_name, "薪资": salary, "技能要求": skills, "市": city, "区": classify, "区域": region, "经验要求": experience_requirement, "学历要求": education_background_requirement, "融资信息": finance_info, "规模": scale, "公司类型": conany_type, }infos.append(info)time.sleep(2)
# 使用pandas将infos列表转换为DataFrame
df = pd.DataFrame(infos)# 将DataFrame保存为Excel文件
df.to_excel("智联职位信息.xlsx", index=False)

相关文章:

  • 演示:基于WPF的DrawingVisual开发的频谱图和律动图
  • 【分布式微服务云原生】10分钟打造坚不可摧的系统:深入探索系统的鲁棒性
  • 在树莓派上基于 LNMP 搭建 Nextcloud
  • 图灵完备-奇数个信号
  • 百度智能体创建:情感领域的创新力量
  • 【大模型对话 的界面搭建-Open WebUI】
  • 【C++算法】5.双指针_乘最多水的容器
  • OIDC9-OIDC集成登录功能(SpringBoot3.0)
  • 【Linux网络】详解TCP协议(3)
  • GitLab CI/CD脚本入门
  • JAVA工具类——Collections
  • AI学习指南深度学习篇-丢弃法Python实践
  • FTP访问方式详解
  • 【JVM】JVM执行流程和内存区域划分
  • 04_OpenCV图片缩放
  • 【Leetcode】101. 对称二叉树
  • 11111111
  • CentOS 7 修改主机名
  • Git学习与使用心得(1)—— 初始化
  • Hexo+码云+git快速搭建免费的静态Blog
  • Java 11 发布计划来了,已确定 3个 新特性!!
  • javascript从右向左截取指定位数字符的3种方法
  • JAVA并发编程--1.基础概念
  • learning koa2.x
  • MySQL几个简单SQL的优化
  • nginx 配置多 域名 + 多 https
  • php ci框架整合银盛支付
  • Phpstorm怎样批量删除空行?
  • react 代码优化(一) ——事件处理
  • SQLServer插入数据
  • vue脚手架vue-cli
  • vue--为什么data属性必须是一个函数
  • Vultr 教程目录
  • WePY 在小程序性能调优上做出的探究
  • 半理解系列--Promise的进化史
  • 初识 webpack
  • 短视频宝贝=慢?阿里巴巴工程师这样秒开短视频
  • zabbix3.2监控linux磁盘IO
  • 阿里云服务器如何修改远程端口?
  • ​LeetCode解法汇总518. 零钱兑换 II
  • # 利刃出鞘_Tomcat 核心原理解析(八)-- Tomcat 集群
  • #Datawhale AI夏令营第4期#AIGC方向 文生图 Task2
  • #我与Java虚拟机的故事#连载07:我放弃了对JVM的进一步学习
  • (1/2)敏捷实践指南 Agile Practice Guide ([美] Project Management institute 著)
  • (LeetCode C++)盛最多水的容器
  • (PySpark)RDD实验实战——取一个数组的中间值
  • (pytorch进阶之路)扩散概率模型
  • (接口封装)
  • (十)c52学习之旅-定时器实验
  • (学习日记)2024.03.12:UCOSIII第十四节:时基列表
  • (一)Thymeleaf用法——Thymeleaf简介
  • (一)插入排序
  • (转)重识new
  • .net core 实现redis分片_基于 Redis 的分布式任务调度框架 earth-frost
  • .NET Framework与.NET Framework SDK有什么不同?