第四节:电商信息爬取项目实战项目
课程目标
- 学习如何通过playwright完成某宝商品信息爬取
课程内容
编码实现
import json
from playwright.sync_api import Playwright,sync_playwright
import time
from tqdm import tqdm
import pandas as pd
def run(playwright:Playwright) -> None:browser = playwright.chromium.launch(headless=False)context = browser.new_context()page = context.new_page()page.goto("https://uland.taobao.com/sem/tbsearch")serch_input = page.get_by_label("请输入搜索文字")serch_input.fill("工装裤")serch_button = page.get_by_role("button",name="搜索")serch_button.click()commoditie_infos = []for i in range(5):print(f"第{i+1}页")page.mouse.wheel(0, 8000) page.wait_for_selector(".Card--doubleCard--wznk5U4") element = page.locator(".Card--doubleCard--wznk5U4").locator("..")commodities = element.all()for commoditie in tqdm(commodities):title_element = commoditie.locator(".Title--title--jCOPvpf")title_element.wait_for()title = title_element.inner_text() img_element = commoditie.locator(".MainPic--mainPic--rcLNaCv")try:img_element.wait_for(timeout=500,state="attached")img = img_element.get_attribute("src") except:img= ""price_int_element = commoditie.locator(".Price--priceInt--ZlsSi_M")price_int_element.wait_for()price_int = price_int_element.inner_text()price_float_element = commoditie.locator(".Price--priceFloat--h2RR0RK")price_float_element.wait_for()price_float = price_float_element.inner_text()price = price_int + price_floatprice = float(price)deliver_place_elements = commoditie.locator(".Price--procity--_7Vt3mX").all()deliver_place_father_element = deliver_place_elements[0]deliver_place_father_element.wait_for()deliver_place_father = deliver_place_father_element.inner_text()if len(deliver_place_elements) == 1:deliver_place_son = ""else:deliver_place_son_element = deliver_place_elements[1]deliver_place_son_element.wait_for()deliver_place_son = deliver_place_son_element.inner_text()deliver_place = deliver_place_father + deliver_place_sonbrand_element = commoditie.locator(".ShopInfo--shopName--rg6mGmy")brand_element.wait_for()brand = brand_element.inner_text()commoditie_infos.append({"title":title,"img":img,"price":price,"deliver_place":deliver_place,"brand":brand,})page.get_by_label("下一页").click()df = pd.DataFrame(commoditie_infos)df.to_csv("商品7.csv",encoding="gbk",index=False)with sync_playwright() as playwright:run(playwright)
测试与调试