当前位置: 首页 > news >正文

python网页爬虫菜鸟教程_python菜鸟 想做一个简单的爬虫 求教程

贴一个爬虫给你:

因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用

#coding:utf-8

"""

@author:haoning

@create time:2015.8.5

"""

from __future__ import division # 精确除法

from Queue import Queue

from __builtin__ import False

import json

import os

import re

import platform

import uuid

import urllib

import urllib2

import sys

import time

import MySQLdb as mdb

from bs4 import BeautifulSoup

reload(sys)

sys.setdefaultencoding( "utf-8" )

headers = {

'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',

'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

'X-Requested-With':'XMLHttpRequest',

'Referer':'https://www.zhihu.com/topics',

'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'

}

DB_HOST = '127.0.0.1'

DB_USER = 'root'

DB_PASS = 'root'

queue= Queue() #接收队列

nodeSet=set()

keywordSet=set()

stop=0

offset=-20

level=0

maxLevel=7

counter=0

base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')

conn.autocommit(False)

curr = conn.cursor()

def get_html(url):

try:

req = urllib2.Request(url)

response = urllib2.urlopen(req,None,3) #在这里应该加入代理

html = response.read()

return html

except:

pass

return None

def getTopics():

url = 'https://www.zhihu.com/topics'

print url

try:

req = urllib2.Request(url)

response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�

html = response.read().decode('utf-8')

print html

soup = BeautifulSoup(html)

lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})

for li in lis:

data_id=li.get('data-id')

name=li.text

curr.execute('select id from classify_new where name=%s',(name))

y= curr.fetchone()

if not y:

curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))

conn.commit()

except Exception as e:

print "get topic error",e

def get_extension(name):

where=name.rfind('.')

if where!=-1:

return name[where:len(name)]

return None

def which_platform():

sys_str = platform.system()

return sys_str

def GetDateString():

when=time.strftime('%Y-%m-%d',time.localtime(time.time()))

foldername = str(when)

return foldername

def makeDateFolder(par,classify):

try:

if os.path.isdir(par):

newFolderName=par + '//' + GetDateString() + '//' +str(classify)

if which_platform()=="Linux":

newFolderName=par + '/' + GetDateString() + "/" +str(classify)

if not os.path.isdir( newFolderName ):

os.makedirs( newFolderName )

return newFolderName

else:

return None

except Exception,e:

print "kk",e

return None

def download_img(url,classify):

try:

extention=get_extension(url)

if(extention is None):

return None

req = urllib2.Request(url)

resp = urllib2.urlopen(req,None,3)

dataimg=resp.read()

name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention

top="E://topic_pic"

folder=makeDateFolder(top, classify)

filename=None

if folder is not None:

filename =folder+"//"+name

try:

if "e82bab09c_m" in str(url):

return True

if not os.path.exists(filename):

file_object = open(filename,'w+b')

file_object.write(dataimg)

file_object.close()

return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name

else:

print "file exist"

return None

except IOError,e1:

print "e1=",e1

pass

except Exception as e:

print "eee",e

pass

return None #如果没有下载下来就利用原来网站的链接

def getChildren(node,name):

global queue,nodeSet

try:

url="https://www.zhihu.com/topic/"+str(node)+"/hot"

html=get_html(url)

if html is None:

return

soup = BeautifulSoup(html)

p_ch='父话题'

node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

topic_cla=soup.find('div', {'class' : 'child-topic'})

if topic_cla is not None:

try:

p_ch=str(topic_cla.text)

aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点

if u'子话题' in p_ch:

for a in aList:

token=a.get('data-token')

a=str(a).replace('\n','').replace('\t','').replace('\r','')

start=str(a).find('>')

end=str(a).rfind('')

new_node=str(str(a)[start+1:end])

curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同

y= curr.fetchone()

if not y:

print "y=",y,"new_node=",new_node,"token=",token

queue.put((token,new_node,node_name))

except Exception as e:

print "add queue error",e

except Exception as e:

print "get html error",e

def getContent(n,name,p,top_id):

try:

global counter

curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

y= curr.fetchone()

print "exist?? ",y,"n=",n

if not y:

url="https://www.zhihu.com/topic/"+str(n)+"/hot"

html=get_html(url)

if html is None:

return

soup = BeautifulSoup(html)

title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')

description=soup.find('div',{'class':'zm-editable-content'})

if description is not None:

description=description.text

if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环

description=None

tag_path=download_img(pic_path,top_id)

print "tag_path=",tag_path

if (tag_path is not None) or tag_path==True:

if tag_path==True:

tag_path=None

father_id=2 #默认为杂谈

curr.execute('select id from rooms where name=%s',(p))

results = curr.fetchall()

for r in results:

father_id=r[0]

name=title

curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

y= curr.fetchone()

print "store see..",y

if not y:

friends_num=0

temp = time.time()

x = time.localtime(float(temp))

create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now

create_time

creater_id=None

room_avatar=tag_path

is_pass=1

has_index=0

reason_id=None

#print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id

######################有资格入库的内容

counter=counter+1

curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))

conn.commit() #必须时时进入数据库,不然找不到父节点

if counter % 200==0:

print "current node",name,"num",counter

except Exception as e:

print "get content error",e

def work():

global queue

curr.execute('select id,node,parent,name from classify where status=1')

results = curr.fetchall()

for r in results:

top_id=r[0]

node=r[1]

parent=r[2]

name=r[3]

try:

queue.put((node,name,parent)) #首先放入队列

while queue.qsize() >0:

n,p=queue.get() #顶节点出队

getContent(n,p,top_id)

getChildren(n,name) #出队内容的子节点

conn.commit()

except Exception as e:

print "what's wrong",e

def new_work():

global queue

curr.execute('select id,data_id,name from classify_new_copy where status=1')

results = curr.fetchall()

for r in results:

top_id=r[0]

data_id=r[1]

name=r[2]

try:

get_topis(data_id,name,top_id)

except:

pass

def get_topis(data_id,name,top_id):

global queue

url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'

isGet = True;

offset = -20;

data_id=str(data_id)

while isGet:

offset = offset + 20

values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}

try:

msg=None

try:

data = urllib.urlencode(values)

request = urllib2.Request(url,data,headers)

response = urllib2.urlopen(request,None,5)

html=response.read().decode('utf-8')

json_str = json.loads(html)

ms=json_str['msg']

if len(ms) <5:

break

msg=ms[0]

except Exception as e:

print "eeeee",e

#print msg

if msg is not None:

soup = BeautifulSoup(str(msg))

blks = soup.find_all('div', {'class' : 'blk'})

for blk in blks:

page=blk.find('a').get('href')

if page is not None:

node=page.replace("/topic/","") #将更多的种子入库

parent=name

ne=blk.find('strong').text

try:

queue.put((node,ne,parent)) #首先放入队列

while queue.qsize() >0:

n,name,p=queue.get() #顶节点出队

size=queue.qsize()

if size > 0:

print size

getContent(n,name,p,top_id)

getChildren(n,name) #出队内容的子节点

conn.commit()

except Exception as e:

print "what's wrong",e

except urllib2.URLError, e:

print "error is",e

pass

if __name__ == '__main__':

i=0

while i<400:

new_work()

i=i+1

说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。

有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。

相关文章:

  • python编程控制机器人_python人工智能机器人工具书籍: Learn Robotics Programming 2018
  • python numpy库作用_Python NumPy库学习
  • python中pow_Python中float的内置pow()和math.pow()之间的区别?
  • python3.6标准库pdf_python3.6下载|python3.6官方版-520下载站
  • python常用单词有多少_在python中,如何找出一个单词中有多少个单独的字母?
  • wireshark抓取dns_利用Scapy打造简单的DNS监测脚本
  • flash动画在新媒体中的应用_2020年宁德市新媒体新技术创新应用课堂教学研讨活动(高中组)在宁德市高级中学举行(二)...
  • python语言流程控制语句的格式_慢步学python,编程基础知识,流程控制语句if
  • 输变电设备物联网传感器数据通信规约_物联网大潮来袭,无线通讯模块如何连接未来?...
  • python怎么爬取app数据_python高级教程 爬虫抓取App数据
  • python中loop函数_为何GAMLOOP中函数输出的Python Scopage不计算?
  • springboot 源码_SpringBoot是如何实现自动配置的? SpringBoot源码(四)
  • etw系统provider事件较多_使用Spring Gateway和KeyCloak构建一个OIDC认证系统
  • @data注解_一枚 架构师 也不会用的Lombok注解,相见恨晚
  • java中int和integer的区别_Java中关于强、软、弱、虚引用的区别
  • [译] React v16.8: 含有Hooks的版本
  • iOS动画编程-View动画[ 1 ] 基础View动画
  • JavaScript/HTML5图表开发工具JavaScript Charts v3.19.6发布【附下载】
  • Nginx 通过 Lua + Redis 实现动态封禁 IP
  • node入门
  • PAT A1017 优先队列
  • Python_OOP
  • session共享问题解决方案
  • Swoft 源码剖析 - 代码自动更新机制
  • vue的全局变量和全局拦截请求器
  • 从setTimeout-setInterval看JS线程
  • 对象管理器(defineProperty)学习笔记
  • 聚类分析——Kmeans
  • 坑!为什么View.startAnimation不起作用?
  • 快速体验 Sentinel 集群限流功能,只需简单几步
  • 排序(1):冒泡排序
  • 如何优雅地使用 Sublime Text
  • 设计模式走一遍---观察者模式
  • 系统认识JavaScript正则表达式
  • 一、python与pycharm的安装
  • 06-01 点餐小程序前台界面搭建
  • LIGO、Virgo第三轮探测告捷,同时探测到一对黑洞合并产生的引力波事件 ...
  • mysql面试题分组并合并列
  • scrapy中间件源码分析及常用中间件大全
  • Unity3D - 异步加载游戏场景与异步加载游戏资源进度条 ...
  • 浅谈sql中的in与not in,exists与not exists的区别
  • 如何在招聘中考核.NET架构师
  • ​queue --- 一个同步的队列类​
  • #100天计划# 2013年9月29日
  • #我与Java虚拟机的故事#连载18:JAVA成长之路
  • (PWM呼吸灯)合泰开发板HT66F2390-----点灯大师
  • (Redis使用系列) Springboot 整合Redisson 实现分布式锁 七
  • (八)光盘的挂载与解挂、挂载CentOS镜像、rpm安装软件详细学习笔记
  • (免费领源码)Java#Springboot#mysql农产品销售管理系统47627-计算机毕业设计项目选题推荐
  • (三)centos7案例实战—vmware虚拟机硬盘挂载与卸载
  • (实战篇)如何缓存数据
  • (转)c++ std::pair 与 std::make
  • (转)Java socket中关闭IO流后,发生什么事?(以关闭输出流为例) .
  • (转)JVM内存分配 -Xms128m -Xmx512m -XX:PermSize=128m -XX:MaxPermSize=512m
  • .MyFile@waifu.club.wis.mkp勒索病毒数据怎么处理|数据解密恢复