from urllib.parse import urljoin
import urllib.request
from bs4 import BeautifulSoup
import time
import os
import re
import errno
def mkdir_p(path): # 递归创建多级目录
try:
os.makedirs(path)
except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def get_link(page): # 寻找链接的href
linkData = []
for page in page.find_all('td'):
links = page.select("a")
for each in links:
# if str(each.get('href'))[:1] == '/': 过滤if代码
data = each.get('href')
linkData.append(data)
return (linkData)
def gain(url): # 获取网页指定内容
try:
page = urllib.request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml') # 利用soup获取网页内容
links = get_link(soup) # 获取<a href= ? 内容
return links
except:
print('无法获取该链接:' + url)
return 1
def main():
url = 'http://weather.unisys.com/hurricane/index.php'
Download_dir = 'E:\\Typhoon_data\\Data' #download path
Web_Link = gain(url)
for Link in range(len(Web_Link)):
Link_Add = Web_Link[Link]
Link_One = re.split("/", Link_Add) # 去除'/',将Link_Add变成数组
Ocean_Folder = Link_One[0] # 获取数组第1位值
Ocean_Time = Link_One[1] # 获取数组第2位值
url_Typhoon = 'http://weather.unisys.com/hurricane/'
_connet = urljoin(url_Typhoon, Link_Add)
Web_Link_ = gain(_connet)
# 删除多余gif链接
Gifdata = []
for Gif in range(len(Web_Link_)):
Gifdata_ = Web_Link_[Gif]
findGif = re.findall(r'.gif$', Gifdata_, re.I)
if findGif:
Gifdata.append(Gifdata_)
# print(Gifdata)
else:
continue
for _Gif in range(len(Gifdata)):
Web_Link_.remove(Gifdata[_Gif])
time.sleep(3)
if Ocean_Time != 'index.php':
for Link_A in range(len(Web_Link_)):
Link_Add_ = Web_Link_[Link_A]
Link_part = re.split("/", Link_Add_) # 去除'/',将Link_Add变成数组
Ocean_dataName = Link_part[0] # 获取dataName
url_Data = 'http://weather.unisys.com/hurricane/' + Ocean_Folder + "/" + Ocean_Time + "/"
connet_ = urljoin(url_Data, Link_Add_)
time.sleep(1)
# 下载数据
Ocean_dataFile = (Ocean_dataName + '.json')
file = os.path.join(Download_dir + "/" + Ocean_Folder + "/" + Ocean_Time + "/") # 拼接绝对路径
mkdir_p(file)
print(connet_)
if os.path.isfile(file + Ocean_dataFile):
print('文件已存在')
else:
try:
url =connet_
wp = urllib.request.urlopen(url) # 打开数据网页数据
content = wp.read()
fp = open(file + Ocean_dataFile, "wb") # 写入指定文件夹
fp.write(content) # 写入数据
fp.close() # 关闭文件
except:
print('无法获取该链接:' + url)
continue
else:
for Link_B in range(len(Web_Link_)):
_Link_Add = Web_Link_[Link_B]
Link_part_ = re.split("/", _Link_Add) # 去除'/',将Link_Add变成数组
Ocean_Time_ = Link_part_[1] # 获取数组第2位值,年份
url_Typhoon_ = 'http://weather.unisys.com/hurricane/' + Ocean_Folder + "/"
Connet = urljoin(url_Typhoon_, _Link_Add)
_Web_Link = gain(Connet)
time.sleep(3)
# 删除多余gif链接
_Gifdata = []
for _Gif_ in range(len(_Web_Link)):
_Gifdata_ = _Web_Link[_Gif_]
findGif = re.findall(r'.gif$', _Gifdata_, re.I)
if findGif:
_Gifdata.append(_Gifdata_)
# print(Gifdata)
else:
continue
for _Gif in range(len(_Gifdata)):
_Web_Link.remove(_Gifdata[_Gif])
for Link_B_ in range(len(_Web_Link)):
_Link_Add_ = Web_Link[Link_B_]
_Link_part_= re.split("/", _Link_Add_) # 去除'/',将Link_Add变成数组
_Ocean_dataName_ = _Link_part_[0] # 获取数组第1位值
url_Data_ = 'http://weather.unisys.com/hurricane/' + Ocean_Folder + "/" + Ocean_Time_ + "/"
Connet_ = urljoin(url_Data_, _Link_Add_)
time.sleep(1)
# 下载数据
Ocean_dataName = (_Ocean_dataName_ + '.json')
file = os.path.join(Download_dir + "/" + Ocean_Folder + "/" + Ocean_Time_ + "/" ) # 拼接绝对路径
mkdir_p(file)
print(Connet_)
if os.path.isfile(file + Ocean_dataName):
print('文件已存在')
else:
try:
url = Connet_
wp = urllib.request.urlopen(url) # 打开数据网页数据
content = wp.read()
fp = open(file + Ocean_dataName, "wb") # 写入指定文件夹
fp.write(content) # 写入数据
fp.close() # 关闭文件
except:
print('无法获取该链接:' + url)
continue
if __name__ == '__main__':
main()