【Science Data Bank】数据集下载脚本
下载脚本
由于频繁下载会出现429错误,所以加了延时和重试,但是错误仍然不可避免,可以反复运行代码进行下载
import requests
import os
from urllib.parse import urlparse, parse_qs
import time headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
} def download_file_from_link(url, output_dir, max_retries=5, delay=15): # 解析URL,获取文件名 parsed_url = urlparse(url) query_params = parse_qs(parsed_url.query) file_name = query_params.get('fileName', ['unknown'])[0] # 构造本地文件路径 local_file_path = os.path.join(output_dir, file_name) for attempt in range(max_retries): try: # 在每次请求之前等待指定的延迟时间(秒) time.sleep(delay) # 发送GET请求并保存文件 with requests.get(url, stream=True, headers=headers) as r: r.raise_for_status() # 如果请求失败,这将引发HTTPError异常 with open(local_file_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) print(f"Downloaded {local_file_path} (attempt {attempt+1}/{max_retries})") return # 下载成功,退出循环 except requests.exceptions.RequestException as e: print(f"Error downloading {url} (attempt {attempt+1}/{max_retries}): {e}") if attempt == max_retries - 1: print(f"Max retries reached for {url}. Giving up.") # 这里可以选择抛出一个异常,或者记录日志等 # 读取文本文件中的链接,每行一个链接
file_path = 'data.txt' # 假设links.txt与脚本在同一目录下,且每行包含一个链接
output_dir = 'data' # 下载文件的输出目录 if not os.path.exists(output_dir): os.makedirs(output_dir) with open(file_path, 'r', encoding='utf-8') as file: # 假设链接文件是UTF-8编码 links = file.readlines() # 遍历链接列表并下载文件
for link in links: link = link.strip() # 去除链接前后的空白字符 if link: download_file_from_link(link, output_dir) print("All files have been processed.")
找出未成功下载的文件脚本
如果最后仅有个位数文件未被下载,可以先找出未被成功下载的文件然后手动下载
import os # 文本文件和下载目录的路径
text_file_path = 'data.txt' # 假设这个文件包含你要检查的URL列表
download_dir = 'data' # 下载文件的目录 # 读取文本文件中的链接
with open(text_file_path, 'r', encoding='utf-8') as file: links = file.readlines() # 遍历链接列表并检查文件是否存在
for link in links: link = link.strip() # 去除链接前后的换行符和空白字符 if link: # 解析URL,获取文件名 file_name_start = link.rfind('fileName=') + 9 file_name = link[file_name_start:].split('&')[0] # 提取fileName参数的值 # 构造本地文件路径 local_file_path = os.path.join(download_dir, file_name) # 检查文件是否存在 if os.path.exists(local_file_path):
# print(f"{file_name} 已下载。")passelse: print(f"{file_name} 未下载。")
上述脚本请自行修改文件路径。