from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import re
html=urlopen(
'https://www.esrl.noaa.gov/psd/cgi-bin/db_search/DBListFiles.pl?did=118&tid=40290&vid=2227'
).read().decode('ascii') #获取页面的HTML soup = BeautifulSoup(html, 'lxml')
list_urls=soup.find_all("a", {"href":re.compile("ftp://ftp.cdc.noaa.gov/Datasets/20thC_ReanV2/Dailies.*")}) #定位到存放url的标号为content的div标签 urls=[]
for url in list_urls:
urls.append(url['href']) # print(urls) for i,url in enumerate(urls):
print("This is file"+str(i+1)+" downloading! You still have "+str(142-i-1)+" files waiting for downloading!!")
file_name = "/media/wzc/OS/Edesk/Meteorological-data/NECP/NOAA-air2m/"+url.split('/')[-1] #文件保存位置+文件名 urlretrieve(url, file_name)