- 积分
- 1626
- 贡献
-
- 精华
- 在线时间
- 小时
- 注册时间
- 2016-1-5
- 最后登录
- 1970-1-1
|
发表于 2020-9-15 16:13:55
|
显示全部楼层
用python 写了一个爬虫程序。http://weather.uwyo.edu/robots.txt显示,robots协议为User-agent: *Disallow: /cgi-bin,如果不遵守协议,存在法律风险。def getHmtlText(url,param): try: #r = requests.get('http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=09&FROM=1212&TO=1212&STNM=47600') r = requests.get(url,params=param) #print(r.url,r.status_code) print("开始下载......") r.raise_for_status() r.encoding= r.apparent_encoding return r.text except: return ""def dataClean(rawText,stnm): try: #regex =re.compile(r'<PRE>.*</PRE>') #print(r.text[:500]) elems = '''----------------------------------------------------------------------------- PRES HGHT TEMP DWPT RELH MIXR DRCT SKNT THTA THTE THTV hPa m C C % g/kg deg knot K K K -----------------------------------------------------------------------------''' #三引号位置 rawText = rawText.replace(elems,'') pattern = r'<H2>{}.*</H2>\n<PRE>[0-9\.\- \n]+</PRE>'.format(stnm) codes = re.findall(pattern,rawText) #,re.S) text = '' for code in codes: text = text+code text = text.replace('<H2>','') text = text.replace('</H2>','') text = text.replace('<PRE>\n','') text = text.replace('</PRE>','') return text except: print("数据清洗出现错误!") return "" def getParam(region,stnm,yr,mon,fromTime,toTime): param = {'region':'0','TYPE':'1','YEAR':'2','MONTH':'3','FROM':'4','TO':'5','STNM':'6'} param['region'] = region param['TYPE'] = unquote('TEXT%3ALIST',encoding='utf-8') param['YEAR']= '{:0=4d}'.format(yr) param['MONTH'] = '{:0=2d}'.format(mon) param['FROM']= fromTime param['TO'] = toTime param['STNM']= stnm return paramdef writeFile(path,content): fw = open(path,'w') fw.write(content) fw.close()def main(): url =" http://weather.uwyo.edu/cgi-bin/sounding" try: fo = open("param.csv") for line in fo: line = line.replace("\n","") stnm,region = line.split(",") for yr in range(2020,2021): for mon in range(9,10): fromTime = '0100' toTime = '0412' param = getParam(region,stnm,yr,mon,fromTime,toTime) content = getHmtlText(url,param) content = dataClean(content,param['STNM']) if len(content)>0: path = Path("./站号_{}/".format(stnm)) if not path.is_dir(): os.mkdir(path) filename = '{0}月{1}至{2}'.format(mon,fromTime,toTime) writeFile(os.path.join(path,filename+'.txt'),content) print(filename+'.txt----'+'下载完毕') else: print("获取网页错误!") except: print("读入参数错误!")
|
|