Python爬取国外天气预报网站的方法
更新时间:2015年07月10日 11:32:18 作者:speedmancs
这篇文章主要介绍了Python爬取国外天气预报网站的方法,可实现抓取国外天气预报信息的相关技巧,具有一定参考借鉴价值,需要的朋友可以参考下
本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:
crawl_weather.py如下:
#encoding=utf-8 import httplib import urllib2 import time from threading import Thread import threading from Queue import Queue from time import sleep import re import copy lang = "fr" count = 0 class Location: # Location(False, "中国", "北京", "zh") # Location(True, "", "亚洲", "zh") def __init__(self, is_beyond_country, country_name, loc_name, lang): self.country_name = country_name self.loc_name = loc_name self.lang = lang self.is_beyond_country = is_beyond_country prn_lock = threading.RLock() def GetLocationURLs(url, recursive): global count if url.find("weather-forecast") != -1: count = count + 1 if count % 500 == 0: prn_lock.acquire() print "count:%d" % (count) prn_lock.release() return [url] page = urllib2.urlopen(url).read() time.sleep(0.01) #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>" pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>" locs = re.findall(pattern, page) locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1] if not recursive: urls = [url for url, name in locs] return urls urls = [] for _url, _name in locs: lst = GetLocationURLs(_url, True) urls.extend(lst) return urls #entry_url = "http://www.accuweather.com/zh/browse-locations" entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang) #regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"] #regions = ["eur"] #region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions] #region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"] sub_urls = GetLocationURLs(entry_url, False) print len(sub_urls) print sub_urls q = Queue() location_urls = [] ThreadNum = 5 lock = threading.RLock() for url in sub_urls: q.put(url) def working(): while True: url = q.get() lst = GetLocationURLs(url, True) print "%s %d urls " % (url, len(lst)) lock.acquire() location_urls.extend(lst) lock.release() q.task_done() for i in range(ThreadNum): t = Thread(target=working) t.setDaemon(True) t.start() q.join() fp = open('locations.txt', "w") fp.write("\n".join(location_urls)) fp.close() #for url in location_urls: # print url #location_urls = GetLocationURLs(entry_url) ''' def Fetch(url): try: print url web_path = url[0] local_name = url[1] print "web_path:", web_path print "local_name:", local_name sContent = urllib2.urlopen(web_path).read() savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name) print savePath file = open(savePath,'wb') file.write(sContent) file.close() print savePath + " saved"; except: pass; def working(): while True: url = q.get() Fetch(url) sleep(10) q.task_done() #root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash" root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash" page = urllib2.urlopen(root_url).read() for i in range(NUM): t = Thread(target=working) t.setDaemon(True) t.start() urls = copy.deepcopy(ppt_urls) urls.extend(srt_urls) urls.extend(video_urls) print len(ppt_urls) print len(srt_urls) print len(video_urls) print len(urls) for url in urls: q.put(url) q.join() ''' ''' root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494" page = urllib2.urlopen(root_url).read() print page '''
FetchLocation.py如下:
#encoding=utf-8 import sys import httplib import urllib2 import time from threading import Thread import threading from Queue import Queue from time import sleep import re import copy from xml.dom import minidom import HTMLParser import datetime q = Queue() locks = [threading.RLock() for i in range(2)] ThreadNumber = 20 locations = {} conds = {} def FindCountryBreadCrumbs(page): lines = page.splitlines() count = 0 start = -1 opened = False for line in lines: if line.find("<ul id=\"country-breadcrumbs\">") != -1: start = count opened = True if opened and line.find("</ul>") != -1: end = count opened = False count = count + 1 return "\n".join(lines[start: (end + 1)]) def GetText(nodelist): rc = [] for node in nodelist: if node.nodeType == node.TEXT_NODE: rc.append(HTMLParser.HTMLParser().unescape(node.data)) return ''.join(rc) def FindCondition(page): pat = "<span class=\"cond\">(.*?)</span>" cds = re.findall(pat, page) cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds] return cds def ExtractInfo(url): try: page = urllib2.urlopen(url).read() except Exception, e: return [] text = FindCountryBreadCrumbs(page) text = HTMLParser.HTMLParser().unescape(text) dom = minidom.parseString(text.encode("utf-8")) locs = [] lis = dom.getElementsByTagName("li") for li in lis: adr_list = li.getElementsByTagName("a") if adr_list: locs.append(GetText(adr_list[0].childNodes).encode("utf-8")) strs = li.getElementsByTagName("strong") if strs: locs.append(GetText(strs[0].childNodes).encode("utf-8")) cds = FindCondition(page) return locs, cds def AddMap(lst, m): for x in lst: if m.get(x) == None: m[x] = 1 def working(): while True: urls = q.get() #print len(urls) m = {} m2 = {} count = 0 for url in urls: count = count + 1 #print "%d/%d" % (count, len(urls)) locs, cds = ExtractInfo(url) AddMap(locs, m) AddMap(cds, m2) locks[1].acquire() AddMap(m.keys(), locations) AddMap(m2.keys(), conds) locks[1].release() q.task_done() def main(): if len(sys.argv) < 2: exit() loc_path = sys.argv[1] fp = open(loc_path, "r") urls = [line.strip() for line in fp] fp.close() #urls = urls[0:1000] blocks = len(urls) / ThreadNumber + 1 for start in range(0, len(urls), blocks): end = start + blocks if end > len(urls): end = len(urls) q.put(urls[start:end]) for i in range(ThreadNumber): t = Thread(target=working) t.setDaemon(True) t.start() q.join() fp = open("location_name.fr", "w") fp.write("\n".join(locations.keys())) fp.close() fp = open("conditions.fr", "w") fp.write("\n".join(conds.keys())) fp.close() if __name__ == '__main__': main()
希望本文所述对大家的python程序设计有所帮助。
相关文章
使用pandas模块读取csv文件和excel表格,并用matplotlib画图的方法
今天小编就为大家分享一篇使用pandas模块读取csv文件和excel表格,并用matplotlib画图的方法,具有很好的参考价值,希望对大家有所帮助。一起跟随小编过来看看吧2018-06-06python中数组array和列表list的基本用法及区别解析
大家都知道数组array是同类型数据的有限集合,列表list是一系列按特定顺序排列的元素组成,可以将任何数据放入列表,且其中元素之间没有任何关系,本文介绍python中数组array和列表list的基本用法及区别,感兴趣的朋友一起看看吧2022-05-05
最新评论