python爬虫抓取天气

一个小小小爬虫
由于天气API需要付费,就自己爬虫爬天气数据下来
然后挂在openshift上。
http://i.zxc.science/weatherapi?citycode=101190102&type=today

爬虫代码如下:

简单的正则匹配,效率应该很低,基于python3。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#coding:utf-8
import urllib.request
import re

__author__ = 'Taosky'

def now(citycode):
url = 'http://m.weathercn.com/todayweather.do?id='+citycode+'&partner=m'
r = urllib.request.urlopen(url)
html = r.read().decode('UTF-8')

today ={}
today['today_date'] = re.search(r'<span class="date">(.*?)</span>',html).group(1)
today['today_weekday'] = re.search(r'<span class="weekday">(.*?)</span>',html).group(1)
today['today_solarTerm'] = re.search(r'<span class="solarTerm">(.*?)</span>',html).group()
today['today_curtime'] = re.search(r'<div class="curtime">\s*?<span>(.*?)</span>',html).group(1)
today['today_curtemp'] = re.search(r'<span class="cur-temp">(.*?)</sup>',html).group(1).replace('<sup>','')
today['today_description'] = re.search(r'<span class="description">(.*?)</span>',html).group(1)
today['today_sunrise'] = re.search(r'<div class="sunrise"><i></i><span>.*?</span><span>(.*?)</span></div>',html).group(1)
today['today_sunset'] = re.search(r'<div class="sunset"><i></i><span>.*?</span><span>(.*?)</span></div>',html).group(1)

textdescre = re.compile('<p\s*class="textdesc">(.*?)</p>')
textdesc = textdescre.findall(html)
today['today_wind-direction'] = textdesc[0]
today['today_wind_level'] = textdesc[1]

numre = re.compile('<p class="num">(.*?)</p>')
num = numre.findall(html)
today['today_air_pressure'] = num[0]
today['today_humidity'] = num[1]
today['today_wind_speed'] = num[2]
today['today_visibility'] = num[3]
return today


def days(citycode):
url = 'http://m.weathercn.com/index.do?id='+citycode+'&partner=m'
r = urllib.request.urlopen(url)
html = r.read().decode('UTF-8')

datesre = re.compile('<td class="date">(.*?)</td>')
tempssre = re.compile('<td class="temp">\s*?(.*?)\s*?</td>')
descsre = re.compile('<td class="desc">\s*?(.*?)\s*?</td>')

dates = datesre.findall(html)
temps = tempssre.findall(html)
descs = descsre.findall(html)

days = {}
for i in range(len(dates)):
num =str(i+1)
days['days_date'+num] = dates[i]
days['days_tem'+num] = temps[i].replace('\t','').replace('\n','').replace('\r','').replace(' ','')
days['days_desc'+num] = descs[i].replace('\t','').replace('\n','').replace('\r','').replace(' ','')
return days

def hours(citycode):
url = 'http://m.weathercn.com/eachhours.do?id='+ citycode + '&partner='
r = urllib.request.urlopen(url)
html = r.read().decode('UTF-8')

timesre = re.compile('<div class="time">(.*?)</div>')
tempsre = re.compile('<div class="temp">(.*?)</div>')
descsre = re.compile('<div class="desc">(.*?)</div>')

times = timesre.findall(html)
temps = tempsre.findall(html)
descs= descsre.findall(html)

hours = {}
for i in range(len(times)):
num =str(i+1)
hours['hours_time'+num] = times[i]
hours['hours_tem'+num] = temps[i]
hours['hours_desc'+num] = descs[i]
return hours