获取网页中网站的标题与描述
# -*- coding: utf-8 -*-
"""
"""
import gzip, cStringIO
import HTMLParser
import threading
import re
import urllib2
def check_url(idx):
urls = ['http://www.weibo.com', 'http://www.taobao.com', 'http://www.jd.com']
url = urls[idx]
if idx == 2:
req = urllib2.Request(url);
req.add_header('Accept-Encoding', 'gzip, deflate');
f = urllib2.urlopen(req, timeout=30)
html = f.read()
#gzip解压缩
if html[:6] == '\x1f\x8b\x08\x00\x00\x00':
html = gzip.GzipFile(fileobj = cStringIO.StringIO(html)).read()
txt = html.decode('gbk')
title_pattern = re.compile('<title>.+</title>')
title = title_pattern.findall(txt)[0]
title = title.replace('<title>','')
title = title.replace('</title>','')
content_pattern = re.compile('"description" content=.+>')
content= content_pattern.findall(txt)[0]
content = content.replace('"description" content=','')
content = content.replace('/','')
content = content.replace('>','')
print url
print title
print content
return
web = urllib2.urlopen(url)
txt = web.read()
if idx==1:
httpParser = HTMLParser.HTMLParser()
txt = httpParser.unescape(txt).encode("utf-8")
title_pattern = re.compile('<title>.+</title>')
title = title_pattern.findall(txt)[0]
title = title.replace('<title>','')
title = title.replace('</title>','')
content_pattern = re.compile('"description" content=.+>')
content= content_pattern.findall(txt)[0]
content = content.replace('"description" content=','')
content = content.replace('/','')
content = content.replace('>','')
print url
print title
print content
thrd_list = []
for idx in xrange(3):
thrd = threading.Thread(target = check_url, args = [idx])
thrd.start()
thrd_list.append(thrd)
for thrd in thrd_list:
thrd.join()
获取网站状态码 import urllib2
import threading
urls = ['http://www.weibo.com', 'http://www.taobao.com', 'http://www.jd.com']
def check_response(url):
response = None
try:
response = urllib2.urlopen(url,timeout=5)
print url,response.getcode()
except urllib2.URLError as e:
if hasattr(e, 'code'):
print 'Error code:',e.code
elif hasattr(e, 'reason'):
print 'Reason:',e.reason
finally:
if response:
response.close()
thrd_list = []
for idx in xrange(3):
thrd = threading.Thread(target = check_response, args = [urls[idx]])
thrd.start()
thrd_list.append(thrd)
for thrd in thrd_list:
thrd.join()