获取网页中网站的标题与描述

# -*- coding: utf-8 -*-
"""



"""
import gzip, cStringIO
import HTMLParser 
import threading
import re
import urllib2

def check_url(idx):
    urls  = ['http://www.weibo.com', 'http://www.taobao.com', 'http://www.jd.com']
    url = urls[idx]
    
    if idx == 2:
        req = urllib2.Request(url);
        req.add_header('Accept-Encoding', 'gzip, deflate');
        f = urllib2.urlopen(req, timeout=30)
        html = f.read()
 
#gzip解压缩
        if html[:6] == '\x1f\x8b\x08\x00\x00\x00':
            html = gzip.GzipFile(fileobj = cStringIO.StringIO(html)).read()
 
        txt = html.decode('gbk')
        title_pattern = re.compile('<title>.+</title>')
        title = title_pattern.findall(txt)[0]
        title = title.replace('<title>','')
        title = title.replace('</title>','')
        content_pattern = re.compile('"description" content=.+>')
        content= content_pattern.findall(txt)[0]
        content = content.replace('"description" content=','')
        content = content.replace('/','')
        content = content.replace('>','')
        print url
        print title
        print content
        return 
        
    web = urllib2.urlopen(url)
    txt = web.read()    
    if idx==1:
        httpParser = HTMLParser.HTMLParser() 
        txt =  httpParser.unescape(txt).encode("utf-8") 
    
    title_pattern = re.compile('<title>.+</title>')
    
    title = title_pattern.findall(txt)[0]
    
    
    title = title.replace('<title>','')
    title = title.replace('</title>','')
    
    content_pattern = re.compile('"description" content=.+>')
    content= content_pattern.findall(txt)[0]
    content = content.replace('"description" content=','')
    content = content.replace('/','')
    content = content.replace('>','')
    
    print url
    print title
    print content

thrd_list = []
for idx in xrange(3):
    thrd = threading.Thread(target = check_url, args = [idx])
    thrd.start()
    thrd_list.append(thrd)
for thrd in thrd_list:
    thrd.join()
获取网站状态码
import urllib2
import threading

urls  = ['http://www.weibo.com', 'http://www.taobao.com', 'http://www.jd.com']
def check_response(url):
    
    response = None
    try:
        response = urllib2.urlopen(url,timeout=5) 
        print url,response.getcode()
    except urllib2.URLError as e:
        if hasattr(e, 'code'):
            print 'Error code:',e.code
        elif hasattr(e, 'reason'):
            print 'Reason:',e.reason 
    finally:
        if response:
            response.close()

thrd_list = []
for idx in xrange(3):
    thrd = threading.Thread(target = check_response, args = [urls[idx]])
    thrd.start()
    thrd_list.append(thrd)
for thrd in thrd_list:
    thrd.join()