学习python中,写个爬虫小程序,基于2.7版本
代码源码贴在我的Github:https://github.com/qqxx6661/python/blob/master/gamerskyPic1.0.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import urllib
import urllib2
import re
import sys
import socket
class Tool:
removeExtraTag = re.compile(r'_cke_saved_src="http.+?.jpg')
def replace(self,x):
x = re.sub(self.removeExtraTag,"",x)
return x.strip()#strip()将前后多余内容删除
class Spider:
def __init__(self,siteURL):
self.siteURL = siteURL
def getPage(self,pageIndex):
if pageIndex == 1:
url = self.siteURL + ".shtml"
else:
url = self.siteURL + "_" + str(pageIndex) + ".shtml"
print '准备抓取:' + url
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#print response.read()
return response.read()
def getPic(self,pageIndex):
page = self.getPage(pageIndex)
#imgre = re.compile(r'src="(http.+?.jpg)">')适合正常抓取
imgre = re.compile(r'src="(http.+?.jpg)">')
imglist = re.findall(imgre,page)
print imglist
imglist_clr = []
for imgurl in imglist:
imgurl_clr = tool.replace(imgurl)
imglist_clr.append(imgurl_clr.encode('utf-8')) #去除u'
print '清洗多余字符完成'
print imglist_clr
x = 0
for imgurl_clr in imglist_clr:
print '正在保存第%s页的第%s张'%(pageIndex,x+1)
urllib.urlretrieve(imgurl_clr,'picture_%s_%s.jpg' % (pageIndex,x+1))
x+=1
socket.setdefaulttimeout(5.0) #设置全局超时5秒
tool=Tool()
print '请输入游民星空网址:'
inURL = raw_input()
inURL = inURL[:-6] #去除.shtml
spider = Spider(inURL)
for x in range(1,20):
try:
spider.getPic(x)
except urllib2.URLError,e:
print e.code
print '已经没有下一页了'
break;
print '所有图片保存完毕'
之后会逐渐完善,比如保存至文件夹,超时自动重试,多线程等。