Python学习笔记--Python 爬虫入门 -18-4 Scrapy案例2 ( 西刺代理IP)

一 . 创建工程 >>scrapy startproject xici_proxyip_project

二. 创建spider >> cd xici_proxyip_project

>> scrapy genspider xicispider xicidaili.com

三. Item 编写

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class XiciProxyipProjectItem(scrapy.Item):
    # define the fields for your item here like:
    IP = scrapy.Field()
    PORT = scrapy.Field()
    ADDRESS = scrapy.Field()        #地址
    ANONY = scrapy.Field()          #匿名
    TYPE = scrapy.Field()           #类型
    SPEED = scrapy.Field()          #速度
    ACTIVE_TIME= scrapy.Field()     #存活时间
    LAST_CHECK_TIME = scrapy.Field()#验证时间

四.spider 编写

# -*- coding: utf-8 -*-
import scrapy
import logging
from  xici_proxyip_project.items import  XiciProxyipProjectItem
log = logging.getLogger()
class XicispiderSpider(scrapy.Spider):
    name = 'xicispider'
    allowed_domains = ['xicidaili.com']
    start_urls = [
        'http://xicidaili.com/nn/1',
    ]

    def start_requests(self):
        reqs = []
        for i in range(1,3):
            #加headers,否则返回503
            req = scrapy.Request("http://www.xicidaili.com/nn/{}".format(i),headers=
                {
                    'Host': 'www.xicidaili.com',
                    'Referer': 'http://www.xicidaili.com/nn/{}'.format(i),
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
                }
                                )
            reqs.append(req)
        return  reqs

    def parse(self, response):
        
        xc_items = XiciProxyipProjectItem()
        trlist = response.css("table tr")[1:]
        #总页数, class = pagination属性的div标签下的不是class=next_page 的a 标签, 有点瑕疵
        # totalPage = response.xpath("//div[@class='pagination']/a[not(@class='next_page')]/text()").extract()[-1]
        # items = []
        for i in trlist:
            xc_items["IP"] =i.xpath("td/text()").extract()[0]
            xc_items["PORT"] =i.xpath("td/text()").extract()[1]
            xc_items["ADDRESS"] = i.xpath('string(td[4])').extract()[0].strip()
            xc_items["ANONY"] =i.xpath("td/text()").extract()[4]
            xc_items["TYPE"] =i.xpath("td/text()").extract()[5]
            xc_items["SPEED"] =i.xpath("td/div/@title").extract()[0]
            xc_items["ACTIVE_TIME"] = i.xpath("td/text()").extract()[-2]
            xc_items["LAST_CHECK_TIME"] =i.xpath("td/text()").extract()[-1]
            # log.info("IP=",xc_items["IP"],",PORT=",xc_items["PORT"],",ADDRESS=",xc_items["ADDRESS"],
            # ",ANONY=",xc_items["ANONY"],",TYPE=",xc_items["TYPE"],",SPEED=",xc_items["SPEED"],",ACTIVE_TIME=",xc_items["ACTIVE_TIME"],",LAST_CHECK_TIME=",xc_items["LAST_CHECK_TIME"])
            # items.append(xc_items) #用append只会循环输出最后一条数据,用yield
            yield xc_items

五. settings 配置

1.把robots.txt 规则关闭

ROBOTSTXT_OBEY = False

2. DEFAULT_REQUEST_HEADERS 可以关闭了

六. 编写Pipeline

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import  json

class XiciProxyipProjectPipeline(object):
    def __init__(self):
        self.filename=open(r"E:\0802\data\xici.txt","wb")

    def process_item(self, item, spider):
        text = json.dumps(dict(item),ensure_ascii=False)+"\n"
        self.filename.write(text.encode("utf-8"))

    def close_spider(self,spider):
        self.filename.close()

七.设置settings 关于pipeline 的开关

ITEM_PIPELINES = {
   'xici_proxyip_project.pipelines.XiciProxyipProjectPipeline': 300,
}

八.运行爬虫

>> scrapy crawl xicispider

参考文章，不过有所升级：西刺网站爬虫代码解读

补充：存Mysql

1.conda install pymysql (python 3.X 版本)

2.重新编写pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import  json
import pymysql
class XiciProxyipProjectPipeline(object):
    def __init__(self):
        # self.filename=open(r"E:\0802\data\xici.txt","wb")
        self.client = pymysql.connect(
            host='127.0.0.1',
            port=3306,
            user='root',
            passwd='XXXX',
            db='test',
            # charset='utf-8'
        )
        self.cur = self.client.cursor()
    def process_item(self, item, spider):
        # text = json.dumps(dict(item),ensure_ascii=False)+"\n"
        # self.filename.write(text.encode("utf-8"))
        # {"IP": "121.31.100.28", "PORT": "8123", "ADDRESS": "广西防城港", "ANONY": "高匿",
        #  "TYPE": "HTTP", "SPEED": "7.502秒", "ACTIVE_TIME": "827天", "LAST_CHECK_TIME": "18-09-26 12:32"}

        sql = 'insert into xicidb(IP,PORT,ADDRESS,ANONY,TYPE,SPEED,ACTIVE_TIME,LAST_CHECK_TIME) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)'
        list = (item['IP'],item['PORT'],item['ADDRESS'],item['ANONY'],item['TYPE'],item['SPEED'],item['ACTIVE_TIME'],item['LAST_CHECK_TIME'])
        self.cur.execute(sql,list)
        self.client.commit()
        return item
    # def close_spider(self,spider):
    #     self.filename.close()

3.运行spider >> scrapy crawl xicispider

效果图：

补充:2018年9月28日01:05:33

注意:

1.check the manual that corresponds to your MySQL server version for the right syntax

这种问题一般需要检查一下你的sql 是不是正确,我就是因为在最后个 %s 多了个逗号,浪费了些时间排查

2.pymysql.err.InternalError: (1366, "Incorrect string value: '\\xE6\\xB5\\x99\\xE6\\xB1\\x9F...' for column 'ADDRESS' at row 1")

解决方案: