第一个小爬虫:全本网小说下载

编译环境:Python3.7.0

需要安装的库:requests

首先第一步:获取网页的源码(测试的URL:http://quanben5.com/n/yuzui/xiaoshuo.html)

import requests

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def main():
    url = 'http://quanben5.com/n/yuzui/xiaoshuo.html'
    html = get_one_page(url)
    print(html)

main()

运行截图:

第二步:获取所有章节的URL

此时需要库:re

import re
import requests

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

#获取所有章的url
def get_url(html):
    #用正则表达式提取出所需要的部分
    URL = re.findall('<li class="c3"><a href="(.*?)"><span>',html,re.S)
    list_url = []   # 定义一个列表来存储所有章的URL
    for url_ in URL:
        list_url.append( 'http://quanben5.com' + url_ )
    for url_ in list_url:
        print(url_)

def main():
    url = 'http://quanben5.com/n/yuzui/xiaoshuo.html'
    html = get_one_page(url)
    get_url(html)

main()

效果图:

获取单章的内容:

import re
import requests

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
'''
#获取所有章的url
def get_url(html):
    #用正则表达式提取出所需要的部分
    URL = re.findall('<li class="c3"><a href="(.*?)"><span>',html,re.S)
    list_url = []   # 定义一个列表来存储所有章的URL
    for url_ in URL:
        list_url.append( 'http://quanben5.com' + url_ )
    for url_ in list_url:
        print(url_)
'''
#获取单章的内容
def get_content(html):
    title = re.findall('<h1 class="title1">(.*?)</h1>',html,re.S)
    title = title[0]
    print(title)
    content = re.findall('<p>(.*?)</p>',html,re.S)
    for sentence in content:
        print(sentence)

def main():
    url = 'http://quanben5.com/n/yuzui/41935.html'
    html = get_one_page(url)
    #get_url(html)
    get_content(html)

main()

效果图:

下载单章内容:

import re
import requests

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
'''
#获取所有章的url
def get_url(html):
    #用正则表达式提取出所需要的部分
    URL = re.findall('<li class="c3"><a href="(.*?)"><span>',html,re.S)
    list_url = []   # 定义一个列表来存储所有章的URL
    for url_ in URL:
        list_url.append( 'http://quanben5.com' + url_ )
    for url_ in list_url:

'''
#获取单章的内容
def get_content(html):
    title = re.findall('<h1 class="title1">(.*?)</h1>',html,re.S)
    title = title[0]
    print(title)
    write_to_file(title)
    content = re.findall('<p>(.*?)</p>',html,re.S)
    for sentence in content:
        print(sentence)
        write_to_file(sentence)
    write_to_file('\n')

#将内容保存到本地
def write_to_file(content):
    with open('result.txt','a',encoding = 'utf-8') as f:
        f.write(content+'\n')

def main():
    url = 'http://quanben5.com/n/yuzui/41935.html'
    html = get_one_page(url)
    #get_url(html)
    get_content(html)

main()

效果图:

下载全部内容:

import re
import requests

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

#获取所有章的url
def get_url(html):
    #用正则表达式提取出所需要的部分
    URL = re.findall('<li class="c3"><a href="(.*?)"><span>',html,re.S)
    list_url = []   # 定义一个列表来存储所有章的URL
    for url_ in URL:
        list_url.append( 'http://quanben5.com' + url_ )
    return list_url

#获取单章的内容
def get_content(html):
    title = re.findall('<h1 class="title1">(.*?)</h1>',html,re.S)
    title = title[0]
    print(title + '开始下载')
    write_to_file(title)
    content = re.findall('<p>(.*?)</p>',html,re.S)
    for sentence in content:
        write_to_file(sentence)
    write_to_file('\n')

#将内容保存到本地
def write_to_file(content):
    with open('result.txt','a',encoding = 'utf-8') as f:
        f.write(content+'\n')

#
def save_content(list_url):
    for url_ in list_url:
        html_ = get_one_page(url_)
        get_content(html_)

def main():
    url = 'http://quanben5.com/n/yuzui/xiaoshuo.html'
    html = get_one_page(url)
    list_url = get_url(html)
    save_content(list_url)

main()

效果图:

完善搜索功能:

搜索后:

正则提取我们想要的目标URL:

import re
import requests

#定义全局变量keyword,方便创建text
keyword = input('请输入你要下载的小说:')

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def main():
    url = 'http://quanben5.com/index.php?c=book&a=search&keywords='+keyword
    html = get_one_page(url)
    url1 = re.findall(r'<h3><a href="(.*?)">',html,re.S)
    url1 = url1[0]
    print(url1)
    url2 = 'http://quanben5.com'+url1+'/xiaoshuo.html'
    print(url2)
    
main()

运行如下:

有时候搜索不到,程序异常就会退出

下一步,加上异常处理:

import re
import requests

#定义全局变量keyword,方便创建text
keyword = input('请输入你要下载的小说:')

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def main():
    url = 'http://quanben5.com/index.php?c=book&a=search&keywords='+keyword
    html = get_one_page(url)
    url1 = re.findall(r'<h3><a href="(.*?)">',html,re.S)
    if url1 == []:
        print('搜索不到!!!')
        
    else:
        url1 = url1[0]
        url2 = 'http://quanben5.com'+url1+'/xiaoshuo.html'
        print(url2)
    
main()

效果图:

----------------------------------------------------------------------分割线---------------------------------------------------------------------------------

上个完整代码:

import os
import re
import sys
import requests

#定义全局变量keyword,方便创建text
keyword = input('请输入你要下载的小说:')
name = str(keyword) + '.txt'

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

#获取所有章的url
def get_url(html):
    #用正则表达式提取出所需要的部分
    URL = re.findall('<li class="c3"><a href="(.*?)"><span>',html,re.S)
    list_url = []   # 定义一个列表来存储所有章的URL
    for url_ in URL:
        list_url.append( 'http://quanben5.com' + url_ )
    return list_url

#获取单章的内容
def get_content(html):
    title = re.findall('<h1 class="title1">(.*?)</h1>',html,re.S)
    title = title[0]
    print(title + '开始下载')
    write_to_file(title)
    content = re.findall('<p>(.*?)</p>',html,re.S)
    for sentence in content:
        write_to_file(sentence)
    write_to_file('\n')

#将内容保存到本地
def write_to_file(content):
    with open(name,'a',encoding = 'utf-8') as f:
        f.write(content+'\n')

#将所有章节保存到本地
def save_content(list_url):
    for url_ in list_url:
        html_ = get_one_page(url_)
        get_content(html_)


def main():
    url = 'http://quanben5.com/index.php?c=book&a=search&keywords='+keyword
    html = get_one_page(url)
    url1 = re.findall(r'<h3><a href="(.*?)">',html,re.S)
    if url1 == []:
        print('搜索不到!!!')
        flag = input('是否退出:(Y or N):')
        if flag == 'Y':
            sys.exit()
        elif flag == 'y':
            sys.exit()
        else:
            print('搜不到能怎么办,我也很无奈-.-||')
    else:
        url1 = url1[0]  # 获得小说URL
        url2 = 'http://quanben5.com'+url1+'/xiaoshuo.html'  # 获得小说目录页URL
        html2 = get_one_page(url2)  
        list_url = get_url(html2)
        print(name + '开始下载!!!')
        save_content(list_url)

main()

最后运行一下!!!(写博客访问得有点频繁,后面的章节就不放出来了,被限制连接了)