Python之协程爬虫 小说网协程爬虫案例

Gevent协程的使用中我们已经学会简单的使用协程,这篇文章我们通过协程爬虫来测试一下具体的效果。Gevent遇到IO阻塞时会自动切换任务:

from gevent import monkey
monkey.patch_all()  # 

import gevent
from urllib.request import urlopen


def f(url):
    print('GET: %s' % url)
    resp = urlopen(url)
    data = resp.read()
    print('%d bytes received from %s.' % (len(data), url))


gevent.joinall([
    gevent.spawn(f, 'https://www.python.org/'),
    gevent.spawn(f, 'https://www.baidu.com/'),
    gevent.spawn(f, 'https://www.e1yu.com/'),
])

基于以上代码,我们使用协程来抓取小说网:https://www.17k.com/all,网页的分析这里不过多介绍,主要是如何使用协程:

from gevent import monkey
monkey.patch_all()

from lxml import etree
import gevent
import requests
import time
def get_all_url(url,headers):
    res=requests.get(url,headers)
    tree=etree.HTML(res.text)
    for i in tree.xpath('//tr[@class="bg0"]'):
        item={}
        item['cate']=i.xpath('./td[2]/a/text()')[0]
        item['title']=i.xpath('./td[3]/span/a/text()')[0]
        item['href']=i.xpath('./td[3]/span/a/@href')[0].lstrip('//')
        item['count']=i.xpath('./td[5]/text()')[0]
        item['author']=i.xpath('./td[6]/a/text()')[0]
        item['date']=i.xpath('./td[7]/text()')[0]
        item['fz']=i.xpath('./td[8]/em/text()')[0].split()[0]
        #print(item)
# 构造翻页的URL请求,返回一个url列表
def get_url_list():
    url_list = []
    for i in range(1, 330):
        url = 'https://www.17k.com/all/book/2_0_0_0_0_0_0_0_{0}.html'.format(i)
        url_list.append(url)
    return url_list
# 协程请求耗时
def gevent_get():
    begin_time = time.time()
    g_list = []
    for k in get_url_list():
        s = gevent.spawn(get_all_url, k, headers)
        g_list.append(s)
    gevent.joinall(g_list)
    end_time = time.time()
    print('gevent cost:%s' % str(end_time - begin_time))
# 普通请求耗时
def ordinary_get():
    begin_time = time.time()
    for k in get_url_list():
        get_all_url(k,headers)
    end_time = time.time()
    print('ordinary cost:%s' % str(end_time - begin_time))
if __name__ == '__main__':
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
    }
    gevent_get()
    ordinary_get()


# gevent cost:17.533002614974976
# ordinary cost:47.7977340221405

通过以上代码,可以发现,协程的效率大约是普通的2倍左右,主要就是测试一下协程的效率,没有抓取小说的详情页,可以自己试着修改一下,练习一下!

发表评论