一 . 线程池和进程池
可以适当的使用,在大量的IO情况下有更好的方法
import timefrom multiprocessing.dummy import Pooldef request(url): print('正在下载->',url) time.sleep(2) print('下载完毕->',url)start = time.time()urls = [ 'www.baidu.com', 'www.taobao.com', 'www.sougou.com']pool = Pool(3)pool.map(request,urls)print('总耗时->',time.time()-start)
二 . 单线程+异步协程(高性能的异步爬虫)
event_loop:相当于无线循环,我们可以把特殊的函数注册到这个时间循环上,异步执行coroutine:协程,就是被async修饰的函数task:任务,它是对协程进一步封装,包含了协程的各个状态future:将要执行的任务async/await,这两个是需要重点了解的
事件循环
import asyncioasync def hello(name): print('hello->'+ name)# 获取一个协程对象c = hello('attila')# 创建一个事件循环loop = asyncio.get_event_loop()# 将协程对象注册到事件循环中,并且启动事件循环对象loop.run_until_complete(c)
task
import asyncioasync def hello(name): print('hello->'+ name)# 获取一个协程对象c = hello('attila')# 创建一个事件循环loop = asyncio.get_event_loop()# 把协程封装到task中task = loop.create_task(c)print(task) # Task pending# 将协程对象注册到事件循环中,并且启动事件循环对象loop.run_until_complete(task)print(task) # Task finished
future
import asyncioasync def hello(name): print('hello->'+ name)# 获取一个协程对象c = hello('attila')# 把协程封装到task中task = asyncio.ensure_future(c)# 将协程对象注册到事件循环中,并且启动事件循环对象loop.run_until_complete(task)
绑定回调
import asynciodef call_back(task): print('---->',task.result())async def hello(name): print('hello->'+ name) return name# 获取一个协程对象c = hello('attila')# 把协程封装到task中task = asyncio.ensure_future(c)# 给任务绑定一个回调函数,这个call_back里面的参数就是绑定回到函数的tasktask.add_done_callback(call_back)# 将协程对象注册到事件循环中,并且启动事件循环对象loop.run_until_complete(task)
多任务异步协程(这里需要用到一个新模块aiohttp,一定不能是requests,因为requests是一个非异步模块)
pip install aiohttp
import aiohttpimport asyncioasync def get_page(url): async with aiohttp.ClientSession() as session: async with await session.get(url=url) as response: # 只要有io操作的地方就要挂起(await) page_text = await response.text() print(page_text) start = time.time()# 这里的url是自己在后台搭建的服务器,没给url都是time.sleep(2)urls = [ 'http://127.0.0.1:5000/cat', 'http://127.0.0.1:5000/dog', 'http://127.0.0.1:5000/monkey', 'http://127.0.0.1:5000/cat', 'http://127.0.0.1:5000/dog', 'http://127.0.0.1:5000/monkey', 'http://127.0.0.1:5000/cat', 'http://127.0.0.1:5000/dog', 'http://127.0.0.1:5000/monkey',]tasks = []loop = asyncio.get_event_loop()for url in urls: c = get_page(url) task = asyncio.ensure_future(c) tasks.append(task)loop.run_until_complete(asyncio.wait(tasks))print('总耗时->',time.time()-start) # 总耗时-> 2.053046464920044