import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '\img\\' # 用于保存图片
import asyncio
import aiohttp
from lxml import etree
import uuid
page_size = 20 # 总爬取页码数
url = 'http://www.netbian.com/meinv/index_{}.htm' # 网址
concurrency = asyncio.Semaphore(500) # 最大并发量
session = None
async def scrape_api(url, m=None):
# 通用请求函数
try:
async with session.get(url=url) as res:
print(res)
if m: return await res.content.read()
return await res.text()
except Exception as e:
print(e, '出错')
async def get_index(page):
if page == 1:
new_url = 'http://www.netbian.com/meinv/index.htm'
else:
new_url = url.format(page)
async with concurrency:
return await scrape_api(new_url)
async def resolver(results):
# 解析所有图片的url
img_url = []
for html in results:
tree = etree.HTML(html)
url_list = tree.xpath('//*[@id="main"]/div[3]/ul/li/a/img/@src')
img_url += url_list
return img_url
async def get_img(url):
img_b = await scrape_api(url, m=True)
if img_b == None: print('保存失败')
img_name = BASE_DIR + str(uuid.uuid1()) + '.jpg'
with open(img_name, 'wb') as f:
f.write(img_b)
print(img_name,'保存成功')
async def main():
global session
headers = {
# 此处js生成逻辑不做深究,直接复制浏览器cookie.逆向相关可以加博主微信讨论
'Cookie':'__yjs_duid=1_6028e87cf1710e7222fbc8dbafd10b691642162867700; yjs_js_security_passport=420f238c83d202e7a4c4c1a12f537338531e91fb_1642162870_js',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
session = aiohttp.ClientSession(headers=headers)
task = [asyncio.ensure_future(get_index(_)) for _ in range(1, page_size + 1)]
results = await asyncio.gather(*task)
url_list = await resolver(results)
scrape_detail_tasks = [asyncio.ensure_future(get_img(img)) for img in url_list]
await asyncio.wait(scrape_detail_tasks)
await session.close()
if __name__ == '__main__':
loop = asyncio.get_event_loop().run_until_complete(main())
注意控制最大并发量,不要爬太快,影响别人网站正常运行
成品
评论区