侧边栏壁纸
博主头像
海绵

正向的根基决定逆向的上限

  • 累计撰写 32 篇文章
  • 累计创建 29 个标签
  • 累计收到 21 条评论

彼岸桌面壁纸下载(aiohttp异步爬虫)

海绵
2022-01-14 / 4 评论 / 3 点赞 / 1,717 阅读 / 1,742 字
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '\img\\'  # 用于保存图片
import asyncio
import aiohttp
from lxml import etree
import uuid

page_size = 20  # 总爬取页码数
url = 'http://www.netbian.com/meinv/index_{}.htm'  # 网址
concurrency = asyncio.Semaphore(500)  # 最大并发量
session = None
async def scrape_api(url, m=None):
    # 通用请求函数
    try:
        async with session.get(url=url) as res:
            print(res)
            if m: return await res.content.read()
            return await res.text()
    except Exception as e:
        print(e, '出错')

async def get_index(page):
    if page == 1:
        new_url = 'http://www.netbian.com/meinv/index.htm'
    else:
        new_url = url.format(page)
    async with concurrency:
        return await scrape_api(new_url)

async def resolver(results):
    # 解析所有图片的url
    img_url = []
    for html in results:
        tree = etree.HTML(html)
        url_list = tree.xpath('//*[@id="main"]/div[3]/ul/li/a/img/@src')
        img_url += url_list
    return img_url

async def get_img(url):
    img_b = await scrape_api(url, m=True)
    if img_b == None: print('保存失败')
    img_name = BASE_DIR + str(uuid.uuid1()) + '.jpg'
    with open(img_name, 'wb') as f:
        f.write(img_b)
    print(img_name,'保存成功')

async def main():
    global session
    headers = {
        # 此处js生成逻辑不做深究,直接复制浏览器cookie.逆向相关可以加博主微信讨论
        'Cookie':'__yjs_duid=1_6028e87cf1710e7222fbc8dbafd10b691642162867700; yjs_js_security_passport=420f238c83d202e7a4c4c1a12f537338531e91fb_1642162870_js',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    }
    session = aiohttp.ClientSession(headers=headers)
    task = [asyncio.ensure_future(get_index(_)) for _ in range(1, page_size + 1)]
    results = await asyncio.gather(*task)
    url_list = await resolver(results)
    scrape_detail_tasks = [asyncio.ensure_future(get_img(img)) for img in url_list]
    await asyncio.wait(scrape_detail_tasks)
    await session.close()

if __name__ == '__main__':
    loop = asyncio.get_event_loop().run_until_complete(main())

注意控制最大并发量,不要爬太快,影响别人网站正常运行

成品

image.png

0

评论区