Python-xpath学习——爬取美女图片

import requests
from lxml import etree
import os
import threading

def get_img(idx):
    try:
        url = f"https://www.92meinv.com/index-{idx}.html"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        html = response.text
        tree = etree.HTML(html)
        names = tree.xpath('//ul[@class="detail-list"]//a/img/@alt')
        real = tree.xpath('//ul[@class="detail-list"]//div/a/@href')
        index = 0
        for src in real:
            res = requests.get(src, headers=headers).text
            tree = etree.HTML(res)
            num = int(tree.xpath('//div[@class="des"]/h1/span/text()')[0].strip("()").split(" / ")[1])
            dir = f"img/page{idx}/{index + 1}_{names[index]}"
            if os.path.exists(dir):
                index += 1
                continue
            else:
                os.makedirs(dir, exist_ok=True)
            print(f"page {idx} -> 正在下载:{names[index]}...")
            for i in range(1, num + 1):
                if i == 1:
                    link = tree.xpath('//div[@class="pp hh"]/a/img/@src')[0]
                else:
                    src_ = src.replace(".html", "-" + str(i) + ".html")
                    res = requests.get(src_, headers=headers).text
                    tree = etree.HTML(res)
                    link = tree.xpath('//div[@class="pp hh"]/a/img/@src')[0]
                img_data = requests.get(link, headers=headers).content
                with open(f"{dir}/{i}.jpg", "wb") as f:
                    f.write(img_data)
                    f.close()
            index += 1
        print(f"page {idx}所有图片下载完成")
    except:
        print(f"page {idx} 下载失败")
        
if __name__ == "__main__":
    page_st = int(input("请输入开始页数:"))
    page_end = int(input("请输入结束页数:"))
    threads = []
    for idx in range(page_st, page_end + 1):
        thread = threading.Thread(target=get_img, args=(idx,))
        threads.append(thread)
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()