利用 Python 爬虫爬取 Pixiv 上的小姐姐

本文最后更新于：2020-09-12 22:50

Pixiv 上的小姐姐那么多，手动保存到手断了也保存不完，为什么不利用 Python 爬虫自动帮我们下载呢？

使用本教程前先确定你的本地网络是否能访问 Pixiv

项目的文件目录

.                       # 项目根目录
|-- Folder1             # 文件夹名字可以自定义 一般为画师的名字
    |-- 6957790.txt     # 文件名为画师的主页 ID
|-- Folder2             # 文件夹名字可以自定义 一般为画师的名字
    |-- 1480420.txt     # 文件名为画师的主页 ID
|-- Folder3             # 文件夹名字可以自定义 一般为画师的名字
    |-- 28440744.txt    # 文件名为画师的主页 ID
|-- Pixiv.py            # Python 爬虫脚本

爬虫代码

pixiv.py 的内容如下

# -*- coding:utf-8*-


'''
请先安装好相应的 Python 模块
pip install requests bs4 lxml
'''


import requests
from bs4 import BeautifulSoup
import lxml
import re
from glob import glob
import os


headers = {
    'cookie': '', # 这里的Cookie改为你的Cookie
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}


def get_pic_list(num):
    url = 'https://www.pixiv.net/ajax/user/%s/profile/all?lang=zh' % num
    r = requests.get(url, headers = headers)
    pic_list = re.findall('"(\d+?)":null', r.text)
    return pic_list


def get_file_list():
    temp = glob('*')
    for item in temp:
        if '.txt' in item:
            temp.remove(item)
    file_list = [name.split('.')[0] for name in temp]
    return file_list


def download(num):
    pic_list = get_pic_list(num)
    file_list = get_file_list()
    for pic in pic_list:
        if pic not in file_list:
            url = 'https://www.pixiv.net/artworks/%s' % pic
            r = requests.get(url, headers = headers)
            result = re.search('"original":"(.+?)"', r.text)
            if result:
                download_url = result.group(1)
                suffix = download_url.rsplit('/', 1)[-1].rsplit('.', 1)[-1]
                h = {
                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                    'accept-encoding': 'gzip, deflate, br',
                    'accept-language': 'zh-CN,zh;q=0.9',
                    'cache-control': 'max-age=0',
                    'dnt': '1',
                    'if-modified-since': 'Mon, 09 Sep 2019 23:00:01 GMT',
                    'referer': 'https://www.pixiv.net/artworks/76712185',
                    'sec-fetch-dest': 'document',
                    'sec-fetch-mode': 'navigate',
                    'sec-fetch-site': 'none',
                    'sec-fetch-user': '?1',
                    'upgrade-insecure-requests': '1',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
                }
                s = requests.get(download_url, headers = h)
                with open(pic + '.' + suffix, 'wb') as f:
                    #print(s.text)
                    f.write(s.content)
                    print(pic)


def main():
    filenames = os.listdir()
    filenames.remove('pixiv.py')
    cwd = os.getcwd()
    for filename in filenames:
        print('切换到%s' % filename)
        os.chdir(cwd + '\\' + filename)
        num = glob("*.txt")[0]
        num = re.sub('.txt', '', num)
        download(num)


if __name__ == '__main__':
    main()