爬虫(二)-摩杜云开发者社区

模拟登录进行爬虫：

抓包分析:

如果显示您的连接不是私密连接，右键图标，属性，在位置那里打一个空格。添加 --test-type --ignore-certificate-errors

我们看到，马上开搞！

爬虫(二)_Chrome

爬一下书架的藏书:

分析观察一下：

爬虫(二)_html_02

效果：

爬虫(二)_Chrome_03

code:

import requests
url = "https://www.17k.com/"
headers = {
    'User-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
}
session = requests.session()
#给登录的路由发包
session.post("https://passport.17k.com/ck/user/login", data={
    "loginName": "18086822387",
    "password": "admin123"
}, headers=headers)

res = session.get(url="https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919")
res.encoding = "utf8"
data = res.json().get("data")
print(data)

etree解析：

爬虫(二)_html_04

爬虫(二)_HTML_05

import requests
from lxml import etree

url = "https://www.17k.com/"
headers = {
    'User-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
}
session = requests.session()
#给登录的路由发包
session.post("https://passport.17k.com/ck/user/login", data={
    "loginName": "18086822387",
    "password": "admin123"
}, headers=headers)

res = session.get(url="https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919")
res.encoding = "utf8"
data = res.json().get("data")

for bookDict in data:
    bookId = bookDict.get('bookId')
    res = requests.get("https://www.17k.com/list/{}.html".format(bookId))
    res.encoding = 'utf-8'

    selector = etree.HTML(res.text)
    urls = selector.xpath('//dl[@class="Volume"]/dd/a')
    for url in urls:
        each_href = url.xpath("./@href")[0]
        print(each_href)
        each_title = url.xpath("./span/text()")[0].strip()
        print(each_title)

最终版本：

import requests
from lxml import etree

url = "https://www.17k.com/"
headers = {
    'User-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
}
session = requests.session()
#给登录的路由发包
session.post("https://passport.17k.com/ck/user/login", data={
    "loginName": "18086822387",
    "password": "admin123"
}, headers=headers)

res = session.get(url="https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919")
res.encoding = "utf8"
data = res.json().get("data")

for bookDict in data:
    bookId = bookDict.get('bookId')
    res = requests.get("https://www.17k.com/list/{}.html".format(bookId))
    res.encoding = 'utf-8'

    selector = etree.HTML(res.text)
    urls = selector.xpath('//dl[@class="Volume"]/dd/a')
    for url in urls:
        each_href = url.xpath("./@href")[0]
        each_title = url.xpath("./span/text()")[0].strip()

        res = requests.get("https://www.17k.com" + each_href)
        res.encoding = 'utf-8'
        each_html = res.text
        selector = etree.HTML(res.text)
        text = selector.xpath('//div[contains(@class,"content")]/div[@class="p"]/p[position()<last()]/text()')
        print(text)

写入本地的txt:

import requests
from lxml import etree
import os

headers = {
    'User-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
}
session = requests.session()

def login():
    url = "https://www.17k.com/"

    # 给登录的路由发包
    session.post("https://passport.17k.com/ck/user/login", data={
        "loginName": "18086822387",
        "password": "admin123"
    }, headers=headers)
#拿到书架返回的数据
def get_books():
    res = session.get(url="https://user.17k.com/ck/author2/shelf?page=1&appKey=2406394919")
    res.encoding = "utf8"
    data = res.json().get("data")
    return data

#给每一本书分类
def get_each_book(data):
    for bookDict in data:
        bookId = bookDict.get('bookId')
        bookName = bookDict.get('bookName')
        book_path = os.path.join("书房", bookName)
        if not os.path.exists(book_path):
            os.mkdir(book_path)
        get_chapter(bookName, bookId, book_path)

# 爬每一本书
def get_chapter(bookName, bookId, book_path):
    res = requests.get("https://www.17k.com/list/{}.html".format(bookId))
    res.encoding = "utf8"
    selector = etree.HTML(res.text)
    urls = selector.xpath('//dl[@class="Volume"]/dd/a')
    for url in urls:
        each_href = url.xpath("./@href")[0]
        each_title = url.xpath("./span/text()")[0].strip()
        res = requests.get("https://www.17k.com" + each_href)
        res.encoding = 'utf-8'
        each_html = res.text
        selector = etree.HTML(res.text)
        each_text = selector.xpath('//div[contains(@class,"content")]/div[@class="p"]/p[position()<last()]/text()')
        download(book_path, each_title, each_text)
        print("{}书的{}章节下载完成".format(bookName, bookId))

 #下载
def download(book_path, each_title, each_text):
    each_title = each_title + '.txt'
    each_path = os.path.join(book_path, each_title)
    with open(each_path, "w", encoding='utf-8') as fp:
        for line in each_text:
            fp.write(line + "\n")

login()
data = get_books()
folder_path = "书房"

if not os.path.exists(folder_path):
    os.mkdir(folder_path)

get_each_book(data)