调用 getOneBlogDetails( ) 函数可以获取目标网页的博主姓名, 个人主页网址,原创文章、粉丝、喜欢、评论数量, 等级、访问量、积分、排名。
#!/usr/lib/python3.6
#encoding = utf-8
#爬取一个博客的基本信息
#本爬虫仅用于学习,纯属爱好,虽然本爬虫很简单,但还是请大家不要滥用
import requests
from bs4 import BeautifulSoup
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'
}
#博客网址
url = ""
#爬取博客基本信息的函数
def getOneBlogDetails(url):
#获取网页的数据
r = requests.get(url, headers=headers, timeout=100)
#解析网页,BeautifulSoup的官方文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
soup = BeautifulSoup(r.text, "html.parser")
Name = soup.find('a', id="uid").text
My_url = soup.find('a', id="uid").get('href')
info = soup.find('div', class_="data-info d-flex item-tiling")
#info_list1: [<dt><a href="?t=1">原创</a></dt>, <dt>粉丝</dt>, <dt>喜欢</dt>, <dt>评论</dt>]
info_list1 = info.find_all('dt')
# info_list2: [<span class="count">77</span>, <span class="count" id="fan">0</span>, <span class="count">0</span>, <span class="count">1</span>]
info_list2 = info.find_all('span', class_='count')
info1 = soup.find('div', class_="grade-box clearfix")
info1_list = info1.find_all('dl')
Base = {}
Base1 = {}
for i in range(len(info_list2)):
Base[info_list1[i].text] = info_list2[i].text
for link in info1_list:
key = link.find('dt').text
if link.find('dd').text.strip() == '':
values = link.find('a').get('title')[0:2]
else:
values = link.find('dd').text.strip()
Base1[key] = values
print( "博主姓名: " + Name )
print( "个人主页: " + My_url)
for key in Base.keys():
print( key + ": " + Base[key])
for key in Base1.keys():
print( key + Base1[key])
# 调用函数
getOneBlogDetails(url)