代码如下:
from bs4 import BeautifulSoup #网页解析 获取数据
import re #正则
import urllib.request,urllib.error #制定URL 获取网页数据
import xlwt
# 正则表达式
findnum = re.compile(r'<div class="num">(\d*)</div> ')
findlink = re.compile(r'<a href="(.*?)"')
findTitle = re.compile(r'<a href=.*>(.*?)</a>')
findplayer = re.compile(r'<span class="data-box"><i class="b-icon play"></i>\n (.*)')
findview = re.compile(r'<span class="data-box"><i class="b-icon view"></i>\n (.*)')
findauthor = re.compile(r'span class="data-box up-name"><i class="b-icon author"></i>\n (.*)')
findauthorlink = re.compile(r'</span> <a href="(.*?)" target="_blank">')
findgrade = re.compile(r'<div class="pts"><div>(\d*)</div>')
findothertitle = re.compile(r'<span class="title">(.*?)</span>')
findotherlink = re.compile(r'<a class="other-link" href="(.*?)" target="_blank">')
findothergrade = re.compile(r'<strong>(\d*)</strong>')
def main():
url="https://www.bilibili.com/v/popular/rank/all"
datalist = getData(url)
savepath = "bilibili排行榜top100.xls"
saveData(datalist,savepath)
# askurl(url)
def getData(url):
datalist=[]
html = askurl(url) #保存获取到的网页源码
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('li', class_="rank-item"):
# print(item)
data = []
item = str(item)
#排名
num = re.findall(findnum, item)[0]
# print(num)
data.append(num)
# 视频链接
link = re.findall(findlink, item)[0]
data.append(link)
# 其他视频链接
otherlink = re.findall(findotherlink, item)
# print(otherlink)
if (len(otherlink) == 0):
data.append(' ')
data.append(' ')
data.append(' ')
data.append(' ')
elif (len(otherlink) == 1):
alink = otherlink[0]
data.append(alink)
data.append(' ')
data.append(' ')
data.append(' ')
# print(alink)
else:
alink = otherlink[0]
data.append(alink)
blink = otherlink[1]
data.append(blink)
clink = otherlink[2]
data.append(clink)
dlink = otherlink[3]
data.append(dlink)
#播放量
player = re.findall(findplayer, item)[0]
data.append(player)
#评论数
view = re.findall(findview, item)[0]
data.append(view)
#作者
author = re.findall(findauthor, item)[0]
data.append(author)
#作者链接
authorlink = re.findall(findauthorlink, item)[0]
data.append(authorlink)
#综合评分
grade = re.findall(findgrade, item)[0]
data.append(grade)
#其他视频评分
othergrade = re.findall(findothergrade, item)
# print(othergrade)
if (len(othergrade) == 0):
data.append(' ')
data.append(' ')
data.append(' ')
data.append(' ')
elif (len(othergrade) == 1):
agrade = othergrade[0]
data.append(agrade)
data.append(' ')
data.append(' ')
data.append(' ')
# print(agrade)
else:
agrade = othergrade[0]
data.append(agrade)
bgrade = othergrade[1]
data.append(bgrade)
cgrade = othergrade[2]
data.append(cgrade)
dgrade = othergrade[3]
data.append(dgrade)
#标题
Title = re.findall(findTitle, item)[0]
data.append(Title)
#其他标题
othertitle = re.findall(findothertitle, item)
if (len(othertitle) == 0):
data.append(' ')
data.append(' ')
data.append(' ')
data.append(' ')
elif (len(othertitle) == 1):
atitle = othertitle[0]
data.append(atitle)
data.append(' ')
data.append(' ')
data.append(' ')
# print(atitle)
else:
atitle = othertitle[0]
data.append(atitle)
btitle = othertitle[1]
data.append(btitle)
ctitle = othertitle[2]
data.append(ctitle)
dtitle = othertitle[3]
data.append(dtitle)
# print(atitle,btitle,ctitle,dtitle)
datalist.append(data)
# print(datalist)
return datalist
def askurl(url):
head = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"
,"cookie": "buvid3=86DCEB74-1812-40F1-A402-A78C125699DD155842infoc; LIVE_BUVID=AUTO3815678380423091; rpdid=|(um~JJRl~uu0J'ulY~)J)|~l; _uuid=F1BE07A1-FA93-F3C9-BA4A-DE21B24D0FE618333infoc; CURRENT_FNVAL=80; blackside_state=1; sid=m42m6s9k; fingerprint=8f329827e01c369fcb046dae85f5ab06; buvid_fp=86DCEB74-1812-40F1-A402-A78C125699DD155842infoc; buvid_fp_plain=2BC9F6BD-497A-4DF3-8548-D198377A77AA143077infoc; DedeUserID=1861718315; DedeUserID__ckMd5=6d788333269d864b; SESSDATA=c3ca45a0%2C1636698446%2Cddf66*51; bili_jct=bff9eb63f6c3c34181f39d401f410062; PVID=2"
}
request = urllib.request.Request(url, headers=head)
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
# print(html)
return html
def saveData(datalist,savepath):
print("保存中·····")
book = xlwt.Workbook(encoding="utf-8")
sheet = book.add_sheet("bilibilitop100.xls",cell_overwrite_ok=True)
col = ("排名","视频链接","其他视频链接一","其他视频链接二","其他视频链接三","其他视频链接四","播放量","评论人数","作者","作者链接","综合评分","其他综合评分一","其他综合评分二","其他综合评分三","其他综合评分四","标题","小标题一","小标题二","小标题三","小标题四")
for i in range(0,20):
sheet.write(0,i,col[i])
for i in range(0,100):
print("第%d条"%(i+1))
data = datalist[i]
for j in range(0,20):
sheet.write(i+1,j,data[j])
book.save(savepath)
if __name__ == "__main__": #程序的入口
main()
print("爬取完毕")
七月12日补充:
上面的代码已失效,大概是正则表达式出了问题,网站本身是动态的,所以这种方法不会一直有效,此方法不可取,但是可以用bs4中的soup.find_all()定位标签和类,进而提取到想要的东西
版权属于:DATA
本文链接:https://www.zhlblog.cn/%E5%8E%9F%E5%88%9B/25.html
转载时须注明出处及本声明