Loading

DATA

Python爬虫实战(二)bilibili排行榜top100
代码如下:from bs4 import BeautifulSoup #网页解析 获取数据 import re ...
扫描右侧二维码阅读全文
10
2021/07

Python爬虫实战(二)bilibili排行榜top100

代码如下:

from bs4 import BeautifulSoup  #网页解析 获取数据
import re   #正则
import urllib.request,urllib.error    #制定URL 获取网页数据
import xlwt

# 正则表达式
findnum = re.compile(r'<div class="num">(\d*)</div> ')
findlink = re.compile(r'<a href="(.*?)"')
findTitle = re.compile(r'<a href=.*>(.*?)</a>')
findplayer = re.compile(r'<span class="data-box"><i class="b-icon play"></i>\n              (.*)')
findview = re.compile(r'<span class="data-box"><i class="b-icon view"></i>\n              (.*)')
findauthor = re.compile(r'span class="data-box up-name"><i class="b-icon author"></i>\n                (.*)')
findauthorlink = re.compile(r'</span> <a href="(.*?)" target="_blank">')
findgrade = re.compile(r'<div class="pts"><div>(\d*)</div>')
findothertitle = re.compile(r'<span class="title">(.*?)</span>')
findotherlink = re.compile(r'<a class="other-link" href="(.*?)" target="_blank">')
findothergrade = re.compile(r'<strong>(\d*)</strong>')


def main():
    url="https://www.bilibili.com/v/popular/rank/all"
    datalist = getData(url)
    savepath = "bilibili排行榜top100.xls"
    saveData(datalist,savepath)
    # askurl(url)




def getData(url):
    datalist=[]
    html = askurl(url)          #保存获取到的网页源码
    soup = BeautifulSoup(html, "html.parser")
    for item in soup.find_all('li', class_="rank-item"):
        # print(item)
        data = []
        item = str(item)
 #排名
        num = re.findall(findnum, item)[0]
        # print(num)
        data.append(num)
# 视频链接
        link = re.findall(findlink, item)[0]
        data.append(link)
# 其他视频链接
        otherlink = re.findall(findotherlink, item)
        # print(otherlink)
        if (len(otherlink) == 0):
            data.append(' ')
            data.append(' ')
            data.append(' ')
            data.append(' ')
        elif (len(otherlink) == 1):
            alink = otherlink[0]
            data.append(alink)
            data.append(' ')
            data.append(' ')
            data.append(' ')
            # print(alink)
        else:
            alink = otherlink[0]
            data.append(alink)
            blink = otherlink[1]
            data.append(blink)
            clink = otherlink[2]
            data.append(clink)
            dlink = otherlink[3]
            data.append(dlink)
#播放量
        player = re.findall(findplayer, item)[0]
        data.append(player)
#评论数
        view = re.findall(findview, item)[0]
        data.append(view)
#作者
        author = re.findall(findauthor, item)[0]
        data.append(author)
#作者链接
        authorlink = re.findall(findauthorlink, item)[0]
        data.append(authorlink)
#综合评分
        grade = re.findall(findgrade, item)[0]
        data.append(grade)
#其他视频评分
        othergrade = re.findall(findothergrade, item)
        # print(othergrade)
        if (len(othergrade) == 0):
            data.append(' ')
            data.append(' ')
            data.append(' ')
            data.append(' ')
        elif (len(othergrade) == 1):
            agrade = othergrade[0]
            data.append(agrade)
            data.append(' ')
            data.append(' ')
            data.append(' ')
            # print(agrade)
        else:
            agrade = othergrade[0]
            data.append(agrade)
            bgrade = othergrade[1]
            data.append(bgrade)
            cgrade = othergrade[2]
            data.append(cgrade)
            dgrade = othergrade[3]
            data.append(dgrade)
#标题
        Title = re.findall(findTitle, item)[0]
        data.append(Title)
#其他标题
        othertitle = re.findall(findothertitle, item)
        if (len(othertitle) == 0):
            data.append(' ')
            data.append(' ')
            data.append(' ')
            data.append(' ')
        elif (len(othertitle) == 1):
            atitle = othertitle[0]
            data.append(atitle)
            data.append(' ')
            data.append(' ')
            data.append(' ')
            # print(atitle)
        else:
            atitle = othertitle[0]
            data.append(atitle)
            btitle = othertitle[1]
            data.append(btitle)
            ctitle = othertitle[2]
            data.append(ctitle)
            dtitle = othertitle[3]
            data.append(dtitle)
            # print(atitle,btitle,ctitle,dtitle)

        datalist.append(data)
    # print(datalist)

    return datalist

def askurl(url):
    head = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"
        ,"cookie": "buvid3=86DCEB74-1812-40F1-A402-A78C125699DD155842infoc; LIVE_BUVID=AUTO3815678380423091; rpdid=|(um~JJRl~uu0J'ulY~)J)|~l; _uuid=F1BE07A1-FA93-F3C9-BA4A-DE21B24D0FE618333infoc; CURRENT_FNVAL=80; blackside_state=1; sid=m42m6s9k; fingerprint=8f329827e01c369fcb046dae85f5ab06; buvid_fp=86DCEB74-1812-40F1-A402-A78C125699DD155842infoc; buvid_fp_plain=2BC9F6BD-497A-4DF3-8548-D198377A77AA143077infoc; DedeUserID=1861718315; DedeUserID__ckMd5=6d788333269d864b; SESSDATA=c3ca45a0%2C1636698446%2Cddf66*51; bili_jct=bff9eb63f6c3c34181f39d401f410062; PVID=2"
    }
    request = urllib.request.Request(url, headers=head)
    response = urllib.request.urlopen(request)
    html = response.read().decode("utf-8")
    # print(html)
    return html

def saveData(datalist,savepath):
    print("保存中·····")
    book = xlwt.Workbook(encoding="utf-8")
    sheet = book.add_sheet("bilibilitop100.xls",cell_overwrite_ok=True)
    col = ("排名","视频链接","其他视频链接一","其他视频链接二","其他视频链接三","其他视频链接四","播放量","评论人数","作者","作者链接","综合评分","其他综合评分一","其他综合评分二","其他综合评分三","其他综合评分四","标题","小标题一","小标题二","小标题三","小标题四")
    for i in range(0,20):
        sheet.write(0,i,col[i])
    for i in range(0,100):
        print("第%d条"%(i+1))
        data = datalist[i]
        for j in range(0,20):
            sheet.write(i+1,j,data[j])
    book.save(savepath)



if __name__ == "__main__": #程序的入口
    main()
    print("爬取完毕")

 

七月12日补充:

上面的代码已失效,大概是正则表达式出了问题,网站本身是动态的,所以这种方法不会一直有效,此方法不可取,但是可以用bs4中的soup.find_all()定位标签和类,进而提取到想要的东西

最后修改:2021 年 07 月 12 日 09 : 35 PM
如果觉得我的文章对你有用,请随意赞赏

发表评论