Loading

DATA

python 爬虫实战(四)批量下载PPT模板
以第一PPT网站为例爬取前5页的所有免费ppt模板下载链接: https://pan.baidu.com/s/15...
扫描右侧二维码阅读全文
12
2021/07

python 爬虫实战(四)批量下载PPT模板

第一PPT网站为例爬取前5页的所有免费ppt模板
下载链接: https://pan.baidu.com/s/15aI821rEogJXKAw2hoozPA 提取码: ridk
代码如下:

import requests
import urllib.request,urllib.error
from bs4 import BeautifulSoup
import re


def main():
    baseurl = "http://www.1ppt.com/moban/"
    html = askurl(baseurl)
    # print(html)
    downlink(baseurl,html)

    print("下载完成")




def askurl(url):
#head 可写可不写,这个网站没有反爬机制
    head = {
        "Cookie": "bdshare_firstime=1569077373112; UM_distinctid=17a943c15833c6-0b686423039b4d-d7e1938-e1000-17a943c1584b5e; CNZZDATA5092133=cnzz_eid%3D1945787715-1625979200-null%26ntime%3D1625984600; acw_tc=249ecd1a16259846297303806e92510c7b47991f277b2afcb64dcc1e75",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"
    }
    request = urllib.request.Request(url,headers=head)
    # req = requests.get(url,headers=head)
    # print(req.text)
    response = urllib.request.urlopen(request)
    # headers = response.getheaders()
    html = response.read().decode("gb2312")
    # print(html)

    return html

def downlink(baseurl,html):
    print("提取下载链接")
    for i in range(1,6):               #在此更改爬取下载的页数
        url = baseurl+"ppt_moban_"+str(i)+".html"
        if i == 1:                     #第一个页面网址单独写出
            url = baseurl
        else:
            url = url
        # print(url)
        html = askurl(url)
        # print(html)
        linklist = onelink(html)
        # print(linklist)

    return 0

def onelink(html):
    soup = BeautifulSoup(html, "html.parser")
    html_a = soup.find_all("ul", class_="tplist")[0]
    context = html_a.find_all('li')  # 得到的是一个列表
    # print(context)

    for j in context:
        a = j.find_all('a')[0]["href"]
        # b = a["href"]
        # print(a)
        urlone = "http://www.1ppt.com/" + a
        # print(urlone)
#每个页面的下载链接都是一样的 因为是相对路径
        # html = askurl(urlone)
        # soup_a = BeautifulSoup(html,"html.parser")
        # li = soup_a.find_all("li",class_="clearfix")
        # print(li)
        try:  #中间遇到个别错误直接跳过
            urltwo = urlone + "#xiazai"
            # print(urltwo)
            html = askurl(urltwo)
        except:
            pass
        soup_a = BeautifulSoup(html, "html.parser")
        ul = soup_a.find_all("ul", class_="downurllist")[0]
        li = ul.find_all('li')[0]
        a_href = li.find("a")["href"]
        urlthree = "http://www.1ppt.com/" + a_href
        downhtml = askurl(urlthree)
        down = BeautifulSoup(downhtml,"html.parser")
        down_li = down.find("li",class_='c1')
        download_a = down_li.find("a")["href"]
        down_title = down.find("h1").string
        down_title = re.sub("免费下载","",down_title)

        # print(down_title)
        download(download_a,down_title)

    return 0

def download(download_a,down_title):
    # print("下载中。。。。。")
    savepath = "../download/ppt/"
    response = requests.get(download_a)
    downloadpath = savepath + down_title + ".zip"
    # print(downloadpath)
    with open(downloadpath,"wb") as f:
            f.write(response.content)
    print("%s下载完成"%down_title)
    return 0




if __name__ == "__main__": #程序的入口
    main()
    print("爬取完毕")
最后修改:2021 年 07 月 12 日 09 : 38 AM
如果觉得我的文章对你有用,请随意赞赏

1 条评论

  1. DATA

    OωO

发表评论