以第一PPT网站为例爬取前5页的所有免费ppt模板
下载链接: https://pan.baidu.com/s/15aI821rEogJXKAw2hoozPA 提取码: ridk
代码如下:
import requests
import urllib.request,urllib.error
from bs4 import BeautifulSoup
import re
def main():
baseurl = "http://www.1ppt.com/moban/"
html = askurl(baseurl)
# print(html)
downlink(baseurl,html)
print("下载完成")
def askurl(url):
#head 可写可不写,这个网站没有反爬机制
head = {
"Cookie": "bdshare_firstime=1569077373112; UM_distinctid=17a943c15833c6-0b686423039b4d-d7e1938-e1000-17a943c1584b5e; CNZZDATA5092133=cnzz_eid%3D1945787715-1625979200-null%26ntime%3D1625984600; acw_tc=249ecd1a16259846297303806e92510c7b47991f277b2afcb64dcc1e75",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"
}
request = urllib.request.Request(url,headers=head)
# req = requests.get(url,headers=head)
# print(req.text)
response = urllib.request.urlopen(request)
# headers = response.getheaders()
html = response.read().decode("gb2312")
# print(html)
return html
def downlink(baseurl,html):
print("提取下载链接")
for i in range(1,6): #在此更改爬取下载的页数
url = baseurl+"ppt_moban_"+str(i)+".html"
if i == 1: #第一个页面网址单独写出
url = baseurl
else:
url = url
# print(url)
html = askurl(url)
# print(html)
linklist = onelink(html)
# print(linklist)
return 0
def onelink(html):
soup = BeautifulSoup(html, "html.parser")
html_a = soup.find_all("ul", class_="tplist")[0]
context = html_a.find_all('li') # 得到的是一个列表
# print(context)
for j in context:
a = j.find_all('a')[0]["href"]
# b = a["href"]
# print(a)
urlone = "http://www.1ppt.com/" + a
# print(urlone)
#每个页面的下载链接都是一样的 因为是相对路径
# html = askurl(urlone)
# soup_a = BeautifulSoup(html,"html.parser")
# li = soup_a.find_all("li",class_="clearfix")
# print(li)
try: #中间遇到个别错误直接跳过
urltwo = urlone + "#xiazai"
# print(urltwo)
html = askurl(urltwo)
except:
pass
soup_a = BeautifulSoup(html, "html.parser")
ul = soup_a.find_all("ul", class_="downurllist")[0]
li = ul.find_all('li')[0]
a_href = li.find("a")["href"]
urlthree = "http://www.1ppt.com/" + a_href
downhtml = askurl(urlthree)
down = BeautifulSoup(downhtml,"html.parser")
down_li = down.find("li",class_='c1')
download_a = down_li.find("a")["href"]
down_title = down.find("h1").string
down_title = re.sub("免费下载","",down_title)
# print(down_title)
download(download_a,down_title)
return 0
def download(download_a,down_title):
# print("下载中。。。。。")
savepath = "../download/ppt/"
response = requests.get(download_a)
downloadpath = savepath + down_title + ".zip"
# print(downloadpath)
with open(downloadpath,"wb") as f:
f.write(response.content)
print("%s下载完成"%down_title)
return 0
if __name__ == "__main__": #程序的入口
main()
print("爬取完毕")
版权属于:DATA
本文链接:https://www.zhlblog.cn/%E5%8E%9F%E5%88%9B/28.html
转载时须注明出处及本声明
OωO