幾年前用 Python 寫了段爬蟲腳本去抓取 喜馬拉雅 FM 上面的音頻。當時音頻下下來後,沒有備份,前些日子電腦重裝系統,文件全部沒有。一開始打算復用舊代碼重新下載,運行失敗後發現喜馬拉雅 FM 的頁面做了改動,一些標籤更改了。於是又重寫代碼了……
搗騰了喜馬拉雅 FM 的新頁面後,有了以下代碼。
import re
import os
import json
import requests
from bs4 import BeautifulSoup
class Ximalaya():
def __init__(self):
self.headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/64.0.3282.167 Chrome/64.0.3282.167 Safari/537.36'
}
def track_html(self):
= []
tracks_info_htmls = []
tracks_download_html for n in range(1,9):
= 'https://www.ximalaya.com/yinyue/6233649/p{}/'.format(n)
url # print('The website is ' + url)
= requests.get(url, headers = self.headers).text
html
tracks_info_htmls.append(html)
= 'https://www.ximalaya.com/revision/play/album?albumId=6233649&pageNum={}&sort=-1&pageSize=30'.format(n)
url # print('The website is ' + url)
= requests.get(url, headers = self.headers).text
html
tracks_download_html.append(html)return tracks_info_htmls, tracks_download_html
def collect_track_info(self, htmls):
= []
track_title_set = []
track_id_set = []
track_info_set
for html in htmls:
= BeautifulSoup(html, 'lxml')
soup = soup.find_all('div', attrs = {'class':'text _OO'})
content for track_info in content:
= track_info.text.strip()
track_title = re.findall('<div class="text _OO"><a href="/yinyue/6233649/(\d*)', str(track_info))
track_id
track_title_set.append(track_title)
track_id_set.append(track_id)
= list(zip(track_id_set, track_title_set))
track_info_set
return track_info_set
def get_track_url(self, htmls):
= []
track_id_set = []
track_url_set = []
id_set = []
url_set
for html in htmls:
= re.findall('"trackId":(.\d*)', html)
track_id
track_id_set.append(track_id)= re.findall('"src":"(.*?)"', html)
track_url
track_url_set.append(track_url)for i in range(len(track_id_set)):
for j in range(len(track_id_set[i])):
id_set.append(track_id_set[i][j])for i in range(len(track_url_set)):
for j in range(len(track_url_set[i])):
url_set.append(track_url_set[i][j])= list(zip(id_set, url_set))
track_url_set
return track_url_set
def download_track(self, info, url):
= []
extract_info = []
extract_url for i in info:
1])
extract_info.append(i[for j in url:
1])
extract_url.append(j[
= list(zip(extract_info, extract_url))
collection
'../../Music/ximalaya/')
os.chdir(
for n in range(len(collection)):
try:
= requests.get(collection[n][1]).content
track with open('{}.m4a'.format(collection[n][0]), 'wb') as f:
f.write(track)print(str(collection[n][0]), '...successed')
except:
print(str(collection[n][0]) + '...failed')
if __name__ == "__main__":
= Ximalaya()
go
= go.track_html()
info_page, download_page
= go.collect_track_info(info_page)
track_info_set = go.get_track_url(download_page)
track_url_set
go.download_track(track_info_set, track_url_set)
成功抓取!
【维瓦尔第】小提琴协奏曲“四季”之《春》I ...successed
【维瓦尔第】小提琴协奏曲“四季”之《春》II ...successed
【维瓦尔第】小提琴协奏曲“四季”之《春》III ...successed
【柴可夫斯基】钢琴套曲“四季”之《一月》 ...successed
......
《黄河》钢琴协奏曲-黄河愤 ...successed
《黄河》钢琴协奏曲-保卫黄河 ...successed
【贝多芬】《f小调钢琴奏鸣曲"热情"》III ...successed
【比才】《 卡门序曲》 ...successed
【比才】《卡门-"哈巴涅拉舞曲"》) ...successed
【莫扎特】《G大调小夜曲》 ...successed
【贝多芬】第五钢琴协奏曲I ...successed
【贝多芬】第五钢琴协奏曲II ...successed
【贝多芬】第五钢琴协奏曲III ...successed
【莫扎特】A大调钢琴协奏曲II ...successed
【贝多芬】钢琴奏鸣曲"悲怆"I ...successed
【贝多芬】钢琴奏鸣曲"悲怆"II ...successed
【贝多芬】钢琴奏鸣曲"悲怆"III ...successed