重寫一段爬蟲腳本

2019-03-17

幾年前用 Python 寫了段爬蟲腳本去抓取 喜馬拉雅 FM 上面的音頻。當時音頻下下來後,沒有備份,前些日子電腦重裝系統,文件全部沒有。一開始打算復用舊代碼重新下載,運行失敗後發現喜馬拉雅 FM 的頁面做了改動,一些標籤更改了。於是又重寫代碼了……

搗騰了喜馬拉雅 FM 的新頁面後,有了以下代碼。


import re
import os
import json
import requests
from bs4 import BeautifulSoup

class Ximalaya():

    def __init__(self):

        self.headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/64.0.3282.167 Chrome/64.0.3282.167 Safari/537.36'
            }

    def track_html(self):

        tracks_info_htmls = []
        tracks_download_html = []
        for n in range(1,9):
            url = 'https://www.ximalaya.com/yinyue/6233649/p{}/'.format(n)
            # print('The website is ' + url)
            html = requests.get(url, headers = self.headers).text
            tracks_info_htmls.append(html)

            url = 'https://www.ximalaya.com/revision/play/album?albumId=6233649&pageNum={}&sort=-1&pageSize=30'.format(n)
            # print('The website is ' + url)
            html = requests.get(url, headers = self.headers).text
            tracks_download_html.append(html)
        return tracks_info_htmls, tracks_download_html

    def collect_track_info(self, htmls):

        track_title_set = []
        track_id_set = []
        track_info_set = []

        for html in htmls:
            soup = BeautifulSoup(html, 'lxml')
            content = soup.find_all('div', attrs = {'class':'text _OO'})
            for track_info in content:
                track_title = track_info.text.strip()
                track_id = re.findall('<div class="text _OO"><a href="/yinyue/6233649/(\d*)', str(track_info))
                track_title_set.append(track_title)
                track_id_set.append(track_id)

        track_info_set = list(zip(track_id_set, track_title_set))

        return track_info_set

    def get_track_url(self, htmls):
        track_id_set = []
        track_url_set = []
        id_set = []
        url_set = []

        for html in htmls:
            track_id = re.findall('"trackId":(.\d*)', html)
            track_id_set.append(track_id)
            track_url = re.findall('"src":"(.*?)"', html)
            track_url_set.append(track_url)
        for i in range(len(track_id_set)):
            for j in range(len(track_id_set[i])):
                id_set.append(track_id_set[i][j])
        for i in range(len(track_url_set)):
            for j in range(len(track_url_set[i])):
                url_set.append(track_url_set[i][j])
        track_url_set = list(zip(id_set, url_set))

        return track_url_set

    def download_track(self, info, url):

        extract_info = []
        extract_url = []
        for i in info:
            extract_info.append(i[1])
        for j in url:
            extract_url.append(j[1])

        collection = list(zip(extract_info, extract_url))

        os.chdir('../../Music/ximalaya/')

        for n in range(len(collection)):
            try:
                track = requests.get(collection[n][1]).content
                with open('{}.m4a'.format(collection[n][0]), 'wb') as f:
                    f.write(track)
                    print(str(collection[n][0]), '...successed')
            except:
                print(str(collection[n][0]) + '...failed')

if __name__ == "__main__":

    go = Ximalaya()

    info_page, download_page = go.track_html()

    track_info_set = go.collect_track_info(info_page)
    track_url_set = go.get_track_url(download_page)

    go.download_track(track_info_set, track_url_set)

成功抓取!


【维瓦尔第】小提琴协奏曲“四季”之《春》I ...successed
【维瓦尔第】小提琴协奏曲“四季”之《春》II ...successed
【维瓦尔第】小提琴协奏曲“四季”之《春》III ...successed
【柴可夫斯基】钢琴套曲“四季”之《一月》 ...successed
......
《黄河》钢琴协奏曲-黄河愤 ...successed
《黄河》钢琴协奏曲-保卫黄河 ...successed
【贝多芬】《f小调钢琴奏鸣曲"热情"》III ...successed
【比才】《 卡门序曲》 ...successed
【比才】《卡门-"哈巴涅拉舞曲"》) ...successed
【莫扎特】《G大调小夜曲》 ...successed
【贝多芬】第五钢琴协奏曲I ...successed
【贝多芬】第五钢琴协奏曲II ...successed
【贝多芬】第五钢琴协奏曲III ...successed
【莫扎特】A大调钢琴协奏曲II ...successed
【贝多芬】钢琴奏鸣曲"悲怆"I ...successed
【贝多芬】钢琴奏鸣曲"悲怆"II ...successed
【贝多芬】钢琴奏鸣曲"悲怆"III ...successed