享一个我自己写的pythonB站视频爬虫,写的比较粗糙(大佬们可以帮我添加一些其他功能爬虫功能吗,如模拟登录,数据分析等;求求啦(>W<))
当然网上一堆B站视频获取的工具,也不差我这个粗糙的python脚本,就是分享出来大家一起讨论学习,如果大家有什么好的想法和功能我们可以一起聊聊。
这里分享一个我自己用的B站视频下载的工具BBDown,很好用,作者也是在一直更新。
必要工具ffmpeg,建议还是放在你的python项目目录下(我不知道为什么配置的环境变量没有生效)
这个如果想爬取高清视频就把自己的cookie加到api_headers。这里进度条加载有点问题,就是视频太小了进度条可能加载不完全,还有就是视频合成也有点问题,有时视频合成不了(我推测可能是特殊字符没有过滤完全,当然也可能是其他原因)
python源码:
- import argparse
- import requests, re, sys, os, time
- from contextlib import closing
- from urllib import parse
- from lxml import etree
- import subprocess
- from tqdm import tqdm
- class BiliBili:
- def __init__(self, dirname):
- self.search_headers = {
- 'authority': 'search.bilibili.com',
- 'Accept': '*/*',
- 'Referer': 'https://www.bilibili.com/',
- 'Accept-Encoding': 'gzip, deflate, br',
- 'Accept-Language': 'zh-CN,zh;q=0.9',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
- 'Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61',
- }
- self.video_headers = {
- 'authority': 'www.bilibili.com',
- 'Referer': 'https://www.bilibili.com/',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
- 'Chrome/118.0.0.0 Safari/537.36'
- }
- self.api_headers = {
- 'authority': 'api.bilibili.com',
- 'Accept': '*/*',
- 'Referer': 'https://www.bilibili.com/',
- 'Accept-Encoding': 'gzip, deflate, br',
- 'Accept-Language': 'zh-CN,zh;q=0.9',
- # 'cookie':"",
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
- 'Chrome/119.0.0.0 Safari/537.36'
- }
- self.sess = requests.Session()
- self.dir = dirname
- def downloader(self, data_url, title):
- """
- 数据下载
- Parameters:
- data_url: 数据地址
- title: 标题
- """
- if self.dir not in os.listdir():
- os.mkdir(self.dir)
- size = 0
- with closing(self.sess.get(data_url, headers=self.video_headers, stream=True)) as response:
- chunk_size = 1000
- content_size = int(response.headers['content-length'])
- content_mb = content_size / 1000 / 1000
- if response.status_code == 200:
- sys.stdout.write(' [开始下载]\n')
- sys.stdout.write(' [文件大小]: %0.2f MB\n' % content_mb)
- video_name = os.path.join(self.dir, title)
- # 保存视频,并输出进度
- with tqdm(total=content_size, desc=' [下载进度]',leave=False, ncols=100, unit='B',unit_scale=True) as pbar:
- with open(video_name, 'wb') as file:
- if content_mb < 3:
- file.write(response.content)
- for i in range(5):
- pbar.update(content_size/5)
- else:
- for data in response.iter_content(chunk_size=chunk_size):
- file.write(data)
- pbar.update(len(data))
- size += len(data)
- file.flush()
- sys.stdout.write('\n')
- sys.stdout.write(' [下载完成]' + '\r')
- sys.stdout.flush()
- if size / content_size == 1:
- print('\n')
- else:
- print('~~~链接异常~~~'+'\r')
- time.sleep(1)
- def search_video(self, keyword, page=1):
- """
- 搜索页视频信息
- Parameters:
- keyword: 关键词
- page: 页码
- Returns:
- videos[titles,bvs]
- titles:标题
- bvs: bv号
- """
- url = f'https://search.bilibili.com/all?keyword={parse.quote(keyword)}&page={page}&o=30'
- req = self.sess.get(url=url, headers=self.search_headers)
- html = etree.fromstring(req.text, etree.HTMLParser())
- bvs = html.xpath('//div[@class="bili-video-card__info--right"]/a/@href')[:3]
- titles = html.xpath('//div[@class="bili-video-card__info--right"]/a/h3/@title')[:3]
- videos = []
- for i, j in zip(titles, bvs):
- for c in u'´★☆❤◦\/:*?"<>|':
- i = i.replace(c, '')
- tmp = [i, j]
- videos.append(tmp)
- # 输出搜索页面视频标题和视频url
- print(videos)
- return videos
- # titles, bvs
- def get_download_url(self, arcurl):
- """
- 获取详情页数据信息
- Parameters:
- arcurl: 视频播放地址
- Returns:
- accept_description: 视频清晰度
- video_data: 视频地址
- audio_data: 音频地址
- title: 标题
- """
- xp = 'BV\d.{9}'
- if re.findall(xp, arcurl):
- bv = re.findall(xp, arcurl)[0]
- url = f'https://api.bilibili.com/x/web-interface/view?bvid={bv}' # avid&cid
- else:
- print('视频BV号解析失败,请检查输入的bv号是否正确')
- exit(0)
- req1 = self.sess.get(url=url, headers=self.video_headers)
- ac_json = req1.json()
- avid = ac_json['data']['aid']
- cid = ac_json['data']['cid']
- url2 = f'https://api.bilibili.com/x/player/wbi/playurl?avid={avid}&cid={cid}&fnval=4048' # playurl
- title = ac_json['data']['title']
- req2 = self.sess.get(url=url2, headers=self.api_headers)
- playinfo_dict = req2.json()
- accept_description = playinfo_dict["data"]["accept_description"] # 视频清晰度
- # id = [playinfo_dict["data"]["dash"]["video"][0]["id"]]
- audio_data = [playinfo_dict["data"]["dash"]["audio"][0]["baseUrl"]] # 音频数据
- video_data = [playinfo_dict["data"]["dash"]["video"][0]["baseUrl"]]
- # print(id)
- if not audio_data and not video_data:
- print('视频解析失败')
- exit(0)
- return [accept_description, video_data, audio_data,title]
- def merge_data(self, dir, video_name):
- """
- 视频合成
- Parameters:
- dir: 目录
- video_name: 视频名
- """
- time.sleep(0.1)
- if video_name+'_2' in os.listdir(self.dir):
- print( '合成视频已存在')
- exit(0)
- else:
- print('视频合成开始:', video_name)
- cmd = f"cd {dir} & ffmpeg -y -i {video_name}.mp4 -i {video_name}.mp3 -c:v copy -c:a aac -strict experimental -map 0:0 -map 1:0 {video_name}_2.mp4 && del {video_name}.mp4 {video_name}.mp3"
- subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- print('视频合成结束:', video_name+'\r')
- def search_downloader(self, keyword,page=1):
- """
- 批量爬取搜索页视频
- Parameters:
- keyword: 关键词
- page: 页码
- """
- if self.dir not in os.listdir():
- os.mkdir(self.dir)
- for j in range(page):
- s_video = self.search_video(keyword, j+1)
- for i in range(len(s_video)):
- title = s_video[i][0]
- arcurl = s_video[i][1]
- if title not in os.listdir(self.dir):
- videos_data = self.get_download_url(arcurl)[1]
- audio_data = self.get_download_url(arcurl)[2]
- if not videos_data[0] or not audio_data[0]:
- print('第[ %d ]页:%s视频或音频解析失败,跳过下载:' % (1 + j, title))
- continue # Skip video download if video or audio parsing fails
- fname = title + '.mp4'
- print('第[ %d ]页:视频[ %s ]下载中:' % (1 + j, fname)) # 打印页码和指定下载视频
- self.downloader(videos_data[0], fname)
- print('视频下载完成!')
- fname = title + '.mp3'
- print('第[ %d ]页:音频[ %s ]下载中:' % (1 + j, fname)) # 打印页码和指定下载视频
- self.downloader(audio_data[0], fname)
- print('音频下载完成!')
- # 创建临时文本文件用于合并视频音频
- try:
- video_name = title
- dirz = self.dir
- self.merge_data(dirz, video_name)
- except:
- print('请安装FFmpeg,并配置环境变量 http://ffmpeg.org/')
- def a_video_download(self,bv):
- """
- 单个视频爬取
- Parameters:
- bv: 关bv号
- """
- video_info = self.get_download_url(bv)
- title = video_info[3]
- fname = "{0}.mp4".format(title)
- print('视频[ %s ]下载中:' % fname) # 打印页码和指定下载视频
- self.downloader(video_info[1][0], fname)
- print('视频下载完成!')
- fname = '{0}.mp3'.format(title)
- print('音频[ %s ]下载中:' % fname) # 打印页码和指定下载视频
- self.downloader(video_info[2][0], fname)
- print('音频下载完成!')
- self.merge_data(self.dir,video_info[3])
- if __name__ == '__main__':
- if len(sys.argv) == 1:
- sys.argv.append('--help')
- parser = argparse.ArgumentParser()
- parser.add_argument('-d', '--dir', required=True, help='必要,下载路径')
- parser.add_argument('-bv', '--bvid', required=False, help='下载指定bv视频')
- parser.add_argument('-s', '--search', required=False, action='store_true', help='批量下载搜索页视频')
- parser.add_argument('-k', '--keyword', required=False, help='搜索关键词内容')
- parser.add_argument('-p', '--pages', required=False, help='需要下载页码数', type=int)
- args = parser.parse_args()
- B = BiliBili(args.dir)
- if args.search:
- if args.keyword and args.pages is None:
- print('请输入搜索关键词和页码')
- exit(0)
- B.search_downloader(args.keyword, args.pages)
- if args.bvid:
- if args.search or args.keyword or args.pages:
- print('下载单个视频请只输入BV号')
- exit(0)
- B.a_video_download(args.bvid)
- # return [accept_description, video_data, audio_data, title]
- # B = BiliBili('猫')
- # url = 'https://www.bilibili.com/video/BV1Jy4y1K7yp/'
- # a=B.get_download_url(url)
- # B.downloader(a[1][0], a[3])
复制代码
|