• LangChain支持哔哩哔哩视频总结


    是基于LangChain框架下的开发,所以最开始请先

    1. pip install Langchain
    2. pip install bilibili-api-python

    技术要点:

    1. 使用Langchain框架自带的Document loaders

    2. 修改BiliBiliLoader的源码,自带的并不支持当前b站的视频加载

    源码文件修改:

    1. import json
    2. import re
    3. import warnings
    4. from typing import List, Tuple
    5. import requests
    6. from langchain_core.documents import Document
    7. from bilibili_api import sync, video
    8. from langchain_community.document_loaders.base import BaseLoader
    9. # Pre-compile regular expressions for video ID extraction
    10. BV_PATTERN = re.compile(r"BV\w+")
    11. AV_PATTERN = re.compile(r"av[0-9]+")
    12. class BiliBiliLoader(BaseLoader):
    13. """
    14. Loader for fetching transcripts from BiliBili videos.
    15. """
    16. def __init__(self, video_urls: List[str], sessdata: str, bili_jct: str, buvid3: str):
    17. """Initialize with bilibili url.
    18. Args:
    19. video_urls (List[str]): List of BiliBili video URLs.
    20. sessdata (str): SESSDATA cookie value for authentication.
    21. bili_jct (str): BILI_JCT cookie value for authentication.
    22. buvid3 (str): BUVI3 cookie value for authentication.
    23. """
    24. self.video_urls = video_urls
    25. self.credential = video.Credential(
    26. sessdata=sessdata, bili_jct=bili_jct, buvid3=buvid3
    27. )
    28. def load(self) -> List[Document]:
    29. """
    30. Load and return a list of documents containing video transcripts.
    31. Returns:
    32. List[Document]: List of Document objects transcripts and metadata.
    33. """
    34. results = []
    35. for url in self.video_urls:
    36. transcript, video_info = self._get_bilibili_subs_and_info(url)
    37. doc = Document(page_content=transcript, metadata=video_info)
    38. results.append(doc)
    39. return results
    40. def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]:
    41. """
    42. Retrieve video information and transcript for a given BiliBili URL.
    43. Args:
    44. url (str): BiliBili video URL.
    45. Returns:
    46. Tuple[str, dict]: A tuple containing the transcript and video information.
    47. """
    48. bvid = BV_PATTERN.search(url)
    49. if bvid:
    50. v = video.Video(bvid=bvid.group(), credential=self.credential)
    51. else:
    52. aid = AV_PATTERN.search(url)
    53. if aid:
    54. v = video.Video(aid=int(aid.group()[2:]), credential=self.credential)
    55. else:
    56. raise ValueError(f"Unable to find a valid video ID in URL: {url}")
    57. video_info = sync(v.get_info())
    58. video_info.update({"url": url})
    59. sub = sync(v.get_subtitle(video_info["cid"]))
    60. # Retrieve and process subtitle content
    61. sub_list = sub["subtitles"]
    62. if sub_list:
    63. sub_url = sub_list[0]["subtitle_url"]
    64. if not sub_url.startswith("http"):
    65. sub_url = "https:" + sub_url
    66. response = requests.get(sub_url)
    67. if response.status_code == 200:
    68. raw_sub_titles = json.loads(response.content)["body"]
    69. raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
    70. raw_transcript_with_meta_info = (
    71. f"Video Title: {video_info['title']}, "
    72. f"description: {video_info['desc']}\n\n"
    73. f"Transcript: {raw_transcript}"
    74. )
    75. return raw_transcript_with_meta_info, video_info
    76. else:
    77. warnings.warn(
    78. f"Failed to fetch subtitles for {url}. "
    79. f"HTTP Status Code: {response.status_code}"
    80. )
    81. return "", video_info
    82. else:
    83. warnings.warn(
    84. f"No subtitles found for video: {url}. Returning empty transcript."
    85. )
    86. return "", video_info

    其中SESSDATA,BUVID3,BILI_JCT 三个参数需要通过访问登录B站进行获取。固定值也是必须值,不需要刷新,永久有效,但是如果该账户访问次数过多和频繁存在被封禁情况,现在不知道b站那边封禁策略。

    获取方法:打开b站网页,F12开发者工具,应用程序->cookies>www.bilibili.com 下的元素获取。

    1. from langchain_community.document_loaders import BiliBiliLoader
    2. SESSDATA = "***************************************"
    3. BUVID3 = "**************************************"
    4. BILI_JCT = "******************************************"
    5. loader = BiliBiliLoader(
    6. [
    7. "https://www.bilibili.com/video/BV1PZ421S7VF/?spm_id_from=333.1007.tianma.1-2-2.click"
    8. ],
    9. sessdata = SESSDATA,
    10. bili_jct = BILI_JCT,
    11. buvid3 = BUVID3,
    12. )
    13. docs = loader.load()
    14. print(docs)

    源码解析:

    _get_bilibili_subs_and_info:

    他是一个检索给定 BiliBili URL 的视频信息和文字记录。

    获取到视频信息后,可以找到字幕URL的获取路径,访问该路径可以获取到字幕信息:

    1. sub_list = sub["subtitles"]
    2. if sub_list:
    3. sub_url = sub_list[0]["subtitle_url"]
    4. if not sub_url.startswith("http"):
    5. sub_url = "https:" + sub_url
    6. response = requests.get(sub_url)

    例子:https://aisubtitle.hdslb.com/bfs/ai_subtitle/prod/125040837614317115816310f6f57f99190f192792b6f2d98ac0?auth_key=1708498531-6e1797becb564b90a29714989167da05-0-e9073436bc93efbbb4f87a3b0c3f7b3f

    如果请求字幕接口成功,那么对于所有的返回的语音字幕文字进行处理:

    1. response = requests.get(sub_url)
    2. if response.status_code == 200:
    3. raw_sub_titles = json.loads(response.content)["body"]
    4. raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
    5. raw_transcript_with_meta_info = (
    6. f"Video Title: {video_info['title']}, "
    7. f"description: {video_info['desc']}\n\n"
    8. f"Transcript: {raw_transcript}"
    9. )

    问题:

    1. 部分b站视频不支持语音文字字幕获取,获取时给错误提示,现在测试情况95%的是视频都是可以获取到了。

    2. 目前视频语言字幕抓取语言,测试只抓取中文,抓取的数据目测是b站提供的字幕数据,其他国家的语言不提供一律转为中文。

    3. AI 字幕需要使用登录账号的cookie进行请求,请求频繁或者过多会出现封禁,体现为接口返回正常返回内容,但是subtitle列表为空(即使实际上有ai字幕),无法获取subtitle_url。建议准备多个账号备用。

    如果你有什么疑问就加入我们的公众号吧,我们能回答你的问题:

  • 相关阅读:
    Triple协议 和dubbo协议
    知识管理系统如何提升企业核心竞争力
    VSCode:使用CMakeLists.txt构建C++项目
    我的sql没问题为什么还是这么慢|MySQL加锁规则
    Linux C++,使用log4cpp记录日志示例详解
    基于PHP+MySQL学生信息管理系统的设计与实现
    网络原理---封装和分用
    物联网实训室解决方案2024
    吴恩达ChatGPT《Finetuning Large Language Models》笔记
    java计算机毕业设计个人博客MyBatis+系统+LW文档+源码+调试部署
  • 原文地址:https://blog.csdn.net/weixin_41227420/article/details/136238039