• Huggingface网页解析和下载爬虫


    解析网页:

    import requests
    from bs4 import BeautifulSoup
    
    # 目标网页URL
    url = 'https://huggingface.co/internlm/internlm-20b/tree/main'
    
    # 发送GET请求
    response = requests.get(url)
    
    # 检查请求是否成功
    if response.status_code == 200:
        # 使用BeautifulSoup解析HTML内容
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 假设我们要找到所有的链接
        for link in soup.find_all('a'):
            href = link.get('href')
            if href:  # 确保href不为空
                print(href)
    else:
        print("网页请求失败,状态码:", response.status_code)
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    /
    /models
    /datasets
    /spaces
    /docs
    /pricing
    /login
    /join
    /internlm
    /internlm/internlm-20b
    /models?pipeline_tag=text-generation
    /models?library=transformers
    /models?library=pytorch
    /models?other=internlm
    /models?other=feature-extraction
    /models?other=custom_code
    /models?license=license%3Aapache-2.0
    /internlm/internlm-20b
    /internlm/internlm-20b/tree/main
    /internlm/internlm-20b/discussions
    /internlm/internlm-20b/tree/main
    /internlm/internlm-20b/commits/main
    /internlm/internlm-20b/commits/main
    /x54-729
    /internlm/internlm-20b/commit/2d83118d863d24565da1f9c6c0fe99d3e882f25c
    /internlm/internlm-20b/blob/main/.gitattributes
    /internlm/internlm-20b/resolve/main/.gitattributes?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/README.md
    /internlm/internlm-20b/resolve/main/README.md?download=true
    /internlm/internlm-20b/commit/509b748b2160d0571d067d85f8a21df018cdee29
    /internlm/internlm-20b/commit/509b748b2160d0571d067d85f8a21df018cdee29
    /internlm/internlm-20b/blob/main/config.json
    /internlm/internlm-20b/resolve/main/config.json?download=true
    /internlm/internlm-20b/commit/2d83118d863d24565da1f9c6c0fe99d3e882f25c
    /internlm/internlm-20b/commit/2d83118d863d24565da1f9c6c0fe99d3e882f25c
    /internlm/internlm-20b/blob/main/configuration_internlm.py
    /internlm/internlm-20b/resolve/main/configuration_internlm.py?download=true
    /internlm/internlm-20b/commit/53d4840ed4326a633e59501ba4ac3342757fed34
    /internlm/internlm-20b/commit/53d4840ed4326a633e59501ba4ac3342757fed34
    /internlm/internlm-20b/blob/main/generation_config.json
    /internlm/internlm-20b/resolve/main/generation_config.json?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/modeling_internlm.py
    /internlm/internlm-20b/resolve/main/modeling_internlm.py?download=true
    /internlm/internlm-20b/commit/c8f2f9979075c3ccd0399d042823ac719d545840
    /internlm/internlm-20b/commit/c8f2f9979075c3ccd0399d042823ac719d545840
    /internlm/internlm-20b/blob/main/pytorch_model-00001-of-00005.bin
    /docs/hub/security-pickle
    /internlm/internlm-20b/resolve/main/pytorch_model-00001-of-00005.bin?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/pytorch_model-00002-of-00005.bin
    /docs/hub/security-pickle
    /internlm/internlm-20b/resolve/main/pytorch_model-00002-of-00005.bin?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/pytorch_model-00003-of-00005.bin
    /docs/hub/security-pickle
    /internlm/internlm-20b/resolve/main/pytorch_model-00003-of-00005.bin?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/pytorch_model-00004-of-00005.bin
    /docs/hub/security-pickle
    /internlm/internlm-20b/resolve/main/pytorch_model-00004-of-00005.bin?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/pytorch_model-00005-of-00005.bin
    /docs/hub/security-pickle
    /internlm/internlm-20b/resolve/main/pytorch_model-00005-of-00005.bin?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/pytorch_model.bin.index.json
    /internlm/internlm-20b/resolve/main/pytorch_model.bin.index.json?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/special_tokens_map.json
    /internlm/internlm-20b/resolve/main/special_tokens_map.json?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/tokenization_internlm.py
    /internlm/internlm-20b/resolve/main/tokenization_internlm.py?download=true
    /internlm/internlm-20b/commit/632df84a18d93aa5b40238a1472a8ffb38e2611c
    /internlm/internlm-20b/commit/632df84a18d93aa5b40238a1472a8ffb38e2611c
    /internlm/internlm-20b/blob/main/tokenizer.model
    /internlm/internlm-20b/resolve/main/tokenizer.model?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/blob/main/tokenizer_config.json
    /internlm/internlm-20b/resolve/main/tokenizer_config.json?download=true
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    /internlm/internlm-20b/commit/b8825fe3394608fe84f0f5eb6471454384fb83aa
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94

    下载代码:

    import requests
    from tqdm.auto import tqdm
    
    file_url = 'https://huggingface.co/internlm/internlm-20b/resolve/main/pytorch_model-00001-of-00005.bin?download=true'
    
    # 获取文件大小
    response = requests.head(file_url)
    total_size = int(response.headers.get('content-length', 0))
    
    # 设置流下载模式
    response = requests.get(file_url, stream=True)
    
    # 检查是否请求成功
    if response.status_code == 200:
        file_path = 'pytorch_model-00001-of-00005.bin'
        # 设置进度条
        with tqdm.wrapattr(open(file_path, "wb"), "write", miniters=1,
                           total=total_size, desc=file_path) as fout:
            for chunk in response.iter_content(chunk_size=4096):
                fout.write(chunk)
        print("文件下载完成")
    else:
        print("下载失败,状态码:", response.status_code)
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
  • 相关阅读:
    力扣labuladong一刷day11拿下打家劫舍问题共3题
    第5章-宏观业务分析方法-5.4-因子分析
    Unity架构之域重新加载
    梳理RWKV 4,5(Eagle),6(Finch)架构的区别以及个人理解和建议
    java面试题之 int和Integer的区别
    【笑小枫的SpringBoot系列】【十四】SpringBoot发送邮件
    KUKA机器人与Insight 配合
    arudino 知识整理
    如何高效获取电商数据
    Zeet构建多云战略充分发挥云的优势
  • 原文地址:https://blog.csdn.net/weixin_41967600/article/details/134279512