• BaiChuan13B-GPTQ量化详解


    知识要点:
    1、按照网上搜索的一些代码,如使用auto_gptq原生库进行训练后量化,可能会正常量化,但是在线推理时会出现如找不到bin文件或者tf文件,即模型权重文件,所以和网上大部分代码不同的地方在于,需要提前保存对应模型的权重文件,如果是BaiChuan13B,那么在进行模型量化前,对其进行保存
    代码如下:

    def save_bin(pretrained_model_dir, quantized_model_dir):
        from transformers import AutoModelForCausalLM
        import torch
        import os
        
        original_model = AutoModelForCausalLM.from_pretrained(
                pretrained_model_dir, 
                trust_remote_code=True,
                torch_dtype=torch.float16,      # 不执行这个保存的bin文件会非常的大,大概50多G
                safetensors=True
            )
        print("保存bin文件...")
        model_path = os.path.join(quantized_model_dir, "pytorch_model"+".bin")
        torch.save(original_model.state_dict(), model_path)
        print("保存bin文件完成...")
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    量化代码,使用原生库auto_gptq进行量化:

    def from_authority_autogptq(pretrained_model_dir, quantized_model_dir):
        from transformers import AutoTokenizer, AutoModelForCausalLM
        from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
        import logging
        import torch
        import os
        
        logging.basicConfig(
            format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
        )
    
    
        # 量化分词器加载
        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_dir, 
            use_fast=False, 
            trust_remote_code=True
        )
        
        examples = [
            tokenizer(
                "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
            )
        ]
        
        # 量化参数配置
        quantize_config = BaseQuantizeConfig(
            bits=4,             # quantize model to 4-bit
            group_size=128,     # it is recommended to set the value to 128
            desc_act=False,     # set to False can significantly speed up inference but the perplexity may slightly bad
        )
    
        # load un-quantized model, by default, the model will always be loaded into CPU memory
        quantize_model = AutoGPTQForCausalLM.from_pretrained(
            pretrained_model_dir, 
            quantize_config=quantize_config, 
            trust_remote_code=True,
            device_map="auto",
        )
        
        
        print("开始量化模型.......")
        quantize_model.quantize(examples)
        
        # save model weights
        print("保存量化文件...")
        quantize_model.save_quantized(quantized_model_dir)
        print("保存量化文件完成...")
        
        print("保存tokenizer...")
        tokenizer.save_pretrained(quantized_model_dir)
        print("保存tokenizer完成...")
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52

    按照上述步骤,此时模型量化文件保存成功,接下来就是模型在线推理

    def get_baichuan2_autogptq(quantized_model_dir):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        from transformers.generation.utils import GenerationConfig
        import torch
        # 模型地址
        model_id = quantized_model_dir
        
        print("加载分词器tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, 
            trust_remote_code=True,
            use_fast=False
        )
        
        '''
        warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default).
        This will lead to slow inference or training speed
        '''
        
        print("加载量化model...")
        quantized_model_4bit = AutoModelForCausalLM.from_pretrained(
            # 要载入的模型名称
            model_id, 
            load_in_4bit=True,
            # 仅使用本地模型,不通过网络下载模型
            local_files_only=True,
            # 指定模型精度
            torch_dtype=torch.float16,
            trust_remote_code=True,
            safetensors=True
        )
        
        print("加载config...")
        quantized_model_4bit.generation_config = GenerationConfig.from_pretrained(
            model_id
        )
    
        # 实例测试
        print("生成...")
        messages = []
        messages.append({"role": "user", "content":"亚历山大为何如此厉害"})
        response = quantized_model_4bit.chat(tokenizer, messages)
        print(response)
        return response 
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44

    最后整合代码:

    '''bin 文件是保存的是原始的加载模型文件,不涉及量化操作的模型过程,不然会报错或者加载不出来!!!'''
    def save_bin(pretrained_model_dir, quantized_model_dir):
        from transformers import AutoModelForCausalLM
        import torch
        import os
        
        original_model = AutoModelForCausalLM.from_pretrained(
                pretrained_model_dir, 
                trust_remote_code=True,
                torch_dtype=torch.float16,      # 不执行这个保存的bin文件会非常的大,大概50多G
                safetensors=True
            )
        print("保存bin文件...")
        model_path = os.path.join(quantized_model_dir, "pytorch_model"+".bin")
        torch.save(original_model.state_dict(), model_path)
        print("保存bin文件完成...")
    
    
    
    # auto_gptq原生库, 量化占用显存7-10G不等,用时23分钟,推理18G
    def from_authority_autogptq(pretrained_model_dir, quantized_model_dir):
        from transformers import AutoTokenizer, AutoModelForCausalLM
        from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
        import logging
        import torch
        import os
        
        logging.basicConfig(
            format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
        )
    
    
        # 量化分词器加载
        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_dir, 
            use_fast=False, 
            trust_remote_code=True
        )
        
        examples = [
            tokenizer(
                "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
            )
        ]
        
        # 量化参数配置
        quantize_config = BaseQuantizeConfig(
            bits=4,             # quantize model to 4-bit
            group_size=128,     # it is recommended to set the value to 128
            desc_act=False,     # set to False can significantly speed up inference but the perplexity may slightly bad
        )
    
        # load un-quantized model, by default, the model will always be loaded into CPU memory
        quantize_model = AutoGPTQForCausalLM.from_pretrained(
            pretrained_model_dir, 
            quantize_config=quantize_config, 
            trust_remote_code=True,
            device_map="auto",
        )
        
        
        print("开始量化模型.......")
        quantize_model.quantize(examples)
        
        # save model weights
        print("保存量化文件...")
        quantize_model.save_quantized(quantized_model_dir)
        print("保存量化文件完成...")
        
        print("保存tokenizer...")
        tokenizer.save_pretrained(quantized_model_dir)
        print("保存tokenizer完成...")
    
    
    
    # 加载量化后的模型方法
    def get_baichuan2_autogptq(quantized_model_dir):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        from transformers.generation.utils import GenerationConfig
        import torch
        # 模型地址
        model_id = quantized_model_dir
        
        print("加载分词器tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, 
            trust_remote_code=True,
            use_fast=False
        )
        
        '''
        warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default).
        This will lead to slow inference or training speed
        '''
        
        print("加载量化model...")
        quantized_model_4bit = AutoModelForCausalLM.from_pretrained(
            # 要载入的模型名称
            model_id, 
            load_in_4bit=True,
            # 仅使用本地模型,不通过网络下载模型
            local_files_only=True,
            # 指定模型精度
            torch_dtype=torch.float16,
            trust_remote_code=True,
            safetensors=True
        )
        
        print("加载config...")
        quantized_model_4bit.generation_config = GenerationConfig.from_pretrained(
            model_id
        )
    
        # 实例测试
        print("生成...")
        messages = []
        messages.append({"role": "user", "content":"```桥架\n1、名称:机房走线架(铝合金) 2、规格:300mm*100mm 3、含支吊架制作安装 4、其它:具体详见图纸、技术规范书、图集、招标文件、招标答疑、政府相关文件、规范等其它资料,满足验收要求```\n请仔细阅读上文,并从中分析出实体列表中的各实体。请使用json字典格式回答,其中,键为各实体名称,值为从文本中提取出的内容(若没有相应实体则值为'无')。\n实体列表如下(目标实体之间通过“;”隔开): ```名称;型号;材质;类型;规格;接地方式```"})
        response = quantized_model_4bit.chat(tokenizer, messages)
        print(response)
        return response 
    
    
    
    
    
    if __name__ == "__main__":
        # from_transformers_autogptq 方法量化模型
        # pretrained_model_dir = "/root/lk/big_model/Baichuan2-13B-Chat"
        # quantized_model_dir = "/root/lk/big_model/baichuan2_autogptq"
        # from_transformers_autogptq(pretrained_model_dir, quantized_model_dir)
        
        import datetime
        print("程序开始时间------->>>>>>", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        
        
        # 地址
        pretrained_model_dir = "/root/lk/big_model/Baichuan2-13B-Chat"
        quantized_model_dir = "/root/lk/big_model/baichuan2_autogptq"
        
        # 第一步:保存原始模型的Bin文件,然后再量化(很关键)
        # save_bin(pretrained_model_dir, quantized_model_dir)
        
        # 第二部:执行来自autogptq原始包量化模型
        # from_authority_autogptq(pretrained_model_dir, quantized_model_dir)
        
        # 第三部:使用量化模型进行推理(需要添加对应文件)
        get_baichuan2_autogptq(quantized_model_dir)
        
        
        print("程序结束时间------->>>>>>", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151

    对应包版本:

    auto-gptq==0.6.0
    transformers==4.39.2
    torch==2.0.1
    
    • 1
    • 2
    • 3
  • 相关阅读:
    LC-895. 最大频率栈(优先队列+哈希表)
    vue中含有iframe的页面如何避免路由切换时keep-alive不起作用
    wireshark打开tcpdump抓的包 vwr: Invalid data length runs past the end of the record
    数据分析:方差分析在R语言中的应用
    Java 面向对象
    图像旋转、平移、缩放变换矩阵计算及案例
    SpringBoot电商项目实战Day10 希尔排序
    我的备考分享:良好的企业治理需要CGEIT
    java面向对象(一)
    ElasticSearch:实现高效数据搜索与分析的利器!项目中如何应用落地,让我带你实操指南。
  • 原文地址:https://blog.csdn.net/weixin_42225889/article/details/137909910