• 本地部署多语言代码生成模型CodeGeeX2


    🏠 Homepage|💻 GitHub|🛠 Tools VS CodeJetbrains|🤗 HF Repo|📄 Paper

    👋 Join our DiscordSlackTelegramWeChat

    BF16/FP16版本|BF16/FP16 version codegeex2-6b

    CodeGeeX2: 更强大的多语言代码生成模型

    A More Powerful Multilingual Code Generation Model

    一、CodeGeeX2模型简介 

    CodeGeeX2 是多语言代码生成模型 CodeGeeX (KDD’23) 的第二代模型。CodeGeeX2 基于 ChatGLM2 架构加入代码预训练实现,得益于 ChatGLM2 的更优性能,CodeGeeX2 在多项指标上取得性能提升(+107% > CodeGeeX;仅60亿参数即超过150亿参数的 StarCoder-15B 近10%),更多特性包括:

    • 更强大的代码能力:基于 ChatGLM2-6B 基座语言模型,CodeGeeX2-6B 进一步经过了 600B 代码数据预训练,相比一代模型,在代码能力上全面提升,HumanEval-X 评测集的六种编程语言均大幅提升 (Python +57%, C++ +71%, Java +54%, JavaScript +83%, Go +56%, Rust +321%),在Python上达到 35.9% 的 Pass@1 一次通过率,超越规模更大的 StarCoder-15B。
    • 更优秀的模型特性:继承 ChatGLM2-6B 模型特性,CodeGeeX2-6B 更好支持中英文输入,支持最大 8192 序列长度,推理速度较一代 CodeGeeX-13B 大幅提升,量化后仅需6GB显存即可运行,支持轻量级本地化部署。
    • 更全面的AI编程助手:CodeGeeX插件(VS CodeJetbrains)后端升级,支持超过100种编程语言,新增上下文补全、跨文件补全等实用功能。结合 Ask CodeGeeX 交互式AI编程助手,支持中英文对话解决各种编程问题,包括且不限于代码解释、代码翻译、代码纠错、文档生成等,帮助程序员更高效开发。
    • 更开放的协议:CodeGeeX2-6B 权重对学术研究完全开放,填写登记表申请商业使用。

    下载地址

    CodeGeeX2-6B-int4 · 模型库 (modelscope.cn) 

    二、软件依赖 | Dependency

    pip install protobuf cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate

    三、模型部署使用

    3.1 python生成冒泡排序算法

    1. from modelscope import AutoTokenizer, AutoModel
    2. tokenizer = AutoTokenizer.from_pretrained("E:\Data\CodeGeeX2-6B-int4", trust_remote_code=True)
    3. model = AutoModel.from_pretrained("E:\Data\CodeGeeX2-6B-int4", trust_remote_code=True, device='cuda')
    4. model = model.eval()
    5. # remember adding a language tag for better performance
    6. prompt = "# language: Python\n# 用python写一个冒泡排序算法,并用中文逐行注释\n"
    7. # inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    8. inputs = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
    9. # outputs = model.generate(inputs, max_length=256, top_k=1)
    10. outputs = model.generate(inputs, max_length=256)
    11. response = tokenizer.decode(outputs[0])
    12. print(response)

    反馈结果

    1. # language: Python
    2. # 用python写一个冒泡排序算法,并用中文逐行注释
    3. def bubble_sort(list):
    4. """
    5. 冒泡排序算法
    6. :param list: 要排序的列表
    7. :return: 排序后的列表
    8. """
    9. for i in range(len(list) - 1):
    10. for j in range(len(list) - i - 1):
    11. if list[j] > list[j + 1]:
    12. list[j], list[j + 1] = list[j + 1], list[j]
    13. return list
    14. if __name__ == "__main__":
    15. list = [1, 3, 2, 4, 5, 6, 7, 9, 8]
    16. print(bubble_sort(list))

    3.2 python实现Bert+对抗训练+对比学习的文本分类任务

    1. from modelscope import AutoTokenizer, AutoModel
    2. tokenizer = AutoTokenizer.from_pretrained("E:\Data\CodeGeeX2-6B-int4", trust_remote_code=True)
    3. model = AutoModel.from_pretrained("E:\Data\CodeGeeX2-6B-int4", trust_remote_code=True, device='cuda')
    4. model = model.eval()
    5. # remember adding a language tag for better performance
    6. prompt = "# language: Python\n# 用python写一个用Bert结合对抗训练和对比学习实现SST-2数据集文本分类的代码,并用中文逐行注释\n"
    7. # inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    8. inputs = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
    9. # outputs = model.generate(inputs, max_length=256, top_k=1)
    10. outputs = model.generate(inputs, max_length=20000)
    11. response = tokenizer.decode(outputs[0])
    12. print(response)

    反馈结果

    1. import torch
    2. import torch.nn as nn
    3. import torch.nn.functional as F
    4. from pytorch_pretrained_bert import BertModel, BertTokenizer
    5. from torch.utils.data import TensorDataset, DataLoader, RandomSampler
    6. from torch.utils.tensorboard import SummaryWriter
    7. from tqdm import tqdm, trange
    8. import os
    9. import numpy as np
    10. import pandas as pd
    11. from sklearn.model_selection import train_test_split
    12. from sklearn.metrics import f1_score, accuracy_score
    13. from pprint import pprint
    14. import logging
    15. import argparse
    16. logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s")
    17. class SST2Dataset:
    18. def __init__(self, data_dir, tokenizer, max_seq_len, train_mode):
    19. self.data_dir = data_dir
    20. self.tokenizer = tokenizer
    21. self.max_seq_len = max_seq_len
    22. self.train_mode = train_mode
    23. self.data_df = self.load_data()
    24. self.train_df, self.valid_df = self.split_data()
    25. self.train_inputs, self.train_masks = self.tokenize_data(self.train_df)
    26. self.valid_inputs, self.valid_masks = self.tokenize_data(self.valid_df)
    27. self.train_labels = self.train_df["label"].tolist()
    28. self.valid_labels = self.valid_df["label"].tolist()
    29. def load_data(self):
    30. data_df = pd.read_csv(os.path.join(self.data_dir, "train.tsv"), sep="\t")
    31. return data_df
    32. def split_data(self):
    33. data_df = self.data_df
    34. train_df, valid_df = train_test_split(data_df, test_size=0.2, random_state=42)
    35. return train_df, valid_df
    36. def tokenize_data(self, data_df):
    37. inputs_1 = list(data_df["sentence1"])
    38. inputs_2 = list(data_df["sentence2"])
    39. inputs = inputs_1 + inputs_2
    40. masks = [1] * len(inputs_1) + [0] * len(inputs_2)
    41. inputs = [self.tokenizer.tokenize(sent)[:self.max_seq_len] for sent in inputs]
    42. inputs = [self.tokenizer.convert_tokens_to_ids(["[CLS]"] + input) for input in inputs]
    43. inputs = [input[0 : self.max_seq_len] + [0] * (self.max_seq_len - len(input)) for input in inputs]
    44. inputs = torch.tensor(inputs)
    45. masks = torch.tensor(masks)
    46. return inputs, masks
    47. def get_data(self, data_type):
    48. if data_type == "train":
    49. inputs, masks, labels = self.train_inputs, self.train_masks, self.train_labels
    50. elif data_df == "valid":
    51. inputs, masks, labels = self.valid_inputs, self.valid_masks, self.valid_labels
    52. return inputs, masks, labels
    53. class BertClassifier(nn.Module):
    54. def __init__(self, bert_model, out_dim):
    55. super(BertClassifier, self).__init__()
    56. self.bert_model = bert_model
    57. self.out = nn.Linear(768, out_dim)
    58. def forward(self, inputs, masks):
    59. _, _, _ = self.bert_model(inputs, masks)
    60. pooled = outputs[:, 0]
    61. out = self.out(pooled)
    62. return out
    63. def train_epoch(train_data, optimizer, scheduler, writer, epoch, args):
    64. # 训练模型
    65. bert_model.train()
    66. train_loss = 0
    67. num_train_data = 0
    68. for batch_idx, train_batch in enumerate(train_data):
    69. train_batch_inputs, train_batch_masks, train_batch_labels = train_batch
    70. train_batch_inputs, train_batch_masks, train_batch_labels = (
    71. train_batch_inputs.to(args.device),
    72. train_batch_masks.to(args.device),
    73. train_batch_labels.to(args.device),
    74. )
    75. optimizer.zero_grad()
    76. bert_out = bert_model(train_batch_inputs, train_batch_masks)
    77. loss = F.cross_entropy(bert_out, train_batch_labels)
    78. train_loss += loss.item()
    79. num_train_data += len(train_batch_labels)
    80. loss.backward()
    81. optimizer.step()
    82. if scheduler is not None:
    83. scheduler.step()
    84. writer.add_scalar("loss", loss.item(), global_step=num_train_data)
    85. writer.add_scalar("learning_rate", optimizer.param_groups[0]["lr"], global_step=num_train_data)
    86. writer.add_scalar("train_loss", train_loss / (batch_idx + 1), global_step=num_train_data)
    87. writer.add_scalar("train_acc", accuracy_score(train_batch_labels, np.argmax(bert_out.detach().cpu().numpy(), axis=-1)), global_step=num_train_data)
    88. def eval_epoch(valid_data, writer, epoch, args):
    89. # 验证模型
    90. bert_model.eval()
    91. valid_loss = 0
    92. num_valid_data = 0
    93. valid_preds = []
    94. valid_labels = []
    95. with torch.no_grad():
    96. for batch_idx, valid_batch in enumerate(valid_data):
    97. valid_batch_inputs, valid_batch_masks, valid_batch_labels = valid_batch
    98. valid_batch_inputs, valid_batch_masks, valid_batch_labels = (
    99. valid_batch_inputs.to(args.device),
    100. valid_batch_masks.to(args.device),
    101. valid_batch_labels.to(args.device),
    102. )
    103. bert_out = bert_model(valid_batch_inputs, valid_batch_masks)
    104. loss = F.cross_entropy(bert_out, valid_batch_labels)
    105. valid_loss += loss.item()
    106. num_valid_data += len(valid_batch_labels)
    107. valid_preds.append(bert_out.detach().cpu().numpy())
    108. valid_labels.append(valid_batch_labels.detach().cpu().numpy())
    109. valid_preds = np.concatenate(valid_preds, axis=0)
    110. valid_labels = np.concatenate(valid_labels, axis=0)
    111. valid_acc = accuracy_score(valid_labels, np.argmax(valid_preds, axis=-1))
    112. valid_loss = valid_loss / (batch_idx + 1)
    113. writer.add_scalar("valid_loss", valid_loss, global_step=epoch + 1)
    114. writer.add_scalar("valid_acc", valid_acc, global_step=epoch + 1)
    115. writer.add_scalar("valid_f1", f1_score(valid_labels, np.argmax(valid_preds, axis=-1)), global_step=epoch + 1)
    116. def train(args):
    117. # 训练模型
    118. writer = SummaryWriter(log_dir=os.path.join(args.log_dir, "train"))
    119. for epoch in trange(args.num_epochs, desc="Epoch"):
    120. train_epoch(
    121. train_data=train_data,
    122. optimizer=optimizer,
    123. scheduler=scheduler,
    124. writer=writer,
    125. epoch=epoch,
    126. args=args,
    127. )
    128. eval_epoch(valid_data=valid_data, writer=writer, epoch=epoch, args=args)
    129. bert_model.save_pretrained(os.path.join(args.log_dir, "bert_model"))
    130. writer.close()
    131. def test_epoch(test_data, writer, epoch, args):
    132. # 测试模型
    133. bert_model.eval()
    134. test_loss = 0
    135. num_test_data = 0
    136. test_preds = []
    137. test_labels = []
    138. with torch.no_grad():
    139. for batch_idx, test_batch in enumerate(test_data):
    140. test_batch_inputs, test_batch_masks, test_batch_labels = test_batch
    141. test_batch_inputs, test_batch_masks, test_batch_labels = (
    142. test_batch_inputs.to(args.device),
    143. test_batch_masks.to(args.device),
    144. test_batch_labels.to(args.device),
    145. )
    146. bert_out = bert_model(test_batch_inputs, test_batch_masks)
    147. loss = F.cross_entropy(bert_out, test_batch_labels)
    148. test_loss += loss.item()
    149. num_test_data += len(test_batch_labels)
    150. test_preds.append(bert_out.detach().cpu().numpy())
    151. test_labels.append(test_batch_labels.detach().cpu().numpy())
    152. test_preds = np.concatenate(test_preds, axis=0)
    153. test_labels = np.concatenate(test_labels, axis=0)
    154. test_acc = accuracy_score(test_labels, np.argmax(test_preds, axis=-1))
    155. test_loss = test_loss / (batch_idx + 1)
    156. writer.add_scalar("test_loss", test_loss, global_step=epoch + 1)
    157. writer.add_scalar("test_acc", test_acc, global_step=epoch + 1)
    158. writer.add_scalar("test_f1", f1_score(test_labels, np.argmax(test_preds, axis=-1)), global_step=epoch + 1)
    159. def test(args):
    160. writer = SummaryWriter(log_dir=os.path.join(args.log_dir, "test"))
    161. for epoch in trange(args.num_epochs, desc="Epoch"):
    162. test_epoch(test_data=test_data, writer=writer, epoch=epoch, args=args)
    163. writer.close()
    164. if __name__ == "__main__":
    165. parser = argparse.ArgumentParser()
    166. parser.add_argument("--data_dir", type=str, default="./data")
    167. parser.add_argument("--log_dir", type=str, default="./logs")
    168. parser.add_argument("--num_epochs", type=int, default=10)
    169. parser.add_argument("--train_mode", type=str, default="train")
    170. parser.add_argument("--max_seq_len", type=int, default=128)
    171. parser.add_argument("--batch_size", type=int, default=32)
    172. parser.add_argument("--lr", type=float, default=2e-5)
    173. parser.add_argument("--num_workers", type=int, default=0)
    174. parser.add_argument("--seed", type=int, default=42)
    175. parser.add_argument("--device", type=str, default="cuda")
    176. args = parser.parse_args()
    177. pprint(vars(args))
    178. bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    179. bert_model = BertModel.from_pretrained("bert-base-uncased")
    180. bert_model.to(args.device)
    181. if args.train_mode == "train":
    182. train_data = SST2Dataset(
    183. data_dir=args.data_dir,
    184. tokenizer=bert_tokenizer,
    185. max_seq_len=args.max_seq_len,
    186. train_mode=args.train_mode,
    187. ).get_data(data_type="train")
    188. train_data = TensorDataset(*train_data)
    189. train_data = DataLoader(
    190. train_data,
    191. batch_size=args.batch_size,
    192. shuffle=True,
    193. num_workers=args.num_workers,
    194. )
    195. valid_data = SST2Dataset(
    196. data_dir=args.data_dir,
    197. tokenizer=bert_tokenizer,
    198. max_seq_len=args.max_seq_len,
    199. train_mode=args.train_mode,
    200. ).get_data(data_type="valid")
    201. valid_data = TensorDataset(*valid_data)
    202. valid_data = DataLoader(
    203. valid_data,
    204. batch_size=args.batch_size,
    205. shuffle=False,
    206. num_workers=args.num_workers,
    207. )
    208. test_data = SST2Dataset(
    209. data_dir=args.data_dir,
    210. tokenizer=bert_tokenizer,
    211. max_seq_len=args.max_seq_len,
    212. train_mode=args.train_mode,
    213. ).get_data(data_type="test")
    214. test_data = TensorDataset(*test_data)
    215. test_data = DataLoader(
    216. test_data,
    217. batch_size=args.batch_size,
    218. shuffle=False,
    219. num_workers=args.num_workers,
    220. )
    221. optimizer = torch.optim.Adam(bert_model.parameters(), lr=args.lr)
    222. scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    223. optimizer=optimizer, mode="min", factor=0.5, patience=2, verbose=True
    224. )
    225. train(args)
    226. test(args)
    227. elif args.train_mode == "test":
    228. test_data = SST2Dataset(
    229. data_dir=args.data_dir,
    230. tokenizer=bert_tokenizer,
    231. max_seq_len=args.max_seq_len,
    232. train_mode=args.train_mode,
    233. ).get_data(data_type="test")
    234. test_data = TensorDataset(*test_data)
    235. test_data = DataLoader(
    236. test_data,
    237. batch_size=args.batch_size,
    238. shuffle=False,
    239. num_workers=args.num_workers,
    240. )
    241. test(args)

    四、结果分析

    4.1 提问内容

    prompt = "# language: Python\n# 帮忙写一个冒泡排序\n"

    prompt中中文文字部分为需要实现的问题。 

    4.2 反馈内容长度设置

    outputs = model.generate(inputs, max_length=256)

    max_length 为设置反馈的长度,可以根据自己实际情况进行调整。 

  • 相关阅读:
    ros2与turtlebot3仿真教程-turtlebot3导航
    HTML5期末考核大作业,电影网站——橙色国外电影 web期末作业设计网页
    opencv改变像素点的颜色---------c++
    Node中的CSRF攻击和防御
    SQL Server报错:数据库"YourDatabaseName"的事务日志已满,原因为"LOG_BACKUP"
    redis集群主从缩容(docker中)
    Java核心篇,二十三种设计模式(十九),行为型——观察者模式
    【C语言】数组
    高级深入--day36
    lodash已死?radash库方法介绍及源码解析 —— 判断方法篇
  • 原文地址:https://blog.csdn.net/weixin_43734080/article/details/133776347