• [pai-diffusion]pai的easynlp的diffusion模型训练


    PAI-Diffusion模型来了!阿里云机器学习团队带您徜徉中文艺术海洋 - 知乎作者:汪诚愚、段忠杰、朱祥茹、黄俊导读近年来,随着海量多模态数据在互联网的爆炸性增长和训练深度学习大模型的算力大幅提升,AI生成内容(AI Generated Content,AIGC)的应用呈现出爆发性增长趋势。其中,文图…icon-default.png?t=N7T8https://zhuanlan.zhihu.com/p/590020134这里和sd 1.5保持了一样的架构,训练也是通用的diffusers的train_text_to_image.py,只是加载的权重不一样而已。在这里,其实到没有必要一定使用easynlp中的clip进行训练,实际上可以用transformers中的chineseclip或者就是chinesecllip进行训练得到clip模型,但是大部分情况下clip模型是不需要重新训练,可以基于已有的来训练diffusion模型,已经训练好的clip可以直接替换掉sd中的text_encoder权重接口。其次对于中文diffusion的训练,使用diffusers只需要将tokenizer换成BertTokenizer,还是用clipmodel进行加载即可,也就是说只将clip模块进行替换,即可重新训练中文diffusion模型。

    PAI-diffusion使用Wukong数据集中的两千万中文图文数据对上进行了20天的预训练,随后在多个下游数据集中进行了微调。

    训练:diffusers -> train_text_to_image_lora.py

    diffusers通用

    分析一下 pai-diffusion-general-large-zh 权重:

    feature_extractor、safety_checker不影响训练和推理,加不加都行

    scheduler->scheduler_config.json

    1. {
    2. "_class_name": "DPMSolverMultistepScheduler",
    3. "_diffusers_version": "0.15.0.dev0",
    4. "algorithm_type": "dpmsolver++",
    5. "beta_end": 0.012,
    6. "beta_schedule": "scaled_linear", # beta_scheduler:beta的调度方式,scaled_linear:缩放线性调度方式
    7. "beta_start": 0.00085,
    8. "clip_sample": false,
    9. "dynamic_thresholding_ratio": 0.995,
    10. "lower_order_final": true,
    11. "num_train_timesteps": 1000,
    12. "prediction_type": "epsilon",
    13. "sample_max_value": 1.0,
    14. "set_alpha_to_one": false,
    15. "skip_prk_steps": true,
    16. "solver_order": 2,
    17. "solver_type": "midpoint",
    18. "steps_offset": 1,
    19. "thresholding": false,
    20. "trained_betas": null
    21. }

    和正常的sd是没什么区别的。

    1. noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
    2. diffusers.schedulers.scheduling_utils.SchedulerMixin.from_pretrained()
    3. diffusers.schedulers.scheduling_ddpm.DDPMScheduler->ConfigMixin->load_config
    4. DDPMScheduler.from_config->
    5. model = cls(**init_dict) ->参数完成初始化

    text_encoder->config.json

    1. {
    2. "_name_or_path": "models/sdm1.4_with_ChTextEncoder/text_encoder",
    3. "architectures": [
    4. "CLIPTextModel"
    5. ],
    6. "attention_dropout": 0.0,
    7. "bos_token_id": 0,
    8. "dropout": 0.0,
    9. "eos_token_id": 2,
    10. "hidden_act": "quick_gelu", # 激活函数
    11. "hidden_size": 768, # encoder layers和pooler layer的维度
    12. "initializer_factor": 1.0,
    13. "initializer_range": 0.02,
    14. "intermediate_size": 3072, # transformer encoder中feed-forward层的维度
    15. "layer_norm_eps": 1e-05,
    16. "max_position_embeddings": 32, # 模型处理的最大序列长度
    17. "model_type": "clip_text_model",
    18. "num_attention_heads": 12, # encoder中并行的transformer layer的个数
    19. "num_hidden_layers": 12, # transformer encoder hidden layers的数量
    20. "pad_token_id": 1,
    21. "projection_dim": 512,
    22. "torch_dtype": "float32",
    23. "transformers_version": "4.25.1",
    24. "vocab_size": 21128 # clip文本模型词汇表大小
    25. }

    注意到中文的词汇表是21128,如果是英文的词汇表sd 1.5的是49408.

    1. text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder",...)
    2. transformers.modeling_utils.PreTrainedModel.from_pretrained()
    3. transformers.models.clip.configuration_clip.CLIPTextConfig->
    4. transformers.models.clip.modeling_clip.CLIPTextModel.forward->
    5. transformers.models.clip.modeling_clip.CLIPTextTransformer.forward->

    tokenizer->special_tokens_map.json/tokenizer_config.json/vocab.txt

    1. {
    2. "cls_token": "[CLS]",
    3. "do_basic_tokenize": true,
    4. "do_lower_case": true,
    5. "mask_token": "[MASK]",
    6. "model_max_length": 32,
    7. "name_or_path": "models/release_20230316/512/tokenizer",
    8. "never_split": null,
    9. "pad_token": "[PAD]",
    10. "sep_token": "[SEP]",
    11. "special_tokens_map_file": null,
    12. "strip_accents": null,
    13. "tokenize_chinese_chars": true,
    14. "tokenizer_class": "BertTokenizer",
    15. "unk_token": "[UNK]"
    16. }
    1. tokenizer = BertTokenizer.from_pretrained()->
    2. transformers.tokenization_utils_base.PretrainedTokenizerBase.forward->
    3. transformers.models.bert.tokenization_bert.BertTokenizer

    unet->config.json

    1. {
    2. "_class_name": "UNet2DConditionModel",
    3. "_diffusers_version": "0.14.0.dev0",
    4. "_name_or_path": "models/20230321_512_openjourney/checkpoint-30000/unet_ema",
    5. "act_fn": "silu",
    6. "attention_head_dim": 8,
    7. "block_out_channels": [
    8. 320,
    9. 640,
    10. 1280,
    11. 1280
    12. ],
    13. "center_input_sample": false,
    14. "class_embed_type": null,
    15. "conv_in_kernel": 3,
    16. "conv_out_kernel": 3,
    17. "cross_attention_dim": 768,
    18. "decay": 0.9999,
    19. "down_block_types": [
    20. "CrossAttnDownBlock2D",
    21. "CrossAttnDownBlock2D",
    22. "CrossAttnDownBlock2D",
    23. "DownBlock2D"
    24. ],
    25. "downsample_padding": 1,
    26. "dual_cross_attention": false,
    27. "flip_sin_to_cos": true,
    28. "freq_shift": 0,
    29. "in_channels": 4,
    30. "inv_gamma": 1.0,
    31. "layers_per_block": 2,
    32. "mid_block_scale_factor": 1,
    33. "mid_block_type": "UNetMidBlock2DCrossAttn",
    34. "min_decay": 0.0,
    35. "norm_eps": 1e-05,
    36. "norm_num_groups": 32,
    37. "num_class_embeds": null,
    38. "only_cross_attention": false,
    39. "optimization_step": 30000,
    40. "out_channels": 4,
    41. "power": 0.6666666666666666,
    42. "projection_class_embeddings_input_dim": null,
    43. "resnet_time_scale_shift": "default",
    44. "sample_size": 64,
    45. "time_cond_proj_dim": null,
    46. "time_embedding_type": "positional",
    47. "timestep_post_act": null,
    48. "up_block_types": [
    49. "UpBlock2D",
    50. "CrossAttnUpBlock2D",
    51. "CrossAttnUpBlock2D",
    52. "CrossAttnUpBlock2D"
    53. ],
    54. "upcast_attention": false,
    55. "update_after_step": 0,
    56. "use_ema_warmup": false,
    57. "use_linear_projection": false
    58. }

    vae->config.json

    1. {
    2. "_class_name": "AutoencoderKL",
    3. "_diffusers_version": "0.14.0.dev0",
    4. "_name_or_path": "models/release_20230316/512/vae",
    5. "act_fn": "silu",
    6. "block_out_channels": [
    7. 128,
    8. 256,
    9. 512,
    10. 512
    11. ],
    12. "down_block_types": [
    13. "DownEncoderBlock2D",
    14. "DownEncoderBlock2D",
    15. "DownEncoderBlock2D",
    16. "DownEncoderBlock2D"
    17. ],
    18. "in_channels": 3,
    19. "latent_channels": 4,
    20. "layers_per_block": 2,
    21. "norm_num_groups": 32,
    22. "out_channels": 3,
    23. "sample_size": 512,
    24. "scaling_factor": 0.18215,
    25. "up_block_types": [
    26. "UpDecoderBlock2D",
    27. "UpDecoderBlock2D",
    28. "UpDecoderBlock2D",
    29. "UpDecoderBlock2D"
    30. ]
    31. }

  • 相关阅读:
    如何确保亚马逊、速卖通等平台测评补单的环境稳定性和安全性?
    卷起来了!熬夜学习阿里P8全彩版并发编程图册,涨薪就在眼前
    45从零开始用Rust编写nginx,静态文件服务器竟然还有这些细节
    基于JavaWeb技术的在线考试系统设计与实现
    数据库系统原理与应用教程(045)—— MySQL 查询(七):聚合函数
    【人工智能基础】人工神经网络
    第三十九基础:JavaScript在自动化测试中的应用
    邸老师的时序分析笔记
    反编译软件库源码附带独立后台教程
    探索前沿:云计算环境下的网络安全策略与技术实践
  • 原文地址:https://blog.csdn.net/u012193416/article/details/133145321