• stable-diffusion-webui中stability的sdv1.5和sdxl模型结构config对比


    sdv1.5 v1-inference.yaml

    1. model:
    2. base_learning_rate: 1.0e-04
    3. target: ldm.models.diffusion.ddpm.LatentDiffusion
    4. params:
    5. linear_start: 0.00085
    6. linear_end: 0.0120
    7. num_timesteps_cond: 1
    8. log_every_t: 200
    9. timesteps: 1000
    10. first_stage_key: "jpg"
    11. cond_stage_key: "txt"
    12. image_size: 64
    13. channels: 4
    14. cond_stage_trainable: false # Note: different from the one we trained before
    15. conditioning_key: crossattn
    16. monitor: val/loss_simple_ema
    17. scale_factor: 0.18215
    18. use_ema: False
    19. scheduler_config: # 10000 warmup steps
    20. target: ldm.lr_scheduler.LambdaLinearScheduler
    21. params:
    22. warm_up_steps: [ 10000 ]
    23. cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
    24. f_start: [ 1.e-6 ]
    25. f_max: [ 1. ]
    26. f_min: [ 1. ]
    27. unet_config:
    28. target: ldm.modules.diffusionmodules.openaimodel.UNetModel
    29. params:
    30. image_size: 32 # unused
    31. in_channels: 4
    32. out_channels: 4
    33. model_channels: 320
    34. attention_resolutions: [ 4, 2, 1 ]
    35. num_res_blocks: 2
    36. channel_mult: [ 1, 2, 4, 4 ]
    37. num_heads: 8
    38. use_spatial_transformer: True
    39. transformer_depth: 1
    40. context_dim: 768
    41. use_checkpoint: True
    42. legacy: False
    43. first_stage_config:
    44. target: ldm.models.autoencoder.AutoencoderKL
    45. params:
    46. embed_dim: 4
    47. monitor: val/rec_loss
    48. ddconfig:
    49. double_z: true
    50. z_channels: 4
    51. resolution: 256
    52. in_channels: 3
    53. out_ch: 3
    54. ch: 128
    55. ch_mult:
    56. - 1
    57. - 2
    58. - 4
    59. - 4
    60. num_res_blocks: 2
    61. attn_resolutions: []
    62. dropout: 0.0
    63. lossconfig:
    64. target: torch.nn.Identity
    65. cond_stage_config:
    66. target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

    modules/initialize.py

    1. Thread(target=load_model).start()
    2. load_model->shared.sd_model

    modules/shared_items.py

    1. Shared()->
    2. sd_model()->modules.sd_models.model_data.get_sd_model()

    sd_models.py

    1. SdModelData:
    2. get_sd_model()->load_model()
    3. model_data = SdModelData()

    sd_models.py load_model()

    1. load_model(checkpoint_info,already_loaded_state_dict)->
    2. state_dict = get_checkpoint_state_dict(checkpoint_info,..)
    3. - torch.load()
    4. checkpoint_config = sd_model_config.find_checkpoint_config(state_dict,checkpoint_info)
    5. # state_dict 权重已经加载上来了,类似下面这种
    6. 'model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.weight': tensor([0.8882, 0.9307, 0.8149, 0.8799, 0.8374, 0.8779, 0.8208, 0.7705, 0.7871,
    7. 0.6953, 0.8354, 0.8594, 0.7881, 0.8018, 0.8442, 0.7744, 0.7969, 0.7715,

    sd_models_config.py 

    1. find_checkpoint_config(state_dict,info)
    2. guess_model_config_from_state_dict(state_dict,info.filename)
    3. - config_default # 根据权重的关键key从开头的config中选出来符合要求的yaml

    sd_model.py load_model()

    1. sd_config = OmegaConf.load(checkpoint_config)
    2. Creating model from config: /root/autodl-tmp/stable-diffusion-webui/configs/v1-inference.yaml
    3. sd_model = instantiate_from_config(sd_config.model)

    简单分析下ldm下的代码:

    models是串起全流程的代码,比如DDPM,modules下的是具体的模块代码

    repositories/stable-diffusion-stability-ai/ldm/util.py 

    1. get_obj_from_str(config["target"])(**config.get("params", dict()))
    2. module:ldm.models.diffusion.ddpm,cls:LatentDiffusion
    3. importlib.import_module(module, package=None)->
    4. 'ldm.models.diffusion.ddpm' from '/root/autodl-tmp/stable-diffusion-webui/repositories/stable-diffusion-stability-ai/ldm/models/diffusion/ddpm.py'>

    sd_model.py load_model()

    1. sd_model = instantiate_from_config(sd_config.model)
    2. # sd_model = LatentDiffusion

    repositories/stable-diffusion-stability-ai/ldm/models/diffusion/ddpm.py LatentDiffusion()

    1. self.instantiate_first_stage()
    2. - model = instantiate_from_config(config)
    3. - self.first_stage_model = model.eval()
    4. self.instantiate_cond_stage()
    5. - model = instantiate_from_config(config)
    6. - self.cond_stage_model = model.eval()
    7. self.model = DiffusionWrapper(unet_config,..)
    8. - self.diffusion_model = instantiate_from_config(diff_model_config)

    sd_model.first_stage_model:

    1. AutoencoderKL(
    2. (encoder): Encoder(
    3. (conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    4. (down): ModuleList(
    5. (0): Module(
    6. (block): ModuleList(
    7. (0-1): 2 x ResnetBlock(
    8. (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
    9. (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    10. (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
    11. (dropout): Dropout(p=0.0, inplace=False)
    12. (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    13. )
    14. )
    15. (attn): ModuleList()
    16. (downsample): Downsample(
    17. (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2))
    18. )
    19. )
    20. (1): Module(
    21. (block): ModuleList(
    22. (0): ResnetBlock(
    23. (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
    24. (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    25. (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
    26. (dropout): Dropout(p=0.0, inplace=False)
    27. (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    28. (nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))
    29. )
    30. (1): ResnetBlock(
    31. (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
    32. (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    33. (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
    34. (dropout): Dropout(p=0.0, inplace=False)
    35. (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    36. )
    37. )
    38. (attn): ModuleList()
    39. (downsample): Downsample(
    40. (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2))
    41. )
    42. )
    43. (2): Module(
    44. (block): ModuleList(
    45. (0): ResnetBlock(
    46. (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
    47. (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    48. (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
    49. (dropout): Dropout(p=0.0, inplace=False)
    50. (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    51. (nin_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))
    52. )
    53. (1): ResnetBlock(
    54. (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
    55. (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    56. (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
    57. (dropout): Dropout(p=0.0, inplace=False)
    58. (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    59. )
    60. )
    61. (attn): ModuleList()
    62. (downsample): Downsample(
    63. (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2))
    64. )
    65. )
    66. (3): Module(
    67. (block): ModuleList(
    68. (0-1): 2 x ResnetBlock(
    69. (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
    70. (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    71. (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
    72. (dropout): Dropout(p=0.0, inplace=False)
    73. (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    74. )
    75. )
    76. (attn): ModuleList()
    77. )
    78. )
    79. (mid): Module(
    80. (block_1): ResnetBlock(
    81. (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
    82. (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    83. (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
    84. (dropout): Dropout(p=0.0, inplace=False)
    85. (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    86. )
    87. (attn_1): AttnBlock(
    88. (norm): GroupNorm(32, 512, eps=1e-06, affine=True)
    89. (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
    90. (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
    91. (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
    92. (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
    93. )
    94. (block_2): ResnetBlock(
    95. (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
    96. (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    97. (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
    98. (dropout): Dropout(p=0.0, inplace=False)
    99. (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    100. )
    101. )
    102. (norm_out): GroupNorm(32, 512, eps=1e-06, affine=True)
    103. (conv_out): Conv2d(512, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    104. )
    105. (decoder): Decoder(
    106. (conv_in): Conv2d(4, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    107. (mid): Module(
    108. (block_1): ResnetBlock(
    109. (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
    110. (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    111. (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
    112. (dropout): Dropout(p=0.0, inplace=False)
    113. (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    114. )
    115. (attn_1): AttnBlock(
    116. (norm): GroupNorm(32, 512, eps=1e-06, affine=True)
    117. (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
    118. (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
    119. (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
    120. (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
    121. )
    122. (block_2): ResnetBlock(
    123. (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
    124. (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    125. (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
    126. (dropout): Dropout(p=0.0, inplace=False)
    127. (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    128. )
    129. )
    130. (up): ModuleList(
    131. (0): Module(
    132. (block): ModuleList(
    133. (0): ResnetBlock(
    134. (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
    135. (conv1): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    136. (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
    137. (dropout): Dropout(p=0.0, inplace=False)
    138. (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    139. (nin_shortcut): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
    140. )
    141. (1-2): 2 x ResnetBlock(
    142. (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
    143. (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    144. (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
    145. (dropout): Dropout(p=0.0, inplace=False)
    146. (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    147. )
    148. )
    149. (attn): ModuleList()
    150. )
    151. (1): Module(
    152. (block): ModuleList(
    153. (0): ResnetBlock(
    154. (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
    155. (conv1): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    156. (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
    157. (dropout): Dropout(p=0.0, inplace=False)
    158. (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    159. (nin_shortcut): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    160. )
    161. (1-2): 2 x ResnetBlock(
    162. (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
    163. (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    164. (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
    165. (dropout): Dropout(p=0.0, inplace=False)
    166. (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    167. )
    168. )
    169. (attn): ModuleList()
    170. (upsample): Upsample(
    171. (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    172. )
    173. )
    174. (2-3): 2 x Module(
    175. (block): ModuleList(
    176. (0-2): 3 x ResnetBlock(
    177. (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
    178. (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    179. (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
    180. (dropout): Dropout(p=0.0, inplace=False)
    181. (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    182. )
    183. )
    184. (attn): ModuleList()
    185. (upsample): Upsample(
    186. (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    187. )
    188. )
    189. )
    190. (norm_out): GroupNorm(32, 128, eps=1e-06, affine=True)
    191. (conv_out): Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    192. )
    193. (loss): Identity()
    194. (quant_conv): Conv2d(8, 8, kernel_size=(1, 1), stride=(1, 1))
    195. (post_quant_conv): Conv2d(4, 4, kernel_size=(1, 1), stride=(1, 1))
    196. )

    sd_model.cond_stage_model:

    1. FrozenCLIPEmbedder(
    2. (transformer): CLIPTextModel(
    3. (text_model): CLIPTextTransformer(
    4. (embeddings): CLIPTextEmbeddings(
    5. (token_embedding): Embedding(49408, 768)
    6. (position_embedding): Embedding(77, 768)
    7. )
    8. (encoder): CLIPEncoder(
    9. (layers): ModuleList(
    10. (0-11): 12 x CLIPEncoderLayer(
    11. (self_attn): CLIPAttention(
    12. (k_proj): Linear(in_features=768, out_features=768, bias=True)
    13. (v_proj): Linear(in_features=768, out_features=768, bias=True)
    14. (q_proj): Linear(in_features=768, out_features=768, bias=True)
    15. (out_proj): Linear(in_features=768, out_features=768, bias=True)
    16. )
    17. (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    18. (mlp): CLIPMLP(
    19. (activation_fn): QuickGELUActivation()
    20. (fc1): Linear(in_features=768, out_features=3072, bias=True)
    21. (fc2): Linear(in_features=3072, out_features=768, bias=True)
    22. )
    23. (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    24. )
    25. )
    26. )
    27. (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    28. )
    29. )
    30. )

    sd_model.model -> diffusionModel

    1. FrozenCLIPEmbedder(
    2. (transformer): CLIPTextModel(
    3. (text_model): CLIPTextTransformer(
    4. (embeddings): CLIPTextEmbeddings(
    5. (token_embedding): Embedding(49408, 768)
    6. (position_embedding): Embedding(77, 768)
    7. )
    8. (encoder): CLIPEncoder(
    9. (layers): ModuleList(
    10. (0-11): 12 x CLIPEncoderLayer(
    11. (self_attn): CLIPAttention(
    12. (k_proj): Linear(in_features=768, out_features=768, bias=True)
    13. (v_proj): Linear(in_features=768, out_features=768, bias=True)
    14. (q_proj): Linear(in_features=768, out_features=768, bias=True)
    15. (out_proj): Linear(in_features=768, out_features=768, bias=True)
    16. )
    17. (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    18. (mlp): CLIPMLP(
    19. (activation_fn): QuickGELUActivation()
    20. (fc1): Linear(in_features=768, out_features=3072, bias=True)
    21. (fc2): Linear(in_features=3072, out_features=768, bias=True)
    22. )
    23. (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    24. )
    25. )
    26. )
    27. (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    28. )
    29. )
    30. )

    sd_model.py load_model_weights

    1. load_model_weights(sd_model,checkpoint_info,state_dict,...)->
    2. model.is_sdxl
    3. model.is_sd1
    4. model.is_sd2
    5. model.load_state_dict(state_dict,strict=False)
    6. vae = model.first_stage_model
    7. model.first_stage_model = None
    8. model.half()
    9. model.first_stage_model = vae
    10. sd_vae.load_vae(model,vae_file,vae_source)

    sd_model.py load_model

    1. send_model_to_device(sd_model)
    2. sd_hijack.model_hijack.hijack(sd_model)

    modules/sd_hijack.py

    1. StableDiffusionModelHijack->hijack(,m)-> m=sd_model
    2. type(m.cond_stage_model) == ldm.modules.encoders.modules.FrozenCLIPEmbedder:
    3. model_embeddings = m.cond_stage_model.roberta.embeddings
    4. model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.word_embeddings,self) # 49408,768
    5. m.cond_stage_model = sd_hijack_xlmr.FrozenXLMREmbedderWithCustomWords(m.cond_stage_model,self)
    6. apply_weighted_forward(m)
    7. self.apply_optimizations()
    8. self.clip = m.cond_stage_model
    9. self.layers = flatten(m)
    10. ldm.modules.diffusionmodules.openaimodel.copy_of_UNetModel_forward_for_webui = ldm.modules.diffusionmodules.openaimodel.UNetModel.forward
    11. ldm.modules.diffusionmodules.openaimodel.UNetModel.forward = sd_unet.UNetModel_forward

    modules/sd_hijack_clip.py

    1. FrozenCLIPEmbedderWithCustomWords()->
    2. self.tokenizer = wrapped.tokenizer
    3. vocab = self.tokenizer.get_vocab()

    sd_model.py load_model

    1. sd_model.eval()
    2. model_data.set_sd_model(sd_model)
    3. sd_hijack.model_hijack.embedding_db.load_textual_inversion_embeddings(force_reload=True)
    4. script_callbacks.model_loaded_callback(sd_model)
    5. sd_model.cond_stage_model_empty_prompt = get_empty_cond(sd_model)
    6. Model loaded in 3004.5s (calculate hash: 175.0s, load weights from disk: 0.2s, find config: 13.4s, create model: 0.4s, apply weights to model: 667.5s, apply half(): 298.5s, apply dtype to VAE: 15.6s, load VAE: 101.6s, load weights from state dict: 69.7s, move model to device: 21.8s, hijack: 1429.6s, load textual inversion embeddings: 114.8s, scripts callbacks: 53.8s, calculate empty prompt: 42.5s).

    sdxl sd_xl_base.yaml

    1. model:
    2. target: sgm.models.diffusion.DiffusionEngine
    3. params:
    4. scale_factor: 0.13025
    5. disable_first_stage_autocast: True
    6. denoiser_config:
    7. target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
    8. params:
    9. num_idx: 1000
    10. weighting_config:
    11. target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
    12. scaling_config:
    13. target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
    14. discretization_config:
    15. target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
    16. network_config:
    17. target: sgm.modules.diffusionmodules.openaimodel.UNetModel
    18. params:
    19. adm_in_channels: 2816
    20. num_classes: sequential
    21. use_checkpoint: True
    22. in_channels: 4
    23. out_channels: 4
    24. model_channels: 320
    25. attention_resolutions: [4, 2]
    26. num_res_blocks: 2
    27. channel_mult: [1, 2, 4]
    28. num_head_channels: 64
    29. use_spatial_transformer: True
    30. use_linear_in_transformer: True
    31. transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
    32. context_dim: 2048
    33. spatial_transformer_attn_type: softmax-xformers
    34. legacy: False
    35. conditioner_config:
    36. target: sgm.modules.GeneralConditioner
    37. params:
    38. emb_models:
    39. # crossattn cond
    40. - is_trainable: False
    41. input_key: txt
    42. target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
    43. params:
    44. layer: hidden
    45. layer_idx: 11
    46. # crossattn and vector cond
    47. - is_trainable: False
    48. input_key: txt
    49. target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
    50. params:
    51. arch: ViT-bigG-14
    52. version: laion2b_s39b_b160k
    53. freeze: True
    54. layer: penultimate
    55. always_return_pooled: True
    56. legacy: False
    57. # vector cond
    58. - is_trainable: False
    59. input_key: original_size_as_tuple
    60. target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
    61. params:
    62. outdim: 256 # multiplied by two
    63. # vector cond
    64. - is_trainable: False
    65. input_key: crop_coords_top_left
    66. target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
    67. params:
    68. outdim: 256 # multiplied by two
    69. # vector cond
    70. - is_trainable: False
    71. input_key: target_size_as_tuple
    72. target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
    73. params:
    74. outdim: 256 # multiplied by two
    75. first_stage_config:
    76. target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
    77. params:
    78. embed_dim: 4
    79. monitor: val/rec_loss
    80. ddconfig:
    81. attn_type: vanilla-xformers
    82. double_z: true
    83. z_channels: 4
    84. resolution: 256
    85. in_channels: 3
    86. out_ch: 3
    87. ch: 128
    88. ch_mult: [1, 2, 4, 4]
    89. num_res_blocks: 2
    90. attn_resolutions: []
    91. dropout: 0.0
    92. lossconfig:
    93. target: torch.nn.Identity

    sd_model.py  load_model_weights()

    sd_model_xl.extend_sdxl(model)
    

    sd_model_xl.py

    1. model.model.conditioning_key = "crossattn"
    2. discretization = sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization()
    3. sgm.models.diffusion.DiffusionEngine.get_learned_conditioning = get_learned_conditioning
    4. sgm.models.diffusion.DiffusionEngine.apply_model = apply_model
    5. sgm.models.diffusion.DiffusionEngine.get_first_stage_encoding = get_first_stage_encoding

    generative-models中sgm代码结构和ldm一致,models下面是整体代码流程,modules下是具体的模块代码。

    repositories/generative-models/sgm/moduels/diffusion.py

    1. model = instantiate_from_config(network_config)
    2. self.model = get_obj_from_str(model)->
    3. self.denoiser = instantiate_from_config(denoiser_config)
    4. self.conditioner = instantiate_from_config(conditioner_config)
    5. self.first_stage_model = instantiate_from_config(first_stage_config).eval()

    model.conditioner

    1. GeneralConditioner(
    2. (embedders): ModuleList(
    3. (0): FrozenCLIPEmbedder(
    4. (transformer): CLIPTextModel(
    5. (text_model): CLIPTextTransformer(
    6. (embeddings): CLIPTextEmbeddings(
    7. (token_embedding): Embedding(49408, 768)
    8. (position_embedding): Embedding(77, 768)
    9. )
    10. (encoder): CLIPEncoder(
    11. (layers): ModuleList(
    12. (0-11): 12 x CLIPEncoderLayer(
    13. (self_attn): CLIPAttention(
    14. (k_proj): Linear(in_features=768, out_features=768, bias=True)
    15. (v_proj): Linear(in_features=768, out_features=768, bias=True)
    16. (q_proj): Linear(in_features=768, out_features=768, bias=True)
    17. (out_proj): Linear(in_features=768, out_features=768, bias=True)
    18. )
    19. (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    20. (mlp): CLIPMLP(
    21. (activation_fn): QuickGELUActivation()
    22. (fc1): Linear(in_features=768, out_features=3072, bias=True)
    23. (fc2): Linear(in_features=3072, out_features=768, bias=True)
    24. )
    25. (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    26. )
    27. )
    28. )
    29. (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    30. )
    31. )
    32. )
    33. (1): FrozenOpenCLIPEmbedder2(
    34. (model): CLIP(
    35. (transformer): Transformer(
    36. (resblocks): ModuleList(
    37. (0-31): 32 x ResidualAttentionBlock(
    38. (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    39. (attn): MultiheadAttention(
    40. (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
    41. )
    42. (ls_1): Identity()
    43. (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    44. (mlp): Sequential(
    45. (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
    46. (gelu): GELUHijack(approximate='none')
    47. (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
    48. )
    49. (ls_2): Identity()
    50. )
    51. )
    52. )
    53. (token_embedding): Embedding(49408, 1280)
    54. (ln_final): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    55. )
    56. )
    57. (2-4): 3 x ConcatTimestepEmbedderND(
    58. (timestep): Timestep()
    59. )
    60. )
    61. (wrapped): Module()
    62. )

    model.first_stage_model:

    1. GeneralConditioner(
    2. (embedders): ModuleList(
    3. (0): FrozenCLIPEmbedder(
    4. (transformer): CLIPTextModel(
    5. (text_model): CLIPTextTransformer(
    6. (embeddings): CLIPTextEmbeddings(
    7. (token_embedding): Embedding(49408, 768)
    8. (position_embedding): Embedding(77, 768)
    9. )
    10. (encoder): CLIPEncoder(
    11. (layers): ModuleList(
    12. (0-11): 12 x CLIPEncoderLayer(
    13. (self_attn): CLIPAttention(
    14. (k_proj): Linear(in_features=768, out_features=768, bias=True)
    15. (v_proj): Linear(in_features=768, out_features=768, bias=True)
    16. (q_proj): Linear(in_features=768, out_features=768, bias=True)
    17. (out_proj): Linear(in_features=768, out_features=768, bias=True)
    18. )
    19. (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    20. (mlp): CLIPMLP(
    21. (activation_fn): QuickGELUActivation()
    22. (fc1): Linear(in_features=768, out_features=3072, bias=True)
    23. (fc2): Linear(in_features=3072, out_features=768, bias=True)
    24. )
    25. (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    26. )
    27. )
    28. )
    29. (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    30. )
    31. )
    32. )
    33. (1): FrozenOpenCLIPEmbedder2(
    34. (model): CLIP(
    35. (transformer): Transformer(
    36. (resblocks): ModuleList(
    37. (0-31): 32 x ResidualAttentionBlock(
    38. (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    39. (attn): MultiheadAttention(
    40. (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
    41. )
    42. (ls_1): Identity()
    43. (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    44. (mlp): Sequential(
    45. (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
    46. (gelu): GELUHijack(approximate='none')
    47. (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
    48. )
    49. (ls_2): Identity()
    50. )
    51. )
    52. )
    53. (token_embedding): Embedding(49408, 1280)
    54. (ln_final): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    55. )
    56. )
    57. (2-4): 3 x ConcatTimestepEmbedderND(
    58. (timestep): Timestep()
    59. )
    60. )
    61. (wrapped): Module()
    62. )

    model.model:

    1. OpenAIWrapper(
    2. (diffusion_model): UNetModel(
    3. (time_embed): Sequential(
    4. (0): Linear(in_features=320, out_features=1280, bias=True)
    5. (1): SiLU()
    6. (2): Linear(in_features=1280, out_features=1280, bias=True)
    7. )
    8. (label_emb): Sequential(
    9. (0): Sequential(
    10. (0): Linear(in_features=2816, out_features=1280, bias=True)
    11. (1): SiLU()
    12. (2): Linear(in_features=1280, out_features=1280, bias=True)
    13. )
    14. )
    15. (input_blocks): ModuleList(
    16. (0): TimestepEmbedSequential(
    17. (0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    18. )
    19. (1-2): 2 x TimestepEmbedSequential(
    20. (0): ResBlock(
    21. (in_layers): Sequential(
    22. (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
    23. (1): SiLU()
    24. (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    25. )
    26. (h_upd): Identity()
    27. (x_upd): Identity()
    28. (emb_layers): Sequential(
    29. (0): SiLU()
    30. (1): Linear(in_features=1280, out_features=320, bias=True)
    31. )
    32. (out_layers): Sequential(
    33. (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
    34. (1): SiLU()
    35. (2): Dropout(p=0, inplace=False)
    36. (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    37. )
    38. (skip_connection): Identity()
    39. )
    40. )
    41. (3): TimestepEmbedSequential(
    42. (0): Downsample(
    43. (op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    44. )
    45. )
    46. (4): TimestepEmbedSequential(
    47. (0): ResBlock(
    48. (in_layers): Sequential(
    49. (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
    50. (1): SiLU()
    51. (2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    52. )
    53. (h_upd): Identity()
    54. (x_upd): Identity()
    55. (emb_layers): Sequential(
    56. (0): SiLU()
    57. (1): Linear(in_features=1280, out_features=640, bias=True)
    58. )
    59. (out_layers): Sequential(
    60. (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
    61. (1): SiLU()
    62. (2): Dropout(p=0, inplace=False)
    63. (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    64. )
    65. (skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))
    66. )
    67. (1): SpatialTransformer(
    68. (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
    69. (proj_in): Linear(in_features=640, out_features=640, bias=True)
    70. (transformer_blocks): ModuleList(
    71. (0-1): 2 x BasicTransformerBlock(
    72. (attn1): CrossAttention(
    73. (to_q): Linear(in_features=640, out_features=640, bias=False)
    74. (to_k): Linear(in_features=640, out_features=640, bias=False)
    75. (to_v): Linear(in_features=640, out_features=640, bias=False)
    76. (to_out): Sequential(
    77. (0): Linear(in_features=640, out_features=640, bias=True)
    78. (1): Dropout(p=0.0, inplace=False)
    79. )
    80. )
    81. (ff): FeedForward(
    82. (net): Sequential(
    83. (0): GEGLU(
    84. (proj): Linear(in_features=640, out_features=5120, bias=True)
    85. )
    86. (1): Dropout(p=0.0, inplace=False)
    87. (2): Linear(in_features=2560, out_features=640, bias=True)
    88. )
    89. )
    90. (attn2): CrossAttention(
    91. (to_q): Linear(in_features=640, out_features=640, bias=False)
    92. (to_k): Linear(in_features=2048, out_features=640, bias=False)
    93. (to_v): Linear(in_features=2048, out_features=640, bias=False)
    94. (to_out): Sequential(
    95. (0): Linear(in_features=640, out_features=640, bias=True)
    96. (1): Dropout(p=0.0, inplace=False)
    97. )
    98. )
    99. (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    100. (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    101. (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    102. )
    103. )
    104. (proj_out): Linear(in_features=640, out_features=640, bias=True)
    105. )
    106. )
    107. (5): TimestepEmbedSequential(
    108. (0): ResBlock(
    109. (in_layers): Sequential(
    110. (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
    111. (1): SiLU()
    112. (2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    113. )
    114. (h_upd): Identity()
    115. (x_upd): Identity()
    116. (emb_layers): Sequential(
    117. (0): SiLU()
    118. (1): Linear(in_features=1280, out_features=640, bias=True)
    119. )
    120. (out_layers): Sequential(
    121. (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
    122. (1): SiLU()
    123. (2): Dropout(p=0, inplace=False)
    124. (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    125. )
    126. (skip_connection): Identity()
    127. )
    128. (1): SpatialTransformer(
    129. (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
    130. (proj_in): Linear(in_features=640, out_features=640, bias=True)
    131. (transformer_blocks): ModuleList(
    132. (0-1): 2 x BasicTransformerBlock(
    133. (attn1): CrossAttention(
    134. (to_q): Linear(in_features=640, out_features=640, bias=False)
    135. (to_k): Linear(in_features=640, out_features=640, bias=False)
    136. (to_v): Linear(in_features=640, out_features=640, bias=False)
    137. (to_out): Sequential(
    138. (0): Linear(in_features=640, out_features=640, bias=True)
    139. (1): Dropout(p=0.0, inplace=False)
    140. )
    141. )
    142. (ff): FeedForward(
    143. (net): Sequential(
    144. (0): GEGLU(
    145. (proj): Linear(in_features=640, out_features=5120, bias=True)
    146. )
    147. (1): Dropout(p=0.0, inplace=False)
    148. (2): Linear(in_features=2560, out_features=640, bias=True)
    149. )
    150. )
    151. (attn2): CrossAttention(
    152. (to_q): Linear(in_features=640, out_features=640, bias=False)
    153. (to_k): Linear(in_features=2048, out_features=640, bias=False)
    154. (to_v): Linear(in_features=2048, out_features=640, bias=False)
    155. (to_out): Sequential(
    156. (0): Linear(in_features=640, out_features=640, bias=True)
    157. (1): Dropout(p=0.0, inplace=False)
    158. )
    159. )
    160. (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    161. (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    162. (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    163. )
    164. )
    165. (proj_out): Linear(in_features=640, out_features=640, bias=True)
    166. )
    167. )
    168. (6): TimestepEmbedSequential(
    169. (0): Downsample(
    170. (op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    171. )
    172. )
    173. (7): TimestepEmbedSequential(
    174. (0): ResBlock(
    175. (in_layers): Sequential(
    176. (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
    177. (1): SiLU()
    178. (2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    179. )
    180. (h_upd): Identity()
    181. (x_upd): Identity()
    182. (emb_layers): Sequential(
    183. (0): SiLU()
    184. (1): Linear(in_features=1280, out_features=1280, bias=True)
    185. )
    186. (out_layers): Sequential(
    187. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    188. (1): SiLU()
    189. (2): Dropout(p=0, inplace=False)
    190. (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    191. )
    192. (skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
    193. )
    194. (1): SpatialTransformer(
    195. (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
    196. (proj_in): Linear(in_features=1280, out_features=1280, bias=True)
    197. (transformer_blocks): ModuleList(
    198. (0-9): 10 x BasicTransformerBlock(
    199. (attn1): CrossAttention(
    200. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    201. (to_k): Linear(in_features=1280, out_features=1280, bias=False)
    202. (to_v): Linear(in_features=1280, out_features=1280, bias=False)
    203. (to_out): Sequential(
    204. (0): Linear(in_features=1280, out_features=1280, bias=True)
    205. (1): Dropout(p=0.0, inplace=False)
    206. )
    207. )
    208. (ff): FeedForward(
    209. (net): Sequential(
    210. (0): GEGLU(
    211. (proj): Linear(in_features=1280, out_features=10240, bias=True)
    212. )
    213. (1): Dropout(p=0.0, inplace=False)
    214. (2): Linear(in_features=5120, out_features=1280, bias=True)
    215. )
    216. )
    217. (attn2): CrossAttention(
    218. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    219. (to_k): Linear(in_features=2048, out_features=1280, bias=False)
    220. (to_v): Linear(in_features=2048, out_features=1280, bias=False)
    221. (to_out): Sequential(
    222. (0): Linear(in_features=1280, out_features=1280, bias=True)
    223. (1): Dropout(p=0.0, inplace=False)
    224. )
    225. )
    226. (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    227. (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    228. (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    229. )
    230. )
    231. (proj_out): Linear(in_features=1280, out_features=1280, bias=True)
    232. )
    233. )
    234. (8): TimestepEmbedSequential(
    235. (0): ResBlock(
    236. (in_layers): Sequential(
    237. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    238. (1): SiLU()
    239. (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    240. )
    241. (h_upd): Identity()
    242. (x_upd): Identity()
    243. (emb_layers): Sequential(
    244. (0): SiLU()
    245. (1): Linear(in_features=1280, out_features=1280, bias=True)
    246. )
    247. (out_layers): Sequential(
    248. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    249. (1): SiLU()
    250. (2): Dropout(p=0, inplace=False)
    251. (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    252. )
    253. (skip_connection): Identity()
    254. )
    255. (1): SpatialTransformer(
    256. (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
    257. (proj_in): Linear(in_features=1280, out_features=1280, bias=True)
    258. (transformer_blocks): ModuleList(
    259. (0-9): 10 x BasicTransformerBlock(
    260. (attn1): CrossAttention(
    261. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    262. (to_k): Linear(in_features=1280, out_features=1280, bias=False)
    263. (to_v): Linear(in_features=1280, out_features=1280, bias=False)
    264. (to_out): Sequential(
    265. (0): Linear(in_features=1280, out_features=1280, bias=True)
    266. (1): Dropout(p=0.0, inplace=False)
    267. )
    268. )
    269. (ff): FeedForward(
    270. (net): Sequential(
    271. (0): GEGLU(
    272. (proj): Linear(in_features=1280, out_features=10240, bias=True)
    273. )
    274. (1): Dropout(p=0.0, inplace=False)
    275. (2): Linear(in_features=5120, out_features=1280, bias=True)
    276. )
    277. )
    278. (attn2): CrossAttention(
    279. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    280. (to_k): Linear(in_features=2048, out_features=1280, bias=False)
    281. (to_v): Linear(in_features=2048, out_features=1280, bias=False)
    282. (to_out): Sequential(
    283. (0): Linear(in_features=1280, out_features=1280, bias=True)
    284. (1): Dropout(p=0.0, inplace=False)
    285. )
    286. )
    287. (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    288. (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    289. (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    290. )
    291. )
    292. (proj_out): Linear(in_features=1280, out_features=1280, bias=True)
    293. )
    294. )
    295. )
    296. (middle_block): TimestepEmbedSequential(
    297. (0): ResBlock(
    298. (in_layers): Sequential(
    299. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    300. (1): SiLU()
    301. (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    302. )
    303. (h_upd): Identity()
    304. (x_upd): Identity()
    305. (emb_layers): Sequential(
    306. (0): SiLU()
    307. (1): Linear(in_features=1280, out_features=1280, bias=True)
    308. )
    309. (out_layers): Sequential(
    310. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    311. (1): SiLU()
    312. (2): Dropout(p=0, inplace=False)
    313. (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    314. )
    315. (skip_connection): Identity()
    316. )
    317. (1): SpatialTransformer(
    318. (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
    319. (proj_in): Linear(in_features=1280, out_features=1280, bias=True)
    320. (transformer_blocks): ModuleList(
    321. (0-9): 10 x BasicTransformerBlock(
    322. (attn1): CrossAttention(
    323. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    324. (to_k): Linear(in_features=1280, out_features=1280, bias=False)
    325. (to_v): Linear(in_features=1280, out_features=1280, bias=False)
    326. (to_out): Sequential(
    327. (0): Linear(in_features=1280, out_features=1280, bias=True)
    328. (1): Dropout(p=0.0, inplace=False)
    329. )
    330. )
    331. (ff): FeedForward(
    332. (net): Sequential(
    333. (0): GEGLU(
    334. (proj): Linear(in_features=1280, out_features=10240, bias=True)
    335. )
    336. (1): Dropout(p=0.0, inplace=False)
    337. (2): Linear(in_features=5120, out_features=1280, bias=True)
    338. )
    339. )
    340. (attn2): CrossAttention(
    341. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    342. (to_k): Linear(in_features=2048, out_features=1280, bias=False)
    343. (to_v): Linear(in_features=2048, out_features=1280, bias=False)
    344. (to_out): Sequential(
    345. (0): Linear(in_features=1280, out_features=1280, bias=True)
    346. (1): Dropout(p=0.0, inplace=False)
    347. )
    348. )
    349. (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    350. (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    351. (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    352. )
    353. )
    354. (proj_out): Linear(in_features=1280, out_features=1280, bias=True)
    355. )
    356. (2): ResBlock(
    357. (in_layers): Sequential(
    358. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    359. (1): SiLU()
    360. (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    361. )
    362. (h_upd): Identity()
    363. (x_upd): Identity()
    364. (emb_layers): Sequential(
    365. (0): SiLU()
    366. (1): Linear(in_features=1280, out_features=1280, bias=True)
    367. )
    368. (out_layers): Sequential(
    369. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    370. (1): SiLU()
    371. (2): Dropout(p=0, inplace=False)
    372. (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    373. )
    374. (skip_connection): Identity()
    375. )
    376. )
    377. (output_blocks): ModuleList(
    378. (0-1): 2 x TimestepEmbedSequential(
    379. (0): ResBlock(
    380. (in_layers): Sequential(
    381. (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
    382. (1): SiLU()
    383. (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    384. )
    385. (h_upd): Identity()
    386. (x_upd): Identity()
    387. (emb_layers): Sequential(
    388. (0): SiLU()
    389. (1): Linear(in_features=1280, out_features=1280, bias=True)
    390. )
    391. (out_layers): Sequential(
    392. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    393. (1): SiLU()
    394. (2): Dropout(p=0, inplace=False)
    395. (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    396. )
    397. (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
    398. )
    399. (1): SpatialTransformer(
    400. (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
    401. (proj_in): Linear(in_features=1280, out_features=1280, bias=True)
    402. (transformer_blocks): ModuleList(
    403. (0-9): 10 x BasicTransformerBlock(
    404. (attn1): CrossAttention(
    405. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    406. (to_k): Linear(in_features=1280, out_features=1280, bias=False)
    407. (to_v): Linear(in_features=1280, out_features=1280, bias=False)
    408. (to_out): Sequential(
    409. (0): Linear(in_features=1280, out_features=1280, bias=True)
    410. (1): Dropout(p=0.0, inplace=False)
    411. )
    412. )
    413. (ff): FeedForward(
    414. (net): Sequential(
    415. (0): GEGLU(
    416. (proj): Linear(in_features=1280, out_features=10240, bias=True)
    417. )
    418. (1): Dropout(p=0.0, inplace=False)
    419. (2): Linear(in_features=5120, out_features=1280, bias=True)
    420. )
    421. )
    422. (attn2): CrossAttention(
    423. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    424. (to_k): Linear(in_features=2048, out_features=1280, bias=False)
    425. (to_v): Linear(in_features=2048, out_features=1280, bias=False)
    426. (to_out): Sequential(
    427. (0): Linear(in_features=1280, out_features=1280, bias=True)
    428. (1): Dropout(p=0.0, inplace=False)
    429. )
    430. )
    431. (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    432. (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    433. (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    434. )
    435. )
    436. (proj_out): Linear(in_features=1280, out_features=1280, bias=True)
    437. )
    438. )
    439. (2): TimestepEmbedSequential(
    440. (0): ResBlock(
    441. (in_layers): Sequential(
    442. (0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
    443. (1): SiLU()
    444. (2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    445. )
    446. (h_upd): Identity()
    447. (x_upd): Identity()
    448. (emb_layers): Sequential(
    449. (0): SiLU()
    450. (1): Linear(in_features=1280, out_features=1280, bias=True)
    451. )
    452. (out_layers): Sequential(
    453. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    454. (1): SiLU()
    455. (2): Dropout(p=0, inplace=False)
    456. (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    457. )
    458. (skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
    459. )
    460. (1): SpatialTransformer(
    461. (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
    462. (proj_in): Linear(in_features=1280, out_features=1280, bias=True)
    463. (transformer_blocks): ModuleList(
    464. (0-9): 10 x BasicTransformerBlock(
    465. (attn1): CrossAttention(
    466. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    467. (to_k): Linear(in_features=1280, out_features=1280, bias=False)
    468. (to_v): Linear(in_features=1280, out_features=1280, bias=False)
    469. (to_out): Sequential(
    470. (0): Linear(in_features=1280, out_features=1280, bias=True)
    471. (1): Dropout(p=0.0, inplace=False)
    472. )
    473. )
    474. (ff): FeedForward(
    475. (net): Sequential(
    476. (0): GEGLU(
    477. (proj): Linear(in_features=1280, out_features=10240, bias=True)
    478. )
    479. (1): Dropout(p=0.0, inplace=False)
    480. (2): Linear(in_features=5120, out_features=1280, bias=True)
    481. )
    482. )
    483. (attn2): CrossAttention(
    484. (to_q): Linear(in_features=1280, out_features=1280, bias=False)
    485. (to_k): Linear(in_features=2048, out_features=1280, bias=False)
    486. (to_v): Linear(in_features=2048, out_features=1280, bias=False)
    487. (to_out): Sequential(
    488. (0): Linear(in_features=1280, out_features=1280, bias=True)
    489. (1): Dropout(p=0.0, inplace=False)
    490. )
    491. )
    492. (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    493. (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    494. (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    495. )
    496. )
    497. (proj_out): Linear(in_features=1280, out_features=1280, bias=True)
    498. )
    499. (2): Upsample(
    500. (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    501. )
    502. )
    503. (3): TimestepEmbedSequential(
    504. (0): ResBlock(
    505. (in_layers): Sequential(
    506. (0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
    507. (1): SiLU()
    508. (2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    509. )
    510. (h_upd): Identity()
    511. (x_upd): Identity()
    512. (emb_layers): Sequential(
    513. (0): SiLU()
    514. (1): Linear(in_features=1280, out_features=640, bias=True)
    515. )
    516. (out_layers): Sequential(
    517. (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
    518. (1): SiLU()
    519. (2): Dropout(p=0, inplace=False)
    520. (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    521. )
    522. (skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1))
    523. )
    524. (1): SpatialTransformer(
    525. (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
    526. (proj_in): Linear(in_features=640, out_features=640, bias=True)
    527. (transformer_blocks): ModuleList(
    528. (0-1): 2 x BasicTransformerBlock(
    529. (attn1): CrossAttention(
    530. (to_q): Linear(in_features=640, out_features=640, bias=False)
    531. (to_k): Linear(in_features=640, out_features=640, bias=False)
    532. (to_v): Linear(in_features=640, out_features=640, bias=False)
    533. (to_out): Sequential(
    534. (0): Linear(in_features=640, out_features=640, bias=True)
    535. (1): Dropout(p=0.0, inplace=False)
    536. )
    537. )
    538. (ff): FeedForward(
    539. (net): Sequential(
    540. (0): GEGLU(
    541. (proj): Linear(in_features=640, out_features=5120, bias=True)
    542. )
    543. (1): Dropout(p=0.0, inplace=False)
    544. (2): Linear(in_features=2560, out_features=640, bias=True)
    545. )
    546. )
    547. (attn2): CrossAttention(
    548. (to_q): Linear(in_features=640, out_features=640, bias=False)
    549. (to_k): Linear(in_features=2048, out_features=640, bias=False)
    550. (to_v): Linear(in_features=2048, out_features=640, bias=False)
    551. (to_out): Sequential(
    552. (0): Linear(in_features=640, out_features=640, bias=True)
    553. (1): Dropout(p=0.0, inplace=False)
    554. )
    555. )
    556. (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    557. (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    558. (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    559. )
    560. )
    561. (proj_out): Linear(in_features=640, out_features=640, bias=True)
    562. )
    563. )
    564. (4): TimestepEmbedSequential(
    565. (0): ResBlock(
    566. (in_layers): Sequential(
    567. (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
    568. (1): SiLU()
    569. (2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    570. )
    571. (h_upd): Identity()
    572. (x_upd): Identity()
    573. (emb_layers): Sequential(
    574. (0): SiLU()
    575. (1): Linear(in_features=1280, out_features=640, bias=True)
    576. )
    577. (out_layers): Sequential(
    578. (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
    579. (1): SiLU()
    580. (2): Dropout(p=0, inplace=False)
    581. (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    582. )
    583. (skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1))
    584. )
    585. (1): SpatialTransformer(
    586. (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
    587. (proj_in): Linear(in_features=640, out_features=640, bias=True)
    588. (transformer_blocks): ModuleList(
    589. (0-1): 2 x BasicTransformerBlock(
    590. (attn1): CrossAttention(
    591. (to_q): Linear(in_features=640, out_features=640, bias=False)
    592. (to_k): Linear(in_features=640, out_features=640, bias=False)
    593. (to_v): Linear(in_features=640, out_features=640, bias=False)
    594. (to_out): Sequential(
    595. (0): Linear(in_features=640, out_features=640, bias=True)
    596. (1): Dropout(p=0.0, inplace=False)
    597. )
    598. )
    599. (ff): FeedForward(
    600. (net): Sequential(
    601. (0): GEGLU(
    602. (proj): Linear(in_features=640, out_features=5120, bias=True)
    603. )
    604. (1): Dropout(p=0.0, inplace=False)
    605. (2): Linear(in_features=2560, out_features=640, bias=True)
    606. )
    607. )
    608. (attn2): CrossAttention(
    609. (to_q): Linear(in_features=640, out_features=640, bias=False)
    610. (to_k): Linear(in_features=2048, out_features=640, bias=False)
    611. (to_v): Linear(in_features=2048, out_features=640, bias=False)
    612. (to_out): Sequential(
    613. (0): Linear(in_features=640, out_features=640, bias=True)
    614. (1): Dropout(p=0.0, inplace=False)
    615. )
    616. )
    617. (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    618. (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    619. (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    620. )
    621. )
    622. (proj_out): Linear(in_features=640, out_features=640, bias=True)
    623. )
    624. )
    625. (5): TimestepEmbedSequential(
    626. (0): ResBlock(
    627. (in_layers): Sequential(
    628. (0): GroupNorm32(32, 960, eps=1e-05, affine=True)
    629. (1): SiLU()
    630. (2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    631. )
    632. (h_upd): Identity()
    633. (x_upd): Identity()
    634. (emb_layers): Sequential(
    635. (0): SiLU()
    636. (1): Linear(in_features=1280, out_features=640, bias=True)
    637. )
    638. (out_layers): Sequential(
    639. (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
    640. (1): SiLU()
    641. (2): Dropout(p=0, inplace=False)
    642. (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    643. )
    644. (skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1))
    645. )
    646. (1): SpatialTransformer(
    647. (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
    648. (proj_in): Linear(in_features=640, out_features=640, bias=True)
    649. (transformer_blocks): ModuleList(
    650. (0-1): 2 x BasicTransformerBlock(
    651. (attn1): CrossAttention(
    652. (to_q): Linear(in_features=640, out_features=640, bias=False)
    653. (to_k): Linear(in_features=640, out_features=640, bias=False)
    654. (to_v): Linear(in_features=640, out_features=640, bias=False)
    655. (to_out): Sequential(
    656. (0): Linear(in_features=640, out_features=640, bias=True)
    657. (1): Dropout(p=0.0, inplace=False)
    658. )
    659. )
    660. (ff): FeedForward(
    661. (net): Sequential(
    662. (0): GEGLU(
    663. (proj): Linear(in_features=640, out_features=5120, bias=True)
    664. )
    665. (1): Dropout(p=0.0, inplace=False)
    666. (2): Linear(in_features=2560, out_features=640, bias=True)
    667. )
    668. )
    669. (attn2): CrossAttention(
    670. (to_q): Linear(in_features=640, out_features=640, bias=False)
    671. (to_k): Linear(in_features=2048, out_features=640, bias=False)
    672. (to_v): Linear(in_features=2048, out_features=640, bias=False)
    673. (to_out): Sequential(
    674. (0): Linear(in_features=640, out_features=640, bias=True)
    675. (1): Dropout(p=0.0, inplace=False)
    676. )
    677. )
    678. (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    679. (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    680. (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
    681. )
    682. )
    683. (proj_out): Linear(in_features=640, out_features=640, bias=True)
    684. )
    685. (2): Upsample(
    686. (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    687. )
    688. )
    689. (6): TimestepEmbedSequential(
    690. (0): ResBlock(
    691. (in_layers): Sequential(
    692. (0): GroupNorm32(32, 960, eps=1e-05, affine=True)
    693. (1): SiLU()
    694. (2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    695. )
    696. (h_upd): Identity()
    697. (x_upd): Identity()
    698. (emb_layers): Sequential(
    699. (0): SiLU()
    700. (1): Linear(in_features=1280, out_features=320, bias=True)
    701. )
    702. (out_layers): Sequential(
    703. (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
    704. (1): SiLU()
    705. (2): Dropout(p=0, inplace=False)
    706. (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    707. )
    708. (skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1))
    709. )
    710. )
    711. (7-8): 2 x TimestepEmbedSequential(
    712. (0): ResBlock(
    713. (in_layers): Sequential(
    714. (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
    715. (1): SiLU()
    716. (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    717. )
    718. (h_upd): Identity()
    719. (x_upd): Identity()
    720. (emb_layers): Sequential(
    721. (0): SiLU()
    722. (1): Linear(in_features=1280, out_features=320, bias=True)
    723. )
    724. (out_layers): Sequential(
    725. (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
    726. (1): SiLU()
    727. (2): Dropout(p=0, inplace=False)
    728. (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    729. )
    730. (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
    731. )
    732. )
    733. )
    734. (out): Sequential(
    735. (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
    736. (1): SiLU()
    737. (2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    738. )
    739. )
    740. )

  • 相关阅读:
    计算机竞赛 题目:基于深度学习的图像风格迁移 - [ 卷积神经网络 机器视觉 ]
    Lecture 14 IO System(IO系统)
    zxing详细使用说明 java生成二维码、条形码
    合作技术保密协议
    汽车SOA-AUTOSAR-IOS架构分析
    MyBatis select标签
    代码随想录Day61 | 503. 下一个更大元素 II | 42. 接雨水
    Shell之练习一
    Pyhon函数定义中的:必选参数、可选参数、可变参数
    索引的创建与设计原则
  • 原文地址:https://blog.csdn.net/u012193416/article/details/133876234