1model: 2 #base_learning_rate: 1.0e-05 3 target: ldm.models.diffusion.ddpm.LatentDiffusion 4 params: 5 linear_start: 0.00085 6 linear_end: 0.0120 7 num_timesteps_cond: 1 8 log_every_t: 200 9 timesteps: 1000 10 first_stage_key: "image" 11 cond_stage_key: "caption" 12 cond_stage_trainable: False # TODO: allow config easily 13 image_size: 64 14 channels: 4 15 conditioning_key: crossattn 16 monitor: val/loss_simple_ema 17 scale_factor: 0.18215 18 #use_ema: False # TODO: set in args 19 use_fp16: True 20 parameterization: "eps" # default, original ldm 21 #parameterization: "velocity" 22 23 unet_config: 24 target: ldm.modules.diffusionmodules.openaimodel.UNetModel 25 params: 26 image_size: 32 # unused 27 in_channels: 4 28 out_channels: 4 29 model_channels: 320 30 attention_resolutions: [ 4, 2, 1 ] 31 num_res_blocks: 1 32 channel_mult: [ 1, 1, 1, 1 ] 33 #num_heads: 8 #wukong 34 num_head_channels: 64 # SD_VERSION v2.0 35 use_spatial_transformer: True 36 enable_flash_attention: True 37 use_linear_in_transformer: True #SD_VERSION v2.0 38 transformer_depth: 1 39 #context_dim: 768 40 context_dim: 1024 # SD_VERSION v2.0 41 use_checkpoint: True 42 legacy: False 43 use_fp16: True 44 dropout: 0.1 45 46 first_stage_config: 47 target: ldm.models.autoencoder.AutoencoderKL 48 params: 49 embed_dim: 4 50 monitor: val/rec_loss 51 use_fp16: True 52 ddconfig: 53 double_z: true 54 z_channels: 4 55 resolution: 256 56 in_channels: 3 57 out_ch: 3 58 ch: 128 59 ch_mult: 60 - 1 61 - 2 62 - 4 63 - 4 64 num_res_blocks: 2 65 attn_resolutions: [] 66 67 cond_stage_config: 68 target: ldm.modules.encoders.modules.FrozenCLIPEmbedder 69 params: 70 use_fp16: True 71 tokenizer_name: "BpeTokenizer" 72 context_length: 77 73 vocab_size: 49408 74 output_dim: 1024 75 width: 1024 76 layers: 23 77 heads: 16 78 epsilon: 1e-5 79 use_quick_gelu: False 80