{ "backbone": { "d_model": 2048, "d_intermediate": 0, "attn_mlp_d_intermediate": 8192, "n_layer": 26, "ssm_cfg": {}, "attn_layer_idx": [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ], "attn_cfg": { "causal": true, "num_heads": 16, "num_heads_kv": 4, "rotary_emb_dim": 128, "rotary_emb_interleaved": true, "qkv_proj_bias": false, "out_proj_bias": false }, "rms_norm": false, "residual_in_fp32": false, "norm_epsilon": 1e-05 }, "prefix_conditioner": { "conditioners": [ { "type": "EspeakPhonemeConditioner", "name": "espeak" }, { "cond_dim": 128, "uncond_type": "learned", "projection": "linear", "type": "PassthroughConditioner", "name": "speaker" }, { "input_dim": 8, "uncond_type": "learned", "type": "FourierConditioner", "name": "emotion" }, { "min_val": 0, "max_val": 24000, "uncond_type": "learned", "type": "FourierConditioner", "name": "fmax" }, { "min_val": 0, "max_val": 400, "uncond_type": "learned", "type": "FourierConditioner", "name": "pitch_std" }, { "min_val": 0, "max_val": 40, "uncond_type": "learned", "type": "FourierConditioner", "name": "speaking_rate" }, { "min_val": -1, "max_val": 126, "uncond_type": "learned", "type": "IntegerConditioner", "name": "language_id" } ], "projection": "linear" }, "eos_token_id": 1024, "masked_token_id": 1025 }