{ "backbone": { "d_model": 2048, "d_intermediate": 0, "attn_mlp_d_intermediate": 8192, "n_layer": 46, "ssm_cfg": { "layer": "Mamba2" }, "attn_layer_idx": [ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44 ], "attn_cfg": { "causal": true, "num_heads": 16, "num_heads_kv": 4, "rotary_emb_dim": 128, "qkv_proj_bias": false, "out_proj_bias": false }, "rms_norm": false, "residual_in_fp32": false, "norm_epsilon": 1e-05 }, "prefix_conditioner": { "conditioners": [ { "type": "EspeakPhonemeConditioner", "name": "espeak" }, { "cond_dim": 128, "uncond_type": "learned", "projection": "linear", "type": "PassthroughConditioner", "name": "speaker" }, { "input_dim": 8, "uncond_type": "learned", "type": "FourierConditioner", "name": "emotion" }, { "min_val": 0, "max_val": 24000, "uncond_type": "learned", "type": "FourierConditioner", "name": "fmax" }, { "min_val": 0, "max_val": 400, "uncond_type": "learned", "type": "FourierConditioner", "name": "pitch_std" }, { "min_val": 0, "max_val": 40, "uncond_type": "learned", "type": "FourierConditioner", "name": "speaking_rate" }, { "min_val": -1, "max_val": 126, "uncond_type": "learned", "type": "IntegerConditioner", "name": "language_id" }, { "input_dim": 8, "min_val": 0.5, "max_val": 0.8, "uncond_type": "learned", "type": "FourierConditioner", "name": "vqscore_8" }, { "min_val": -1.0, "max_val": 1000, "uncond_type": "learned", "type": "FourierConditioner", "name": "ctc_loss" }, { "min_val": 1, "max_val": 5, "uncond_type": "learned", "type": "FourierConditioner", "name": "dnsmos_ovrl" }, { "min_val": 0, "max_val": 1, "uncond_type": "learned", "type": "IntegerConditioner", "name": "speaker_noised" } ], "projection": "linear" }, "eos_token_id": 1024, "masked_token_id": 1025 }