Zyphra
/

Zonos-v0.1-hybrid

Text-to-Speech

Safetensors

Model card Files Files and versions Community

darios commited on 4 days ago

Commit

341d98d

verified ·

1 Parent(s): 398cfcf

Upload config.json with huggingface_hub

Browse files

Files changed (1) hide show

config.json +117 -0

config.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+    "backbone": {
+        "d_model": 2048,
+        "d_intermediate": 0,
+        "attn_mlp_d_intermediate": 8192,
+        "n_layer": 46,
+        "ssm_cfg": {
+            "layer": "Mamba2"
+        },
+        "attn_layer_idx": [
+            0,
+            4,
+            8,
+            12,
+            16,
+            20,
+            24,
+            28,
+            32,
+            36,
+            40,
+            44
+        ],
+        "attn_cfg": {
+            "causal": true,
+            "num_heads": 16,
+            "num_heads_kv": 4,
+            "rotary_emb_dim": 128,
+            "qkv_proj_bias": false,
+            "out_proj_bias": false
+        },
+        "rms_norm": false,
+        "residual_in_fp32": false,
+        "norm_epsilon": 1e-05
+    },
+    "prefix_conditioner": {
+        "conditioners": [
+            {
+                "type": "EspeakPhonemeConditioner",
+                "name": "espeak"
+            },
+            {
+                "cond_dim": 128,
+                "uncond_type": "learned",
+                "projection": "linear",
+                "type": "PassthroughConditioner",
+                "name": "speaker"
+            },
+            {
+                "input_dim": 8,
+                "uncond_type": "learned",
+                "type": "FourierConditioner",
+                "name": "emotion"
+            },
+            {
+                "min_val": 0,
+                "max_val": 24000,
+                "uncond_type": "learned",
+                "type": "FourierConditioner",
+                "name": "fmax"
+            },
+            {
+                "min_val": 0,
+                "max_val": 400,
+                "uncond_type": "learned",
+                "type": "FourierConditioner",
+                "name": "pitch_std"
+            },
+            {
+                "min_val": 0,
+                "max_val": 40,
+                "uncond_type": "learned",
+                "type": "FourierConditioner",
+                "name": "speaking_rate"
+            },
+            {
+                "min_val": -1,
+                "max_val": 126,
+                "uncond_type": "learned",
+                "type": "IntegerConditioner",
+                "name": "language_id"
+            },
+            {
+                "input_dim": 8,
+                "min_val": 0.5,
+                "max_val": 0.8,
+                "uncond_type": "learned",
+                "type": "FourierConditioner",
+                "name": "vqscore_8"
+            },
+            {
+                "min_val": -1.0,
+                "max_val": 1000,
+                "uncond_type": "learned",
+                "type": "FourierConditioner",
+                "name": "ctc_loss"
+            },
+            {
+                "min_val": 1,
+                "max_val": 5,
+                "uncond_type": "learned",
+                "type": "FourierConditioner",
+                "name": "dnsmos_ovrl"
+            },
+            {
+                "min_val": 0,
+                "max_val": 1,
+                "uncond_type": "learned",
+                "type": "IntegerConditioner",
+                "name": "speaker_noised"
+            }
+        ],
+        "projection": "linear"
+    },
+    "eos_token_id": 1024,
+    "masked_token_id": 1025
+}