Upload EcapaTdnnForSequenceClassification

Browse files

Files changed (10) hide show

angular_loss.py +68 -0
audio_processing.py +413 -0
config.json +6 -2
conv_asr.py +189 -0
features.py +560 -0
model.safetensors +1 -1
modeling_ecapa_tdnn.py +150 -0
module.py +105 -0
spectrogram_augment.py +223 -0
tdnn_attention.py +620 -0

angular_loss.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+import torch.nn as nn
+class Loss(nn.modules.loss._Loss):
+    """Inherit this class to implement custom loss."""
+    def __init__(self, **kwargs):
+        super(Loss, self).__init__(**kwargs)
+class AdditiveMarginSoftmaxLoss(Loss):
+    """Computes Additive Margin Softmax (CosFace) Loss
+    Paper: CosFace: Large Margin Cosine Loss for Deep Face Recognition
+    args:
+    scale: scale value for cosine angle
+    margin: margin value added to cosine angle
+    """
+    def __init__(self, scale=30.0, margin=0.2):
+        super().__init__()
+        self.eps = 1e-7
+        self.scale = scale
+        self.margin = margin
+    def forward(self, logits: torch.Tensor, labels: torch.Tensor):
+        # Extract the logits corresponding to the true class
+        logits_target = logits[torch.arange(logits.size(0)), labels]  # Faster indexing
+        numerator = self.scale * (logits_target - self.margin)  # Apply additive margin
+        # Exclude the target logits from denominator calculation
+        logits.scatter_(1, labels.unsqueeze(1), float('-inf'))  # Mask target class
+        denominator = torch.exp(numerator) + torch.sum(torch.exp(self.scale * logits), dim=1)
+        # Compute final loss
+        loss = -torch.log(torch.exp(numerator) / denominator)
+        return loss.mean()
+class AdditiveAngularMarginSoftmaxLoss(Loss):
+    """Computes Additive Angular Margin Softmax (ArcFace) Loss
+    Paper: ArcFace: Additive Angular Margin Loss for Deep Face Recognition
+    Args:
+    scale: scale value for cosine angle
+    margin: margin value added to cosine angle
+    """
+    def __init__(self, scale=20.0, margin=1.35):
+        super().__init__()
+        self.eps = 1e-7
+        self.scale = scale
+        self.margin = margin
+    def forward(self, logits: torch.Tensor, labels: torch.Tensor):
+        numerator = self.scale * torch.cos(
+            torch.acos(torch.clamp(torch.diagonal(logits.transpose(0, 1)[labels]), -1.0 + self.eps, 1 - self.eps))
+            + self.margin
+        )
+        excl = torch.cat(
+            [torch.cat((logits[i, :y], logits[i, y + 1 :])).unsqueeze(0) for i, y in enumerate(labels)], dim=0
+        )
+        denominator = torch.exp(numerator) + torch.sum(torch.exp(self.scale * excl), dim=1)
+        L = numerator - torch.log(denominator)
+        return -torch.mean(L)

audio_processing.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import math
+from packaging import version
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+import torch
+try:
+    import torchaudio
+    import torchaudio.functional
+    import torchaudio.transforms
+    TORCHAUDIO_VERSION = version.parse(torchaudio.__version__)
+    TORCHAUDIO_VERSION_MIN = version.parse('0.5')
+    HAVE_TORCHAUDIO = True
+except ModuleNotFoundError:
+    HAVE_TORCHAUDIO = False
+from .module import NeuralModule
+from .features import FilterbankFeatures, FilterbankFeaturesTA
+from .spectrogram_augment import SpecCutout, SpecAugment
+class AudioPreprocessor(NeuralModule, ABC):
+    """
+    An interface for Neural Modules that performs audio pre-processing,
+    transforming the wav files to features.
+    """
+    def __init__(self, win_length, hop_length):
+        super().__init__()
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'ones': torch.ones,
+            None: torch.ones,
+        }
+        # Normally, when you call to(dtype) on a torch.nn.Module, all
+        # floating point parameters and buffers will change to that
+        # dtype, rather than being float32. The AudioPreprocessor
+        # classes, uniquely, don't actually have any parameters or
+        # buffers from what I see. In addition, we want the input to
+        # the preprocessor to be float32, but need to create the
+        # output in appropriate precision. We have this empty tensor
+        # here just to detect which dtype tensor this module should
+        # output at the end of execution.
+        self.register_buffer("dtype_sentinel_tensor", torch.tensor((), dtype=torch.float32), persistent=False)
+    @torch.no_grad()
+    def forward(self, input_signal, length):
+        processed_signal, processed_length = self.get_features(input_signal.to(torch.float32), length)
+        processed_signal = processed_signal.to(self.dtype_sentinel_tensor.dtype)
+        return processed_signal, processed_length
+    @abstractmethod
+    def get_features(self, input_signal, length):
+        # Called by forward(). Subclasses should implement this.
+        pass
+class AudioToMelSpectrogramPreprocessor(AudioPreprocessor):
+    """Featurizer module that converts wavs to mel spectrograms.
+    Args:
+        sample_rate (int): Sample rate of the input audio data.
+            Defaults to 16000
+        window_size (float): Size of window for fft in seconds
+            Defaults to 0.02
+        window_stride (float): Stride of window for fft in seconds
+            Defaults to 0.01
+        n_window_size (int): Size of window for fft in samples
+            Defaults to None. Use one of window_size or n_window_size.
+        n_window_stride (int): Stride of window for fft in samples
+            Defaults to None. Use one of window_stride or n_window_stride.
+        window (str): Windowing function for fft. can be one of ['hann',
+            'hamming', 'blackman', 'bartlett']
+            Defaults to "hann"
+        normalize (str): Can be one of ['per_feature', 'all_features']; all
+            other options disable feature normalization. 'all_features'
+            normalizes the entire spectrogram to be mean 0 with std 1.
+            'pre_features' normalizes per channel / freq instead.
+            Defaults to "per_feature"
+        n_fft (int): Length of FT window. If None, it uses the smallest power
+            of 2 that is larger than n_window_size.
+            Defaults to None
+        preemph (float): Amount of pre emphasis to add to audio. Can be
+            disabled by passing None.
+            Defaults to 0.97
+        features (int): Number of mel spectrogram freq bins to output.
+            Defaults to 64
+        lowfreq (int): Lower bound on mel basis in Hz.
+            Defaults to 0
+        highfreq  (int): Lower bound on mel basis in Hz.
+            Defaults to None
+        log (bool): Log features.
+            Defaults to True
+        log_zero_guard_type(str): Need to avoid taking the log of zero. There
+            are two options: "add" or "clamp".
+            Defaults to "add".
+        log_zero_guard_value(float, or str): Add or clamp requires the number
+            to add with or clamp to. log_zero_guard_value can either be a float
+            or "tiny" or "eps". torch.finfo is used if "tiny" or "eps" is
+            passed.
+            Defaults to 2**-24.
+        dither (float): Amount of white-noise dithering.
+            Defaults to 1e-5
+        pad_to (int): Ensures that the output size of the time dimension is
+            a multiple of pad_to.
+            Defaults to 16
+        frame_splicing (int): Defaults to 1
+        exact_pad (bool): If True, sets stft center to False and adds padding, such that num_frames = audio_length
+            // hop_length. Defaults to False.
+        pad_value (float): The value that shorter mels are padded with.
+            Defaults to 0
+        mag_power (float): The power that the linear spectrogram is raised to
+            prior to multiplication with mel basis.
+            Defaults to 2 for a power spec
+        rng : Random number generator
+        nb_augmentation_prob (float) : Probability with which narrowband augmentation would be applied to
+            samples in the batch.
+            Defaults to 0.0
+        nb_max_freq (int) : Frequency above which all frequencies will be masked for narrowband augmentation.
+            Defaults to 4000
+        use_torchaudio: Whether to use the `torchaudio` implementation.
+        mel_norm: Normalization used for mel filterbank weights.
+            Defaults to 'slaney' (area normalization)
+        stft_exact_pad: Deprecated argument, kept for compatibility with older checkpoints.
+        stft_conv: Deprecated argument, kept for compatibility with older checkpoints.
+    """
+    def __init__(
+        self,
+        sample_rate=16000,
+        window_size=0.02,
+        window_stride=0.01,
+        n_window_size=None,
+        n_window_stride=None,
+        window="hann",
+        normalize="per_feature",
+        n_fft=None,
+        preemph=0.97,
+        features=64,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        log_zero_guard_type="add",
+        log_zero_guard_value=2**-24,
+        dither=1e-5,
+        pad_to=16,
+        frame_splicing=1,
+        exact_pad=False,
+        pad_value=0,
+        mag_power=2.0,
+        rng=None,
+        nb_augmentation_prob=0.0,
+        nb_max_freq=4000,
+        use_torchaudio: bool = False,
+        mel_norm="slaney",
+        stft_exact_pad=False,  # Deprecated arguments; kept for config compatibility
+        stft_conv=False,  # Deprecated arguments; kept for config compatibility
+    ):
+        super().__init__(n_window_size, n_window_stride)
+        self._sample_rate = sample_rate
+        if window_size and n_window_size:
+            raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
+        if window_stride and n_window_stride:
+            raise ValueError(
+                f"{self} received both window_stride and " f"n_window_stride. Only one should be specified."
+            )
+        if window_size:
+            n_window_size = int(window_size * self._sample_rate)
+        if window_stride:
+            n_window_stride = int(window_stride * self._sample_rate)
+        # Given the long and similar argument list, point to the class and instantiate it by reference
+        if not use_torchaudio:
+            featurizer_class = FilterbankFeatures
+        else:
+            featurizer_class = FilterbankFeaturesTA
+        self.featurizer = featurizer_class(
+            sample_rate=self._sample_rate,
+            n_window_size=n_window_size,
+            n_window_stride=n_window_stride,
+            window=window,
+            normalize=normalize,
+            n_fft=n_fft,
+            preemph=preemph,
+            nfilt=features,
+            lowfreq=lowfreq,
+            highfreq=highfreq,
+            log=log,
+            log_zero_guard_type=log_zero_guard_type,
+            log_zero_guard_value=log_zero_guard_value,
+            dither=dither,
+            pad_to=pad_to,
+            frame_splicing=frame_splicing,
+            exact_pad=exact_pad,
+            pad_value=pad_value,
+            mag_power=mag_power,
+            rng=rng,
+            nb_augmentation_prob=nb_augmentation_prob,
+            nb_max_freq=nb_max_freq,
+            mel_norm=mel_norm,
+            stft_exact_pad=stft_exact_pad,  # Deprecated arguments; kept for config compatibility
+            stft_conv=stft_conv,  # Deprecated arguments; kept for config compatibility
+        )
+    def get_features(self, input_signal, length):
+        return self.featurizer(input_signal, length)
+    @property
+    def filter_banks(self):
+        return self.featurizer.filter_banks
+class AudioToMFCCPreprocessor(AudioPreprocessor):
+    """Preprocessor that converts wavs to MFCCs.
+    Uses torchaudio.transforms.MFCC.
+    Args:
+        sample_rate: The sample rate of the audio.
+            Defaults to 16000.
+        window_size: Size of window for fft in seconds. Used to calculate the
+            win_length arg for mel spectrogram.
+            Defaults to 0.02
+        window_stride: Stride of window for fft in seconds. Used to caculate
+            the hop_length arg for mel spect.
+            Defaults to 0.01
+        n_window_size: Size of window for fft in samples
+            Defaults to None. Use one of window_size or n_window_size.
+        n_window_stride: Stride of window for fft in samples
+            Defaults to None. Use one of window_stride or n_window_stride.
+        window: Windowing function for fft. can be one of ['hann',
+            'hamming', 'blackman', 'bartlett', 'none', 'null'].
+            Defaults to 'hann'
+        n_fft: Length of FT window. If None, it uses the smallest power of 2
+            that is larger than n_window_size.
+            Defaults to None
+        lowfreq (int): Lower bound on mel basis in Hz.
+            Defaults to 0
+        highfreq  (int): Lower bound on mel basis in Hz.
+            Defaults to None
+        n_mels: Number of mel filterbanks.
+            Defaults to 64
+        n_mfcc: Number of coefficients to retain
+            Defaults to 64
+        dct_type: Type of discrete cosine transform to use
+        norm: Type of norm to use
+        log: Whether to use log-mel spectrograms instead of db-scaled.
+            Defaults to True.
+    """
+    def __init__(
+        self,
+        sample_rate=16000,
+        window_size=0.02,
+        window_stride=0.01,
+        n_window_size=None,
+        n_window_stride=None,
+        window='hann',
+        n_fft=None,
+        lowfreq=0.0,
+        highfreq=None,
+        n_mels=64,
+        n_mfcc=64,
+        dct_type=2,
+        norm='ortho',
+        log=True,
+    ):
+        self._sample_rate = sample_rate
+        if not HAVE_TORCHAUDIO:
+            print('Could not import torchaudio. Some features might not work.')
+            raise ModuleNotFoundError(
+                "torchaudio is not installed but is necessary for "
+                "AudioToMFCCPreprocessor. We recommend you try "
+                "building it from source for the PyTorch version you have."
+            )
+        if window_size and n_window_size:
+            raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
+        if window_stride and n_window_stride:
+            raise ValueError(
+                f"{self} received both window_stride and " f"n_window_stride. Only one should be specified."
+            )
+        # Get win_length (n_window_size) and hop_length (n_window_stride)
+        if window_size:
+            n_window_size = int(window_size * self._sample_rate)
+        if window_stride:
+            n_window_stride = int(window_stride * self._sample_rate)
+        super().__init__(n_window_size, n_window_stride)
+        mel_kwargs = {}
+        mel_kwargs['f_min'] = lowfreq
+        mel_kwargs['f_max'] = highfreq
+        mel_kwargs['n_mels'] = n_mels
+        mel_kwargs['n_fft'] = n_fft or 2 ** math.ceil(math.log2(n_window_size))
+        mel_kwargs['win_length'] = n_window_size
+        mel_kwargs['hop_length'] = n_window_stride
+        # Set window_fn. None defaults to torch.ones.
+        window_fn = self.torch_windows.get(window, None)
+        if window_fn is None:
+            raise ValueError(
+                f"Window argument for AudioProcessor is invalid: {window}."
+                f"For no window function, use 'ones' or None."
+            )
+        mel_kwargs['window_fn'] = window_fn
+        # Use torchaudio's implementation of MFCCs as featurizer
+        self.featurizer = torchaudio.transforms.MFCC(
+            sample_rate=self._sample_rate,
+            n_mfcc=n_mfcc,
+            dct_type=dct_type,
+            norm=norm,
+            log_mels=log,
+            melkwargs=mel_kwargs,
+        )
+    def get_features(self, input_signal, length):
+        features = self.featurizer(input_signal)
+        seq_len = torch.ceil(length.to(torch.float32) / self.hop_length).to(dtype=torch.long)
+        return features, seq_len
+class SpectrogramAugmentation(NeuralModule):
+    """
+    Performs time and freq cuts in one of two ways.
+    SpecAugment zeroes out vertical and horizontal sections as described in
+    SpecAugment (https://arxiv.org/abs/1904.08779). Arguments for use with
+    SpecAugment are `freq_masks`, `time_masks`, `freq_width`, and `time_width`.
+    SpecCutout zeroes out rectangulars as described in Cutout
+    (https://arxiv.org/abs/1708.04552). Arguments for use with Cutout are
+    `rect_masks`, `rect_freq`, and `rect_time`.
+    Args:
+        freq_masks (int): how many frequency segments should be cut.
+            Defaults to 0.
+        time_masks (int): how many time segments should be cut
+            Defaults to 0.
+        freq_width (int): maximum number of frequencies to be cut in one
+            segment.
+            Defaults to 10.
+        time_width (int): maximum number of time steps to be cut in one
+            segment
+            Defaults to 10.
+        rect_masks (int): how many rectangular masks should be cut
+            Defaults to 0.
+        rect_freq (int): maximum size of cut rectangles along the frequency
+            dimension
+            Defaults to 5.
+        rect_time (int): maximum size of cut rectangles along the time
+            dimension
+            Defaults to 25.
+        use_numba_spec_augment: use numba code for Spectrogram augmentation
+        use_vectorized_spec_augment: use vectorized code for Spectrogram augmentation
+    """
+    def __init__(
+        self,
+        freq_masks=0,
+        time_masks=0,
+        freq_width=10,
+        time_width=10,
+        rect_masks=0,
+        rect_time=5,
+        rect_freq=20,
+        rng=None,
+        mask_value=0.0,
+        use_vectorized_spec_augment: bool = True,
+    ):
+        super().__init__()
+        if rect_masks > 0:
+            self.spec_cutout = SpecCutout(
+                rect_masks=rect_masks,
+                rect_time=rect_time,
+                rect_freq=rect_freq,
+                rng=rng,
+            )
+            # self.spec_cutout.to(self._device)
+        else:
+            self.spec_cutout = lambda input_spec: input_spec
+        if freq_masks + time_masks > 0:
+            self.spec_augment = SpecAugment(
+                freq_masks=freq_masks,
+                time_masks=time_masks,
+                freq_width=freq_width,
+                time_width=time_width,
+                rng=rng,
+                mask_value=mask_value,
+                use_vectorized_code=use_vectorized_spec_augment,
+            )
+        else:
+            self.spec_augment = lambda input_spec, length: input_spec
+    def forward(self, input_spec, length):
+        augmented_spec = self.spec_cutout(input_spec=input_spec)
+        augmented_spec = self.spec_augment(input_spec=augmented_spec, length=length)
+        return augmented_spec

config.json CHANGED Viewed

@@ -1,11 +1,14 @@
 {
-  "_attn_implementation_autoset": true,
   "angular": true,
   "angular_margin": 0.2,
   "angular_scale": 30,
   "attention_channels": 128,
   "auto_map": {
-    "AutoConfig": "configuration_ecapa_tdnn.EcapaTdnnConfig"
   },
   "bos_token_id": 1,
   "decoder_config": {
@@ -2577,6 +2580,7 @@
   },
   "time_masks": 5,
   "time_width": 0.03,
   "transformers_version": "4.48.3",
   "use_torchaudio": true,
   "use_vectorized_spec_augment": true,

 {
   "angular": true,
   "angular_margin": 0.2,
   "angular_scale": 30,
+  "architectures": [
+    "EcapaTdnnForSequenceClassification"
+  ],
   "attention_channels": 128,
   "auto_map": {
+    "AutoConfig": "configuration_ecapa_tdnn.EcapaTdnnConfig",
+    "AutoModelForAudioClassification": "modeling_ecapa_tdnn.EcapaTdnnForSequenceClassification"
   },
   "bos_token_id": 1,
   "decoder_config": {
   },
   "time_masks": 5,
   "time_width": 0.03,
+  "torch_dtype": "float32",
   "transformers_version": "4.48.3",
   "use_torchaudio": true,
   "use_vectorized_spec_augment": true,

conv_asr.py ADDED Viewed

	@@ -0,0 +1,189 @@

+from typing import Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .module import NeuralModule
+from .tdnn_attention import (
+    StatsPoolLayer,
+    AttentivePoolLayer,
+    TdnnModule,
+    TdnnSeModule,
+    TdnnSeRes2NetModule,
+    init_weights
+)
+class EcapaTdnnEncoder(NeuralModule):
+    """
+    Modified ECAPA Encoder layer without Res2Net module for faster training and inference which achieves
+    better numbers on speaker diarization tasks
+    Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf)
+    input:
+        feat_in: input feature shape (mel spec feature shape)
+        filters: list of filter shapes for SE_TDNN modules
+        kernel_sizes: list of kernel shapes for SE_TDNN modules
+        dilations: list of dilations for group conv se layer
+        scale: scale value to group wider conv channels (deafult:8)
+    output:
+        outputs : encoded output
+        output_length: masked output lengths
+    """
+    def __init__(
+        self,
+        feat_in: int,
+        filters: list,
+        kernel_sizes: list,
+        dilations: list,
+        scale: int = 8,
+        res2net: bool = False,
+        res2net_scale: int = 8,
+        init_mode: str = 'xavier_uniform',
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        self.layers.append(TdnnModule(feat_in, filters[0], kernel_size=kernel_sizes[0], dilation=dilations[0]))
+        for i in range(len(filters) - 2):
+            if res2net:
+                self.layers.append(
+                    TdnnSeRes2NetModule(
+                        filters[i],
+                        filters[i + 1],
+                        group_scale=scale,
+                        se_channels=128,
+                        kernel_size=kernel_sizes[i + 1],
+                        dilation=dilations[i + 1],
+                        res2net_scale=res2net_scale,
+                    )
+                )
+            else:
+                self.layers.append(
+                    TdnnSeModule(
+                        filters[i],
+                        filters[i + 1],
+                        group_scale=scale,
+                        se_channels=128,
+                        kernel_size=kernel_sizes[i + 1],
+                        dilation=dilations[i + 1],
+                    )
+                )
+        self.feature_agg = TdnnModule(filters[-1], filters[-1], kernel_sizes[-1], dilations[-1])
+        self.apply(lambda x: init_weights(x, mode=init_mode))
+    def forward(self, audio_signal, length=None):
+        x = audio_signal
+        outputs = []
+        for layer in self.layers:
+            x = layer(x, length=length)
+            outputs.append(x)
+        x = torch.cat(outputs[1:], dim=1)
+        x = self.feature_agg(x)
+        return x, length
+class SpeakerDecoder(NeuralModule):
+    """
+    Speaker Decoder creates the final neural layers that maps from the outputs
+    of Jasper Encoder to the embedding layer followed by speaker based softmax loss.
+    Args:
+        feat_in (int): Number of channels being input to this module
+        num_classes (int): Number of unique speakers in dataset
+        emb_sizes (list) : shapes of intermediate embedding layers (we consider speaker embbeddings
+            from 1st of this layers). Defaults to [1024,1024]
+        pool_mode (str) : Pooling strategy type. options are 'xvector','tap', 'attention'
+            Defaults to 'xvector (mean and variance)'
+            tap (temporal average pooling: just mean)
+            attention (attention based pooling)
+        init_mode (str): Describes how neural network parameters are
+            initialized. Options are ['xavier_uniform', 'xavier_normal',
+            'kaiming_uniform','kaiming_normal'].
+            Defaults to "xavier_uniform".
+    """
+    def __init__(
+        self,
+        feat_in: int,
+        num_classes: int,
+        emb_sizes: Optional[Union[int, list]] = 256,
+        pool_mode: str = 'xvector',
+        angular: bool = False,
+        attention_channels: int = 128,
+        init_mode: str = "xavier_uniform",
+    ):
+        super().__init__()
+        self.angular = angular
+        self.emb_id = 2
+        bias = False if self.angular else True
+        emb_sizes = [emb_sizes] if type(emb_sizes) is int else emb_sizes
+        self._num_classes = num_classes
+        self.pool_mode = pool_mode.lower()
+        if self.pool_mode == 'xvector' or self.pool_mode == 'tap':
+            self._pooling = StatsPoolLayer(feat_in=feat_in, pool_mode=self.pool_mode)
+            affine_type = 'linear'
+        elif self.pool_mode == 'attention':
+            self._pooling = AttentivePoolLayer(inp_filters=feat_in, attention_channels=attention_channels)
+            affine_type = 'conv'
+        shapes = [self._pooling.feat_in]
+        for size in emb_sizes:
+            shapes.append(int(size))
+        emb_layers = []
+        for shape_in, shape_out in zip(shapes[:-1], shapes[1:]):
+            layer = self.affine_layer(shape_in, shape_out, learn_mean=False, affine_type=affine_type)
+            emb_layers.append(layer)
+        self.emb_layers = nn.ModuleList(emb_layers)
+        self.final = nn.Linear(shapes[-1], self._num_classes, bias=bias)
+        self.apply(lambda x: init_weights(x, mode=init_mode))
+    def affine_layer(
+        self,
+        inp_shape,
+        out_shape,
+        learn_mean=True,
+        affine_type='conv',
+    ):
+        if affine_type == 'conv':
+            layer = nn.Sequential(
+                nn.BatchNorm1d(inp_shape, affine=True, track_running_stats=True),
+                nn.Conv1d(inp_shape, out_shape, kernel_size=1),
+            )
+        else:
+            layer = nn.Sequential(
+                nn.Linear(inp_shape, out_shape),
+                nn.BatchNorm1d(out_shape, affine=learn_mean, track_running_stats=True),
+                nn.ReLU(),
+            )
+        return layer
+    def forward(self, encoder_output, length=None):
+        pool = self._pooling(encoder_output, length)
+        embs = []
+        for layer in self.emb_layers:
+            pool, emb = layer(pool), layer[: self.emb_id](pool)
+            embs.append(emb)
+        pool = pool.squeeze(-1)
+        if self.angular:
+            for W in self.final.parameters():
+                W = F.normalize(W, p=2, dim=1)
+            pool = F.normalize(pool, p=2, dim=1)
+        out = self.final(pool)
+        return out, embs[-1].squeeze(-1)

features.py ADDED Viewed

	@@ -0,0 +1,560 @@

+import math
+import random
+from typing import Optional, Union, Tuple
+import librosa
+import torchaudio
+import torch
+import torch.nn as nn
+try:
+    import torchaudio
+    HAVE_TORCHAUDIO = True
+except ModuleNotFoundError:
+    HAVE_TORCHAUDIO = False
+CONSTANT = 1e-5
+def normalize_batch(x, seq_len, normalize_type):
+    x_mean = None
+    x_std = None
+    if normalize_type == "per_feature":
+        batch_size = x.shape[0]
+        max_time = x.shape[2]
+        # When doing stream capture to a graph, item() is not allowed
+        # becuase it calls cudaStreamSynchronize(). Therefore, we are
+        # sacrificing some error checking when running with cuda graphs.
+        if (
+            torch.cuda.is_available()
+            and not torch.cuda.is_current_stream_capturing()
+            and torch.any(seq_len == 1).item()
+        ):
+            raise ValueError(
+                "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result "
+                "in torch.std() returning nan. Make sure your audio length has enough samples for a single "
+                "feature (ex. at least `hop_length` for Mel Spectrograms)."
+            )
+        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
+        valid_mask = time_steps < seq_len.unsqueeze(1)
+        x_mean_numerator = torch.where(valid_mask.unsqueeze(1), x, 0.0).sum(axis=2)
+        x_mean_denominator = valid_mask.sum(axis=1)
+        x_mean = x_mean_numerator / x_mean_denominator.unsqueeze(1)
+        # Subtract 1 in the denominator to correct for the bias.
+        x_std = torch.sqrt(
+            torch.sum(torch.where(valid_mask.unsqueeze(1), x - x_mean.unsqueeze(2), 0.0) ** 2, axis=2)
+            / (x_mean_denominator.unsqueeze(1) - 1.0)
+        )
+        # make sure x_std is not zero
+        x_std += CONSTANT
+        return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2), x_mean, x_std
+    elif normalize_type == "all_features":
+        x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        for i in range(x.shape[0]):
+            x_mean[i] = x[i, :, : seq_len[i].item()].mean()
+            x_std[i] = x[i, :, : seq_len[i].item()].std()
+        # make sure x_std is not zero
+        x_std += CONSTANT
+        return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1), x_mean, x_std
+    elif "fixed_mean" in normalize_type and "fixed_std" in normalize_type:
+        x_mean = torch.tensor(normalize_type["fixed_mean"], device=x.device)
+        x_std = torch.tensor(normalize_type["fixed_std"], device=x.device)
+        return (
+            (x - x_mean.view(x.shape[0], x.shape[1]).unsqueeze(2)) / x_std.view(x.shape[0], x.shape[1]).unsqueeze(2),
+            x_mean,
+            x_std,
+        )
+    else:
+        return x, x_mean, x_std
+def clean_spectrogram_batch(spectrogram: torch.Tensor, spectrogram_len: torch.Tensor, fill_value=0.0) -> torch.Tensor:
+    """
+    Fill spectrogram values outside the length with `fill_value`
+    Args:
+        spectrogram: Tensor with shape [B, C, L] containing batched spectrograms
+        spectrogram_len: Tensor with shape [B] containing the sequence length of each batch element
+        fill_value: value to fill with, 0.0 by default
+    Returns:
+        cleaned spectrogram, tensor with shape equal to `spectrogram`
+    """
+    device = spectrogram.device
+    batch_size, _, max_len = spectrogram.shape
+    mask = torch.arange(max_len, device=device)[None, :] >= spectrogram_len[:, None]
+    mask = mask.unsqueeze(1).expand_as(spectrogram)
+    return spectrogram.masked_fill(mask, fill_value)
+def splice_frames(x, frame_splicing):
+    """Stacks frames together across feature dim
+    input is batch_size, feature_dim, num_frames
+    output is batch_size, feature_dim*frame_splicing, num_frames
+    """
+    seq = [x]
+    for n in range(1, frame_splicing):
+        seq.append(torch.cat([x[:, :, :n], x[:, :, n:]], dim=2))
+    return torch.cat(seq, dim=1)
+@torch.jit.script_if_tracing
+def make_seq_mask_like(
+    lengths: torch.Tensor, like: torch.Tensor, time_dim: int = -1, valid_ones: bool = True
+) -> torch.Tensor:
+    """
+    Args:
+        lengths: Tensor with shape [B] containing the sequence length of each batch element
+        like: The mask will contain the same number of dimensions as this Tensor, and will have the same max
+            length in the time dimension of this Tensor.
+        time_dim: Time dimension of the `shape_tensor` and the resulting mask. Zero-based.
+        valid_ones: If True, valid tokens will contain value `1` and padding will be `0`. Else, invert.
+    Returns:
+        A :class:`torch.Tensor` containing 1's and 0's for valid and invalid tokens, respectively, if `valid_ones`, else
+        vice-versa. Mask will have the same number of dimensions as `like`. Batch and time dimensions will match
+        the `like`. All other dimensions will be singletons. E.g., if `like.shape == [3, 4, 5]` and
+        `time_dim == -1', mask will have shape `[3, 1, 5]`.
+    """
+    # Mask with shape [B, T]
+    mask = torch.arange(like.shape[time_dim], device=like.device).repeat(lengths.shape[0], 1).lt(lengths.view(-1, 1))
+    # [B, T] -> [B, *, T] where * is any number of singleton dimensions to expand to like tensor
+    for _ in range(like.dim() - mask.dim()):
+        mask = mask.unsqueeze(1)
+    # If needed, transpose time dim
+    if time_dim != -1 and time_dim != mask.dim() - 1:
+        mask = mask.transpose(-1, time_dim)
+    # Maybe invert the padded vs. valid token values
+    if not valid_ones:
+        mask = ~mask
+    return mask
+class FilterbankFeatures(nn.Module):
+    """Featurizer that converts wavs to Mel Spectrograms.
+    See AudioToMelSpectrogramPreprocessor for args.
+    """
+    def __init__(
+        self,
+        sample_rate=16000,
+        n_window_size=320,
+        n_window_stride=160,
+        window="hann",
+        normalize="per_feature",
+        n_fft=None,
+        preemph=0.97,
+        nfilt=64,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        log_zero_guard_type="add",
+        log_zero_guard_value=2**-24,
+        dither=CONSTANT,
+        pad_to=16,
+        max_duration=16.7,
+        frame_splicing=1,
+        exact_pad=False,
+        pad_value=0,
+        mag_power=2.0,
+        use_grads=False,
+        rng=None,
+        nb_augmentation_prob=0.0,
+        nb_max_freq=4000,
+        mel_norm="slaney",
+        stft_exact_pad=False,  # Deprecated arguments; kept for config compatibility
+        stft_conv=False,  # Deprecated arguments; kept for config compatibility
+    ):
+        super().__init__()
+        if stft_conv or stft_exact_pad:
+            print(
+                "Using torch_stft is deprecated and has been removed. The values have been forcibly set to False "
+                "for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True "
+                "as needed."
+            )
+        if exact_pad and n_window_stride % 2 == 1:
+            raise NotImplementedError(
+                f"{self} received exact_pad == True, but hop_size was odd. If audio_length % hop_size == 0. Then the "
+                "returned spectrogram would not be of length audio_length // hop_size. Please use an even hop_size."
+            )
+        self.log_zero_guard_value = log_zero_guard_value
+        if (
+            n_window_size is None
+            or n_window_stride is None
+            or not isinstance(n_window_size, int)
+            or not isinstance(n_window_stride, int)
+            or n_window_size <= 0
+            or n_window_stride <= 0
+        ):
+            raise ValueError(
+                f"{self} got an invalid value for either n_window_size or "
+                f"n_window_stride. Both must be positive ints."
+            )
+        self.win_length = n_window_size
+        self.hop_length = n_window_stride
+        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
+        self.stft_pad_amount = (self.n_fft - self.hop_length) // 2 if exact_pad else None
+        self.exact_pad = exact_pad
+        if exact_pad:
+            print("STFT using exact pad")
+        torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'none': None,
+        }
+        window_fn = torch_windows.get(window, None)
+        window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None
+        self.register_buffer("window", window_tensor)
+        self.normalize = normalize
+        self.log = log
+        self.dither = dither
+        self.frame_splicing = frame_splicing
+        self.nfilt = nfilt
+        self.preemph = preemph
+        self.pad_to = pad_to
+        highfreq = highfreq or sample_rate / 2
+        filterbanks = torch.tensor(
+            librosa.filters.mel(
+                sr=sample_rate, n_fft=self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq, norm=mel_norm
+            ),
+            dtype=torch.float,
+        ).unsqueeze(0)
+        self.register_buffer("fb", filterbanks)
+        # Calculate maximum sequence length
+        max_length = self.get_seq_len(torch.tensor(max_duration * sample_rate, dtype=torch.float))
+        max_pad = pad_to - (max_length % pad_to) if pad_to > 0 else 0
+        self.max_length = max_length + max_pad
+        self.pad_value = pad_value
+        self.mag_power = mag_power
+        # We want to avoid taking the log of zero
+        # There are two options: either adding or clamping to a small value
+        if log_zero_guard_type not in ["add", "clamp"]:
+            raise ValueError(
+                f"{self} received {log_zero_guard_type} for the "
+                f"log_zero_guard_type parameter. It must be either 'add' or "
+                f"'clamp'."
+            )
+        self.use_grads = use_grads
+        if not use_grads:
+            self.forward = torch.no_grad()(self.forward)
+        self._rng = random.Random() if rng is None else rng
+        self.nb_augmentation_prob = nb_augmentation_prob
+        if self.nb_augmentation_prob > 0.0:
+            if nb_max_freq >= sample_rate / 2:
+                self.nb_augmentation_prob = 0.0
+            else:
+                self._nb_max_fft_bin = int((nb_max_freq / sample_rate) * n_fft)
+        # log_zero_guard_value is the the small we want to use, we support
+        # an actual number, or "tiny", or "eps"
+        self.log_zero_guard_type = log_zero_guard_type
+    def stft(self, x):
+        return torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            center=False if self.exact_pad else True,
+            window=self.window.to(dtype=torch.float),
+            return_complex=True,
+        )
+    def log_zero_guard_value_fn(self, x):
+        if isinstance(self.log_zero_guard_value, str):
+            if self.log_zero_guard_value == "tiny":
+                return torch.finfo(x.dtype).tiny
+            elif self.log_zero_guard_value == "eps":
+                return torch.finfo(x.dtype).eps
+            else:
+                raise ValueError(
+                    f"{self} received {self.log_zero_guard_value} for the "
+                    f"log_zero_guard_type parameter. It must be either a "
+                    f"number, 'tiny', or 'eps'"
+                )
+        else:
+            return self.log_zero_guard_value
+    def get_seq_len(self, seq_len):
+        # Assuming that center is True is stft_pad_amount = 0
+        pad_amount = self.stft_pad_amount * 2 if self.stft_pad_amount is not None else self.n_fft // 2 * 2
+        seq_len = torch.floor_divide((seq_len + pad_amount - self.n_fft), self.hop_length) + 1
+        return seq_len.to(dtype=torch.long)
+    @property
+    def filter_banks(self):
+        return self.fb
+    def forward(self, x, seq_len, linear_spec=False):
+        seq_len = self.get_seq_len(seq_len)
+        if self.stft_pad_amount is not None:
+            x = torch.nn.functional.pad(
+                x.unsqueeze(1), (self.stft_pad_amount, self.stft_pad_amount), "reflect"
+            ).squeeze(1)
+        # dither (only in training mode for eval determinism)
+        if self.training and self.dither > 0:
+            x += self.dither * torch.randn_like(x)
+        # do preemphasis
+        if self.preemph is not None:
+            x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1)
+        # disable autocast to get full range of stft values
+        with torch.amp.autocast(x.device.type, enabled=False):
+            x = self.stft(x)
+        # torch stft returns complex tensor (of shape [B,N,T]); so convert to magnitude
+        # guard is needed for sqrt if grads are passed through
+        guard = 0 if not self.use_grads else CONSTANT
+        x = torch.view_as_real(x)
+        x = torch.sqrt(x.pow(2).sum(-1) + guard)
+        if self.training and self.nb_augmentation_prob > 0.0:
+            for idx in range(x.shape[0]):
+                if self._rng.random() < self.nb_augmentation_prob:
+                    x[idx, self._nb_max_fft_bin :, :] = 0.0
+        # get power spectrum
+        if self.mag_power != 1.0:
+            x = x.pow(self.mag_power)
+        # return plain spectrogram if required
+        if linear_spec:
+            return x, seq_len
+        # dot with filterbank energies
+        x = torch.matmul(self.fb.to(x.dtype), x)
+        # log features if required
+        if self.log:
+            if self.log_zero_guard_type == "add":
+                x = torch.log(x + self.log_zero_guard_value_fn(x))
+            elif self.log_zero_guard_type == "clamp":
+                x = torch.log(torch.clamp(x, min=self.log_zero_guard_value_fn(x)))
+            else:
+                raise ValueError("log_zero_guard_type was not understood")
+        # frame splicing if required
+        if self.frame_splicing > 1:
+            x = splice_frames(x, self.frame_splicing)
+        # normalize if required
+        if self.normalize:
+            x, _, _ = normalize_batch(x, seq_len, normalize_type=self.normalize)
+        # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
+        max_len = x.size(-1)
+        mask = torch.arange(max_len, device=x.device)
+        mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1)
+        x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value)
+        del mask
+        pad_to = self.pad_to
+        if pad_to == "max":
+            x = nn.functional.pad(x, (0, self.max_length - x.size(-1)), value=self.pad_value)
+        elif pad_to > 0:
+            pad_amt = x.size(-1) % pad_to
+            if pad_amt != 0:
+                x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value)
+        return x, seq_len
+class FilterbankFeaturesTA(nn.Module):
+    """
+    Exportable, `torchaudio`-based implementation of Mel Spectrogram extraction.
+    See `AudioToMelSpectrogramPreprocessor` for args.
+    """
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        n_window_size: int = 320,
+        n_window_stride: int = 160,
+        normalize: Optional[str] = "per_feature",
+        nfilt: int = 64,
+        n_fft: Optional[int] = None,
+        preemph: float = 0.97,
+        lowfreq: float = 0,
+        highfreq: Optional[float] = None,
+        log: bool = True,
+        log_zero_guard_type: str = "add",
+        log_zero_guard_value: Union[float, str] = 2**-24,
+        dither: float = 1e-5,
+        window: str = "hann",
+        pad_to: int = 0,
+        pad_value: float = 0.0,
+        mel_norm="slaney",
+        # Seems like no one uses these options anymore. Don't convolute the code by supporting thm.
+        use_grads: bool = False,  # Deprecated arguments; kept for config compatibility
+        max_duration: float = 16.7,  # Deprecated arguments; kept for config compatibility
+        frame_splicing: int = 1,  # Deprecated arguments; kept for config compatibility
+        exact_pad: bool = False,  # Deprecated arguments; kept for config compatibility
+        nb_augmentation_prob: float = 0.0,  # Deprecated arguments; kept for config compatibility
+        nb_max_freq: int = 4000,  # Deprecated arguments; kept for config compatibility
+        mag_power: float = 2.0,  # Deprecated arguments; kept for config compatibility
+        rng: Optional[random.Random] = None,  # Deprecated arguments; kept for config compatibility
+        stft_exact_pad: bool = False,  # Deprecated arguments; kept for config compatibility
+        stft_conv: bool = False,  # Deprecated arguments; kept for config compatibility
+    ):
+        super().__init__()
+        if not HAVE_TORCHAUDIO:
+            raise ValueError(f"Need to install torchaudio to instantiate a {self.__class__.__name__}")
+        # Make sure log zero guard is supported, if given as a string
+        supported_log_zero_guard_strings = {"eps", "tiny"}
+        if isinstance(log_zero_guard_value, str) and log_zero_guard_value not in supported_log_zero_guard_strings:
+            raise ValueError(
+                f"Log zero guard value must either be a float or a member of {supported_log_zero_guard_strings}"
+            )
+        # Copied from `AudioPreprocessor` due to the ad-hoc structuring of the Mel Spec extractor class
+        self.torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'ones': torch.ones,
+            None: torch.ones,
+        }
+        # Ensure we can look up the window function
+        if window not in self.torch_windows:
+            raise ValueError(f"Got window value '{window}' but expected a member of {self.torch_windows.keys()}")
+        self.win_length = n_window_size
+        self.hop_length = n_window_stride
+        self._sample_rate = sample_rate
+        self._normalize_strategy = normalize
+        self._use_log = log
+        self._preemphasis_value = preemph
+        self.log_zero_guard_type = log_zero_guard_type
+        self.log_zero_guard_value: Union[str, float] = log_zero_guard_value
+        self.dither = dither
+        self.pad_to = pad_to
+        self.pad_value = pad_value
+        self.n_fft = n_fft
+        self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = torchaudio.transforms.MelSpectrogram(
+            sample_rate=self._sample_rate,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            n_mels=nfilt,
+            window_fn=self.torch_windows[window],
+            mel_scale="slaney",
+            norm=mel_norm,
+            n_fft=n_fft,
+            f_max=highfreq,
+            f_min=lowfreq,
+            wkwargs={"periodic": False},
+        )
+    @property
+    def filter_banks(self):
+        """Matches the analogous class"""
+        return self._mel_spec_extractor.mel_scale.fb
+    def _resolve_log_zero_guard_value(self, dtype: torch.dtype) -> float:
+        if isinstance(self.log_zero_guard_value, float):
+            return self.log_zero_guard_value
+        return getattr(torch.finfo(dtype), self.log_zero_guard_value)
+    def _apply_dithering(self, signals: torch.Tensor) -> torch.Tensor:
+        if self.training and self.dither > 0.0:
+            noise = torch.randn_like(signals) * self.dither
+            signals = signals + noise
+        return signals
+    def _apply_preemphasis(self, signals: torch.Tensor) -> torch.Tensor:
+        if self._preemphasis_value is not None:
+            padded = torch.nn.functional.pad(signals, (1, 0))
+            signals = signals - self._preemphasis_value * padded[:, :-1]
+        return signals
+    def _compute_output_lengths(self, input_lengths: torch.Tensor) -> torch.Tensor:
+        out_lengths = input_lengths.div(self.hop_length, rounding_mode="floor").add(1).long()
+        return out_lengths
+    def _apply_pad_to(self, features: torch.Tensor) -> torch.Tensor:
+        # Only apply during training; else need to capture dynamic shape for exported models
+        if not self.training or self.pad_to == 0 or features.shape[-1] % self.pad_to == 0:
+            return features
+        pad_length = self.pad_to - (features.shape[-1] % self.pad_to)
+        return torch.nn.functional.pad(features, pad=(0, pad_length), value=self.pad_value)
+    def _apply_log(self, features: torch.Tensor) -> torch.Tensor:
+        if self._use_log:
+            zero_guard = self._resolve_log_zero_guard_value(features.dtype)
+            if self.log_zero_guard_type == "add":
+                features = features + zero_guard
+            elif self.log_zero_guard_type == "clamp":
+                features = features.clamp(min=zero_guard)
+            else:
+                raise ValueError(f"Unsupported log zero guard type: '{self.log_zero_guard_type}'")
+            features = features.log()
+        return features
+    def _extract_spectrograms(self, signals: torch.Tensor) -> torch.Tensor:
+        # Complex FFT needs to be done in single precision
+        with torch.amp.autocast('cuda', enabled=False):
+            features = self._mel_spec_extractor(waveform=signals)
+        return features
+    def _apply_normalization(self, features: torch.Tensor, lengths: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
+        # For consistency, this function always does a masked fill even if not normalizing.
+        mask: torch.Tensor = make_seq_mask_like(lengths=lengths, like=features, time_dim=-1, valid_ones=False)
+        features = features.masked_fill(mask, 0.0)
+        # Maybe don't normalize
+        if self._normalize_strategy is None:
+            return features
+        # Use the log zero guard for the sqrt zero guard
+        guard_value = self._resolve_log_zero_guard_value(features.dtype)
+        if self._normalize_strategy == "per_feature" or self._normalize_strategy == "all_features":
+            # 'all_features' reduces over each sample; 'per_feature' reduces over each channel
+            reduce_dim = 2
+            if self._normalize_strategy == "all_features":
+                reduce_dim = [1, 2]
+            # [B, D, T] -> [B, D, 1] or [B, 1, 1]
+            means = features.sum(dim=reduce_dim, keepdim=True).div(lengths.view(-1, 1, 1))
+            stds = (
+                features.sub(means)
+                .masked_fill(mask, 0.0)
+                .pow(2.0)
+                .sum(dim=reduce_dim, keepdim=True)  # [B, D, T] -> [B, D, 1] or [B, 1, 1]
+                .div(lengths.view(-1, 1, 1) - 1)  # assume biased estimator
+                .clamp(min=guard_value)  # avoid sqrt(0)
+                .sqrt()
+            )
+            features = (features - means) / (stds + eps)
+        else:
+            # Deprecating constant std/mean
+            raise ValueError(f"Unsupported norm type: '{self._normalize_strategy}")
+        features = features.masked_fill(mask, 0.0)
+        return features
+    def forward(self, input_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        feature_lengths = self._compute_output_lengths(input_lengths=length)
+        signals = self._apply_dithering(signals=input_signal)
+        signals = self._apply_preemphasis(signals=signals)
+        features = self._extract_spectrograms(signals=signals)
+        features = self._apply_log(features=features)
+        features = self._apply_normalization(features=features, lengths=feature_lengths)
+        features = self._apply_pad_to(features=features)
+        return features, feature_lengths

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5828920fac146b843150400b269fa8fe51e3f3c5922b6c1882db45a43480dc92
 size 26039912

 version https://git-lfs.github.com/spec/v1
+oid sha256:4da89b0b6d405974f1e332bdc9945fae76222d7ddf0f955653fba9a00cca0339
 size 26039912

modeling_ecapa_tdnn.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from dataclasses import dataclass
+from typing import Optional, Union, Tuple
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from transformers.utils import ModelOutput
+from .configuration_ecapa_tdnn import EcapaTdnnConfig
+from .audio_processing import AudioToMelSpectrogramPreprocessor
+from .audio_processing import SpectrogramAugmentation
+from .conv_asr import EcapaTdnnEncoder, SpeakerDecoder
+from .angular_loss import AdditiveMarginSoftmaxLoss, AdditiveAngularMarginSoftmaxLoss
+@dataclass
+class EcapaTdnnBaseModelOutput(ModelOutput):
+    encoder_outputs: torch.FloatTensor = None
+    extract_features: torch.FloatTensor = None
+    output_lengths: torch.FloatTensor = None
+@dataclass
+class EcapaTdnnSequenceClassifierOutput(ModelOutput):
+    loss: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+    embeddings: torch.FloatTensor = None
+class EcapaTdnnPreTrainedModel(PreTrainedModel):
+    config_class = EcapaTdnnConfig
+    base_model_prefix = "ecapa_tdnn"
+    main_input_name = "input_values"
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Conv2d):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+    @property
+    def num_weights(self):
+        """
+        Utility property that returns the total number of parameters of NeuralModule.
+        """
+        return self._num_weights()
+    @torch.jit.ignore
+    def _num_weights(self):
+        num: int = 0
+        for p in self.parameters():
+            if p.requires_grad:
+                num += p.numel()
+        return num
+class EcapaTdnnModel(EcapaTdnnPreTrainedModel):
+    def __init__(self, config: EcapaTdnnConfig):
+        super().__init__(config)
+        self.config = config
+        self.preprocessor = AudioToMelSpectrogramPreprocessor(**config.mel_spectrogram_config)
+        self.spec_augment = SpectrogramAugmentation(**config.spectrogram_augmentation_config)
+        self.encoder = EcapaTdnnEncoder(**config.encoder_config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, EcapaTdnnBaseModelOutput]:
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_values).to(input_values)
+        lengths = attention_mask.sum(dim=1).long()
+        extract_features, output_lengths = self.preprocessor(input_values, lengths)
+        if self.training:
+            extract_features = self.spec_augment(extract_features, output_lengths)
+        encoder_outputs, output_lengths = self.encoder(extract_features, output_lengths)
+        return EcapaTdnnBaseModelOutput(
+            encoder_outputs=encoder_outputs,
+            extract_features=extract_features,
+            output_lengths=output_lengths,
+        )
+class EcapaTdnnForSequenceClassification(EcapaTdnnPreTrainedModel):
+    def __init__(self, config: EcapaTdnnConfig):
+        super().__init__(config)
+        self.ecapa_tdnn = EcapaTdnnModel(config)
+        self.classifier = SpeakerDecoder(**config.decoder_config)
+        if config.objective == 'additive_angular_margin':
+            self.loss_fct = AdditiveAngularMarginSoftmaxLoss(**config.objective_config)
+        elif config.objective == 'additive_margin':
+            self.loss_fct = AdditiveMarginSoftmaxLoss(**config.objective_config)
+        elif config.objective == 'cross_entropy':
+            self.loss_fct = nn.CrossEntropyLoss(**config.objective_config)
+        self.init_weights()
+    def freeze_base_model(self):
+        for param in self.ecapa_tdnn.parameters():
+            param.requires_grad = False
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, EcapaTdnnSequenceClassifierOutput]:
+        ecapa_tdnn_outputs = self.ecapa_tdnn(
+            input_values,
+            attention_mask,
+        )
+        logits, output_embeddings = self.classifier(
+            ecapa_tdnn_outputs.encoder_outputs,
+            ecapa_tdnn_outputs.output_lengths
+        )
+        logits = logits.view(-1, self.config.num_labels)
+        loss = None
+        if labels is not None:
+            loss = self.loss_fct(logits, labels.view(-1))
+        return EcapaTdnnSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+        )

module.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import torch.nn as nn
+class NeuralModule(nn.Module):
+    @property
+    def num_weights(self):
+        """
+        Utility property that returns the total number of parameters of NeuralModule.
+        """
+        return self._num_weights()
+    @torch.jit.ignore
+    def _num_weights(self):
+        num: int = 0
+        for p in self.parameters():
+            if p.requires_grad:
+                num += p.numel()
+        return num
+    def freeze(self) -> None:
+        r"""
+        Freeze all params for inference.
+        This method sets `requires_grad` to False for all parameters of the module.
+        It also stores the original `requires_grad` state of each parameter in a dictionary,
+        so that `unfreeze()` can restore the original state if `partial=True` is set in `unfreeze()`.
+        """
+        grad_map = {}
+        for pname, param in self.named_parameters():
+            # Store the original grad state
+            grad_map[pname] = param.requires_grad
+            # Freeze the parameter
+            param.requires_grad = False
+        # Store the frozen grad map
+        if not hasattr(self, '_frozen_grad_map'):
+            self._frozen_grad_map = grad_map
+        else:
+            self._frozen_grad_map.update(grad_map)
+        self.eval()
+    def unfreeze(self, partial: bool = False) -> None:
+        """
+        Unfreeze all parameters for training.
+        Allows for either total unfreeze or partial unfreeze (if the module was explicitly frozen previously with `freeze()`).
+        The `partial` argument is used to determine whether to unfreeze all parameters or only the parameters that were
+        previously unfrozen prior `freeze()`.
+        Example:
+            Consider a model that has an encoder and a decoder module. Assume we want the encoder to be frozen always.
+            ```python
+            model.encoder.freeze()  # Freezes all parameters in the encoder explicitly
+            ```
+            During inference, all parameters of the model should be frozen - we do this by calling the model's freeze method.
+            This step records that the encoder module parameters were already frozen, and so if partial unfreeze is called,
+            we should keep the encoder parameters frozen.
+            ```python
+            model.freeze()  # Freezes all parameters in the model; encoder remains frozen
+            ```
+            Now, during fine-tuning, we want to unfreeze the decoder but keep the encoder frozen. We can do this by calling
+            `unfreeze(partial=True)`.
+            ```python
+            model.unfreeze(partial=True)  # Unfreezes only the decoder; encoder remains frozen
+            ```
+        Args:
+            partial: If True, only unfreeze parameters that were previously frozen. If the parameter was already frozen
+                when calling `freeze()`, it will remain frozen after calling `unfreeze(partial=True)`.
+        """
+        if partial and not hasattr(self, '_frozen_grad_map'):
+            raise ValueError("Cannot unfreeze partially without first freezing the module with `freeze()`")
+        for pname, param in self.named_parameters():
+            if not partial:
+                # Unfreeze all parameters
+                param.requires_grad = True
+            else:
+                # Unfreeze only parameters that were previously frozen
+                # Check if the parameter was frozen
+                if pname in self._frozen_grad_map:
+                    param.requires_grad = self._frozen_grad_map[pname]
+                else:
+                    # Log a warning if the parameter was not found in the frozen grad map
+                    print(
+                        f"Parameter {pname} not found in list of previously frozen parameters. "
+                        f"Unfreezing this parameter."
+                    )
+                    param.requires_grad = True
+        # Clean up the frozen grad map
+        if hasattr(self, '_frozen_grad_map'):
+            delattr(self, '_frozen_grad_map')
+        self.train()

spectrogram_augment.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import random
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+class SpecAugment(nn.Module):
+    """
+    Zeroes out(cuts) random continuous horisontal or
+    vertical segments of the spectrogram as described in
+    SpecAugment (https://arxiv.org/abs/1904.08779).
+    params:
+    freq_masks - how many frequency segments should be cut
+    time_masks - how many time segments should be cut
+    freq_width - maximum number of frequencies to be cut in one segment
+    time_width - maximum number of time steps to be cut in one segment.
+        Can be a positive integer or a float value in the range [0, 1].
+        If positive integer value, defines maximum number of time steps
+        to be cut in one segment.
+        If a float value, defines maximum percentage of timesteps that
+        are cut adaptively.
+    use_vectorized_code - GPU-based implementation with batched masking and GPU rng,
+        setting it to False reverts to the legacy implementation.
+        Fast implementation is inspired by torchaudio:
+        https://github.com/pytorch/audio/blob/ea437b31ce316ea3d66fe73768c0dcb94edb79ad/src/torchaudio/functional/functional.py#L816
+    """
+    FREQ_AXIS = 1  # Frequency axis in the spectrogram tensor
+    TIME_AXIS = 2  # Time axis in the spectrogram tensor
+    def __init__(
+        self,
+        freq_masks: int = 0,
+        time_masks: int = 0,
+        freq_width: int = 10,
+        time_width: Union[int, float] = 10,
+        rng: random.Random = None,
+        mask_value: float = 0.0,
+        use_vectorized_code: bool = True,
+    ):
+        super().__init__()
+        self._rng = random.Random() if rng is None else rng
+        self.freq_masks = freq_masks
+        self.time_masks = time_masks
+        self.freq_width = freq_width
+        self.time_width = time_width
+        self.mask_value = mask_value
+        self.use_vectorized_code = use_vectorized_code
+        if isinstance(time_width, int):
+            self.adaptive_temporal_width = False
+        else:
+            if time_width > 1.0 or time_width < 0.0:
+                raise ValueError("If `time_width` is a float value, must be in range [0, 1]")
+            self.adaptive_temporal_width = True
+    @torch.no_grad()
+    def forward(self, input_spec, length):
+        if self.use_vectorized_code:
+            return self._forward_vectorized(input_spec, length)
+        else:
+            return self._forward_legacy(input_spec, length)
+    def _forward_legacy(self, input_spec, length):
+        batch_size, num_freq_bins, _ = input_spec.shape
+        # Move lengths to CPU before repeated indexing
+        lengths_cpu = length.cpu().numpy()
+        # Generate a numpy boolean mask. `True` elements represent where the input spec will be augmented.
+        fill_mask: np.array = np.full(shape=input_spec.shape, fill_value=False)
+        freq_start_upper_bound = num_freq_bins - self.freq_width
+        # Choose different mask ranges for each element of the batch
+        for idx in range(batch_size):
+            # Set freq masking
+            for _ in range(self.freq_masks):
+                start = self._rng.randint(0, freq_start_upper_bound)
+                width = self._rng.randint(0, self.freq_width)
+                fill_mask[idx, start : start + width, :] = True
+            # Derive time width, sometimes based percentage of input length.
+            if self.adaptive_temporal_width:
+                time_max_width = max(1, int(lengths_cpu[idx] * self.time_width))
+            else:
+                time_max_width = self.time_width
+            time_start_upper_bound = max(1, lengths_cpu[idx] - time_max_width)
+            # Set time masking
+            for _ in range(self.time_masks):
+                start = self._rng.randint(0, time_start_upper_bound)
+                width = self._rng.randint(0, time_max_width)
+                fill_mask[idx, :, start : start + width] = True
+        # Bring the mask to device and fill spec
+        fill_mask = torch.from_numpy(fill_mask).to(input_spec.device)
+        masked_spec = input_spec.masked_fill(mask=fill_mask, value=self.mask_value)
+        return masked_spec
+    def _forward_vectorized(self, input_spec: torch.Tensor, length: torch.Tensor) -> torch.Tensor:
+        # time masks
+        input_spec = self._apply_masks(
+            input_spec=input_spec,
+            num_masks=self.time_masks,
+            length=length,
+            width=self.time_width,
+            axis=self.TIME_AXIS,
+            mask_value=self.mask_value,
+        )
+        # freq masks
+        input_spec = self._apply_masks(
+            input_spec=input_spec,
+            num_masks=self.freq_masks,
+            length=length,
+            width=self.freq_width,
+            axis=self.FREQ_AXIS,
+            mask_value=self.mask_value,
+        )
+        return input_spec
+    def _apply_masks(
+        self,
+        input_spec: torch.Tensor,
+        num_masks: int,
+        length: torch.Tensor,
+        width: Union[int, float],
+        mask_value: float,
+        axis: int,
+    ) -> torch.Tensor:
+        assert axis in (
+            self.FREQ_AXIS,
+            self.TIME_AXIS,
+        ), f"Axis can be only be equal to frequency \
+            ({self.FREQ_AXIS}) or time ({self.TIME_AXIS}). Received: {axis=}"
+        assert not (
+            isinstance(width, float) and axis == self.FREQ_AXIS
+        ), "Float width supported \
+            only with time axis."
+        batch_size = input_spec.shape[0]
+        axis_length = input_spec.shape[axis]
+        # If width is float then it is transformed into a tensor
+        if axis == self.TIME_AXIS and isinstance(width, float):
+            width = torch.clamp(width * length, max=axis_length).unsqueeze(1)
+        # Generate [0-1) random numbers and then scale the tensors.
+        # Use float32 dtype for begin/end mask markers before they are quantized to long.
+        mask_width = torch.rand((batch_size, num_masks), device=input_spec.device, dtype=torch.float32) * width
+        mask_width = mask_width.long()
+        mask_start = torch.rand((batch_size, num_masks), device=input_spec.device, dtype=torch.float32)
+        if axis == self.TIME_AXIS:
+            # length can only be used for the time axis
+            mask_start = mask_start * (length.unsqueeze(1) - mask_width)
+        else:
+            mask_start = mask_start * (axis_length - mask_width)
+        mask_start = mask_start.long()
+        mask_end = mask_start + mask_width
+        # Create mask values using vectorized indexing
+        indices = torch.arange(axis_length, device=input_spec.device)
+        # Create a mask_tensor with all the indices.
+        # The mask_tensor shape is (batch_size, num_masks, axis_length).
+        mask_tensor = (indices >= mask_start.unsqueeze(-1)) & (indices < mask_end.unsqueeze(-1))
+        # Reduce masks to one mask
+        mask_tensor = mask_tensor.any(dim=1)
+        # Create a final mask that aligns with the full tensor
+        mask = torch.zeros_like(input_spec, dtype=torch.bool)
+        if axis == self.TIME_AXIS:
+            mask_ranges = mask_tensor[:, None, :]
+        else:  # axis == self.FREQ_AXIS
+            mask_ranges = mask_tensor[:, :, None]
+        mask[:, :, :] = mask_ranges
+        # Apply the mask value
+        return input_spec.masked_fill(mask=mask, value=mask_value)
+class SpecCutout(nn.Module):
+    """
+    Zeroes out(cuts) random rectangles in the spectrogram
+    as described in (https://arxiv.org/abs/1708.04552).
+    params:
+    rect_masks - how many rectangular masks should be cut
+    rect_freq - maximum size of cut rectangles along the frequency dimension
+    rect_time - maximum size of cut rectangles along the time dimension
+    """
+    def __init__(self, rect_masks=0, rect_time=5, rect_freq=20, rng=None):
+        super(SpecCutout, self).__init__()
+        self._rng = random.Random() if rng is None else rng
+        self.rect_masks = rect_masks
+        self.rect_time = rect_time
+        self.rect_freq = rect_freq
+    @torch.no_grad()
+    def forward(self, input_spec):
+        sh = input_spec.shape
+        for idx in range(sh[0]):
+            for i in range(self.rect_masks):
+                rect_x = self._rng.randint(0, sh[1] - self.rect_freq)
+                rect_y = self._rng.randint(0, sh[2] - self.rect_time)
+                w_x = self._rng.randint(0, self.rect_freq)
+                w_y = self._rng.randint(0, self.rect_time)
+                input_spec[idx, rect_x : rect_x + w_x, rect_y : rect_y + w_y] = 0.0
+        return input_spec

tdnn_attention.py ADDED Viewed

	@@ -0,0 +1,620 @@

+import math
+from typing import List, Optional
+from numpy import inf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import _calculate_correct_fan
+class StatsPoolLayer(nn.Module):
+    """Statistics and time average pooling (TAP) layer
+    This computes mean and, optionally, standard deviation statistics across the time dimension.
+    Args:
+        feat_in: Input features with shape [B, D, T]
+        pool_mode: Type of pool mode. Supported modes are 'xvector' (mean and standard deviation) and 'tap' (time
+            average pooling, i.e., mean)
+        eps: Epsilon, minimum value before taking the square root, when using 'xvector' mode.
+        unbiased: Whether to use the biased estimator for the standard deviation when using 'xvector' mode. The default
+            for torch.Tensor.std() is True.
+    Returns:
+        Pooled statistics with shape [B, D].
+    Raises:
+        ValueError if an unsupported pooling mode is specified.
+    """
+    def __init__(self, feat_in: int, pool_mode: str = 'xvector', eps: float = 1e-10, unbiased: bool = True):
+        super().__init__()
+        supported_modes = {"xvector", "tap"}
+        if pool_mode not in supported_modes:
+            raise ValueError(f"Pool mode must be one of {supported_modes}; got '{pool_mode}'")
+        self.pool_mode = pool_mode
+        self.feat_in = feat_in
+        self.eps = eps
+        self.unbiased = unbiased
+        if self.pool_mode == 'xvector':
+            # Mean + std
+            self.feat_in *= 2
+    def forward(self, encoder_output, length=None):
+        if length is None:
+            mean = encoder_output.mean(dim=-1)  # Time Axis
+            if self.pool_mode == 'xvector':
+                correction = 1 if self.unbiased else 0
+                std = encoder_output.std(dim=-1, correction=correction).clamp(min=self.eps)
+                pooled = torch.cat([mean, std], dim=-1)
+            else:
+                pooled = mean
+        else:
+            mask = make_seq_mask_like(like=encoder_output, lengths=length, valid_ones=False)
+            encoder_output = encoder_output.masked_fill(mask, 0.0)
+            # [B, D, T] -> [B, D]
+            means = encoder_output.mean(dim=-1)
+            # Re-scale to get padded means
+            means = means * (encoder_output.shape[-1] / length).unsqueeze(-1)
+            if self.pool_mode == "xvector":
+                correction = 1 if self.unbiased else 0
+                stds = (
+                    encoder_output.sub(means.unsqueeze(-1))
+                    .masked_fill(mask, 0.0)
+                    .pow(2.0)
+                    .sum(-1)  # [B, D, T] -> [B, D]
+                    .div(length.view(-1, 1).sub(correction))
+                    .clamp(min=self.eps)
+                    .sqrt()
+                )
+                pooled = torch.cat((means, stds), dim=-1)
+            else:
+                pooled = means
+        return pooled
+class AttentivePoolLayer(nn.Module):
+    """
+    Attention pooling layer for pooling speaker embeddings
+    Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf)
+    inputs:
+        inp_filters: input feature channel length from encoder
+        attention_channels: intermediate attention channel size
+        kernel_size: kernel_size for TDNN and attention conv1d layers (default: 1)
+        dilation: dilation size for TDNN and attention conv1d layers  (default: 1)
+    """
+    def __init__(
+        self,
+        inp_filters: int,
+        attention_channels: int = 128,
+        kernel_size: int = 1,
+        dilation: int = 1,
+        eps: float = 1e-10,
+    ):
+        super().__init__()
+        self.feat_in = 2 * inp_filters
+        self.attention_layer = nn.Sequential(
+            TdnnModule(inp_filters * 3, attention_channels, kernel_size=kernel_size, dilation=dilation),
+            nn.Tanh(),
+            nn.Conv1d(
+                in_channels=attention_channels,
+                out_channels=inp_filters,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ),
+        )
+        self.eps = eps
+    def forward(self, x, length=None):
+        max_len = x.size(2)
+        if length is None:
+            length = torch.ones(x.shape[0], device=x.device)
+        mask, num_values = lens_to_mask(length, max_len=max_len, device=x.device)
+        # encoder statistics
+        mean, std = get_statistics_with_mask(x, mask / num_values)
+        mean = mean.unsqueeze(2).repeat(1, 1, max_len)
+        std = std.unsqueeze(2).repeat(1, 1, max_len)
+        attn = torch.cat([x, mean, std], dim=1)
+        # attention statistics
+        attn = self.attention_layer(attn)  # attention pass
+        attn = attn.masked_fill(mask == 0, -inf)
+        alpha = F.softmax(attn, dim=2)  # attention values, α
+        mu, sg = get_statistics_with_mask(x, alpha)  # µ and ∑
+        # gather
+        return torch.cat((mu, sg), dim=1).unsqueeze(2)
+class TdnnModule(nn.Module):
+    """
+    Time Delayed Neural Module (TDNN) - 1D
+    input:
+        inp_filters: input filter channels for conv layer
+        out_filters: output filter channels for conv layer
+        kernel_size: kernel weight size for conv layer
+        dilation: dilation for conv layer
+        stride: stride for conv layer
+        padding: padding for conv layer (default None: chooses padding value such that input and output feature shape matches)
+    output:
+        tdnn layer output
+    """
+    def __init__(
+        self,
+        inp_filters: int,
+        out_filters: int,
+        kernel_size: int = 1,
+        dilation: int = 1,
+        stride: int = 1,
+        groups: int = 1,
+        padding: int = None,
+    ):
+        super().__init__()
+        if padding is None:
+            padding = get_same_padding(kernel_size, stride=stride, dilation=dilation)
+        self.conv_layer = nn.Conv1d(
+            in_channels=inp_filters,
+            out_channels=out_filters,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            groups=groups,
+            padding=padding,
+        )
+        self.activation = nn.ReLU()
+        self.bn = nn.BatchNorm1d(out_filters)
+    def forward(self, x, length=None):
+        x = self.conv_layer(x)
+        x = self.activation(x)
+        return self.bn(x)
+class MaskedSEModule(nn.Module):
+    """
+    Squeeze and Excite module implementation with conv1d layers
+    input:
+        inp_filters: input filter channel size
+        se_filters: intermediate squeeze and excite channel output and input size
+        out_filters: output filter channel size
+        kernel_size: kernel_size for both conv1d layers
+        dilation: dilation size for both conv1d layers
+    output:
+        squeeze and excite layer output
+    """
+    def __init__(self, inp_filters: int, se_filters: int, out_filters: int, kernel_size: int = 1, dilation: int = 1):
+        super().__init__()
+        self.se_layer = nn.Sequential(
+            nn.Conv1d(
+                inp_filters,
+                se_filters,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ),
+            nn.ReLU(),
+            nn.BatchNorm1d(se_filters),
+            nn.Conv1d(
+                se_filters,
+                out_filters,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ),
+            nn.Sigmoid(),
+        )
+    def forward(self, input, length=None):
+        if length is None:
+            x = torch.mean(input, dim=2, keep_dim=True)
+        else:
+            max_len = input.size(2)
+            mask, num_values = lens_to_mask(length, max_len=max_len, device=input.device)
+            x = torch.sum((input * mask), dim=2, keepdim=True) / (num_values)
+        out = self.se_layer(x)
+        return out * input
+class TdnnSeModule(nn.Module):
+    """
+    Modified building SE_TDNN group module block from ECAPA implementation for faster training and inference
+    Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf)
+    inputs:
+        inp_filters: input filter channel size
+        out_filters: output filter channel size
+        group_scale: scale value to group wider conv channels (deafult:8)
+        se_channels: squeeze and excite output channel size (deafult: 1024/8= 128)
+        kernel_size: kernel_size for group conv1d layers (default: 1)
+        dilation: dilation size for group conv1d layers  (default: 1)
+    """
+    def __init__(
+        self,
+        inp_filters: int,
+        out_filters: int,
+        group_scale: int = 8,
+        se_channels: int = 128,
+        kernel_size: int = 1,
+        dilation: int = 1,
+        init_mode: str = 'xavier_uniform',
+    ):
+        super().__init__()
+        self.out_filters = out_filters
+        padding_val = get_same_padding(kernel_size=kernel_size, dilation=dilation, stride=1)
+        group_conv = nn.Conv1d(
+            out_filters,
+            out_filters,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding_val,
+            groups=group_scale,
+        )
+        self.group_tdnn_block = nn.Sequential(
+            TdnnModule(inp_filters, out_filters, kernel_size=1, dilation=1),
+            group_conv,
+            nn.ReLU(),
+            nn.BatchNorm1d(out_filters),
+            TdnnModule(out_filters, out_filters, kernel_size=1, dilation=1),
+        )
+        self.se_layer = MaskedSEModule(out_filters, se_channels, out_filters)
+        self.apply(lambda x: init_weights(x, mode=init_mode))
+    def forward(self, input, length=None):
+        x = self.group_tdnn_block(input)
+        x = self.se_layer(x, length)
+        return x + input
+class Res2NetBlock(nn.Module):
+    """
+    Res2Net module that splits input channels into groups and processes them separately before merging.
+    This allows multi-scale feature extraction.
+    """
+    def __init__(self, in_channels, out_channels, scale=4, kernel_size=1, dilation=1):
+        super().__init__()
+        assert in_channels % scale == 0, "in_channels must be divisible by scale"
+        self.scale = scale
+        self.width = in_channels // scale  # Number of channels per group
+        self.convs = nn.ModuleList([
+            nn.Conv1d(self.width, self.width, kernel_size=kernel_size, dilation=dilation, padding=dilation, bias=False)
+            for _ in range(scale - 1)
+        ])
+        self.bn = nn.BatchNorm1d(out_channels)
+        self.activation = nn.ReLU()
+    def forward(self, x):
+        """
+        x: [B, C, T]
+        """
+        splits = torch.split(x, self.width, dim=1)
+        outputs = [splits[0]]  # First part remains unchanged
+        for i in range(1, self.scale):
+            conv_out = self.convs[i - 1](splits[i])  # Apply convolution on each group
+            outputs.append(conv_out + outputs[i - 1])  # Hierarchical aggregation
+        out = torch.cat(outputs, dim=1)  # Merge groups
+        return self.activation(self.bn(out))
+class TdnnSeRes2NetModule(nn.Module):
+    """
+    SE-TDNN module with Res2Net for ECAPA-TDNN.
+    """
+    def __init__(
+        self,
+        inp_filters: int,
+        out_filters: int,
+        group_scale: int = 1,
+        se_channels: int = 128,
+        kernel_size: int = 1,
+        dilation: int = 1,
+        res2net_scale: int = 8,  # New Res2Net parameter
+    ):
+        super().__init__()
+        # First TDNN layer
+        self.tdnn1 = TdnnModule(inp_filters, out_filters, kernel_size=1, dilation=1, groups=group_scale)
+        # Res2Net block replaces grouped TDNN
+        self.res2net = Res2NetBlock(out_filters, out_filters, scale=res2net_scale, kernel_size=kernel_size, dilation=dilation)
+        # Squeeze-and-Excite module
+        self.se_layer = MaskedSEModule(out_filters, se_channels, out_filters)
+    def forward(self, x, length=None):
+        residual = x
+        x = self.tdnn1(x)
+        x = self.res2net(x)  # Apply Res2Net block
+        x = self.se_layer(x, length)
+        return x + residual  # Residual connection
+class MaskedConv1d(nn.Module):
+    __constants__ = ["use_conv_mask", "real_out_channels", "heads"]
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        heads=-1,
+        bias=False,
+        use_mask=True,
+        quantize=False,
+    ):
+        super(MaskedConv1d, self).__init__()
+        if not (heads == -1 or groups == in_channels):
+            raise ValueError("Only use heads for depthwise convolutions")
+        self.real_out_channels = out_channels
+        if heads != -1:
+            in_channels = heads
+            out_channels = heads
+            groups = heads
+        # preserve original padding
+        self._padding = padding
+        # if padding is a tuple/list, it is considered as asymmetric padding
+        if type(padding) in (tuple, list):
+            self.pad_layer = nn.ConstantPad1d(padding, value=0.0)
+            # reset padding for conv since pad_layer will handle this
+            padding = 0
+        else:
+            self.pad_layer = None
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.use_mask = use_mask
+        self.heads = heads
+        # Calculations for "same" padding cache
+        self.same_padding = (self.conv.stride[0] == 1) and (
+            2 * self.conv.padding[0] == self.conv.dilation[0] * (self.conv.kernel_size[0] - 1)
+        )
+        if self.pad_layer is None:
+            self.same_padding_asymmetric = False
+        else:
+            self.same_padding_asymmetric = (self.conv.stride[0] == 1) and (
+                sum(self._padding) == self.conv.dilation[0] * (self.conv.kernel_size[0] - 1)
+            )
+        # `self.lens` caches consecutive integers from 0 to `self.max_len` that are used to compute the mask for a
+        # batch. Recomputed to bigger size as needed. Stored on a device of the latest batch lens.
+        if self.use_mask:
+            self.max_len = torch.tensor(0)
+            self.lens = torch.tensor(0)
+    def get_seq_len(self, lens):
+        if self.same_padding or self.same_padding_asymmetric:
+            return lens
+        if self.pad_layer is None:
+            return (
+                torch.div(
+                    lens + 2 * self.conv.padding[0] - self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - 1,
+                    self.conv.stride[0],
+                    rounding_mode='trunc',
+                )
+                + 1
+            )
+        else:
+            return (
+                torch.div(
+                    lens + sum(self._padding) - self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - 1,
+                    self.conv.stride[0],
+                    rounding_mode='trunc',
+                )
+                + 1
+            )
+    def forward(self, x, lens):
+        if self.use_mask:
+            # Generally will be called by ConvASREncoder, but kept as single gpu backup.
+            if x.size(2) > self.max_len:
+                self.update_masked_length(x.size(2), device=lens.device)
+            x = self.mask_input(x, lens)
+        # Update lengths
+        lens = self.get_seq_len(lens)
+        # asymmtric pad if necessary
+        if self.pad_layer is not None:
+            x = self.pad_layer(x)
+        sh = x.shape
+        if self.heads != -1:
+            x = x.view(-1, self.heads, sh[-1])
+        out = self.conv(x)
+        if self.heads != -1:
+            out = out.view(sh[0], self.real_out_channels, -1)
+        return out, lens
+    def update_masked_length(self, max_len, seq_range=None, device=None):
+        if seq_range is None:
+            self.lens, self.max_len = _masked_conv_init_lens(self.lens, max_len, self.max_len)
+            self.lens = self.lens.to(device)
+        else:
+            self.lens = seq_range
+            self.max_len = torch.tensor(max_len)
+    def mask_input(self, x, lens):
+        max_len = x.size(2)
+        mask = self.lens[:max_len].unsqueeze(0).to(lens.device) < lens.unsqueeze(1)
+        x = x * mask.unsqueeze(1).to(device=x.device)
+        return x
+@torch.jit.script
+def _masked_conv_init_lens(lens: torch.Tensor, current_maxlen: int, original_maxlen: torch.Tensor):
+    if current_maxlen > original_maxlen:
+        new_lens = torch.arange(current_maxlen)
+        new_max_lens = torch.tensor(current_maxlen)
+    else:
+        new_lens = lens
+        new_max_lens = original_maxlen
+    return new_lens, new_max_lens
+def get_same_padding(kernel_size, stride, dilation) -> int:
+    if stride > 1 and dilation > 1:
+        raise ValueError("Only stride OR dilation may be greater than 1")
+    return (dilation * (kernel_size - 1)) // 2
+def lens_to_mask(lens: List[int], max_len: int, device: str = None):
+    """
+    outputs masking labels for list of lengths of audio features, with max length of any
+    mask as max_len
+    input:
+        lens: list of lens
+        max_len: max length of any audio feature
+    output:
+        mask: masked labels
+        num_values: sum of mask values for each feature (useful for computing statistics later)
+    """
+    lens_mat = torch.arange(max_len).to(device)
+    mask = lens_mat[:max_len].unsqueeze(0) < lens.unsqueeze(1)
+    mask = mask.unsqueeze(1)
+    num_values = torch.sum(mask, dim=2, keepdim=True)
+    return mask, num_values
+def get_statistics_with_mask(x: torch.Tensor, m: torch.Tensor, dim: int = 2, eps: float = 1e-10):
+    """
+    compute mean and standard deviation of input(x) provided with its masking labels (m)
+    input:
+        x: feature input
+        m: averaged mask labels
+    output:
+        mean: mean of input features
+        std: stadard deviation of input features
+    """
+    mean = torch.sum((m * x), dim=dim)
+    std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
+    return mean, std
+@torch.jit.script_if_tracing
+def make_seq_mask_like(
+    like: torch.Tensor, lengths: torch.Tensor, valid_ones: bool = True, time_dim: int = -1
+) -> torch.Tensor:
+    mask = torch.arange(like.shape[time_dim], device=like.device).repeat(lengths.shape[0], 1).lt(lengths.unsqueeze(-1))
+    # Match number of dims in `like` tensor
+    for _ in range(like.dim() - mask.dim()):
+        mask = mask.unsqueeze(1)
+    # If time dim != -1, transpose to proper dim.
+    if time_dim != -1:
+        mask = mask.transpose(time_dim, -1)
+    if not valid_ones:
+        mask = ~mask
+    return mask
+def init_weights(m, mode: Optional[str] = 'xavier_uniform'):
+    if isinstance(m, MaskedConv1d):
+        init_weights(m.conv, mode)
+    if isinstance(m, (nn.Conv1d, nn.Linear)):
+        if mode is not None:
+            if mode == 'xavier_uniform':
+                nn.init.xavier_uniform_(m.weight, gain=1.0)
+            elif mode == 'xavier_normal':
+                nn.init.xavier_normal_(m.weight, gain=1.0)
+            elif mode == 'kaiming_uniform':
+                nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
+            elif mode == 'kaiming_normal':
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+            elif mode == 'tds_uniform':
+                tds_uniform_(m.weight)
+            elif mode == 'tds_normal':
+                tds_normal_(m.weight)
+            else:
+                raise ValueError("Unknown Initialization mode: {0}".format(mode))
+    elif isinstance(m, nn.BatchNorm1d):
+        if m.track_running_stats:
+            m.running_mean.zero_()
+            m.running_var.fill_(1)
+            m.num_batches_tracked.zero_()
+        if m.affine:
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+def tds_uniform_(tensor, mode='fan_in'):
+    """
+    Uniform Initialization from the paper [Sequence-to-Sequence Speech Recognition with Time-Depth Separable Convolutions](https://www.isca-speech.org/archive/Interspeech_2019/pdfs/2460.pdf)
+    Normalized to -
+    .. math::
+        \\text{bound} = \\text{2} \\times \\sqrt{\\frac{1}{\\text{fan\\_mode}}}
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = 2.0  # sqrt(4.0) = 2
+    std = gain / math.sqrt(fan)  # sqrt(4.0 / fan_in)
+    bound = std  # Calculate uniform bounds from standard deviation
+    with torch.no_grad():
+        return tensor.uniform_(-bound, bound)
+def tds_normal_(tensor, mode='fan_in'):
+    """
+    Normal Initialization from the paper [Sequence-to-Sequence Speech Recognition with Time-Depth Separable Convolutions](https://www.isca-speech.org/archive/Interspeech_2019/pdfs/2460.pdf)
+    Normalized to -
+    .. math::
+        \\text{bound} = \\text{2} \\times \\sqrt{\\frac{1}{\\text{fan\\_mode}}}
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+    """
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = 2.0
+    std = gain / math.sqrt(fan)  # sqrt(4.0 / fan_in)
+    bound = std  # Calculate uniform bounds from standard deviation
+    with torch.no_grad():
+        return tensor.normal_(0.0, bound)