Full SelfAttention

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class FullSelfAttention(nn.Module):
    """
    A vanilla multi-head self-attention layer with no masking and a projection at the end.
    This implementation doesn't use causal masking, meaning all tokens can attend to each other.
    """

    def __init__(self, n_embd, n_head, attn_pdrop, resid_pdrop):
        super().__init__()
        assert n_embd % n_head == 0
        self.n_head = n_head
        # key, query, value projections for all heads
        self.key = nn.Linear(n_embd, n_embd)
        self.query = nn.Linear(n_embd, n_embd)
        self.value = nn.Linear(n_embd, n_embd)
        # regularization
        self.attn_drop = nn.Dropout(attn_pdrop)
        self.resid_drop = nn.Dropout(resid_pdrop)
        # output projection
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        B, T, C = x.size()

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # full self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = F.softmax(att, dim=-1)  # no masking here, full attention
        att = self.attn_drop(att)
        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side

        # output projection
        y = self.resid_drop(self.proj(y))
        return y

Causal SelfAttention


class CausalSelfAttention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention here but I am including an
    explicit implementation here to show that there is nothing too scary here.
    """

    def __init__(self, n_embd, block_size, n_head, attn_pdrop, resid_pdrop):
        super().__init__()
        assert n_embd % n_head == 0
        self.n_head = n_head
        # key, query, value projections for all heads
        self.key = nn.Linear(n_embd, n_embd)
        self.query = nn.Linear(n_embd, n_embd)
        self.value = nn.Linear(n_embd, n_embd)
        # regularization
        self.attn_drop = nn.Dropout(attn_pdrop)
        self.resid_drop = nn.Dropout(resid_pdrop)
        # output projection
        self.proj = nn.Linear(n_embd, n_embd)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size))
                                     .view(1, 1, block_size, block_size))
    def forward(self, x, layer_past=None):
        B, T, C = x.size()

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_drop(self.proj(y))
        return y

Look-ahead MHSA

To implement causal attention with a "look ahead" mechanism (i.e., allowing some future tokens to be attended to, while still maintaining causality by limiting how far into the future attention can extend), you can modify the attention mechanism to apply a causal mask with a specified number of future tokens.

Here's a PyTorch implementation of causal attention with look-ahead:

```python
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class CausalAttentionWithLookAhead(nn.Module):
    """
    Causal self-attention mechanism with a look-ahead window, allowing each token to attend to a limited number
    of future tokens, while still maintaining the causal nature (i.e., no attention to tokens further ahead).

    Usage:

        n_embd = 64  # embedding dimension
        n_head = 8   # number of attention heads
        attn_pdrop = 0.1  # dropout for attention weights
        resid_pdrop = 0.1  # dropout for output projection
        look_ahead_size = 2  # allow attention up to 2 future tokens

        model = CausalAttentionWithLookAhead(n_embd, n_head, attn_pdrop, resid_pdrop, look_ahead_size)
        x = torch.randn(16, 10, n_embd)  # Batch of 16 sequences, each of length 10, embedding size 64
        output = model(x)
        print(output.size())  # should return (16, 10, 64)
    """

    def __init__(self, n_embd, n_head, attn_pdrop, resid_pdrop, look_ahead_size):
        super().__init__()
        assert n_embd % n_head == 0
        self.n_head = n_head
        self.look_ahead_size = look_ahead_size

        # key, query, value projections for all heads
        self.key = nn.Linear(n_embd, n_embd)
        self.query = nn.Linear(n_embd, n_embd)
        self.value = nn.Linear(n_embd, n_embd)

        # regularization
        self.attn_drop = nn.Dropout(attn_pdrop)
        self.resid_drop = nn.Dropout(resid_pdrop)

        # output projection
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        B, T, C = x.size()

        # calculate query, key, values for all heads and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)

        # Causal attention with look ahead: Self-attend (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

        # Create the causal mask with a look-ahead window
        causal_mask = torch.tril(torch.ones(T, T), diagonal=self.look_ahead_size).view(1, 1, T, T).to(x.device)  # (1, 1, T, T)
        att = att.masked_fill(causal_mask == 0, float('-inf'))

        # Apply softmax and dropout
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)

        # Apply attention to the value: (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side

        # Output projection
        y = self.resid_drop(self.proj(y))
        return y

### Key Points:
1. **Causal Masking with Look-Ahead**:
   - A causal mask is created using `torch.tril`, which generates a lower triangular matrix. The `diagonal=self.look_ahead_size` argument allows attention to future tokens within a window of size `look_ahead_size`. 
   - For example, if `look_ahead_size=2`, each token will be able to attend to up to 2 future tokens in addition to past and current tokens.

2. **Attention Calculation**:
   - As usual, the queries (`q`), keys (`k`), and values (`v`) are computed and reshaped for the multi-head attention operation.
   - The attention scores are computed by multiplying the query matrix with the transpose of the key matrix.
   - After masking, softmax is applied to obtain the attention weights, and these weights are used to compute a weighted sum of the values.

3. **Handling the Future Look-Ahead**:
   - The `look_ahead_size` controls how many future tokens can be attended to. The larger the value, the further ahead the model can look while still being restricted by causality.

ChunkBasedAttention

To modify the `ChunkBasedAttention` implementation for parallel processing during training, the key change is to avoid sequential processing of chunks and instead process all chunks in parallel. This requires reshaping the input tensors so that the attention computation can be performed on all chunks simultaneously. Here's how you can adjust the `ChunkBasedAttention` class for parallel processing:

### Modified `ChunkBasedAttention` with Parallel Processing:

```python
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class ChunkBasedAttention(nn.Module):
    """
    Chunk-based self-attention mechanism with configurable left and right attention chunk sizes.
    This version allows for parallel processing of chunks during training.

    Example Usage:

      n_embd = 64  # embedding dimension
      n_head = 8   # number of attention heads
      attn_pdrop = 0.1  # dropout for attention weights
      resid_pdrop = 0.1  # dropout for output projection
      attention_chunk_size = 4  # size of each chunk
      left_chunk_size = 1  # allow attention to 1 chunk on the left
      right_chunk_size = 1  # allow attention to 1 chunk on the right

      model = ChunkBasedAttention(n_embd, n_head, attn_pdrop, resid_pdrop, attention_chunk_size, left_chunk_size, right_chunk_size)
      x = torch.randn(16, 12, n_embd)  # Batch of 16 sequences, each of length 12, embedding size 64
      output = model(x)
      print(output.size())  # should return (16, 12, 64)

    """

    def __init__(self, n_embd, n_head, attn_pdrop, resid_pdrop, attention_chunk_size, left_chunk_size, right_chunk_size):
        super().__init__()
        assert n_embd % n_head == 0
        self.n_head = n_head
        self.attention_chunk_size = attention_chunk_size
        self.left_chunk_size = left_chunk_size
        self.right_chunk_size = right_chunk_size

        # key, query, value projections for all heads
        self.key = nn.Linear(n_embd, n_embd)
        self.query = nn.Linear(n_embd, n_embd)
        self.value = nn.Linear(n_embd, n_embd)

        # regularization
        self.attn_drop = nn.Dropout(attn_pdrop)
        self.resid_drop = nn.Dropout(resid_pdrop)

        # output projection
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        B, T, C = x.size()  # B: Batch size, T: Sequence length, C: Embedding dimension
        chunk_size = self.attention_chunk_size
        num_chunks = (T + chunk_size - 1) // chunk_size  # Total number of chunks

        # calculate query, key, values for all heads and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)

        # Reshape the queries, keys, and values into chunks for parallel processing
        q_chunks = q.view(B, self.n_head, num_chunks, chunk_size, C // self.n_head)  # (B, nh, num_chunks, chunk_size, hs)
        k_chunks = k.view(B, self.n_head, num_chunks, chunk_size, C // self.n_head)  # (B, nh, num_chunks, chunk_size, hs)
        v_chunks = v.view(B, self.n_head, num_chunks, chunk_size, C // self.n_head)  # (B, nh, num_chunks, chunk_size, hs)

        # Construct the causal mask with left and right chunk sizes
        chunk_mask = torch.zeros(num_chunks, num_chunks).to(x.device)
        for i in range(num_chunks):
            start_idx = max(0, i - self.left_chunk_size)
            end_idx = min(num_chunks, i + self.right_chunk_size + 1)
            chunk_mask[i, start_idx:end_idx] = 1

        # Apply the chunk mask to attention scores
        chunk_mask = chunk_mask.view(1, 1, num_chunks, num_chunks)  # (1, 1, num_chunks, num_chunks)

        # Compute attention for all chunks in parallel
        attn_scores = torch.einsum('bhqnc,bhknc->bhqkn', q_chunks, k_chunks) / math.sqrt(C // self.n_head)  # (B, nh, num_chunks, chunk_size, chunk_size)
        attn_scores = attn_scores.masked_fill(chunk_mask[:, :, None, :, :] == 0, float('-inf'))  # Apply the chunk-based mask
        attn_probs = F.softmax(attn_scores, dim=-1)  # (B, nh, num_chunks, chunk_size, chunk_size)
        attn_probs = self.attn_drop(attn_probs)

        # Apply attention to the value: (B, nh, num_chunks, chunk_size, chunk_size) x (B, nh, num_chunks, chunk_size, hs)
        y_chunks = torch.einsum('bhqkn,bhknc->bhqnc', attn_probs, v_chunks)  # (B, nh, num_chunks, chunk_size, hs)
        y = y_chunks.contiguous().view(B, self.n_head, T, C // self.n_head)  # (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # Reassemble all head outputs side by side

        # Output projection
        y = self.resid_drop(self.proj(y))
        return y


### Explanation of Changes:
1. **Parallel Processing**:
   - Instead of processing each chunk sequentially, we reshape the input into chunks and process all chunks in parallel using `torch.einsum`. This allows for faster training with GPUs since parallelization is leveraged.

2. **Causal Mask**:
   - A chunk-based mask is constructed to ensure that each chunk can only attend to its allowed left and right neighboring chunks.
   - This mask is then applied to the attention scores to ensure that attention is restricted to a specific range of chunks.

3. **Efficient Attention Calculation**:
   - The attention scores are computed using `torch.einsum`, which efficiently handles the matrix multiplication for all chunks in parallel.
   - The `softmax` operation is applied across the appropriate dimension (`-1`), and then the attention probabilities are used to weight the values.

4. **Reshaping for Output**:
   - After computing the attention-weighted values for all chunks, the output is reshaped back into the original sequence length to ensure consistency with the input format.


### Key Points:
- **Chunk-Based Masking**: Each chunk can attend only to a specified range of chunks to the left and right, and this is enforced using the chunk mask.
- **Parallel Computation**: By reshaping the input into chunks and using `torch.einsum`, we can compute attention for all chunks in parallel, which speeds up training.
- **Flexibility**: The chunk size, left, and right attention window sizes are flexible and can be adjusted based on the model's requirements.

 

BEST-RQ: SSL with Random-projection Quantizer for Speech Recognition

BEST-RQ introduces a novel technique of self-supervised training using a combination of Random Projection Quantizer (RPQ) and Masked Language Modeling (MLM).

Entire process of BEST-RQ is firstly to proceed Random Projection Quantizer (RPQ) (randomly initialized linear layer and a single codebook for quantizing and discretizing the audio):

  • The Mel filterbanks are projected through the linear layer.
  • The index of the nearest codebook entry to the projection is selected as the target.
  • The nearest codebook entry is found by calculating the argmin of the normalized distance between the projection and each codebook entry.

Afterward, a mask is applied to a portion of the Mel filterbanks, and the model’s objective is to predict the correct targets for the masked sections. This is framed as a classification task, and cross-entropy loss is used to compute the training objective.


1. Random Projection Quantizer (RPQ)

The Random Projection Quantizer is the core part in BEST-RQ, designed to discretize continuous speech features, making them suitable for BERT-like pretraining. RPQ consists of two major components: the Projection Matrix and the Codebook. Both are randomly initialized and remain fixed throughout the training process.

1) Projection Matrix

The projection matrix projects the original speech features into a lower-dimensional space. The matrix is of size ( d \times k ), where:

  • d: Dimensionality of the original speech features (typically high, such as hundreds or thousands).
  • k: Target dimensionality after projection (usually much lower than ( d )).

This dimensionality reduction is essential for handling the vast amount of speech data efficiently.

2) Codebook

The Codebook is a collection of n code vectors, each of size ( k ). These code vectors represent the discrete code space into which the speech features are projected.

  • n: The size of the codebook, which can be tuned based on the task at hand.

Given an input vector ( x ) (a ( d )-dimensional vector computed from speech signals), RPQ maps ( x ) to discrete labels ( y ) through the following operation:

Where:

  • The projection matrix ( A ) is a randomly initialized ( h \times d ) matrix.
  • The codebook ( C = {c_1, ..., c_n} ) contains randomly initialized ( h )-dimensional vectors.
  • The function ( \text{norm}_{l2} ) denotes the L2 normalization.

This transformation enables the speech signals to be quantized into discrete labels, providing a structured learning signal for the downstream tasks.

The projection matrix is initialized using Xavier initialization (Glorot & Bengio, 2010).
The codebook is initialized using a standard normal distribution.
Both are kept frozen during the entire pretraining process, ensuring that the quantization remains consistent.

 

Code

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.linalg import vector_norm

class RandomProjectionQuantizer(nn.Module):
    """
    Vector quantization using a projection and a randomly initialized codebook.
    The output is the indices of the closest code in the codebook for each time step of the input.

    Example
    -------
    >>> quantizer = RandomProjectionQuantizer(16, 16, 8192)
    >>> inputs = torch.rand(10, 12, 16)
    >>> output = quantizer(inputs)
    >>> output.shape
    torch.Size([10, 12])
    """

    def __init__(self, input_dim, codebook_dim, codebook_vocab):
        super().__init__()

        self.input_dim = input_dim
        self.codebook_dim = codebook_dim
        self.codebook_vocab = codebook_vocab

        # Initialize projection matrix with Xavier initialization
        self.Prj_A_init = nn.init.xavier_uniform_(torch.empty((input_dim, codebook_dim)))

        # Normalize a randomly initialized codebook
        self.codebook = F.normalize(torch.randn(codebook_vocab, codebook_dim))

    def forward(self, x):
        """
        Forward the input through the projection and return the indices of the closest codebook entries.
        """
        # Normalize the projected input
        x = F.normalize(torch.matmul(x, self.Prj_A_init))

        # Calculate distances between codebook entries and input, and find the closest code
        distances = vector_norm(self.codebook.unsqueeze(1) - x.unsqueeze(1), dim=-1)

        # Return the indices of the closest code for each input
        return distances.argmin(dim=1)

2. Masked Language Modeling (MLM)

BEST-RQ applies Masked Language Modeling (MLM), much like BERT does for text, but in this case for speech. During training, certain portions of the speech signal are masked and replaced with noise.

  • Masking Strategy: Each frame of speech is masked with a fixed probability, and the masked portions are replaced with noise sampled from a normal distribution (mean = 0, standard deviation = 0.1).

The model, typically based on a Transformer architecture, is then tasked with predicting the labels (codebook indices) of the masked speech based on the surrounding context. This allows the model to focus on learning robust speech representations.


** A unique point of BEST-RQ is that the RPQ's projection matrix and codebook are frozen and independent of the ASR encoder. This ensures that the model focuses solely on learning meaningful speech representations without needing to adapt to the intricacies of the quantization process.

 

Code

https://github.com/speechbrain/speechbrain/pull/2309/files#diff-a93bef3df2fb2e56565025e82dbc87ee2293c30872b211a91ea049fd6c3bb49d

Pre-train.
The pre-training uses mask length 400ms with masking probability of 0.01.
The learning rate schedule uses a transformer learning rate schedule (Vaswani et al., 2017).
Adam optimizer with 0.004 peak learning rate and 25000 warmup steps.
The batch size is 2048.
Since the encoder has 4 times temporal-dimension reduction, the quantization with random projections stacks every 4 frames for projections.
The vocab size of the codebook is 8192 and the dimension is 16.

The pre-training quality is not very sensitive to the codebook vocab size and the codebook dimension, and is more sensitive to the masking probability and the mask length. The role of the projection layer in the random-projection quantizer is to allow using different codebook dimensions, and one can achieve similar results without the projection and set the codebook dimension to be the same as the input dimension. Due to the variance coming from the random initialization, the impact of a hyperparameter usually requires multiple runs of experiments to verify the result.

 

Codebook utilization. One of the most critical factors for pre-training quality is the percentage of the codebook that is used during training. In particular, at each training step a higher percentage of the codebook being used in each batch correlates strongly with a good pre-training quality. When the distribution of the codebook utilization is skewed toward a smaller subset of codes, this usually makes the pre-training task easier and provides less effective pre-training. The l2 normalizations on the projected vector and the codebook are critical for providing more uniform codebook utilization. On the other hand, using randomly initialized codebook and projection matrix can introduce different codebook utilizations with different random seeds, which impact the pretraining quality across different runs with same experiment configurations. This variance impacts quality more when training with smaller pre-training and fine-tuning datasets. How to reduce this reproducibility issue caused by random initialization is an important next step for improving random-projection quantizations.

 

Initialization. The quantizer uses random initialization and does not update the parameters, and therefore the initialization algorithm can play an important role on the results. In this paper we showed results with Xavier initialization for the projection matrix and the standard normal distribution for the codebook, and further comparisons on different initialization algorithms can be conduct in the future work.

 

[1] https://arxiv.org/pdf/2202.01855

[2] https://arxiv.org/pdf/2405.04296

[3] Speechbrain

 

This paper presents a method for jointly pre-training speech and text in an encoder-decoder framework to improve performance in speech translation and recognition tasks. 

 

 

Key Takeaways:

  1. Architecture: The method utilizes an Attention based Encoder-Decoder (AED) framework to integrate data from different modalities (speech and text) for representation learning.
    • Shared Encoder and Decoder: The STPT framework uses a shared encoder and decoder for both the speech and text modalities, which allows the model to integrate knowledge from both domains.
  2. Acoustic and Linguistic Representation Learning: The STPT framework is designed to learn both acoustic features from speech and linguistic features from text during the pre-training stage. This is crucial for speech translation models, which must understand the sounds of speech as well as the meaning of words.
  3. Joint Pre-Training Phase; Multi-Task Learning Framework: The framework integrates different pre-training tasks to build a robust model capable of handling multiple aspects of speech and language. The proposed Speech and Text joint Pre-Training (STPT) framework incorporates four self-supervised and supervised subtasks designed for cross-modality learning.
    • Text-to-Text (T2T): This self-supervised task helps the model learn linguistic patterns in the text. It's similar to how models like BERT learn by predicting masked words in a sentence.
    • Speech SSL learning (SSL): This is another self-supervised task focused on learning from the speech data alone, likely involving predicting some masked or hidden parts of the speech input.
    • Speech-to-Phoneme (S2P): A supervised task where the model is trained to predict phoneme units from speech data. Phonemes are the smallest units of sound in a language, so this task helps the model learn the sounds that make up speech.
    • Speech-to-Subword (S2T): Also a supervised task, where the model learns to predict subword units from the speech input. Subwords are larger than phonemes and can carry more linguistic information, like syllables or parts of words.
  4. Loss Functions: Pretraining is guided by different loss functions corresponding to the various tasks:
    • LT2T: The loss for the Text-to-Text task.
    • LSSL: The loss for the Speech SSL learning task, which involves masked prediction.
    • LS2P: The loss for the Speech-to-Phoneme task, which involves phoneme-unit sequence classification.
    • LS2T: The loss for the Speech-to-Subword task, involving sequential prediction of subword tokens.
    • Final Loss: The overall objective for the pre-training phase is a combination of these losses, guiding the model to learn both modality-specific and cross-modal representations. 
  5. Improved Performance: The STPT method effectively fuses speech and text information into one model, leading to significant improvements in performance. It achieves 1.7 to 2.3 BLEU score improvements on the MUST-C speech translation dataset and comparable word error rates (WERs) to the wav2vec 2.0 model on the LibriSpeech speech recognition task.

 

 

 

This paper presents a new model, SpeechUT, which aims to bridge the gap between speech and text representations in the context of pre-training for speech-to-text tasks.

 

 

Key Takeways:

  1. Tasks: SpeechUT incorporates three unsupervised pre-training tasks: speech-to-unit (S2U), masked unit modeling (MUM), and unit-to-text (U2T). These tasks help to learn better representations for the speech and text modalities.
  2. Architecture: SpeechUT comprises a speech encoder, unit encoder, and text decoder, along with speech and unit pre-nets to process the inputs.
  3. Unified-Modal Speech-Unit-Text Pre-training Model (SpeechUT): The proposed model is designed to connect the representations of speech and text through a shared unit encoder. It allows for pre-training with unpaired speech and text data, which can be beneficial for tasks like automatic speech recognition (ASR) and speech translation (ST). SpeechUT is a new pre-training method using hidden-unit representations to connect speech encoders and text decoders.
  4. Discrete Representation (Units): SpeechUT leverages hidden-unit representations as an interface to align speech and text. This is done by decomposing the speech-to-text model into a speech-to-unit model and a unit-to-text model, which can be pre-trained separately with large amounts of unpaired data. The model uses discrete unit sequences produced by off-line generators, allowing for the pre-training of large-scale unpaired speech and text.
  5. Embedding Mixing: An embedding mixing mechanism is introduced to better align speech and unit representations.
  6. Pre-Training and Fine-Tuning Methods: The paper describes how SpeechUT is pre-trained with the mentioned tasks and fine-tuned for specific ASR and ST tasks.
    1. Pre-Training Tasks: SpeechUT includes three unsupervised pre-training tasks: speech-to-unit, masked unit modeling, and unit-to-text.
    2. Fine-Tuning: For downstream tasks like ASR and ST, SpeechUT is fine-tuned without introducing new parameters, utilizing the pre-trained modules.
  7. Performance: The paper reports that SpeechUT achieves substantial improvements over strong baselines and sets new state-of-the-art performance on the LibriSpeech ASR and MuST-C ST benchmarks.
  8. Detailed Analyses: The paper includes detailed analyses to understand the proposed SpeechUT model better, and the code and pre-trained models are made available for the community.

 

 




SPE-54: Keyword Spotting


Unified Speculation, Detection, And, Verification Keyword Spotting

Geng-shen Fu, Thibaud Senechal, Aaron Challenner, Tao Zhang, Amazon Alexa Science

 


Problem

 

- Accurate and timely recognition of the trigger keyword is vital.

- There is a trade-off needed between accuracy and latency.

 

Proposed method

 

- We propose an CRNN-based unified speculation, detection, and verification keyword detection model.

- We propose a latency- aware max-pooling loss, and show empirically that it teaches a model to maximize accuracy under the latency constraint.

- A USDV model can be trained in a MTL fashion and achieves different accuracy and latency trade-off across these three tasks.

 

 

 

1. Unified speculation, detection, and verification model

- Speculation makes an early decision, which can be used to give a head-start to downstream processes on the device.

- Detection mimics the traditional keyword trigger task and gives a more accurate decision by observing the full keyword context.

- Verification verifies previous decision by observing even more audio after the keyword span.

 

2. Model architecture and training strategy

- CRNN architecture

- multi-task learning with different target latencies on the new proposed latency-aware max-pooling loss.


Temporal Early Exiting for Streaming Speech Commands Recognition

Comcast Applied AI, University of Waterloo

 


Problem

 

Voice queries to take time to process: 

 

Stage 1: The user is speaking (seconds). 

Stage 2: Finish ASR transcription (~50ms). 

Stage 3: Information retrieval (~500ms).

 

 

 

Proposed method

 

- Use a streaming speech commands model for the top-K voice queries.

- Apply some training objective for better early exiting across time; Return a prediction before the entire audio is observed.

- Use early exiting with some condence threshold to adjust the latency-accuracy trade-off.

 

Model

- GRU Model

- Per-frame output probability distribution over K commands (classes).

 

Early-Exiting Objectives

 

Connectionist temporal classication (CTC):

Last-frame cross entropy (LF):

All-frame cross entropy (AF):

 

Findings

 

1. The all-frame objective (AF) performs best, perhaps because it explicitly trains the hidden features to be more discriminative, similar to deep supervision [1].

2. The observed indices correlate with the optimal indices for all models and datasets, with the AF-0.5 model consistently exiting earlier than the LF one does.


Self-supervised Learning for Speech and Audio Processing I

Technical Program Session MLSP-3

 


UNIVERSAL PARALINGUISTIC SPEECH REPRESENTATIONS USING SELF-SUPERVISED CONFORMERS

 

Verily Life Sciences, Boston, USA1 and Mountain View, California, USA

 


Many speech applications require understanding aspects beyond the words being spoken, such as recognizing emotion, detecting whether the speaker is wearing a mask, or distinguishing real from synthetic speech. In this work, we introduce a new state-of-the-art paralinguistic representation derived from large-scale, fully self-supervised training of a 600M+ parameter Conformer-based architecture. We benchmark on a diverse set of speech tasks and demonstrate that simple linear classifiers trained on top of our time-averaged representation outperform nearly all previous results, in some cases by large margins. Our analyses of context-window size demonstrate that, surprisingly, 2 second context-windows achieve 96% the performance of the Conformers that use the full long-term context on 7 out of 9 tasks. Furthermore, while the best per-task representations are extracted internally in the network, stable performance across several layers allows a single universal representation to reach near optimal performance on all tasks.

 

 

https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9747197

-

 

Proposed method

-

Key Findings

 


 

A NOISE-ROBUST SELF-SUPERVISED PRE-TRAINING MODEL BASED SPEECH REPRESENTATION LEARNING FOR AUTOMATIC SPEECH RECOGNITION

NEL-SLIP, University of Science and Technology of China (USTC), Hefei, China

 


Wav2vec2.0 is a popular self-supervised pre-training framework for learning speech representations in the context of automatic speech recognition (ASR). It was shown that wav2vec2.0 has a good robustness against the domain shift, while the noise robustness is still unclear. In this work, we therefore first analyze the noise robustness of wav2vec2.0 via experiments. We observe that wav2vec2.0 pre-trained on noisy data can obtain good representations and thus improve the ASR performance on the noisy test set, which however brings a performance degradation on the clean test set. To avoid this issue, in this work we propose an enhanced wav2vec2.0 model. Specifically, the noisy speech and the corresponding clean version are fed into the same feature encoder, where the clean speech provides training targets for the model. Experimental results reveal that the proposed method can not only improve the ASR performance on the noisy test set which surpasses the original wav2vec2.0, but also ensure a tiny performance decrease on the clean test set. In addition, the effectiveness of the proposed method is demonstrated under different types of noise conditions.

 

https://ieeexplore.ieee.org/document/9747379

 


AN ADAPTER BASED PRE-TRAINING FOR EFFICIENT AND SCALABLE SELF-SUPERVISED SPEECH REPRESENTATION LEARNING

Huawei R&D UK, University of Oxford

 


https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9747374

 


CONTRASTIVE PREDICTION STRATEGIES FOR UNSUPERVISED SEGMENTATION AND CATEGORIZATION OF PHONEMES AND WORDS

University of Wroclaw, Poland, NavAlgo, France, NVIDIA, Poland, Universite de Toulon, France


We identify a performance trade-off between the tasks of phoneme categorization and phoneme and word segmentation in several self-supervised learning algorithms based on Contrastive Predictive Coding (CPC). Our experiments suggest that context building networks, albeit necessary for high performance on categorization tasks, harm segmentation performance by causing a temporal shift on the learned representations. Aiming to tackle this trade-off, we take inspiration from the leading approaches on segmentation and propose multi-level Aligned CPC (mACPC). It builds on Aligned CPC (ACPC), a variant of CPC which exhibits the best performance on categorization tasks, and incorporates multi-level modeling and optimization for detection of spectral changes. Our methods improve in all tested categorization metrics and achieve state-of-the-art performance in word segmentation.

 

https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9746102

 


 

CHARACTERIZING THE ADVERSARIAL VULNERABILITY OF SPEECH SELF-SUPERVISED LEARNING

National Taiwan University, The Chinese University of Hong Kong


SUPERB

 

A leaderboard named Speech processing Universal PERformance Benchmark (SUPERB), which aims at benchmarking the performance of a shared self-supervised learning (SSL) speech model across various downstream speech tasks with minimal modification of architectures and a small amount of data, has fueled the research for speech representation learning. The SUPERB demonstrates speech SSL upstream models improve the performance of various downstream tasks through just minimal adaptation. As the paradigm of the self-supervised learning upstream model followed by downstream tasks arouses more attention in the speech community, characterizing the adversarial robustness of such paradigm is of high priority. In this paper, we make the first attempt to investigate the adversarial vulnerability of such paradigm under the attacks from both zero-knowledge adversaries and limited-knowledge adversaries. The experimental results illustrate that the paradigm proposed by SUPERB is seriously vulnerable to limited-knowledge adversaries, and the attacks generated by zero-knowledge adversaries are with transferability. The XAB test verifies the imperceptibility of crafted adversarial attacks.

 

https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9747242


 


Language Modeling

Technical Program Session SPE-4

 


CAPITALIZATION NORMALIZATION FOR LANGUAGE MODELING WITH AN ACCURATE AND EFFICIENT HIERARCHICAL RNN MODEL

 

Google Research


Problem

Capitalization normalization (truecasing) is the task of restoring the correct case (uppercase or lowercase) of noisy text.

 

Proposed method

A fast, accurate and compact two-level hierarchical word-and-character-based RNN

 

Used the truecaser to normalize user-generated text in a Federated Learning framework for language modeling.

Key Findings

 

In a real user A/B experiment, authors demonstrated that the improvement translates to reduced prediction error rates in a virtual keyboard application.


 

NEURAL-FST CLASS LANGUAGE MODEL FOR END-TO-END SPEECH RECOGNITION

Facebook AI, USA


 

Proposed method

Neural-FST Class Language Model (NFCLM) for endto-end speech recognition

 

a novel method that combines neural network language models (NNLMs) and finite state transducers (FSTs) in a mathematically consistent framework

 

Key Findings

 

NFCLM significantly outperforms NNLM by 15.8% relative in terms of WER.

 

NFCLM achieves similar performance as traditional NNLM and FST shallow fusion while being less prone to overbiasing and 12 times more compact, making it more suitable for on-device usage.

 


ENHANCE RNNLMS WITH HIERARCHICAL MULTI-TASK LEARNING FOR ASR

 

University of Missouri, USA


Proposed method

 

 

 

 

Key Findings

 


RESCOREBERT: DISCRIMINATIVE SPEECH RECOGNITION RESCORING WITH BERT

1Amazon Alexa AI, USA 2Emory University, USA


Problem

 

Second-pass rescoring improves the outputs from a first-pass decoder by implementing a lattice rescoring or n-best re-ranking.

 

Proposed method (RescoreBERT)

 

Authors showed how to train a BERT-based rescoring model with minimum WER (MWER) loss, to incorporate the improvements of a discriminative loss into fine-tuning of deep bidirectional pretrained models for ASR.

 

Authors proposed a fusion strategy that incorporates the MLM into the discriminative training process to effectively distill knowledge from a pretrained model. We further propose an alternative discriminative loss.

Key Findings

 

Reduced WER by 6.6%/3.4% relative on the LibriSpeech clean/other test sets over a BERT baseline without discriminative objective

 

Found that it reduces both latency and WER (by 3 to 8% relative) over an LSTM rescoring model.


Hybrid sub-word segmentation for handling long tail in morphologically rich low resource languages

 

Cognitive Systems Lab, University Bremen, Germany


Problem

 

Dealing with Out Of Vocabulary (OOV) words or unseen words

 

For morphologically rich languages having high type token ratio, the OOV percentage is also quite high.

 

Sub-word segmentation has been found to be one of the major approaches in dealing with OOVs.

 

Proposed method 

 

This paper presents a hybrid sub-word segmentation algorithm to deal with OOVs.

 

A sub-word segmentation evaluation methodology is also presented.

 

All the experiments are done for conversational code-switched Malayalam-English corpus.

Speech Recognition: Robust Speech Recognition I

Technical Program Session SPE-2

 


AUDIO-VISUAL MULTI-CHANNEL SPEECH SEPARATION, DEREVERBERATION AND RECOGNITION

The Chinese University of Hong Kong; Tencent AI lab


Problem

 

accurate recognition of cocktail party speech characterised by the interference from overlapping speakers, background noise and room reverberation.

 

Proposed method

 

In this paper, an audiovisual multi-channel speech separation, dereverberation and recognition approach with visual information into all three stages of the system is proposed.

 

The advantage of the additional visual modality over using audio only is demonstrated on two neural dereverberation approaches based on DNN-WPE and spectral mapping respectively.

 

 


BEST OF BOTH WORLDS: MULTI-TASK AUDIO-VISUAL AUTOMATIC SPEECH RECOGNITION AND ACTIVE SPEAKER DETECTION

Google, Inc.


Problem

 

Under noisy conditions, automatic speech recognition (ASR) can greatly benefit from the addition of visual signals coming from a video of the speaker’s face.

 

현실적으로 여러 얼굴이 존재하는 경우가 많은데 전통적으로 active speaker detection (ASD)으로  모든 시간마다 audio와 일치하는 active speaker's face를 분리하는 모델을 따로 사용했으나, 최근에는 attention 모델을 추가해서 별도의 ASD를 설계하지 않고 audio와 모든 face candidate을  모델에 집어 넣어 end-to-end way로 처리 하기도 한다.

 

Proposed method

2.1. A/V Backbone: Shared Audio-Visual Frontend

 

Acoustic Features. log mel filterbank

Audio and Video Synchronization. resample video

Visual Features. ConvNet on top of the synchronized video

Attention Mechanism. in order to soft-select the one matching the audio.

 

2.2. ASR Model - Transformer-Transducer Model

 

For ASR, the weighted visual features and input acoustic features are then concatenated along the last dimension, producing audio-visual features which are then fed to the ASR encoder.

 

2.3. ASD Model

 

For ASD, the attention scores is used directly for the model prediction. For each audio query and each timestep, the attention scores give a measure of how well each candidate video corresponds to the audio.

 

3. MULTI-TASK LOSS FOR A/V ASR AND ASD

ASD. For active speaker detection, the normalized attention weights can be used to train the attention module directly with cross entropy loss.

ASR. RNN-T loss

 

MTL Loss. We combine the ASD and ASR losses with a weighted linear sum of the losses 

 

 

Key Findings

 

This paper presents a multi-task learning (MTL) for a model that can simultaneously perform audio-visual ASR and active speaker detection, improving previous work on multiperson audio-visual ASR.

 

Combining the two tasks is enough to significantly improve the performance of the model in the ASD task relative to the baseline.

 


IMPROVING NOISE ROBUSTNESS OF CONTRASTIVE SPEECH REPRESENTATION LEARNING WITH SPEECH RECONSTRUCTION

The Ohio State University, USA, Microsoft Corporation


Problem

 

Noise Robust ASR

 

Proposed method

 

In this paper, authors employ a noise-robust representation learned by a refined self-supervised framework of wav2vec 2.0 for noisy speech recognition. They combine a reconstruction module with contrastive learning and perform multi-task continual pre-training to explicitly reconstruct the clean speech from the noisy input.

 

 


 

+ Recent posts