Skip to content

encodec

mindnlp.transformers.models.encodec.modeling_encodec

MindSpore EnCodec model.

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConv1d

Bases: Module

Conv1d with asymmetric or causal padding and normalization.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
class EncodecConv1d(nn.Module):
    """Conv1d with asymmetric or causal padding and normalization."""
    def __init__(
        self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1
    ):
        """Initialize the EncodecConv1d class.

        Args:
            self: The instance of the class.
            config: The configuration object containing various settings.
            in_channels (int): The number of input channels.
            out_channels (int): The number of output channels.
            kernel_size (int): The size of the convolutional kernel.
            stride (int, optional): The stride value for the convolution operation. Defaults to 1.
            dilation (int, optional): The dilation value for the convolution operation. Defaults to 1.

        Returns:
            None

        Raises:
            ValueError: If `norm_type` is not one of the allowed values: `"weight_norm"` or `"time_group_norm"`.
            Warning: If both `stride` and `dilation` are greater than 1.

        """
        super().__init__()
        self.causal = config.use_causal_conv
        self.pad_mode = config.pad_mode
        self.norm_type = config.norm_type

        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        # warn user on unusual setup between dilation and stride
        if stride > 1 and dilation > 1:
            logger.warning(
                "EncodecConv1d has been initialized with stride > 1 and dilation > 1"
                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
            )

        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, dilation=dilation)
        if self.norm_type == "weight_norm":
            setattr(self, 'conv', weight_norm(self.conv))
        elif self.norm_type == "time_group_norm":
            self.norm = nn.GroupNorm(1, out_channels)

    @staticmethod
    def _get_extra_padding_for_conv1d(
        hidden_states: mindspore.Tensor, kernel_size: int, stride: int, padding_total: int = 0
    ) -> int:
        """See `pad_for_conv1d`."""
        length = hidden_states.shape[-1]
        n_frames = (length - kernel_size + padding_total) / stride + 1
        ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
        return ideal_length - length

    @staticmethod
    def _pad1d(hidden_states: mindspore.Tensor, paddings: Tuple[int, int], mode: str = "zero", value: float = 0.0):
        """Tiny wrapper around torch.ops.pad, just to allow for reflect padding on small input.
        If this is the case, we insert extra 0 padding to the right before the reflection happens.
        """
        length = hidden_states.shape[-1]
        padding_left, padding_right = paddings
        if mode != "reflect":
            return ops.pad(hidden_states, paddings, mode, value)

        max_pad = max(padding_left, padding_right)
        extra_pad = 0
        if length <= max_pad:
            extra_pad = max_pad - length + 1
            hidden_states = ops.pad(hidden_states, (0, extra_pad))
        if mode != 'reflect':
            padded = ops.pad(hidden_states, paddings, mode, value)
        else:
            padded = ops.pad(hidden_states, paddings, mode)

        end = padded.shape[-1] - extra_pad
        return padded[..., :end]

    def forward(self, hidden_states):
        """
        Method 'forward' in the class 'EncodecConv1d'.

        Args:
            self (object): Instance of EncodecConv1d class.
            hidden_states (Tensor):
                Input tensor of shape [batch_size, channels, sequence_length] representing hidden states.

        Returns:
            None:
                The method does not return any value but updates the hidden_states tensor after applying convolution
                and normalization.

        Raises:
            ValueError: If the normalization type is not supported.
            RuntimeError: If the convolution operation fails.
        """
        kernel_size = self.conv.kernel_size[0]
        stride = self.conv.stride[0]
        dilation = self.conv.dilation[0]
        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
        padding_total = kernel_size - stride
        extra_padding = self._get_extra_padding_for_conv1d(hidden_states, kernel_size, stride, padding_total)

        if self.causal:
            # Left padding for causal
            hidden_states = self._pad1d(hidden_states, (padding_total, extra_padding), mode=self.pad_mode)
        else:
            # Asymmetric padding required for odd strides
            padding_right = padding_total // 2
            padding_left = padding_total - padding_right
            hidden_states = self._pad1d(
                hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode
            )

        hidden_states = self.conv(hidden_states)
        if self.norm_type == "time_group_norm":
            hidden_states = self.norm(hidden_states)

        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConv1d.__init__(config, in_channels, out_channels, kernel_size, stride=1, dilation=1)

Initialize the EncodecConv1d class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

The configuration object containing various settings.

in_channels

The number of input channels.

TYPE: int

out_channels

The number of output channels.

TYPE: int

kernel_size

The size of the convolutional kernel.

TYPE: int

stride

The stride value for the convolution operation. Defaults to 1.

TYPE: int DEFAULT: 1

dilation

The dilation value for the convolution operation. Defaults to 1.

TYPE: int DEFAULT: 1

RETURNS DESCRIPTION

None

RAISES DESCRIPTION
ValueError

If norm_type is not one of the allowed values: "weight_norm" or "time_group_norm".

Warning

If both stride and dilation are greater than 1.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def __init__(
    self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1
):
    """Initialize the EncodecConv1d class.

    Args:
        self: The instance of the class.
        config: The configuration object containing various settings.
        in_channels (int): The number of input channels.
        out_channels (int): The number of output channels.
        kernel_size (int): The size of the convolutional kernel.
        stride (int, optional): The stride value for the convolution operation. Defaults to 1.
        dilation (int, optional): The dilation value for the convolution operation. Defaults to 1.

    Returns:
        None

    Raises:
        ValueError: If `norm_type` is not one of the allowed values: `"weight_norm"` or `"time_group_norm"`.
        Warning: If both `stride` and `dilation` are greater than 1.

    """
    super().__init__()
    self.causal = config.use_causal_conv
    self.pad_mode = config.pad_mode
    self.norm_type = config.norm_type

    if self.norm_type not in ["weight_norm", "time_group_norm"]:
        raise ValueError(
            f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
        )

    # warn user on unusual setup between dilation and stride
    if stride > 1 and dilation > 1:
        logger.warning(
            "EncodecConv1d has been initialized with stride > 1 and dilation > 1"
            f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
        )

    self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, dilation=dilation)
    if self.norm_type == "weight_norm":
        setattr(self, 'conv', weight_norm(self.conv))
    elif self.norm_type == "time_group_norm":
        self.norm = nn.GroupNorm(1, out_channels)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConv1d.forward(hidden_states)

Method 'forward' in the class 'EncodecConv1d'.

PARAMETER DESCRIPTION
self

Instance of EncodecConv1d class.

TYPE: object

hidden_states

Input tensor of shape [batch_size, channels, sequence_length] representing hidden states.

TYPE: Tensor

RETURNS DESCRIPTION
None

The method does not return any value but updates the hidden_states tensor after applying convolution and normalization.

RAISES DESCRIPTION
ValueError

If the normalization type is not supported.

RuntimeError

If the convolution operation fails.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def forward(self, hidden_states):
    """
    Method 'forward' in the class 'EncodecConv1d'.

    Args:
        self (object): Instance of EncodecConv1d class.
        hidden_states (Tensor):
            Input tensor of shape [batch_size, channels, sequence_length] representing hidden states.

    Returns:
        None:
            The method does not return any value but updates the hidden_states tensor after applying convolution
            and normalization.

    Raises:
        ValueError: If the normalization type is not supported.
        RuntimeError: If the convolution operation fails.
    """
    kernel_size = self.conv.kernel_size[0]
    stride = self.conv.stride[0]
    dilation = self.conv.dilation[0]
    kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
    padding_total = kernel_size - stride
    extra_padding = self._get_extra_padding_for_conv1d(hidden_states, kernel_size, stride, padding_total)

    if self.causal:
        # Left padding for causal
        hidden_states = self._pad1d(hidden_states, (padding_total, extra_padding), mode=self.pad_mode)
    else:
        # Asymmetric padding required for odd strides
        padding_right = padding_total // 2
        padding_left = padding_total - padding_right
        hidden_states = self._pad1d(
            hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode
        )

    hidden_states = self.conv(hidden_states)
    if self.norm_type == "time_group_norm":
        hidden_states = self.norm(hidden_states)

    return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConvTranspose1d

Bases: Module

ConvTranspose1d with asymmetric or causal padding and normalization.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
class EncodecConvTranspose1d(nn.Module):
    """ConvTranspose1d with asymmetric or causal padding and normalization."""
    def __init__(self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1):
        """
        Args:
            self (object): The instance of the class.
            config (object): An object containing configuration parameters.
            in_channels (int): The number of input channels.
            out_channels (int): The number of output channels.
            kernel_size (int): The size of the convolutional kernel.
            stride (int, optional): The stride of the convolution. Defaults to 1.

        Returns:
            None.

        Raises:
            ValueError: If self.norm_type is not one of 'weight_norm' or 'time_group_norm'.
            ValueError: If trim_right_ratio is not equal to 1.0 and causal convolutions are not used.
        """
        super().__init__()
        self.causal = config.use_causal_conv
        self.trim_right_ratio = config.trim_right_ratio
        self.norm_type = config.norm_type
        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
        if config.norm_type == "weight_norm":
            self.conv = weight_norm(self.conv)
        elif config.norm_type == "time_group_norm":
            self.norm = nn.GroupNorm(1, out_channels)

        if not (self.causal or self.trim_right_ratio == 1.0):
            raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")

    def forward(self, hidden_states):
        """
        This method forwards a 1D transposed convolutional layer for the EncodecConvTranspose1d class.

        Args:
            self: An instance of the EncodecConvTranspose1d class.
            hidden_states: A tensor representing the input hidden states to be processed by the
                transposed convolution layer.

        Returns:
            None: However, the method modifies the hidden_states tensor to apply the transposed convolution operation.

        Raises:
            ValueError: If the norm_type attribute is not recognized or supported.
            RuntimeError: If an error occurs during the transposed convolution operation.
            AttributeError: If the required attributes are not found in the instance of the EncodecConvTranspose1d class.
        """
        kernel_size = self.conv.kernel_size[0]
        stride = self.conv.stride[0]
        padding_total = kernel_size - stride

        hidden_states = self.conv(hidden_states)

        if self.norm_type == "time_group_norm":
            hidden_states = self.norm(hidden_states)

        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
        # removed at the very end, when keeping only the right length for the output,
        # as removing it here would require also passing the length at the matching layer
        # in the encoder.
        if self.causal:
            # Trim the padding on the right according to the specified ratio
            # if trim_right_ratio = 1.0, trim everything from right
            padding_right = math.ceil(padding_total * self.trim_right_ratio)
        else:
            # Asymmetric padding required for odd strides
            padding_right = padding_total // 2

        padding_left = padding_total - padding_right

        # unpad
        end = hidden_states.shape[-1] - padding_right
        hidden_states = hidden_states[..., padding_left:end]
        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConvTranspose1d.__init__(config, in_channels, out_channels, kernel_size, stride=1)

PARAMETER DESCRIPTION
self

The instance of the class.

TYPE: object

config

An object containing configuration parameters.

TYPE: object

in_channels

The number of input channels.

TYPE: int

out_channels

The number of output channels.

TYPE: int

kernel_size

The size of the convolutional kernel.

TYPE: int

stride

The stride of the convolution. Defaults to 1.

TYPE: int DEFAULT: 1

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If self.norm_type is not one of 'weight_norm' or 'time_group_norm'.

ValueError

If trim_right_ratio is not equal to 1.0 and causal convolutions are not used.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def __init__(self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1):
    """
    Args:
        self (object): The instance of the class.
        config (object): An object containing configuration parameters.
        in_channels (int): The number of input channels.
        out_channels (int): The number of output channels.
        kernel_size (int): The size of the convolutional kernel.
        stride (int, optional): The stride of the convolution. Defaults to 1.

    Returns:
        None.

    Raises:
        ValueError: If self.norm_type is not one of 'weight_norm' or 'time_group_norm'.
        ValueError: If trim_right_ratio is not equal to 1.0 and causal convolutions are not used.
    """
    super().__init__()
    self.causal = config.use_causal_conv
    self.trim_right_ratio = config.trim_right_ratio
    self.norm_type = config.norm_type
    if self.norm_type not in ["weight_norm", "time_group_norm"]:
        raise ValueError(
            f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
        )

    self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
    if config.norm_type == "weight_norm":
        self.conv = weight_norm(self.conv)
    elif config.norm_type == "time_group_norm":
        self.norm = nn.GroupNorm(1, out_channels)

    if not (self.causal or self.trim_right_ratio == 1.0):
        raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")

mindnlp.transformers.models.encodec.modeling_encodec.EncodecConvTranspose1d.forward(hidden_states)

This method forwards a 1D transposed convolutional layer for the EncodecConvTranspose1d class.

PARAMETER DESCRIPTION
self

An instance of the EncodecConvTranspose1d class.

hidden_states

A tensor representing the input hidden states to be processed by the transposed convolution layer.

RETURNS DESCRIPTION
None

However, the method modifies the hidden_states tensor to apply the transposed convolution operation.

RAISES DESCRIPTION
ValueError

If the norm_type attribute is not recognized or supported.

RuntimeError

If an error occurs during the transposed convolution operation.

AttributeError

If the required attributes are not found in the instance of the EncodecConvTranspose1d class.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
def forward(self, hidden_states):
    """
    This method forwards a 1D transposed convolutional layer for the EncodecConvTranspose1d class.

    Args:
        self: An instance of the EncodecConvTranspose1d class.
        hidden_states: A tensor representing the input hidden states to be processed by the
            transposed convolution layer.

    Returns:
        None: However, the method modifies the hidden_states tensor to apply the transposed convolution operation.

    Raises:
        ValueError: If the norm_type attribute is not recognized or supported.
        RuntimeError: If an error occurs during the transposed convolution operation.
        AttributeError: If the required attributes are not found in the instance of the EncodecConvTranspose1d class.
    """
    kernel_size = self.conv.kernel_size[0]
    stride = self.conv.stride[0]
    padding_total = kernel_size - stride

    hidden_states = self.conv(hidden_states)

    if self.norm_type == "time_group_norm":
        hidden_states = self.norm(hidden_states)

    # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
    # removed at the very end, when keeping only the right length for the output,
    # as removing it here would require also passing the length at the matching layer
    # in the encoder.
    if self.causal:
        # Trim the padding on the right according to the specified ratio
        # if trim_right_ratio = 1.0, trim everything from right
        padding_right = math.ceil(padding_total * self.trim_right_ratio)
    else:
        # Asymmetric padding required for odd strides
        padding_right = padding_total // 2

    padding_left = padding_total - padding_right

    # unpad
    end = hidden_states.shape[-1] - padding_right
    hidden_states = hidden_states[..., padding_left:end]
    return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoder

Bases: Module

SEANet decoder as used by EnCodec.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
class EncodecDecoder(nn.Module):
    """SEANet decoder as used by EnCodec."""
    def __init__(self, config: EncodecConfig):
        """
        __init__

        Initializes an instance of the EncodecDecoder class.

        Args:
            self: The instance of the class.
            config (EncodecConfig):
                An instance of the EncodecConfig class containing configuration parameters for the decoder.

                - Type: EncodecConfig
                - Purpose: Specifies the configuration settings for the decoder.
                - Restrictions: Must be an instance of the EncodecConfig class.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        scaling = int(2 ** len(config.upsampling_ratios))
        model = [EncodecConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]

        model += [EncodecLSTM(config, scaling * config.num_filters)]

        # Upsample to raw audio scale
        for ratio in config.upsampling_ratios:
            current_scale = scaling * config.num_filters
            # Add upsampling layers
            model += [nn.ELU()]
            model += [
                EncodecConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
            ]
            # Add residual layers
            for j in range(config.num_residual_layers):
                model += [EncodecResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
            scaling //= 2

        # Add final layers
        model += [nn.ELU()]
        model += [EncodecConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
        self.layers = nn.ModuleList(model)

    def forward(self, hidden_states):
        """
        Construct method in the EncodecDecoder class.

        Args:
            self (object): Instance of the EncodecDecoder class.
            hidden_states (object): The hidden states to be processed by the method.
                This parameter is a list of hidden states that will be sequentially processed by each layer in the model.
                It is expected that each hidden state conforms to the input requirements of the layers.

        Returns:
            None: The method does not return any value directly but modifies the hidden_states in place.

        Raises:
            None.
        """
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoder.__init__(config)

init

Initializes an instance of the EncodecDecoder class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

An instance of the EncodecConfig class containing configuration parameters for the decoder.

  • Type: EncodecConfig
  • Purpose: Specifies the configuration settings for the decoder.
  • Restrictions: Must be an instance of the EncodecConfig class.

TYPE: EncodecConfig

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
def __init__(self, config: EncodecConfig):
    """
    __init__

    Initializes an instance of the EncodecDecoder class.

    Args:
        self: The instance of the class.
        config (EncodecConfig):
            An instance of the EncodecConfig class containing configuration parameters for the decoder.

            - Type: EncodecConfig
            - Purpose: Specifies the configuration settings for the decoder.
            - Restrictions: Must be an instance of the EncodecConfig class.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    scaling = int(2 ** len(config.upsampling_ratios))
    model = [EncodecConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]

    model += [EncodecLSTM(config, scaling * config.num_filters)]

    # Upsample to raw audio scale
    for ratio in config.upsampling_ratios:
        current_scale = scaling * config.num_filters
        # Add upsampling layers
        model += [nn.ELU()]
        model += [
            EncodecConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
        ]
        # Add residual layers
        for j in range(config.num_residual_layers):
            model += [EncodecResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
        scaling //= 2

    # Add final layers
    model += [nn.ELU()]
    model += [EncodecConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
    self.layers = nn.ModuleList(model)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoder.forward(hidden_states)

Construct method in the EncodecDecoder class.

PARAMETER DESCRIPTION
self

Instance of the EncodecDecoder class.

TYPE: object

hidden_states

The hidden states to be processed by the method. This parameter is a list of hidden states that will be sequentially processed by each layer in the model. It is expected that each hidden state conforms to the input requirements of the layers.

TYPE: object

RETURNS DESCRIPTION
None

The method does not return any value directly but modifies the hidden_states in place.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
def forward(self, hidden_states):
    """
    Construct method in the EncodecDecoder class.

    Args:
        self (object): Instance of the EncodecDecoder class.
        hidden_states (object): The hidden states to be processed by the method.
            This parameter is a list of hidden states that will be sequentially processed by each layer in the model.
            It is expected that each hidden state conforms to the input requirements of the layers.

    Returns:
        None: The method does not return any value directly but modifies the hidden_states in place.

    Raises:
        None.
    """
    for layer in self.layers:
        hidden_states = layer(hidden_states)
    return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecDecoderOutput dataclass

Bases: ModelOutput

PARAMETER DESCRIPTION
audio_values

Decoded audio values, obtained using the decoder part of Encodec.

TYPE: `mindspore.Tensor` of shape `(batch_size, segment_length)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
65
66
67
68
69
70
71
72
@dataclass
class EncodecDecoderOutput(ModelOutput):
    """
    Args:
        audio_values (`mindspore.Tensor`  of shape `(batch_size, segment_length)`, *optional*):
            Decoded audio values, obtained using the decoder part of Encodec.
    """
    audio_values: mindspore.Tensor = None

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoder

Bases: Module

SEANet encoder as used by EnCodec.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
class EncodecEncoder(nn.Module):
    """SEANet encoder as used by EnCodec."""
    def __init__(self, config: EncodecConfig):
        """
        This method initializes an instance of the EncodecEncoder class.

        Args:
            self: The instance of the EncodecEncoder class.
            config (EncodecConfig):
                An instance of the EncodecConfig class containing configuration parameters for the encoder.

                - audio_channels (int): The number of audio channels.
                - num_filters (int): The number of filters to be used in the encoder.
                - kernel_size (int): The size of the kernel for convolutional layers.
                - upsampling_ratios (list): A list of integers representing the upsampling ratios for each layer.
                - num_residual_layers (int): The number of residual layers to be used in the encoder.
                - dilation_growth_rate (int): The growth rate for the dilation in the residual blocks.
                - hidden_size (int): The size of the hidden layer.
                - last_kernel_size (int): The size of the kernel for the final convolutional layer.

        Returns:
            None:
                The method initializes the layers of the encoder and assigns them to the 'layers' attribute of the
                EncodecEncoder instance.

        Raises:
            None.
        """
        super().__init__()
        model = [EncodecConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
        scaling = 1

        # Downsample to raw audio scale
        for ratio in reversed(config.upsampling_ratios):
            current_scale = scaling * config.num_filters
            # Add residual layers
            for j in range(config.num_residual_layers):
                model += [EncodecResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
            # Add downsampling layers
            model += [nn.ELU()]
            model += [EncodecConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
            scaling *= 2

        model += [EncodecLSTM(config, scaling * config.num_filters)]
        model += [nn.ELU()]
        model += [EncodecConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]

        self.layers = nn.ModuleList(model)

    def forward(self, hidden_states):
        """
        Constructs the encoded hidden states by applying each layer in the EncodecEncoder.

        Args:
            self (EncodecEncoder): An instance of the EncodecEncoder class.
            hidden_states (object):
                The input hidden states.

                - Type: Any valid Python object
                - Purpose: Represents the initial hidden states.
                - Restrictions: None

        Returns:
            None: This method does not return any value. It updates the hidden_states in place.

        Raises:
            None.
        """
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoder.__init__(config)

This method initializes an instance of the EncodecEncoder class.

PARAMETER DESCRIPTION
self

The instance of the EncodecEncoder class.

config

An instance of the EncodecConfig class containing configuration parameters for the encoder.

  • audio_channels (int): The number of audio channels.
  • num_filters (int): The number of filters to be used in the encoder.
  • kernel_size (int): The size of the kernel for convolutional layers.
  • upsampling_ratios (list): A list of integers representing the upsampling ratios for each layer.
  • num_residual_layers (int): The number of residual layers to be used in the encoder.
  • dilation_growth_rate (int): The growth rate for the dilation in the residual blocks.
  • hidden_size (int): The size of the hidden layer.
  • last_kernel_size (int): The size of the kernel for the final convolutional layer.

TYPE: EncodecConfig

RETURNS DESCRIPTION
None

The method initializes the layers of the encoder and assigns them to the 'layers' attribute of the EncodecEncoder instance.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
def __init__(self, config: EncodecConfig):
    """
    This method initializes an instance of the EncodecEncoder class.

    Args:
        self: The instance of the EncodecEncoder class.
        config (EncodecConfig):
            An instance of the EncodecConfig class containing configuration parameters for the encoder.

            - audio_channels (int): The number of audio channels.
            - num_filters (int): The number of filters to be used in the encoder.
            - kernel_size (int): The size of the kernel for convolutional layers.
            - upsampling_ratios (list): A list of integers representing the upsampling ratios for each layer.
            - num_residual_layers (int): The number of residual layers to be used in the encoder.
            - dilation_growth_rate (int): The growth rate for the dilation in the residual blocks.
            - hidden_size (int): The size of the hidden layer.
            - last_kernel_size (int): The size of the kernel for the final convolutional layer.

    Returns:
        None:
            The method initializes the layers of the encoder and assigns them to the 'layers' attribute of the
            EncodecEncoder instance.

    Raises:
        None.
    """
    super().__init__()
    model = [EncodecConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
    scaling = 1

    # Downsample to raw audio scale
    for ratio in reversed(config.upsampling_ratios):
        current_scale = scaling * config.num_filters
        # Add residual layers
        for j in range(config.num_residual_layers):
            model += [EncodecResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
        # Add downsampling layers
        model += [nn.ELU()]
        model += [EncodecConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
        scaling *= 2

    model += [EncodecLSTM(config, scaling * config.num_filters)]
    model += [nn.ELU()]
    model += [EncodecConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]

    self.layers = nn.ModuleList(model)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoder.forward(hidden_states)

Constructs the encoded hidden states by applying each layer in the EncodecEncoder.

PARAMETER DESCRIPTION
self

An instance of the EncodecEncoder class.

TYPE: EncodecEncoder

hidden_states

The input hidden states.

  • Type: Any valid Python object
  • Purpose: Represents the initial hidden states.
  • Restrictions: None

TYPE: object

RETURNS DESCRIPTION
None

This method does not return any value. It updates the hidden_states in place.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
def forward(self, hidden_states):
    """
    Constructs the encoded hidden states by applying each layer in the EncodecEncoder.

    Args:
        self (EncodecEncoder): An instance of the EncodecEncoder class.
        hidden_states (object):
            The input hidden states.

            - Type: Any valid Python object
            - Purpose: Represents the initial hidden states.
            - Restrictions: None

    Returns:
        None: This method does not return any value. It updates the hidden_states in place.

    Raises:
        None.
    """
    for layer in self.layers:
        hidden_states = layer(hidden_states)
    return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEncoderOutput dataclass

Bases: ModelOutput

PARAMETER DESCRIPTION
audio_codes

Discret code embeddings computed using model.encode.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional* DEFAULT: None

audio_scales

Scaling factor for each audio_codes input. This is used to unscale each chunk of audio when decoding.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
52
53
54
55
56
57
58
59
60
61
62
@dataclass
class EncodecEncoderOutput(ModelOutput):
    """
    Args:
        audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
    """
    audio_codes: mindspore.Tensor = None
    audio_scales: mindspore.Tensor = None

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook

Bases: Module

Codebook with Euclidean distance.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
class EncodecEuclideanCodebook(nn.Module):
    """Codebook with Euclidean distance."""
    def __init__(self, config: EncodecConfig):
        """
        Initializes an instance of the EncodecEuclideanCodebook class.

        Args:
            self: The instance of the class.
            config (EncodecConfig): An object of the EncodecConfig class that contains the configuration parameters.

        Returns:
            None

        Raises:
            None
        """
        super().__init__()
        embed = mindspore.Parameter(ops.zeros(config.codebook_size, config.codebook_dim), requires_grad=False)

        self.codebook_size = config.codebook_size

        self.inited = mindspore.Parameter([True], requires_grad=False)
        self.cluster_size = mindspore.Parameter(ops.zeros(config.codebook_size), requires_grad=False)
        self.embed = embed
        self.embed_avg = embed.clone()

    def quantize(self, hidden_states):
        """
        Quantizes the given hidden states using the Euclidean codebook encoding method.

        Args:
            self (EncodecEuclideanCodebook): An instance of the EncodecEuclideanCodebook class.
            hidden_states (Tensor): A tensor representing the hidden states to be quantized.

        Returns:
            None.

        Raises:
            None.
        """
        embed = self.embed.t()
        scaled_states = hidden_states.pow(2).sum(1, keepdims=True)
        dist = -(scaled_states - 2 * hidden_states @ embed + embed.pow(2).sum(0, keepdims=True))
        embed_ind = dist.max(axis=-1, return_indices=True)[1]
        return embed_ind

    def encode(self, hidden_states):
        """
        Encodes the hidden states using the Euclidean Codebook method.

        Args:
            self: An instance of the EncodecEuclideanCodebook class.
            hidden_states (ndarray): A numpy array containing the hidden states to be encoded.
                The shape of the array is expected to be (batch_size, sequence_length, hidden_size).

        Returns:
            ndarray: A numpy array containing the encoded indices.
                The shape of the array is the same as the input hidden_states, except for the last dimension
                which is reduced to represent the indices of the codebook.

        Raises:
            None.
        """
        shape = hidden_states.shape
        # pre-process
        hidden_states = hidden_states.reshape((-1, shape[-1]))
        # quantize
        embed_ind = self.quantize(hidden_states)
        # post-process
        embed_ind = embed_ind.view(*shape[:-1])
        return embed_ind

    def decode(self, embed_ind):
        """
        Decodes an embedding index using the Euclidean codebook method.

        Args:
            self (EncodecEuclideanCodebook): An instance of the EncodecEuclideanCodebook class.
            embed_ind (int): The index of the embedding to decode.

        Returns:
            None.

        Raises:
            None.
        """
        quantize = embedding(embed_ind, self.embed)
        return quantize

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.__init__(config)

Initializes an instance of the EncodecEuclideanCodebook class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

An object of the EncodecConfig class that contains the configuration parameters.

TYPE: EncodecConfig

RETURNS DESCRIPTION

None

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
def __init__(self, config: EncodecConfig):
    """
    Initializes an instance of the EncodecEuclideanCodebook class.

    Args:
        self: The instance of the class.
        config (EncodecConfig): An object of the EncodecConfig class that contains the configuration parameters.

    Returns:
        None

    Raises:
        None
    """
    super().__init__()
    embed = mindspore.Parameter(ops.zeros(config.codebook_size, config.codebook_dim), requires_grad=False)

    self.codebook_size = config.codebook_size

    self.inited = mindspore.Parameter([True], requires_grad=False)
    self.cluster_size = mindspore.Parameter(ops.zeros(config.codebook_size), requires_grad=False)
    self.embed = embed
    self.embed_avg = embed.clone()

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.decode(embed_ind)

Decodes an embedding index using the Euclidean codebook method.

PARAMETER DESCRIPTION
self

An instance of the EncodecEuclideanCodebook class.

TYPE: EncodecEuclideanCodebook

embed_ind

The index of the embedding to decode.

TYPE: int

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
def decode(self, embed_ind):
    """
    Decodes an embedding index using the Euclidean codebook method.

    Args:
        self (EncodecEuclideanCodebook): An instance of the EncodecEuclideanCodebook class.
        embed_ind (int): The index of the embedding to decode.

    Returns:
        None.

    Raises:
        None.
    """
    quantize = embedding(embed_ind, self.embed)
    return quantize

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.encode(hidden_states)

Encodes the hidden states using the Euclidean Codebook method.

PARAMETER DESCRIPTION
self

An instance of the EncodecEuclideanCodebook class.

hidden_states

A numpy array containing the hidden states to be encoded. The shape of the array is expected to be (batch_size, sequence_length, hidden_size).

TYPE: ndarray

RETURNS DESCRIPTION
ndarray

A numpy array containing the encoded indices. The shape of the array is the same as the input hidden_states, except for the last dimension which is reduced to represent the indices of the codebook.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
def encode(self, hidden_states):
    """
    Encodes the hidden states using the Euclidean Codebook method.

    Args:
        self: An instance of the EncodecEuclideanCodebook class.
        hidden_states (ndarray): A numpy array containing the hidden states to be encoded.
            The shape of the array is expected to be (batch_size, sequence_length, hidden_size).

    Returns:
        ndarray: A numpy array containing the encoded indices.
            The shape of the array is the same as the input hidden_states, except for the last dimension
            which is reduced to represent the indices of the codebook.

    Raises:
        None.
    """
    shape = hidden_states.shape
    # pre-process
    hidden_states = hidden_states.reshape((-1, shape[-1]))
    # quantize
    embed_ind = self.quantize(hidden_states)
    # post-process
    embed_ind = embed_ind.view(*shape[:-1])
    return embed_ind

mindnlp.transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.quantize(hidden_states)

Quantizes the given hidden states using the Euclidean codebook encoding method.

PARAMETER DESCRIPTION
self

An instance of the EncodecEuclideanCodebook class.

TYPE: EncodecEuclideanCodebook

hidden_states

A tensor representing the hidden states to be quantized.

TYPE: Tensor

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
def quantize(self, hidden_states):
    """
    Quantizes the given hidden states using the Euclidean codebook encoding method.

    Args:
        self (EncodecEuclideanCodebook): An instance of the EncodecEuclideanCodebook class.
        hidden_states (Tensor): A tensor representing the hidden states to be quantized.

    Returns:
        None.

    Raises:
        None.
    """
    embed = self.embed.t()
    scaled_states = hidden_states.pow(2).sum(1, keepdims=True)
    dist = -(scaled_states - 2 * hidden_states @ embed + embed.pow(2).sum(0, keepdims=True))
    embed_ind = dist.max(axis=-1, return_indices=True)[1]
    return embed_ind

mindnlp.transformers.models.encodec.modeling_encodec.EncodecLSTM

Bases: Module

LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
class EncodecLSTM(nn.Module):
    """
    LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
    """
    def __init__(self, config, dimension):
        """
        Initializes an instance of the EncodecLSTM class.

        Args:
            self (EncodecLSTM): The instance of the EncodecLSTM class.
            config (object): The configuration object containing various settings.
            dimension (int): The dimension of the LSTM input and output.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)

    def forward(self, hidden_states):
        """
        Constructs the encoded hidden states using the Long Short-Term Memory (LSTM) algorithm.

        Args:
            self (EncodecLSTM): An instance of the EncodecLSTM class.
            hidden_states (torch.Tensor): The hidden states to be encoded.
                Should have shape (batch_size, sequence_length, input_size).

        Returns:
            torch.Tensor: The encoded hidden states. Has shape (sequence_length, input_size, batch_size).

        Raises:
            None.

        Note:
            - The 'hidden_states' tensor is expected to have the batch dimension as the first dimension,
            the sequence dimension as the second dimension, and the input size dimension as the third dimension.
            - The 'hidden_states' tensor is permuted twice to match the expected input format for the LSTM.
            - The LSTM is applied on the permuted 'hidden_states' tensor, and its output is added element-wise to
            the original 'hidden_states' tensor.
            - The resulting tensor is permuted again to match the expected output format.
        """
        hidden_states = hidden_states.permute(2, 0, 1)
        hidden_states = self.lstm(hidden_states)[0] + hidden_states
        hidden_states = hidden_states.permute(1, 2, 0)
        return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecLSTM.__init__(config, dimension)

Initializes an instance of the EncodecLSTM class.

PARAMETER DESCRIPTION
self

The instance of the EncodecLSTM class.

TYPE: EncodecLSTM

config

The configuration object containing various settings.

TYPE: object

dimension

The dimension of the LSTM input and output.

TYPE: int

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
def __init__(self, config, dimension):
    """
    Initializes an instance of the EncodecLSTM class.

    Args:
        self (EncodecLSTM): The instance of the EncodecLSTM class.
        config (object): The configuration object containing various settings.
        dimension (int): The dimension of the LSTM input and output.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecLSTM.forward(hidden_states)

Constructs the encoded hidden states using the Long Short-Term Memory (LSTM) algorithm.

PARAMETER DESCRIPTION
self

An instance of the EncodecLSTM class.

TYPE: EncodecLSTM

hidden_states

The hidden states to be encoded. Should have shape (batch_size, sequence_length, input_size).

TYPE: Tensor

RETURNS DESCRIPTION

torch.Tensor: The encoded hidden states. Has shape (sequence_length, input_size, batch_size).

Note
  • The 'hidden_states' tensor is expected to have the batch dimension as the first dimension, the sequence dimension as the second dimension, and the input size dimension as the third dimension.
  • The 'hidden_states' tensor is permuted twice to match the expected input format for the LSTM.
  • The LSTM is applied on the permuted 'hidden_states' tensor, and its output is added element-wise to the original 'hidden_states' tensor.
  • The resulting tensor is permuted again to match the expected output format.
Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
def forward(self, hidden_states):
    """
    Constructs the encoded hidden states using the Long Short-Term Memory (LSTM) algorithm.

    Args:
        self (EncodecLSTM): An instance of the EncodecLSTM class.
        hidden_states (torch.Tensor): The hidden states to be encoded.
            Should have shape (batch_size, sequence_length, input_size).

    Returns:
        torch.Tensor: The encoded hidden states. Has shape (sequence_length, input_size, batch_size).

    Raises:
        None.

    Note:
        - The 'hidden_states' tensor is expected to have the batch dimension as the first dimension,
        the sequence dimension as the second dimension, and the input size dimension as the third dimension.
        - The 'hidden_states' tensor is permuted twice to match the expected input format for the LSTM.
        - The LSTM is applied on the permuted 'hidden_states' tensor, and its output is added element-wise to
        the original 'hidden_states' tensor.
        - The resulting tensor is permuted again to match the expected output format.
    """
    hidden_states = hidden_states.permute(2, 0, 1)
    hidden_states = self.lstm(hidden_states)[0] + hidden_states
    hidden_states = hidden_states.permute(1, 2, 0)
    return hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel

Bases: EncodecPreTrainedModel

EncodecModel

This class represents an Encodec model for audio encoding and decoding. It is a subclass of EncodecPreTrainedModel.

ATTRIBUTE DESCRIPTION
config

The configuration instance used to initialize the model.

TYPE: EncodecConfig

encoder

The encoder module of the model.

TYPE: EncodecEncoder

decoder

The decoder module of the model.

TYPE: EncodecDecoder

quantizer

The quantizer module of the model.

TYPE: EncodecResidualVectorQuantizer

bits_per_codebook

The number of bits per codebook.

TYPE: int

post_init

A method called after the initialization of the model.

TYPE: method

METHOD DESCRIPTION
get_encoder

Returns the encoder module of the model.

get_decoder

Returns the decoder module of the model.

_encode_frame

Encodes the given input using the underlying VQVAE.

encode

Encodes the input audio waveform into discrete codes.

_linear_overlap_add

Applies linear overlap-add to the given frames.

_decode_frame

Decodes the given codes into an output audio waveform.

decode

Decodes the given frames into an output audio waveform.

forward

Constructs the model.

Example
>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, EncodecModel
...
>>> dataset = load_dataset("ashraq/esc50")
>>> audio_sample = dataset["train"]["audio"][0]["array"]
...
>>> model_id = "facebook/encodec_24khz"
>>> model = EncodecModel.from_pretrained(model_id)
>>> processor = AutoProcessor.from_pretrained(model_id)
...
>>> inputs = processor(raw_audio=audio_sample, return_tensors="pt")
...
>>> outputs = model(**inputs)
>>> audio_codes = outputs.audio_codes
>>> audio_values = outputs.audio_values
Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
class EncodecModel(EncodecPreTrainedModel):

    """
    EncodecModel

    This class represents an Encodec model for audio encoding and decoding. It is a subclass of EncodecPreTrainedModel.

    Attributes:
        config (EncodecConfig): The configuration instance used to initialize the model.
        encoder (EncodecEncoder): The encoder module of the model.
        decoder (EncodecDecoder): The decoder module of the model.
        quantizer (EncodecResidualVectorQuantizer): The quantizer module of the model.
        bits_per_codebook (int): The number of bits per codebook.
        post_init (method): A method called after the initialization of the model.

    Methods:
        get_encoder(): Returns the encoder module of the model.
        get_decoder(): Returns the decoder module of the model.
        _encode_frame(input_values, bandwidth, padding_mask): Encodes the given input using the underlying VQVAE.
        encode(input_values, padding_mask, bandwidth, return_dict): Encodes the input audio waveform into discrete codes.
        _linear_overlap_add(frames, stride): Applies linear overlap-add to the given frames.
        _decode_frame(codes, scale): Decodes the given codes into an output audio waveform.
        decode(audio_codes, audio_scales, padding_mask, return_dict): Decodes the given frames into an output audio waveform.
        forward(input_values, padding_mask, bandwidth, audio_codes, audio_scales, return_dict): Constructs the model.

    Example:
        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, EncodecModel
        ...
        >>> dataset = load_dataset("ashraq/esc50")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]
        ...
        >>> model_id = "facebook/encodec_24khz"
        >>> model = EncodecModel.from_pretrained(model_id)
        >>> processor = AutoProcessor.from_pretrained(model_id)
        ...
        >>> inputs = processor(raw_audio=audio_sample, return_tensors="pt")
        ...
        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```
    """
    def __init__(self, config: EncodecConfig):
        """
        Initializes an instance of the EncodecModel class.

        Args:
            self: The instance of the EncodecModel class.
            config (EncodecConfig): The configuration object containing settings for the EncodecModel.
                This parameter is required and must be of type EncodecConfig.
                It specifies the configuration settings for the EncodecModel.

        Returns:
            None.

        Raises:
            ValueError: If the codebook_size specified in the config is not a power of 2.
                This exception is raised when the codebook_size is invalid.
        """
        super().__init__(config)
        self.config = config

        self.encoder = EncodecEncoder(config)
        self.decoder = EncodecDecoder(config)

        self.quantizer = EncodecResidualVectorQuantizer(config)

        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
        if 2**self.bits_per_codebook != self.config.codebook_size:
            raise ValueError("The codebook_size must be a power of 2.")

        # Initialize weights and apply final processing
        self.post_init()

    def get_encoder(self):
        """
        This method returns the encoder associated with the EncodecModel instance.

        Args:
            self (EncodecModel): The instance of the EncodecModel class.
                It is used to access the attributes and methods of the class.

        Returns:
            encoder: This method returns the encoder associated with the EncodecModel instance.

        Raises:
            None.
        """
        return self.encoder

    def get_decoder(self):
        """
        This method returns the decoder object associated with the EncodecModel instance.

        Args:
            self (object): The instance of the EncodecModel class.
                It is used to access the attributes and methods of the class.

        Returns:
            None: This method does not return any value explicitly, as it directly retrieves and returns the decoder
                object associated with the instance of the EncodecModel class.

        Raises:
            None.
        """
        return self.decoder

    def _encode_frame(
        self, input_values: mindspore.Tensor, bandwidth: float, padding_mask: int
    ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor]]:
        """
        Encodes the given input using the underlying VQVAE. If `config.normalize` is set to `True` the input is first
        normalized. The padding mask is required to compute the correct scale.
        """
        length = input_values.shape[-1]
        duration = length / self.config.sampling_rate

        if self.config.chunk_length_s is not None and duration > 1e-5 + self.config.chunk_length_s:
            raise RuntimeError(f"Duration of frame ({duration}) is longer than chunk {self.config.chunk_length_s}")

        scale = None
        if self.config.normalize:
            # if the padding is non zero
            input_values = input_values * padding_mask
            mono = ops.sum(input_values, 1, keepdim=True) / input_values.shape[1]
            scale = ops.mean(mono.pow(2), dim=-1, keepdim=True).sqrt() + 1e-8
            input_values = input_values / scale

        embeddings = self.encoder(input_values)
        codes = self.quantizer.encode(embeddings, bandwidth)
        codes = codes.swapaxes(0, 1)
        return codes, scale

    def encode(
        self,
        input_values: mindspore.Tensor,
        padding_mask: mindspore.Tensor = None,
        bandwidth: Optional[float] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor, Optional[mindspore.Tensor]], EncodecEncoderOutput]:
        """
        Encodes the input audio waveform into discrete codes.

        Args:
            input_values (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Float values of the input audio waveform.
            padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            bandwidth (`float`, *optional*):
                The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
                bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
                as bandwidth == 6.0

        Returns:
            A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
            factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
            `codebook` of shape `[batch_size, num_codebooks, frames]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        if bandwidth is None:
            bandwidth = self.config.target_bandwidths[0]
        if bandwidth not in self.config.target_bandwidths:
            raise ValueError(
                f"This model doesn't support the bandwidth {bandwidth}. "
                f"Select one of {self.config.target_bandwidths}."
            )

        _, channels, input_length = input_values.shape

        if channels < 1 or channels > 2:
            raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")

        chunk_length = self.config.chunk_length
        if chunk_length is None:
            chunk_length = input_length
            stride = input_length
        else:
            stride = self.config.chunk_stride

        if padding_mask is None:
            padding_mask = ops.ones_like(input_values).bool()

        encoded_frames = []
        scales = []

        step = chunk_length - stride
        if (input_length % stride) - step != 0:
            raise ValueError(
                "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
            )

        for offset in range(0, input_length - step, stride):
            mask = padding_mask[..., offset : offset + chunk_length].bool()
            frame = input_values[:, :, offset : offset + chunk_length]
            encoded_frame, scale = self._encode_frame(frame, bandwidth, mask)
            encoded_frames.append(encoded_frame)
            scales.append(scale)

        encoded_frames = ops.stack(encoded_frames)

        if not return_dict:
            return (encoded_frames, scales)

        return EncodecEncoderOutput(encoded_frames, scales)

    @staticmethod
    def _linear_overlap_add(frames: List[mindspore.Tensor], stride: int):
        """
        Method _linear_overlap_add in the EncodecModel class.

        This method performs linear overlap-add method on a list of frames to reforward the original signal.

        Args:
            frames (List[mindspore.Tensor]): A list of mindspore tensors representing the input frames.
                Each frame should be a tensor of shape [batch_size, ... , frame_length].
            stride (int): An integer specifying the stride for overlapping frames.
                It determines the amount of overlap between consecutive frames.

        Returns:
            None: The method modifies the frames in-place to perform the linear overlap-add operation.

        Raises:
            ValueError:
                - If the input list of frames is empty.
                - If the minimum element of the sum of weights (sum_weight) is zero, indicating an invalid operation.
        """
        # Generic overlap add, with linear fade-in/fade-out, supporting complex scenario
        # e.g., more than 2 frames per position.
        # The core idea is to use a weight function that is a triangle,
        # with a maximum value at the middle of the chunk.
        # We use this weighting when summing the frames, and divide by the sum of weights
        # for each positions at the end. Thus:
        #   - if a frame is the only one to cover a position, the weighting is a no-op.
        #   - if 2 frames cover a position:
        #          ...  ...
        #         /   \/   \
        #        /    /\    \
        #            S  T       , i.e. S offset of second frame starts, T end of first frame.
        # Then the weight function for each one is: (t - S), (T - t), with `t` a given offset.
        # After the final normalization, the weight of the second frame at position `t` is
        # (t - S) / (t - S + (T - t)) = (t - S) / (T - S), which is exactly what we want.
        #
        #   - if more than 2 frames overlap at a given point, we hope that by induction
        #      something sensible happens.
        if len(frames) == 0:
            raise ValueError("`frames` cannot be an empty list.")

        dtype = frames[0].dtype
        shape = frames[0].shape[:-1]
        total_size = stride * (len(frames) - 1) + frames[-1].shape[-1]

        frame_length = frames[0].shape[-1]
        time_vec = ops.linspace(0, 1, frame_length + 2).to(dtype)[1:-1]
        weight = 0.5 - (time_vec - 0.5).abs()

        sum_weight = ops.zeros(total_size, dtype=dtype)
        out = ops.zeros(*shape, total_size, dtype=dtype)
        offset: int = 0

        for frame in frames:
            frame_length = frame.shape[-1]
            out[..., offset : offset + frame_length] += weight[:frame_length] * frame
            sum_weight[offset : offset + frame_length] += weight[:frame_length]
            offset += stride

        if sum_weight.min() == 0:
            raise ValueError(f"`sum_weight` minimum element must be bigger than zero: {sum_weight}`")

        return out / sum_weight

    def _decode_frame(self, codes: mindspore.Tensor, scale: Optional[mindspore.Tensor] = None) -> mindspore.Tensor:
        """
        This method decodes the input codes and returns the corresponding output tensor.

        Args:
            self (EncodecModel): The instance of the EncodecModel class.
            codes (mindspore.Tensor): The input tensor containing the codes to be decoded.
                It is expected to have the shape (sequence_length, batch_size, code_size).
            scale (Optional[mindspore.Tensor]): An optional tensor representing the scale factor.
                If provided, it is expected to have the shape (batch_size, 1, 1). Defaults to None.

        Returns:
            mindspore.Tensor: The output tensor representing the decoded frames.
                It has the shape (sequence_length, batch_size, feature_size).

        Raises:
            ValueError: If the input codes or scale tensor have incompatible shapes for the decoding operation.
            TypeError: If the input codes or scale are not of type mindspore.Tensor.
        """
        codes = codes.swapaxes(0, 1)
        embeddings = self.quantizer.decode(codes)
        outputs = self.decoder(embeddings)
        if scale is not None:
            outputs = outputs * scale.view(-1, 1, 1)
        return outputs

    def decode(
        self,
        audio_codes: mindspore.Tensor,
        audio_scales: mindspore.Tensor,
        padding_mask: Optional[mindspore.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecDecoderOutput]:
        """
        Decodes the given frames into an output audio waveform.

        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
        trimmed.

        Args:
            audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
                Discret code embeddings computed using `model.encode`.
            audio_scales (`mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
                Scaling factor for each `audio_codes` input.
            padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        """
        return_dict = return_dict or self.config.return_dict

        chunk_length = self.config.chunk_length
        if chunk_length is None:
            if len(audio_codes) != 1:
                raise ValueError(f"Expected one frame, got {len(audio_codes)}")
            audio_values = self._decode_frame(audio_codes[0], audio_scales[0])
        else:
            decoded_frames = []

            for frame, scale in zip(audio_codes, audio_scales):
                frames = self._decode_frame(frame, scale)
                decoded_frames.append(frames)

            audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)

        # truncate based on padding mask
        if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
            audio_values = audio_values[..., : padding_mask.shape[-1]]

        if not return_dict:
            return (audio_values,)
        return EncodecDecoderOutput(audio_values)

    def forward(
        self,
        input_values: mindspore.Tensor,
        padding_mask: Optional[mindspore.Tensor] = None,
        bandwidth: Optional[float] = None,
        audio_codes: Optional[mindspore.Tensor] = None,
        audio_scales: Optional[mindspore.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecOutput]:
        r"""
        Returns:
            Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecOutput]

        Example:
            ```python
            >>> from datasets import load_dataset
            >>> from transformers import AutoProcessor, EncodecModel
            ...
            >>> dataset = load_dataset("ashraq/esc50")
            >>> audio_sample = dataset["train"]["audio"][0]["array"]
            ...
            >>> model_id = "facebook/encodec_24khz"
            >>> model = EncodecModel.from_pretrained(model_id)
            >>> processor = AutoProcessor.from_pretrained(model_id)
            ...
            >>> inputs = processor(raw_audio=audio_sample, return_tensors="pt")
            ...
            >>> outputs = model(**inputs)
            >>> audio_codes = outputs.audio_codes
            >>> audio_values = outputs.audio_values
            ```
        """
        return_dict = return_dict or self.config.return_dict

        if padding_mask is None:
            padding_mask = ops.ones_like(input_values).bool()

        if audio_codes is not None and audio_scales is None:
            raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")

        if audio_scales is not None and audio_codes is None:
            raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")

        if audio_scales is None and audio_codes is None:
            audio_codes, audio_scales = self.encode(input_values, padding_mask, bandwidth, False)

        audio_values = self.decode(audio_codes, audio_scales, padding_mask, return_dict=return_dict)[0]
        if not return_dict:
            return (audio_codes, audio_values)

        return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.__init__(config)

Initializes an instance of the EncodecModel class.

PARAMETER DESCRIPTION
self

The instance of the EncodecModel class.

config

The configuration object containing settings for the EncodecModel. This parameter is required and must be of type EncodecConfig. It specifies the configuration settings for the EncodecModel.

TYPE: EncodecConfig

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If the codebook_size specified in the config is not a power of 2. This exception is raised when the codebook_size is invalid.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
def __init__(self, config: EncodecConfig):
    """
    Initializes an instance of the EncodecModel class.

    Args:
        self: The instance of the EncodecModel class.
        config (EncodecConfig): The configuration object containing settings for the EncodecModel.
            This parameter is required and must be of type EncodecConfig.
            It specifies the configuration settings for the EncodecModel.

    Returns:
        None.

    Raises:
        ValueError: If the codebook_size specified in the config is not a power of 2.
            This exception is raised when the codebook_size is invalid.
    """
    super().__init__(config)
    self.config = config

    self.encoder = EncodecEncoder(config)
    self.decoder = EncodecDecoder(config)

    self.quantizer = EncodecResidualVectorQuantizer(config)

    self.bits_per_codebook = int(math.log2(self.config.codebook_size))
    if 2**self.bits_per_codebook != self.config.codebook_size:
        raise ValueError("The codebook_size must be a power of 2.")

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.decode(audio_codes, audio_scales, padding_mask=None, return_dict=None)

Decodes the given frames into an output audio waveform.

Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be trimmed.

PARAMETER DESCRIPTION
audio_codes

Discret code embeddings computed using model.encode.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional*

audio_scales

Scaling factor for each audio_codes input.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*

padding_mask

Padding mask used to pad the input_values.

TYPE: `mindspore.Tensor` of shape `(batch_size, channels, sequence_length)` DEFAULT: None

return_dict

Whether or not to return a [~utils.ModelOutput] instead of a plain tuple.

TYPE: `bool`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
def decode(
    self,
    audio_codes: mindspore.Tensor,
    audio_scales: mindspore.Tensor,
    padding_mask: Optional[mindspore.Tensor] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecDecoderOutput]:
    """
    Decodes the given frames into an output audio waveform.

    Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
    trimmed.

    Args:
        audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`mindspore.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input.
        padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
            Padding mask used to pad the `input_values`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

    """
    return_dict = return_dict or self.config.return_dict

    chunk_length = self.config.chunk_length
    if chunk_length is None:
        if len(audio_codes) != 1:
            raise ValueError(f"Expected one frame, got {len(audio_codes)}")
        audio_values = self._decode_frame(audio_codes[0], audio_scales[0])
    else:
        decoded_frames = []

        for frame, scale in zip(audio_codes, audio_scales):
            frames = self._decode_frame(frame, scale)
            decoded_frames.append(frames)

        audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)

    # truncate based on padding mask
    if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
        audio_values = audio_values[..., : padding_mask.shape[-1]]

    if not return_dict:
        return (audio_values,)
    return EncodecDecoderOutput(audio_values)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.encode(input_values, padding_mask=None, bandwidth=None, return_dict=None)

Encodes the input audio waveform into discrete codes.

PARAMETER DESCRIPTION
input_values

Float values of the input audio waveform.

TYPE: `mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`

padding_mask

Padding mask used to pad the input_values.

TYPE: `mindspore.Tensor` of shape `(batch_size, channels, sequence_length)` DEFAULT: None

bandwidth

The target bandwidth. Must be one of config.target_bandwidths. If None, uses the smallest possible bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as bandwidth == 6.0

TYPE: `float`, *optional* DEFAULT: None

RETURNS DESCRIPTION
Union[Tuple[Tensor, Optional[Tensor]], EncodecEncoderOutput]

A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling

Union[Tuple[Tensor, Optional[Tensor]], EncodecEncoderOutput]

factors for each chunk when normalize is True. Each frames is a tuple (codebook, scale), with

Union[Tuple[Tensor, Optional[Tensor]], EncodecEncoderOutput]

codebook of shape [batch_size, num_codebooks, frames].

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
def encode(
    self,
    input_values: mindspore.Tensor,
    padding_mask: mindspore.Tensor = None,
    bandwidth: Optional[float] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, Optional[mindspore.Tensor]], EncodecEncoderOutput]:
    """
    Encodes the input audio waveform into discrete codes.

    Args:
        input_values (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
            Float values of the input audio waveform.
        padding_mask (`mindspore.Tensor` of shape `(batch_size, channels, sequence_length)`):
            Padding mask used to pad the `input_values`.
        bandwidth (`float`, *optional*):
            The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
            bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
            as bandwidth == 6.0

    Returns:
        A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
        factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
        `codebook` of shape `[batch_size, num_codebooks, frames]`.
    """
    return_dict = return_dict if return_dict is not None else self.config.return_dict

    if bandwidth is None:
        bandwidth = self.config.target_bandwidths[0]
    if bandwidth not in self.config.target_bandwidths:
        raise ValueError(
            f"This model doesn't support the bandwidth {bandwidth}. "
            f"Select one of {self.config.target_bandwidths}."
        )

    _, channels, input_length = input_values.shape

    if channels < 1 or channels > 2:
        raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")

    chunk_length = self.config.chunk_length
    if chunk_length is None:
        chunk_length = input_length
        stride = input_length
    else:
        stride = self.config.chunk_stride

    if padding_mask is None:
        padding_mask = ops.ones_like(input_values).bool()

    encoded_frames = []
    scales = []

    step = chunk_length - stride
    if (input_length % stride) - step != 0:
        raise ValueError(
            "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
        )

    for offset in range(0, input_length - step, stride):
        mask = padding_mask[..., offset : offset + chunk_length].bool()
        frame = input_values[:, :, offset : offset + chunk_length]
        encoded_frame, scale = self._encode_frame(frame, bandwidth, mask)
        encoded_frames.append(encoded_frame)
        scales.append(scale)

    encoded_frames = ops.stack(encoded_frames)

    if not return_dict:
        return (encoded_frames, scales)

    return EncodecEncoderOutput(encoded_frames, scales)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.forward(input_values, padding_mask=None, bandwidth=None, audio_codes=None, audio_scales=None, return_dict=None)

RETURNS DESCRIPTION
Union[Tuple[Tensor, Tensor], EncodecOutput]

Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecOutput]

Example
>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, EncodecModel
...
>>> dataset = load_dataset("ashraq/esc50")
>>> audio_sample = dataset["train"]["audio"][0]["array"]
...
>>> model_id = "facebook/encodec_24khz"
>>> model = EncodecModel.from_pretrained(model_id)
>>> processor = AutoProcessor.from_pretrained(model_id)
...
>>> inputs = processor(raw_audio=audio_sample, return_tensors="pt")
...
>>> outputs = model(**inputs)
>>> audio_codes = outputs.audio_codes
>>> audio_values = outputs.audio_values
Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
def forward(
    self,
    input_values: mindspore.Tensor,
    padding_mask: Optional[mindspore.Tensor] = None,
    bandwidth: Optional[float] = None,
    audio_codes: Optional[mindspore.Tensor] = None,
    audio_scales: Optional[mindspore.Tensor] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecOutput]:
    r"""
    Returns:
        Union[Tuple[mindspore.Tensor, mindspore.Tensor], EncodecOutput]

    Example:
        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, EncodecModel
        ...
        >>> dataset = load_dataset("ashraq/esc50")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]
        ...
        >>> model_id = "facebook/encodec_24khz"
        >>> model = EncodecModel.from_pretrained(model_id)
        >>> processor = AutoProcessor.from_pretrained(model_id)
        ...
        >>> inputs = processor(raw_audio=audio_sample, return_tensors="pt")
        ...
        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```
    """
    return_dict = return_dict or self.config.return_dict

    if padding_mask is None:
        padding_mask = ops.ones_like(input_values).bool()

    if audio_codes is not None and audio_scales is None:
        raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")

    if audio_scales is not None and audio_codes is None:
        raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")

    if audio_scales is None and audio_codes is None:
        audio_codes, audio_scales = self.encode(input_values, padding_mask, bandwidth, False)

    audio_values = self.decode(audio_codes, audio_scales, padding_mask, return_dict=return_dict)[0]
    if not return_dict:
        return (audio_codes, audio_values)

    return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.get_decoder()

This method returns the decoder object associated with the EncodecModel instance.

PARAMETER DESCRIPTION
self

The instance of the EncodecModel class. It is used to access the attributes and methods of the class.

TYPE: object

RETURNS DESCRIPTION
None

This method does not return any value explicitly, as it directly retrieves and returns the decoder object associated with the instance of the EncodecModel class.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
def get_decoder(self):
    """
    This method returns the decoder object associated with the EncodecModel instance.

    Args:
        self (object): The instance of the EncodecModel class.
            It is used to access the attributes and methods of the class.

    Returns:
        None: This method does not return any value explicitly, as it directly retrieves and returns the decoder
            object associated with the instance of the EncodecModel class.

    Raises:
        None.
    """
    return self.decoder

mindnlp.transformers.models.encodec.modeling_encodec.EncodecModel.get_encoder()

This method returns the encoder associated with the EncodecModel instance.

PARAMETER DESCRIPTION
self

The instance of the EncodecModel class. It is used to access the attributes and methods of the class.

TYPE: EncodecModel

RETURNS DESCRIPTION
encoder

This method returns the encoder associated with the EncodecModel instance.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
def get_encoder(self):
    """
    This method returns the encoder associated with the EncodecModel instance.

    Args:
        self (EncodecModel): The instance of the EncodecModel class.
            It is used to access the attributes and methods of the class.

    Returns:
        encoder: This method returns the encoder associated with the EncodecModel instance.

    Raises:
        None.
    """
    return self.encoder

mindnlp.transformers.models.encodec.modeling_encodec.EncodecOutput dataclass

Bases: ModelOutput

PARAMETER DESCRIPTION
audio_codes

Discret code embeddings computed using model.encode.

TYPE: `mindspore.Tensor` of shape `(batch_size, nb_chunks, chunk_length)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
39
40
41
42
43
44
45
46
47
48
49
@dataclass
class EncodecOutput(ModelOutput):
    """
    Args:
        audio_codes (`mindspore.Tensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
            Decoded audio values, obtained using the decoder part of Encodec.
    """
    audio_codes: mindspore.Tensor = None
    audio_values: mindspore.Tensor = None

mindnlp.transformers.models.encodec.modeling_encodec.EncodecPreTrainedModel

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
class EncodecPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    config_class = EncodecConfig
    base_model_prefix = "encodec"
    main_input_name = "input_values"

    def _init_weights(self, cell):
        """Initialize the weights"""
        if isinstance(cell, nn.Linear):
            cell.weight.set_data(initializer(
                Normal(sigma=self.config.initializer_range, mean=0.0)))
            if cell.bias is not None:
                cell.bias.set_data(initializer('zeros'))
        elif isinstance(cell, (nn.LayerNorm, nn.GroupNorm)):
            cell.beta.set_data(initializer('zeros', shape=cell.beta.data.shape))
            cell.gamma.set_data(initializer('ones', shape=cell.gamma.data.shape))
        elif isinstance(cell, nn.Conv1d):
            cell.weight.set_data(initializer('he_normal',shape=cell.weight.shape))
            if cell.bias is not None:
                k = math.sqrt(cell.groups / (cell.in_channels * cell.kernel_size[0]))
                cell.bias.set_data(initializer(Uniform(k), shape=cell.bias.shape))
                # nn.init.uniform_(cell.bias, a=-k, b=k)
        elif isinstance(cell, nn.Embedding):
            cell.embedding_table.set_data(initializer(
                Normal(sigma=self.config.initializer_range,mean=0.0)))
            #.normal_(mean=0.0, std=self.config.initializer_range)
            if cell.padding_idx is not None:
                cell.embedding_table.set_data(initializer('zeros',cell.padding_idx))
        elif isinstance(cell, nn.LSTM):
            for name, param in cell.parameters_and_names():
                if "weight" in name:
                    param.set_data(initializer('xavier_uniform',shape=param.shape))
                    # nn.init.xavier_uniform_(param)
                elif "bias" in name:
                    param.set_data(initializer('zeros',shape=param.shape))

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer

Bases: Module

Residual Vector Quantizer.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
class EncodecResidualVectorQuantizer(nn.Module):
    """Residual Vector Quantizer."""
    def __init__(self, config: EncodecConfig):
        """
        Initializes an instance of the EncodecResidualVectorQuantizer class.

        Args:
            self: The instance of the class.
            config (EncodecConfig):
                An object of the EncodecConfig class that holds configuration parameters.

                - codebook_size (int): The size of the codebook.
                - frame_rate (int): The frame rate.
                - num_quantizers (int): The number of quantizers.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        self.codebook_size = config.codebook_size
        self.frame_rate = config.frame_rate
        self.num_quantizers = config.num_quantizers
        self.layers = nn.ModuleList([EncodecVectorQuantization(config) for _ in range(config.num_quantizers)])

    def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
        """Return num_quantizers based on specified target bandwidth."""
        bw_per_q = math.log2(self.codebook_size) * self.frame_rate
        num_quantizers = self.num_quantizers
        if bandwidth is not None and bandwidth > 0.0:
            num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
        return num_quantizers

    def encode(self, embeddings: mindspore.Tensor, bandwidth: Optional[float] = None) -> mindspore.Tensor:
        """
        Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
        the appropriate number of quantizers to use and returns indices for each quantizer.
        """
        num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
        residual = embeddings
        all_indices = []
        for layer in self.layers[:num_quantizers]:
            indices = layer.encode(residual)
            quantized = layer.decode(indices)
            residual = residual - quantized
            all_indices.append(indices)
        out_indices = ops.stack(all_indices)
        return out_indices

    def decode(self, codes: mindspore.Tensor) -> mindspore.Tensor:
        """Decode the given codes to the quantized representation."""
        quantized_out = mindspore.tensor(0.0)
        for i, indices in enumerate(codes):
            layer = self.layers[i]
            quantized = layer.decode(indices)
            quantized_out = quantized_out + quantized
        return quantized_out

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.__init__(config)

Initializes an instance of the EncodecResidualVectorQuantizer class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

An object of the EncodecConfig class that holds configuration parameters.

  • codebook_size (int): The size of the codebook.
  • frame_rate (int): The frame rate.
  • num_quantizers (int): The number of quantizers.

TYPE: EncodecConfig

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
def __init__(self, config: EncodecConfig):
    """
    Initializes an instance of the EncodecResidualVectorQuantizer class.

    Args:
        self: The instance of the class.
        config (EncodecConfig):
            An object of the EncodecConfig class that holds configuration parameters.

            - codebook_size (int): The size of the codebook.
            - frame_rate (int): The frame rate.
            - num_quantizers (int): The number of quantizers.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    self.codebook_size = config.codebook_size
    self.frame_rate = config.frame_rate
    self.num_quantizers = config.num_quantizers
    self.layers = nn.ModuleList([EncodecVectorQuantization(config) for _ in range(config.num_quantizers)])

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.decode(codes)

Decode the given codes to the quantized representation.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
742
743
744
745
746
747
748
749
def decode(self, codes: mindspore.Tensor) -> mindspore.Tensor:
    """Decode the given codes to the quantized representation."""
    quantized_out = mindspore.tensor(0.0)
    for i, indices in enumerate(codes):
        layer = self.layers[i]
        quantized = layer.decode(indices)
        quantized_out = quantized_out + quantized
    return quantized_out

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.encode(embeddings, bandwidth=None)

Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets the appropriate number of quantizers to use and returns indices for each quantizer.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
def encode(self, embeddings: mindspore.Tensor, bandwidth: Optional[float] = None) -> mindspore.Tensor:
    """
    Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
    the appropriate number of quantizers to use and returns indices for each quantizer.
    """
    num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
    residual = embeddings
    all_indices = []
    for layer in self.layers[:num_quantizers]:
        indices = layer.encode(residual)
        quantized = layer.decode(indices)
        residual = residual - quantized
        all_indices.append(indices)
    out_indices = ops.stack(all_indices)
    return out_indices

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResidualVectorQuantizer.get_num_quantizers_for_bandwidth(bandwidth=None)

Return num_quantizers based on specified target bandwidth.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
718
719
720
721
722
723
724
def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
    """Return num_quantizers based on specified target bandwidth."""
    bw_per_q = math.log2(self.codebook_size) * self.frame_rate
    num_quantizers = self.num_quantizers
    if bandwidth is not None and bandwidth > 0.0:
        num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
    return num_quantizers

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResnetBlock

Bases: Module

Residual block from SEANet model as used by EnCodec.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
class EncodecResnetBlock(nn.Module):
    """
    Residual block from SEANet model as used by EnCodec.
    """
    def __init__(self, config: EncodecConfig, dim: int, dilations: List[int]):
        """
        Initialize the EncodecResnetBlock.

        Args:
            self (object): The instance of the class.
            config (EncodecConfig): An object containing configuration parameters for the block.
            dim (int): The dimension of the input data.
            dilations (List[int]): A list of dilation factors for each convolutional layer.

        Returns:
            None.

        Raises:
            ValueError: Raised if the number of kernel sizes does not match the number of dilations provided.
        """
        super().__init__()
        kernel_sizes = (config.residual_kernel_size, 1)
        if len(kernel_sizes) != len(dilations):
            raise ValueError("Number of kernel sizes should match number of dilations")

        hidden = dim // config.compress
        block = []
        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
            in_chs = dim if i == 0 else hidden
            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
            block += [nn.ELU()]
            block += [EncodecConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
        self.block = nn.ModuleList(block)

        if config.use_conv_shortcut:
            self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
        else:
            self.shortcut = nn.Identity()

    def forward(self, hidden_states):
        """
        Constructs the EncodecResnetBlock.

        This method applies a series of layers to the given hidden_states to forward the EncodecResnetBlock.
        The method returns the combined result of the residual connection and the output of the layers.

        Args:
            self (EncodecResnetBlock): An instance of the EncodecResnetBlock class.
            hidden_states (Tensor): The input hidden states to be passed through the block layers.
                Expected shape: (batch_size, hidden_size).

        Returns:
            Tensor: The combined result of the residual connection and the output of the block layers.
                Expected shape: (batch_size, hidden_size).

        Raises:
            None.
        """
        residual = hidden_states
        for layer in self.block:
            hidden_states = layer(hidden_states)

        return self.shortcut(residual) + hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResnetBlock.__init__(config, dim, dilations)

Initialize the EncodecResnetBlock.

PARAMETER DESCRIPTION
self

The instance of the class.

TYPE: object

config

An object containing configuration parameters for the block.

TYPE: EncodecConfig

dim

The dimension of the input data.

TYPE: int

dilations

A list of dilation factors for each convolutional layer.

TYPE: List[int]

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

Raised if the number of kernel sizes does not match the number of dilations provided.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
def __init__(self, config: EncodecConfig, dim: int, dilations: List[int]):
    """
    Initialize the EncodecResnetBlock.

    Args:
        self (object): The instance of the class.
        config (EncodecConfig): An object containing configuration parameters for the block.
        dim (int): The dimension of the input data.
        dilations (List[int]): A list of dilation factors for each convolutional layer.

    Returns:
        None.

    Raises:
        ValueError: Raised if the number of kernel sizes does not match the number of dilations provided.
    """
    super().__init__()
    kernel_sizes = (config.residual_kernel_size, 1)
    if len(kernel_sizes) != len(dilations):
        raise ValueError("Number of kernel sizes should match number of dilations")

    hidden = dim // config.compress
    block = []
    for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
        in_chs = dim if i == 0 else hidden
        out_chs = dim if i == len(kernel_sizes) - 1 else hidden
        block += [nn.ELU()]
        block += [EncodecConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
    self.block = nn.ModuleList(block)

    if config.use_conv_shortcut:
        self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
    else:
        self.shortcut = nn.Identity()

mindnlp.transformers.models.encodec.modeling_encodec.EncodecResnetBlock.forward(hidden_states)

Constructs the EncodecResnetBlock.

This method applies a series of layers to the given hidden_states to forward the EncodecResnetBlock. The method returns the combined result of the residual connection and the output of the layers.

PARAMETER DESCRIPTION
self

An instance of the EncodecResnetBlock class.

TYPE: EncodecResnetBlock

hidden_states

The input hidden states to be passed through the block layers. Expected shape: (batch_size, hidden_size).

TYPE: Tensor

RETURNS DESCRIPTION
Tensor

The combined result of the residual connection and the output of the block layers. Expected shape: (batch_size, hidden_size).

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
def forward(self, hidden_states):
    """
    Constructs the EncodecResnetBlock.

    This method applies a series of layers to the given hidden_states to forward the EncodecResnetBlock.
    The method returns the combined result of the residual connection and the output of the layers.

    Args:
        self (EncodecResnetBlock): An instance of the EncodecResnetBlock class.
        hidden_states (Tensor): The input hidden states to be passed through the block layers.
            Expected shape: (batch_size, hidden_size).

    Returns:
        Tensor: The combined result of the residual connection and the output of the block layers.
            Expected shape: (batch_size, hidden_size).

    Raises:
        None.
    """
    residual = hidden_states
    for layer in self.block:
        hidden_states = layer(hidden_states)

    return self.shortcut(residual) + hidden_states

mindnlp.transformers.models.encodec.modeling_encodec.EncodecVectorQuantization

Bases: Module

Vector quantization implementation. Currently supports only euclidean distance.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
class EncodecVectorQuantization(nn.Module):
    """
    Vector quantization implementation. Currently supports only euclidean distance.
    """
    def __init__(self, config: EncodecConfig):
        """
        Initializes an instance of the EncodecVectorQuantization class.

        Args:
            self: The instance of the EncodecVectorQuantization class.
            config (EncodecConfig):
                An object of the EncodecConfig class that contains the configuration data for the vector quantization.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        self.codebook = EncodecEuclideanCodebook(config)

    def encode(self, hidden_states):
        """
        Method to encode hidden states using vector quantization.

        Args:
            self (EncodeVectorQuantization): The instance of the EncodeVectorQuantization class.
            hidden_states (torch.Tensor):
                The hidden states to be encoded. Should be in the shape of (batch_size, hidden_dim, sequence_length).

        Returns:
            embed_in (torch.Tensor): The encoded representation of the hidden states.

        Raises:
            None.
        """
        hidden_states = hidden_states.permute(0, 2, 1)
        embed_in = self.codebook.encode(hidden_states)
        return embed_in

    def decode(self, embed_ind):
        """
        Decode the embedded indices to obtain the quantized vectors.

        Args:
            self (EncodecVectorQuantization): The instance of the EncodecVectorQuantization class.
            embed_ind (Tensor): A 3D tensor containing the embedded indices.
                Its shape should be (batch_size, num_channels, num_embeddings).

        Returns:
            quantize (Tensor): A 3D tensor representing the quantized vectors after decoding.
                The shape of the tensor is (batch_size, num_embeddings, num_channels).

        Raises:
            ValueError: If the embed_ind tensor is not of the expected shape.
            RuntimeError: If there is an issue with decoding the embedded indices.
        """
        quantize = self.codebook.decode(embed_ind)
        quantize = quantize.permute(0, 2, 1)
        return quantize

mindnlp.transformers.models.encodec.modeling_encodec.EncodecVectorQuantization.__init__(config)

Initializes an instance of the EncodecVectorQuantization class.

PARAMETER DESCRIPTION
self

The instance of the EncodecVectorQuantization class.

config

An object of the EncodecConfig class that contains the configuration data for the vector quantization.

TYPE: EncodecConfig

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
def __init__(self, config: EncodecConfig):
    """
    Initializes an instance of the EncodecVectorQuantization class.

    Args:
        self: The instance of the EncodecVectorQuantization class.
        config (EncodecConfig):
            An object of the EncodecConfig class that contains the configuration data for the vector quantization.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    self.codebook = EncodecEuclideanCodebook(config)

mindnlp.transformers.models.encodec.modeling_encodec.EncodecVectorQuantization.decode(embed_ind)

Decode the embedded indices to obtain the quantized vectors.

PARAMETER DESCRIPTION
self

The instance of the EncodecVectorQuantization class.

TYPE: EncodecVectorQuantization

embed_ind

A 3D tensor containing the embedded indices. Its shape should be (batch_size, num_channels, num_embeddings).

TYPE: Tensor

RETURNS DESCRIPTION
quantize

A 3D tensor representing the quantized vectors after decoding. The shape of the tensor is (batch_size, num_embeddings, num_channels).

TYPE: Tensor

RAISES DESCRIPTION
ValueError

If the embed_ind tensor is not of the expected shape.

RuntimeError

If there is an issue with decoding the embedded indices.

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
def decode(self, embed_ind):
    """
    Decode the embedded indices to obtain the quantized vectors.

    Args:
        self (EncodecVectorQuantization): The instance of the EncodecVectorQuantization class.
        embed_ind (Tensor): A 3D tensor containing the embedded indices.
            Its shape should be (batch_size, num_channels, num_embeddings).

    Returns:
        quantize (Tensor): A 3D tensor representing the quantized vectors after decoding.
            The shape of the tensor is (batch_size, num_embeddings, num_channels).

    Raises:
        ValueError: If the embed_ind tensor is not of the expected shape.
        RuntimeError: If there is an issue with decoding the embedded indices.
    """
    quantize = self.codebook.decode(embed_ind)
    quantize = quantize.permute(0, 2, 1)
    return quantize

mindnlp.transformers.models.encodec.modeling_encodec.EncodecVectorQuantization.encode(hidden_states)

Method to encode hidden states using vector quantization.

PARAMETER DESCRIPTION
self

The instance of the EncodeVectorQuantization class.

TYPE: EncodeVectorQuantization

hidden_states

The hidden states to be encoded. Should be in the shape of (batch_size, hidden_dim, sequence_length).

TYPE: Tensor

RETURNS DESCRIPTION
embed_in

The encoded representation of the hidden states.

TYPE: Tensor

Source code in mindnlp/transformers/models/encodec/modeling_encodec.py
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
def encode(self, hidden_states):
    """
    Method to encode hidden states using vector quantization.

    Args:
        self (EncodeVectorQuantization): The instance of the EncodeVectorQuantization class.
        hidden_states (torch.Tensor):
            The hidden states to be encoded. Should be in the shape of (batch_size, hidden_dim, sequence_length).

    Returns:
        embed_in (torch.Tensor): The encoded representation of the hidden states.

    Raises:
        None.
    """
    hidden_states = hidden_states.permute(0, 2, 1)
    embed_in = self.codebook.encode(hidden_states)
    return embed_in

mindnlp.transformers.models.encodec.configuration_encodec

Encodec Model config

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of an [EncodecModel]. It is used to instantiate a Encodec model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the facebook/encodec_24khz architecture.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER DESCRIPTION
target_bandwidths

The range of diffent bandwiths the model can encode audio with.

TYPE: `List[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]` DEFAULT: [1.5, 3.0, 6.0, 12.0, 24.0]

sampling_rate

The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).

TYPE: `int`, *optional*, defaults to 24000 DEFAULT: 24000

audio_channels

Number of channels in the audio data. Either 1 for mono or 2 for stereo.

TYPE: `int`, *optional*, defaults to 1 DEFAULT: 1

normalize

Whether the audio shall be normalized when passed.

TYPE: `bool`, *optional*, defaults to `False` DEFAULT: False

chunk_length_s

If defined the audio is pre-processed into chunks of lengths chunk_length_s and then encoded.

TYPE: `float`, *optional* DEFAULT: None

overlap

Defines the overlap between each chunk. It is used to compute the chunk_stride using the following formulae : int((1.0 - self.overlap) * self.chunk_length).

TYPE: `float`, *optional* DEFAULT: None

hidden_size

Intermediate representation dimension.

TYPE: `int`, *optional*, defaults to 128 DEFAULT: 128

num_filters

Number of convolution kernels of first EncodecConv1d down sampling layer.

TYPE: `int`, *optional*, defaults to 32 DEFAULT: 32

num_residual_layers

Number of residual layers.

TYPE: `int`, *optional*, defaults to 1 DEFAULT: 1

upsampling_ratios

Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here that must match the decoder order.

TYPE: `Sequence[int]` , *optional*, defaults to `[8, 5, 4, 2]` DEFAULT: [8, 5, 4, 2]

norm_type

Normalization method. Should be in ["weight_norm", "time_group_norm"]

TYPE: `str`, *optional*, defaults to `"weight_norm"` DEFAULT: 'weight_norm'

kernel_size

Kernel size for the initial convolution.

TYPE: `int`, *optional*, defaults to 7 DEFAULT: 7

last_kernel_size

Kernel size for the last convolution layer.

TYPE: `int`, *optional*, defaults to 7 DEFAULT: 7

residual_kernel_size

Kernel size for the residual layers.

TYPE: `int`, *optional*, defaults to 3 DEFAULT: 3

dilation_growth_rate

How much to increase the dilation with each layer.

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

use_causal_conv

Whether to use fully causal convolution.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

pad_mode

Padding mode for the convolutions.

TYPE: `str`, *optional*, defaults to `"reflect"` DEFAULT: 'reflect'

compress

Reduced dimensionality in residual branches (from Demucs v3).

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

num_lstm_layers

Number of LSTM layers at the end of the encoder.

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

trim_right_ratio

Ratio for trimming at the right of the transposed convolution under the use_causal_conv = True setup. If equal to 1.0, it means that all the trimming is done at the right.

TYPE: `float`, *optional*, defaults to 1.0 DEFAULT: 1.0

codebook_size

Number of discret codes that make up VQVAE.

TYPE: `int`, *optional*, defaults to 1024 DEFAULT: 1024

codebook_dim

Dimension of the codebook vectors. If not defined, uses hidden_size.

TYPE: `int`, *optional* DEFAULT: None

use_conv_shortcut

Whether to use a convolutional layer as the 'skip' connection in the EncodecResnetBlock block. If False, an identity function will be used, giving a generic residual connection.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

Example
>>> from transformers import EncodecModel, EncodecConfig
...
>>> # Initializing a "facebook/encodec_24khz" style configuration
>>> configuration = EncodecConfig()
...
>>> # Initializing a model (with random weights) from the "facebook/encodec_24khz" style configuration
>>> model = EncodecModel(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config
Source code in mindnlp/transformers/models/encodec/configuration_encodec.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class EncodecConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of an [`EncodecModel`]. It is used to instantiate a
    Encodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the
    [facebook/encodec_24khz](https://hf-mirror.com/facebook/encodec_24khz) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        target_bandwidths (`List[float]`, *optional*, defaults to `[1.5, 3.0, 6.0, 12.0, 24.0]`):
            The range of diffent bandwiths the model can encode audio with.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
        audio_channels (`int`, *optional*, defaults to 1):
            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether the audio shall be normalized when passed.
        chunk_length_s (`float`, *optional*):
            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
        overlap (`float`, *optional*):
            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
        hidden_size (`int`, *optional*, defaults to 128):
            Intermediate representation dimension.
        num_filters (`int`, *optional*, defaults to 32):
            Number of convolution kernels of first `EncodecConv1d` down sampling layer.
        num_residual_layers (`int`,  *optional*, defaults to 1):
            Number of residual layers.
        upsampling_ratios (`Sequence[int]` , *optional*, defaults to `[8, 5, 4, 2]`):
            Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it
            will use the ratios in the reverse order to the ones specified here that must match the decoder order.
        norm_type (`str`, *optional*, defaults to `"weight_norm"`):
            Normalization method. Should be in `["weight_norm", "time_group_norm"]`
        kernel_size (`int`, *optional*, defaults to 7):
            Kernel size for the initial convolution.
        last_kernel_size (`int`, *optional*, defaults to 7):
            Kernel size for the last convolution layer.
        residual_kernel_size (`int`, *optional*, defaults to 3):
            Kernel size for the residual layers.
        dilation_growth_rate (`int`, *optional*, defaults to 2):
            How much to increase the dilation with each layer.
        use_causal_conv (`bool`, *optional*, defaults to `True`):
            Whether to use fully causal convolution.
        pad_mode (`str`, *optional*, defaults to `"reflect"`):
            Padding mode for the convolutions.
        compress (`int`, *optional*, defaults to 2):
            Reduced dimensionality in residual branches (from Demucs v3).
        num_lstm_layers (`int`, *optional*, defaults to 2):
            Number of LSTM layers at the end of the encoder.
        trim_right_ratio (`float`, *optional*, defaults to 1.0):
            Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If
            equal to 1.0, it means that all the trimming is done at the right.
        codebook_size (`int`, *optional*, defaults to 1024):
            Number of discret codes that make up VQVAE.
        codebook_dim (`int`, *optional*):
            Dimension of the codebook vectors. If not defined, uses `hidden_size`.
        use_conv_shortcut (`bool`, *optional*, defaults to `True`):
            Whether to use a convolutional layer as the 'skip' connection in the `EncodecResnetBlock` block. If False,
            an identity function will be used, giving a generic residual connection.

    Example:
        ```python
        >>> from transformers import EncodecModel, EncodecConfig
        ...
        >>> # Initializing a "facebook/encodec_24khz" style configuration
        >>> configuration = EncodecConfig()
        ...
        >>> # Initializing a model (with random weights) from the "facebook/encodec_24khz" style configuration
        >>> model = EncodecModel(configuration)
        ...
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    """
    model_type = "encodec"
    #pylint: disable=W0102
    def __init__(
        self,
        target_bandwidths = [1.5, 3.0, 6.0, 12.0, 24.0],
        sampling_rate=24_000,
        audio_channels=1,
        normalize=False,
        chunk_length_s=None,
        overlap=None,
        hidden_size=128,
        num_filters=32,
        num_residual_layers=1,
        upsampling_ratios = [8, 5, 4, 2],
        norm_type="weight_norm",
        kernel_size=7,
        last_kernel_size=7,
        residual_kernel_size=3,
        dilation_growth_rate=2,
        use_causal_conv=True,
        pad_mode="reflect",
        compress=2,
        num_lstm_layers=2,
        trim_right_ratio=1.0,
        codebook_size=1024,
        codebook_dim=None,
        use_conv_shortcut=True,
        **kwargs,
    ):
        """
        Initializes an instance of the EncodecConfig class.

        Args:
            self: The instance of the class.
            target_bandwidths (list[float]): List of target bandwidths in kHz. Default is [1.5, 3.0, 6.0, 12.0, 24.0].
            sampling_rate (int): The audio sampling rate in Hz. Default is 24000.
            audio_channels (int): The number of audio channels. Default is 1.
            normalize (bool): Flag indicating whether to normalize the audio. Default is False.
            chunk_length_s (float): The length of audio chunks in seconds. Default is None.
            overlap (float): The overlap ratio between audio chunks. Default is None.
            hidden_size (int): The size of the hidden state in the model. Default is 128.
            num_filters (int): The number of filters in the model. Default is 32.
            num_residual_layers (int): The number of residual layers in the model. Default is 1.
            upsampling_ratios (list[int]): List of upsampling ratios. Default is [8, 5, 4, 2].
            norm_type (str): The type of normalization. Must be either 'weight_norm' or 'time_group_norm'. Default is 'weight_norm'.
            kernel_size (int): The size of the convolutional kernel. Default is 7.
            last_kernel_size (int): The size of the last convolutional kernel. Default is 7.
            residual_kernel_size (int): The size of the residual convolutional kernel. Default is 3.
            dilation_growth_rate (int): The growth rate of dilation in the model. Default is 2.
            use_causal_conv (bool): Flag indicating whether to use causal convolution. Default is True.
            pad_mode (str): The padding mode for convolution. Default is 'reflect'.
            compress (int): The compression factor for audio. Default is 2.
            num_lstm_layers (int): The number of LSTM layers in the model. Default is 2.
            trim_right_ratio (float): The ratio of trimming audio from the right. Default is 1.0.
            codebook_size (int): The size of the codebook. Default is 1024.
            codebook_dim (int): The dimension of the codebook. Default is equal to hidden_size if not provided.
            use_conv_shortcut (bool): Flag indicating whether to use convolution shortcut. Default is True.

        Returns:
            None.

        Raises:
            ValueError: If norm_type is not 'weight_norm' or 'time_group_norm'.

        """
        self.target_bandwidths = target_bandwidths
        self.sampling_rate = sampling_rate
        self.audio_channels = audio_channels
        self.normalize = normalize
        self.chunk_length_s = chunk_length_s
        self.overlap = overlap
        self.hidden_size = hidden_size
        self.num_filters = num_filters
        self.num_residual_layers = num_residual_layers
        self.upsampling_ratios = upsampling_ratios
        self.norm_type = norm_type
        self.kernel_size = kernel_size
        self.last_kernel_size = last_kernel_size
        self.residual_kernel_size = residual_kernel_size
        self.dilation_growth_rate = dilation_growth_rate
        self.use_causal_conv = use_causal_conv
        self.pad_mode = pad_mode
        self.compress = compress
        self.num_lstm_layers = num_lstm_layers
        self.trim_right_ratio = trim_right_ratio
        self.codebook_size = codebook_size
        self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
        self.use_conv_shortcut = use_conv_shortcut

        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        super().__init__(**kwargs)

    # This is a property because you might want to change the chunk_length_s on the fly
    @property
    def chunk_length(self) -> Optional[int]:
        r"""
        chunk_length
        """
        if self.chunk_length_s is None:
            return None
        return int(self.chunk_length_s * self.sampling_rate)

    # This is a property because you might want to change the chunk_length_s on the fly
    @property
    def chunk_stride(self) -> Optional[int]:
        r"""
        chunk_stride
        """
        if self.chunk_length_s is None or self.overlap is None:
            return None
        return max(1, int((1.0 - self.overlap) * self.chunk_length))

    @property
    def frame_rate(self) -> int:
        r"""
        frame_rate
        """
        hop_length = np.prod(self.upsampling_ratios)
        return math.ceil(self.sampling_rate / hop_length)

    @property
    def num_quantizers(self) -> int:
        r"""
        num_quantizers
        """
        return int(1000 * self.target_bandwidths[-1] // (self.frame_rate * 10))

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_length: Optional[int] property

chunk_length

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.chunk_stride: Optional[int] property

chunk_stride

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.frame_rate: int property

frame_rate

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.num_quantizers: int property

num_quantizers

mindnlp.transformers.models.encodec.configuration_encodec.EncodecConfig.__init__(target_bandwidths=[1.5, 3.0, 6.0, 12.0, 24.0], sampling_rate=24000, audio_channels=1, normalize=False, chunk_length_s=None, overlap=None, hidden_size=128, num_filters=32, num_residual_layers=1, upsampling_ratios=[8, 5, 4, 2], norm_type='weight_norm', kernel_size=7, last_kernel_size=7, residual_kernel_size=3, dilation_growth_rate=2, use_causal_conv=True, pad_mode='reflect', compress=2, num_lstm_layers=2, trim_right_ratio=1.0, codebook_size=1024, codebook_dim=None, use_conv_shortcut=True, **kwargs)

Initializes an instance of the EncodecConfig class.

PARAMETER DESCRIPTION
self

The instance of the class.

target_bandwidths

List of target bandwidths in kHz. Default is [1.5, 3.0, 6.0, 12.0, 24.0].

TYPE: list[float] DEFAULT: [1.5, 3.0, 6.0, 12.0, 24.0]

sampling_rate

The audio sampling rate in Hz. Default is 24000.

TYPE: int DEFAULT: 24000

audio_channels

The number of audio channels. Default is 1.

TYPE: int DEFAULT: 1

normalize

Flag indicating whether to normalize the audio. Default is False.

TYPE: bool DEFAULT: False

chunk_length_s

The length of audio chunks in seconds. Default is None.

TYPE: float DEFAULT: None

overlap

The overlap ratio between audio chunks. Default is None.

TYPE: float DEFAULT: None

hidden_size

The size of the hidden state in the model. Default is 128.

TYPE: int DEFAULT: 128

num_filters

The number of filters in the model. Default is 32.

TYPE: int DEFAULT: 32

num_residual_layers

The number of residual layers in the model. Default is 1.

TYPE: int DEFAULT: 1

upsampling_ratios

List of upsampling ratios. Default is [8, 5, 4, 2].

TYPE: list[int] DEFAULT: [8, 5, 4, 2]

norm_type

The type of normalization. Must be either 'weight_norm' or 'time_group_norm'. Default is 'weight_norm'.

TYPE: str DEFAULT: 'weight_norm'

kernel_size

The size of the convolutional kernel. Default is 7.

TYPE: int DEFAULT: 7

last_kernel_size

The size of the last convolutional kernel. Default is 7.

TYPE: int DEFAULT: 7

residual_kernel_size

The size of the residual convolutional kernel. Default is 3.

TYPE: int DEFAULT: 3

dilation_growth_rate

The growth rate of dilation in the model. Default is 2.

TYPE: int DEFAULT: 2

use_causal_conv

Flag indicating whether to use causal convolution. Default is True.

TYPE: bool DEFAULT: True

pad_mode

The padding mode for convolution. Default is 'reflect'.

TYPE: str DEFAULT: 'reflect'

compress

The compression factor for audio. Default is 2.

TYPE: int DEFAULT: 2

num_lstm_layers

The number of LSTM layers in the model. Default is 2.

TYPE: int DEFAULT: 2

trim_right_ratio

The ratio of trimming audio from the right. Default is 1.0.

TYPE: float DEFAULT: 1.0

codebook_size

The size of the codebook. Default is 1024.

TYPE: int DEFAULT: 1024

codebook_dim

The dimension of the codebook. Default is equal to hidden_size if not provided.

TYPE: int DEFAULT: None

use_conv_shortcut

Flag indicating whether to use convolution shortcut. Default is True.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If norm_type is not 'weight_norm' or 'time_group_norm'.

Source code in mindnlp/transformers/models/encodec/configuration_encodec.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def __init__(
    self,
    target_bandwidths = [1.5, 3.0, 6.0, 12.0, 24.0],
    sampling_rate=24_000,
    audio_channels=1,
    normalize=False,
    chunk_length_s=None,
    overlap=None,
    hidden_size=128,
    num_filters=32,
    num_residual_layers=1,
    upsampling_ratios = [8, 5, 4, 2],
    norm_type="weight_norm",
    kernel_size=7,
    last_kernel_size=7,
    residual_kernel_size=3,
    dilation_growth_rate=2,
    use_causal_conv=True,
    pad_mode="reflect",
    compress=2,
    num_lstm_layers=2,
    trim_right_ratio=1.0,
    codebook_size=1024,
    codebook_dim=None,
    use_conv_shortcut=True,
    **kwargs,
):
    """
    Initializes an instance of the EncodecConfig class.

    Args:
        self: The instance of the class.
        target_bandwidths (list[float]): List of target bandwidths in kHz. Default is [1.5, 3.0, 6.0, 12.0, 24.0].
        sampling_rate (int): The audio sampling rate in Hz. Default is 24000.
        audio_channels (int): The number of audio channels. Default is 1.
        normalize (bool): Flag indicating whether to normalize the audio. Default is False.
        chunk_length_s (float): The length of audio chunks in seconds. Default is None.
        overlap (float): The overlap ratio between audio chunks. Default is None.
        hidden_size (int): The size of the hidden state in the model. Default is 128.
        num_filters (int): The number of filters in the model. Default is 32.
        num_residual_layers (int): The number of residual layers in the model. Default is 1.
        upsampling_ratios (list[int]): List of upsampling ratios. Default is [8, 5, 4, 2].
        norm_type (str): The type of normalization. Must be either 'weight_norm' or 'time_group_norm'. Default is 'weight_norm'.
        kernel_size (int): The size of the convolutional kernel. Default is 7.
        last_kernel_size (int): The size of the last convolutional kernel. Default is 7.
        residual_kernel_size (int): The size of the residual convolutional kernel. Default is 3.
        dilation_growth_rate (int): The growth rate of dilation in the model. Default is 2.
        use_causal_conv (bool): Flag indicating whether to use causal convolution. Default is True.
        pad_mode (str): The padding mode for convolution. Default is 'reflect'.
        compress (int): The compression factor for audio. Default is 2.
        num_lstm_layers (int): The number of LSTM layers in the model. Default is 2.
        trim_right_ratio (float): The ratio of trimming audio from the right. Default is 1.0.
        codebook_size (int): The size of the codebook. Default is 1024.
        codebook_dim (int): The dimension of the codebook. Default is equal to hidden_size if not provided.
        use_conv_shortcut (bool): Flag indicating whether to use convolution shortcut. Default is True.

    Returns:
        None.

    Raises:
        ValueError: If norm_type is not 'weight_norm' or 'time_group_norm'.

    """
    self.target_bandwidths = target_bandwidths
    self.sampling_rate = sampling_rate
    self.audio_channels = audio_channels
    self.normalize = normalize
    self.chunk_length_s = chunk_length_s
    self.overlap = overlap
    self.hidden_size = hidden_size
    self.num_filters = num_filters
    self.num_residual_layers = num_residual_layers
    self.upsampling_ratios = upsampling_ratios
    self.norm_type = norm_type
    self.kernel_size = kernel_size
    self.last_kernel_size = last_kernel_size
    self.residual_kernel_size = residual_kernel_size
    self.dilation_growth_rate = dilation_growth_rate
    self.use_causal_conv = use_causal_conv
    self.pad_mode = pad_mode
    self.compress = compress
    self.num_lstm_layers = num_lstm_layers
    self.trim_right_ratio = trim_right_ratio
    self.codebook_size = codebook_size
    self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
    self.use_conv_shortcut = use_conv_shortcut

    if self.norm_type not in ["weight_norm", "time_group_norm"]:
        raise ValueError(
            f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
        )

    super().__init__(**kwargs)

mindnlp.transformers.models.encodec.feature_extraction_encodec

Feature extractor class for EnCodec.

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor

Bases: SequenceFeatureExtractor

Constructs an EnCodec feature extractor.

This feature extractor inherits from [~feature_extraction_sequence_utils.SequenceFeatureExtractor] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods.

Instantiating a feature extractor with the defaults will yield a similar configuration to that of the facebook/encodec_24khz architecture.

PARAMETER DESCRIPTION
feature_size

The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.

TYPE: `int`, *optional*, defaults to 1 DEFAULT: 1

sampling_rate

The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).

TYPE: `int`, *optional*, defaults to 24000 DEFAULT: 24000

padding_value

The value that is used to fill the padding values.

TYPE: `float`, *optional*, defaults to 0.0 DEFAULT: 0.0

chunk_length_s

If defined the audio is pre-processed into chunks of lengths chunk_length_s and then encoded.

TYPE: `float`, *optional* DEFAULT: None

overlap

Defines the overlap between each chunk. It is used to compute the chunk_stride using the following formulae : int((1.0 - self.overlap) * self.chunk_length).

TYPE: `float`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/encodec/feature_extraction_encodec.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class EncodecFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs an EnCodec feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Instantiating a feature extractor with the defaults will yield a similar configuration to that of the
    [facebook/encodec_24khz](https://hf-mirror.com/facebook/encodec_24khz) architecture.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        chunk_length_s (`float`, *optional*):
            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
        overlap (`float`, *optional*):
            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
    """
    model_input_names = ["input_values", "padding_mask"]

    def __init__(
        self,
        feature_size: int = 1,
        sampling_rate: int = 24000,
        padding_value: float = 0.0,
        chunk_length_s: float = None,
        overlap: float = None,
        **kwargs,
    ):
        """
        Initialize the EncodecFeatureExtractor class with the given parameters.

        Args:
            self: The instance of the class.
            feature_size (int): The size of the feature. Default is 1.
            sampling_rate (int): The sampling rate in Hz. Default is 24000.
            padding_value (float): The value used for padding. Default is 0.0.
            chunk_length_s (float): The length of each chunk in seconds. Default is None.
            overlap (float): The overlap between chunks in seconds. Default is None.
            **kwargs: Additional keyword arguments.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
        self.chunk_length_s = chunk_length_s
        self.overlap = overlap

    @property
    def chunk_length(self) -> Optional[int]:
        r"""
        # This is a property because you might want to change the chunk_length_s on the fly
        """
        if self.chunk_length_s is None:
            return None
        return int(self.chunk_length_s * self.sampling_rate)

    @property
    def chunk_stride(self) -> Optional[int]:
        r"""
        # This is a property because you might want to change the chunk_length_s on the fly
        """
        if self.chunk_length_s is None or self.overlap is None:
            return None
        return max(1, int((1.0 - self.overlap) * self.chunk_length))

    def __call__(
        self,
        raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
        truncation: Optional[bool] = False,
        max_length: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        sampling_rate: Optional[int] = None,
    ) -> BatchFeature:
        """
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
                (`feature_size = 2`).
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                lengths).
            truncation (`bool`, *optional*, defaults to `False`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
        """
        if sampling_rate is not None:
            if sampling_rate != self.sampling_rate:
                raise ValueError(
                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
                    f" {self.sampling_rate} and not {sampling_rate}."
                )
        else:
            logger.warning(
                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
                "Failing to do so can result in silent errors that might be hard to debug."
            )

        if padding and truncation:
            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
        if padding is None:
            # by default let's pad the inputs
            padding = True

        is_batched = bool(
            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
        )

        if is_batched:
            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
        elif not is_batched and not isinstance(raw_audio, np.ndarray):
            raw_audio = np.asarray(raw_audio, dtype=np.float32)
        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
            raw_audio = raw_audio.astype(np.float32)

        # always return batch
        if not is_batched:
            raw_audio = [np.asarray(raw_audio).T]

        # verify inputs are valid
        for _, example in enumerate(raw_audio):
            if example.ndim > 2:
                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
            if self.feature_size == 1 and example.ndim != 1:
                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
            if self.feature_size == 2 and example.shape[-1] != 2:
                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")

        padded_inputs = None
        input_values = BatchFeature({"input_values": raw_audio})
        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
            if truncation:
                max_length = min(array.shape[0] for array in raw_audio)
                nb_step = int(np.floor(max_length / self.chunk_stride))
                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
            elif padding:
                max_length = max(array.shape[0] for array in raw_audio)
                nb_step = int(np.ceil(max_length / self.chunk_stride))
                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
                padding = "max_length"
            else:
                padded_inputs = input_values

        # normal padding on batch
        if padded_inputs is None:
            padded_inputs = self.pad(
                input_values,
                max_length=max_length,
                truncation=truncation,
                padding=padding,
                return_attention_mask=padding,
            )
            if padding:
                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")

        input_values = []
        for example in padded_inputs.pop("input_values"):
            if self.feature_size == 1:
                example = example[..., None]
            input_values.append(example.T)

        padded_inputs["input_values"] = input_values
        if return_tensors is not None:
            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

        return padded_inputs

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_length: Optional[int] property

This is a property because you might want to change the chunk_length_s on the fly

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.chunk_stride: Optional[int] property

This is a property because you might want to change the chunk_length_s on the fly

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.__call__(raw_audio, padding=None, truncation=False, max_length=None, return_tensors=None, sampling_rate=None)

Main method to featurize and prepare for the model one or several sequence(s).

PARAMETER DESCRIPTION
raw_audio

The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape (num_samples,) for mono audio (feature_size = 1), or (2, num_samples) for stereo audio (feature_size = 2).

TYPE: `np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`

padding

Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among:

  • True or 'longest': Pad to the longest sequence in the batch (or no padding if only a single sequence if provided).
  • 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
  • False or 'do_not_pad' (default): No padding (i.e., can output a batch with sequences of different lengths).

TYPE: `bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True` DEFAULT: None

truncation

Activates truncation to cut input sequences longer than max_length to max_length.

TYPE: `bool`, *optional*, defaults to `False` DEFAULT: False

max_length

Maximum length of the returned list and optionally padding length (see above).

TYPE: `int`, *optional* DEFAULT: None

return_tensors

If set, will return tensors instead of list of python integers. Acceptable values are:

  • 'tf': Return TensorFlow tf.constant objects.
  • 'pt': Return PyTorch torch.Tensor objects.
  • 'np': Return Numpy np.ndarray objects.

TYPE: `str` or [`~utils.TensorType`], *optional* DEFAULT: None

sampling_rate

The sampling rate at which the audio input was sampled. It is strongly recommended to pass sampling_rate at the forward call to prevent silent errors.

TYPE: `int`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/encodec/feature_extraction_encodec.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def __call__(
    self,
    raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
    padding: Optional[Union[bool, str, PaddingStrategy]] = None,
    truncation: Optional[bool] = False,
    max_length: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    sampling_rate: Optional[int] = None,
) -> BatchFeature:
    """
    Main method to featurize and prepare for the model one or several sequence(s).

    Args:
        raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
            The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
            values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
            `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
            (`feature_size = 2`).
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding
            index) among:

            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
            sequence if provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
            acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
            lengths).
        truncation (`bool`, *optional*, defaults to `False`):
            Activates truncation to cut input sequences longer than `max_length` to `max_length`.
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        return_tensors (`str` or [`~utils.TensorType`], *optional*):
            If set, will return tensors instead of list of python integers. Acceptable values are:

            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return Numpy `np.ndarray` objects.
        sampling_rate (`int`, *optional*):
            The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
            `sampling_rate` at the forward call to prevent silent errors.
    """
    if sampling_rate is not None:
        if sampling_rate != self.sampling_rate:
            raise ValueError(
                f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
                f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
                f" {self.sampling_rate} and not {sampling_rate}."
            )
    else:
        logger.warning(
            "It is strongly recommended to pass the `sampling_rate` argument to this function. "
            "Failing to do so can result in silent errors that might be hard to debug."
        )

    if padding and truncation:
        raise ValueError("Both padding and truncation were set. Make sure you only set one.")
    if padding is None:
        # by default let's pad the inputs
        padding = True

    is_batched = bool(
        isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
    )

    if is_batched:
        raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
    elif not is_batched and not isinstance(raw_audio, np.ndarray):
        raw_audio = np.asarray(raw_audio, dtype=np.float32)
    elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
        raw_audio = raw_audio.astype(np.float32)

    # always return batch
    if not is_batched:
        raw_audio = [np.asarray(raw_audio).T]

    # verify inputs are valid
    for _, example in enumerate(raw_audio):
        if example.ndim > 2:
            raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
        if self.feature_size == 1 and example.ndim != 1:
            raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
        if self.feature_size == 2 and example.shape[-1] != 2:
            raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")

    padded_inputs = None
    input_values = BatchFeature({"input_values": raw_audio})
    if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
        if truncation:
            max_length = min(array.shape[0] for array in raw_audio)
            nb_step = int(np.floor(max_length / self.chunk_stride))
            max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
        elif padding:
            max_length = max(array.shape[0] for array in raw_audio)
            nb_step = int(np.ceil(max_length / self.chunk_stride))
            max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
            padding = "max_length"
        else:
            padded_inputs = input_values

    # normal padding on batch
    if padded_inputs is None:
        padded_inputs = self.pad(
            input_values,
            max_length=max_length,
            truncation=truncation,
            padding=padding,
            return_attention_mask=padding,
        )
        if padding:
            padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")

    input_values = []
    for example in padded_inputs.pop("input_values"):
        if self.feature_size == 1:
            example = example[..., None]
        input_values.append(example.T)

    padded_inputs["input_values"] = input_values
    if return_tensors is not None:
        padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

    return padded_inputs

mindnlp.transformers.models.encodec.feature_extraction_encodec.EncodecFeatureExtractor.__init__(feature_size=1, sampling_rate=24000, padding_value=0.0, chunk_length_s=None, overlap=None, **kwargs)

Initialize the EncodecFeatureExtractor class with the given parameters.

PARAMETER DESCRIPTION
self

The instance of the class.

feature_size

The size of the feature. Default is 1.

TYPE: int DEFAULT: 1

sampling_rate

The sampling rate in Hz. Default is 24000.

TYPE: int DEFAULT: 24000

padding_value

The value used for padding. Default is 0.0.

TYPE: float DEFAULT: 0.0

chunk_length_s

The length of each chunk in seconds. Default is None.

TYPE: float DEFAULT: None

overlap

The overlap between chunks in seconds. Default is None.

TYPE: float DEFAULT: None

**kwargs

Additional keyword arguments.

DEFAULT: {}

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/encodec/feature_extraction_encodec.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def __init__(
    self,
    feature_size: int = 1,
    sampling_rate: int = 24000,
    padding_value: float = 0.0,
    chunk_length_s: float = None,
    overlap: float = None,
    **kwargs,
):
    """
    Initialize the EncodecFeatureExtractor class with the given parameters.

    Args:
        self: The instance of the class.
        feature_size (int): The size of the feature. Default is 1.
        sampling_rate (int): The sampling rate in Hz. Default is 24000.
        padding_value (float): The value used for padding. Default is 0.0.
        chunk_length_s (float): The length of each chunk in seconds. Default is None.
        overlap (float): The overlap between chunks in seconds. Default is None.
        **kwargs: Additional keyword arguments.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
    self.chunk_length_s = chunk_length_s
    self.overlap = overlap