Skip to content

gpt_pangu

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu

MindSpore PanguAlpha GPT2 Model

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguAttention

Bases: Module

Represents the GPTPanguAttention class, which inherits from nn.Module. This class contains methods for attention mechanism used in GPT (Generative Pre-trained Transformer) models.

METHOD DESCRIPTION
__init__

Initializes the GPTPanguAttention instance with the given configuration.

_attn

Computes the attention mechanism using the query, key, and value tensors, with optional attention and head masks.

_split_heads

Splits the hidden_size dimension of the given tensor into attn_head_size and num_heads.

_merge_heads

Merges attn_head_size dimension and num_attn_heads dimension into hidden_size.

forward

Constructs the attention mechanism using the provided hidden_states and optional past layers, masks, custom query, cache usage, and attention output flag.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
class GPTPanguAttention(nn.Module):

    """
    Represents the GPTPanguAttention class, which inherits from nn.Module.
    This class contains methods for attention mechanism used in GPT (Generative Pre-trained Transformer) models.

    Methods:
        __init__: Initializes the GPTPanguAttention instance with the given configuration.
        _attn: Computes the attention mechanism using the query, key, and value tensors, with optional attention and
            head masks.
        _split_heads: Splits the hidden_size dimension of the given tensor into attn_head_size and num_heads.
        _merge_heads: Merges attn_head_size dimension and num_attn_heads dimension into hidden_size.
        forward: Constructs the attention mechanism using the provided hidden_states and optional past layers, masks,
            custom query, cache usage, and attention output flag.
    """
    def __init__(self, config):
        """
        Initializes the GPTPanguAttention class.

        Args:
            self (object): The instance of the class itself.
            config (object):
                An object containing configuration parameters for the attention mechanism.

                - max_position_embeddings (int): The maximum number of positions for positional embeddings.
                - hidden_size (int): The dimension of the hidden state.
                - num_heads (int): The number of attention heads.
                - scale_attn_weights (bool): A flag indicating whether to scale the attention weights.
                - attn_pdrop (float): The dropout probability for attention weights.
                - resid_pdrop (float): The dropout probability for residual connections.

        Returns:
            None.

        Raises:
            ValueError: If the embed_dim is not divisible by num_heads, an exception is raised with a
                detailed error message.
        """
        super().__init__()

        max_positions = config.max_position_embeddings
        self.bias = ops.tril(ops.ones((max_positions, max_positions), dtype=mindspore.uint8)).view(
                1, 1, max_positions, max_positions
            )
        self.masked_bias = mindspore.tensor(-1e4)

        self.embed_dim = config.hidden_size
        self.num_heads = config.num_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
            )

        self.scale_attn_weights = config.scale_attn_weights

        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
        self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)

        self.attn_dropout = nn.Dropout(p=config.attn_pdrop)
        self.resid_dropout = nn.Dropout(p=config.resid_pdrop)

    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        '''
        Method _attn in the GPTPanguAttention class.

        This method calculates the attention weights and applies the attention mechanism to the input values.

        Args:
            self: GPTPanguAttention instance.
                The instance of the GPTPanguAttention class.
            query: tensor, shape [batch_size, num_attention_heads, sequence_length, d_model]
                The query tensor used to calculate the attention scores.
            key: tensor, shape [batch_size, num_attention_heads, sequence_length, d_model]
                The key tensor used to calculate the attention scores.
            value: tensor, shape [batch_size, num_attention_heads, sequence_length, d_model]
                The value tensor which is the input to the attention mechanism.
            attention_mask: tensor, optional
                Mask tensor for the attention scores. If provided, it should have the same shape as attn_weights.
            head_mask: tensor, optional
                Mask tensor for the attention heads. If provided, it should have the same shape as attn_weights.

        Returns:
            attn_output: tensor
                The output tensor after applying the attention mechanism.
                It has the same shape as the input value tensor.
            attn_weights: tensor
                The attention weights representing the importance of each element in the input sequence.

        Raises:
            ValueError: If the dimensions of query, key, or value are not compatible for matrix multiplication.
            TypeError: If any of the input tensors are not of type tensor.
            IndexError: If the dimensions of the input tensors are not as expected for the attention mechanism.
            RuntimeError: If any runtime error occurs during the calculation of attention weights.
        '''
        attn_weights = ops.matmul(query, key.swapaxes(-1, -2))

        if self.scale_attn_weights:
            attn_weights = attn_weights / (float(value.shape[-1]) ** 0.5)

        query_length, key_length = query.shape[-2], key.shape[-2]
        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
        attn_weights = ops.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype))

        if attention_mask is not None:
            # Apply the attention mask
            attn_weights = attn_weights + attention_mask

        attn_weights = ops.softmax(attn_weights, axis=-1)

        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
        attn_weights = attn_weights.astype(value.dtype)
        attn_weights = self.attn_dropout(attn_weights)

        # Mask heads if we want to
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        attn_output = ops.matmul(attn_weights, value)

        return attn_output, attn_weights

    def _split_heads(self, tensor, num_heads, attn_head_size):
        """
        Splits hidden_size dim into attn_head_size and num_heads
        """
        new_shape = tensor.shape[:-1] + (num_heads, attn_head_size)
        tensor = tensor.view(*new_shape)
        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

    def _merge_heads(self, tensor, num_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        """
        tensor = tensor.permute(0, 2, 1, 3)
        new_shape = tensor.shape[:-2] + (num_heads * attn_head_size,)
        return tensor.view(new_shape)

    def forward(
        self,
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        custom_query=None,
        use_cache=False,
        output_attentions=False,
    ):
        """
        Constructs the attention mechanism used in the GPTPangu model.

        Args:
            self (GPTPanguAttention): An instance of the GPTPanguAttention class.
            hidden_states (tensor): The input tensor of shape (batch_size, sequence_length, hidden_size).
            layer_past (tuple, optional): A tuple containing the past key and value tensors. Defaults to None.
            attention_mask (tensor, optional): The attention mask tensor of shape (batch_size, sequence_length).
                Defaults to None.
            head_mask (tensor, optional): The head mask tensor of shape (num_heads, sequence_length, sequence_length).
                Defaults to None.
            custom_query (tensor, optional): The custom query tensor of shape (batch_size, sequence_length, hidden_size).
                Defaults to None.
            use_cache (bool, optional): Whether to use the past key and value tensors. Defaults to False.
            output_attentions (bool, optional): Whether to output the attention weights. Defaults to False.

        Returns:
            tuple:
                A tuple containing the attention output tensor and the present key-value tuple:

                - attn_output (tensor): The output tensor of shape (batch_size, sequence_length, hidden_size).
                - present (tuple): A tuple containing the present key and value tensors of shape
                (batch_size, num_heads, sequence_length, head_dim).

        Raises:
            None.
        """
        query = self.q_proj(custom_query) if custom_query is not None else self.q_proj(hidden_states)
        key = self.k_proj(hidden_states)
        value = self.v_proj(hidden_states)

        query = self._split_heads(query, self.num_heads, self.head_dim)
        key = self._split_heads(key, self.num_heads, self.head_dim)
        value = self._split_heads(value, self.num_heads, self.head_dim)

        if layer_past is not None:
            past_key, past_value = layer_past
            key = ops.cat((past_key, key), axis=-2)
            value = ops.cat((past_value, value), axis=-2)

        if use_cache is True:
            present = (key, value)
        else:
            present = None

        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
        attn_output = self.c_proj(attn_output)
        attn_output = self.resid_dropout(attn_output)

        outputs = (attn_output, present)
        if output_attentions:
            outputs += (attn_weights,)

        return outputs  # a, present, (attentions)

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguAttention.__init__(config)

Initializes the GPTPanguAttention class.

PARAMETER DESCRIPTION
self

The instance of the class itself.

TYPE: object

config

An object containing configuration parameters for the attention mechanism.

  • max_position_embeddings (int): The maximum number of positions for positional embeddings.
  • hidden_size (int): The dimension of the hidden state.
  • num_heads (int): The number of attention heads.
  • scale_attn_weights (bool): A flag indicating whether to scale the attention weights.
  • attn_pdrop (float): The dropout probability for attention weights.
  • resid_pdrop (float): The dropout probability for residual connections.

TYPE: object

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If the embed_dim is not divisible by num_heads, an exception is raised with a detailed error message.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def __init__(self, config):
    """
    Initializes the GPTPanguAttention class.

    Args:
        self (object): The instance of the class itself.
        config (object):
            An object containing configuration parameters for the attention mechanism.

            - max_position_embeddings (int): The maximum number of positions for positional embeddings.
            - hidden_size (int): The dimension of the hidden state.
            - num_heads (int): The number of attention heads.
            - scale_attn_weights (bool): A flag indicating whether to scale the attention weights.
            - attn_pdrop (float): The dropout probability for attention weights.
            - resid_pdrop (float): The dropout probability for residual connections.

    Returns:
        None.

    Raises:
        ValueError: If the embed_dim is not divisible by num_heads, an exception is raised with a
            detailed error message.
    """
    super().__init__()

    max_positions = config.max_position_embeddings
    self.bias = ops.tril(ops.ones((max_positions, max_positions), dtype=mindspore.uint8)).view(
            1, 1, max_positions, max_positions
        )
    self.masked_bias = mindspore.tensor(-1e4)

    self.embed_dim = config.hidden_size
    self.num_heads = config.num_heads
    self.head_dim = self.embed_dim // self.num_heads
    if self.head_dim * self.num_heads != self.embed_dim:
        raise ValueError(
            f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
        )

    self.scale_attn_weights = config.scale_attn_weights

    self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
    self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
    self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
    self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)

    self.attn_dropout = nn.Dropout(p=config.attn_pdrop)
    self.resid_dropout = nn.Dropout(p=config.resid_pdrop)

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguAttention.forward(hidden_states, layer_past=None, attention_mask=None, head_mask=None, custom_query=None, use_cache=False, output_attentions=False)

Constructs the attention mechanism used in the GPTPangu model.

PARAMETER DESCRIPTION
self

An instance of the GPTPanguAttention class.

TYPE: GPTPanguAttention

hidden_states

The input tensor of shape (batch_size, sequence_length, hidden_size).

TYPE: tensor

layer_past

A tuple containing the past key and value tensors. Defaults to None.

TYPE: tuple DEFAULT: None

attention_mask

The attention mask tensor of shape (batch_size, sequence_length). Defaults to None.

TYPE: tensor DEFAULT: None

head_mask

The head mask tensor of shape (num_heads, sequence_length, sequence_length). Defaults to None.

TYPE: tensor DEFAULT: None

custom_query

The custom query tensor of shape (batch_size, sequence_length, hidden_size). Defaults to None.

TYPE: tensor DEFAULT: None

use_cache

Whether to use the past key and value tensors. Defaults to False.

TYPE: bool DEFAULT: False

output_attentions

Whether to output the attention weights. Defaults to False.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
tuple

A tuple containing the attention output tensor and the present key-value tuple:

  • attn_output (tensor): The output tensor of shape (batch_size, sequence_length, hidden_size).
  • present (tuple): A tuple containing the present key and value tensors of shape (batch_size, num_heads, sequence_length, head_dim).
Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def forward(
    self,
    hidden_states,
    layer_past=None,
    attention_mask=None,
    head_mask=None,
    custom_query=None,
    use_cache=False,
    output_attentions=False,
):
    """
    Constructs the attention mechanism used in the GPTPangu model.

    Args:
        self (GPTPanguAttention): An instance of the GPTPanguAttention class.
        hidden_states (tensor): The input tensor of shape (batch_size, sequence_length, hidden_size).
        layer_past (tuple, optional): A tuple containing the past key and value tensors. Defaults to None.
        attention_mask (tensor, optional): The attention mask tensor of shape (batch_size, sequence_length).
            Defaults to None.
        head_mask (tensor, optional): The head mask tensor of shape (num_heads, sequence_length, sequence_length).
            Defaults to None.
        custom_query (tensor, optional): The custom query tensor of shape (batch_size, sequence_length, hidden_size).
            Defaults to None.
        use_cache (bool, optional): Whether to use the past key and value tensors. Defaults to False.
        output_attentions (bool, optional): Whether to output the attention weights. Defaults to False.

    Returns:
        tuple:
            A tuple containing the attention output tensor and the present key-value tuple:

            - attn_output (tensor): The output tensor of shape (batch_size, sequence_length, hidden_size).
            - present (tuple): A tuple containing the present key and value tensors of shape
            (batch_size, num_heads, sequence_length, head_dim).

    Raises:
        None.
    """
    query = self.q_proj(custom_query) if custom_query is not None else self.q_proj(hidden_states)
    key = self.k_proj(hidden_states)
    value = self.v_proj(hidden_states)

    query = self._split_heads(query, self.num_heads, self.head_dim)
    key = self._split_heads(key, self.num_heads, self.head_dim)
    value = self._split_heads(value, self.num_heads, self.head_dim)

    if layer_past is not None:
        past_key, past_value = layer_past
        key = ops.cat((past_key, key), axis=-2)
        value = ops.cat((past_value, value), axis=-2)

    if use_cache is True:
        present = (key, value)
    else:
        present = None

    attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)

    attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
    attn_output = self.c_proj(attn_output)
    attn_output = self.resid_dropout(attn_output)

    outputs = (attn_output, present)
    if output_attentions:
        outputs += (attn_weights,)

    return outputs  # a, present, (attentions)

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguBlock

Bases: Module

This class represents a block of the GPTPangu model, containing layers for attention and feed-forward processing.

PARAMETER DESCRIPTION
config

An object containing configuration settings for the GPTPanguBlock.

ATTRIBUTE DESCRIPTION
ln_1

Layer normalization module for the first layer.

attn

GPTPanguAttention module for attention processing.

ln_2

Layer normalization module for the second layer.

mlp

GPTPanguMLP module for feed-forward processing.

METHOD DESCRIPTION
__init__

Initializes the GPTPanguBlock with the given configuration settings.

forward

Constructs the block by processing the input hidden_states through attention and feed-forward layers.

RETURNS DESCRIPTION
outputs

A tuple containing the final hidden states after processing.

Inherits from

nn.Module

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
class GPTPanguBlock(nn.Module):

    """
    This class represents a block of the GPTPangu model, containing layers for attention and feed-forward processing.

    Parameters:
        config: An object containing configuration settings for the GPTPanguBlock.

    Attributes:
        ln_1: Layer normalization module for the first layer.
        attn: GPTPanguAttention module for attention processing.
        ln_2: Layer normalization module for the second layer.
        mlp: GPTPanguMLP module for feed-forward processing.

    Methods:
        __init__: Initializes the GPTPanguBlock with the given configuration settings.
        forward:
            Constructs the block by processing the input hidden_states through attention and feed-forward layers.

    Returns:
        outputs:
            A tuple containing the final hidden states after processing.

    Inherits from:
        nn.Module
    """
    def __init__(self, config):
        """
        Initialize a GPTPanguBlock instance with the provided configuration.

        Args:
            self (GPTPanguBlock): The instance of the GPTPanguBlock class.
            config (GPTPanguConfig):
                The configuration object containing parameters for the block.

                - hidden_size (int): The size of the hidden layers.
                - intermediate_size (int, optional): The size of the intermediate layers. Defaults to None.
                If not provided, it is set to 4 times the hidden size.
                - layer_norm_epsilon (float): The epsilon value for layer normalization.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        hidden_size = config.hidden_size
        inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size

        self.ln_1 = nn.LayerNorm([hidden_size], eps=config.layer_norm_epsilon)
        self.attn = GPTPanguAttention(config)
        self.ln_2 = nn.LayerNorm([hidden_size], eps=config.layer_norm_epsilon)
        self.mlp = GPTPanguMLP(inner_dim, config)

    def forward(
        self,
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        custom_query=None,
        use_cache=False,
        output_attentions=False,
    ):
        """
        Constructs the GPTPanguBlock.

        Args:
            self: The instance of the class.
            hidden_states (torch.Tensor): The input hidden states of shape `(batch_size, sequence_length, hidden_size)`.
            layer_past (Tuple[torch.Tensor], optional):
                The cached past hidden states of shape `(batch_size, num_heads, sequence_length, hidden_size)`.
                Default is `None`.
            attention_mask (torch.Tensor, optional):
                The attention mask of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
                Default is `None`.
            head_mask (torch.Tensor, optional): The head mask of shape `(num_heads,)`. Default is `None`.
            custom_query (torch.Tensor, optional):
                The custom query tensor of shape `(batch_size, num_heads, sequence_length, hidden_size)`.
                Default is `None`.
            use_cache (bool, optional): Whether to use the cache for the hidden states. Default is `False`.
            output_attentions (bool, optional): Whether to output attentions probabilities. Default is `False`.

        Returns:
            Tuple[torch.Tensor]:
                A tuple containing the following:

                - hidden_states (torch.Tensor):
                The output hidden states of shape `(batch_size, sequence_length, hidden_size)`.
                - layer_past (Tuple[torch.Tensor]):
                The updated cached past hidden states of shape `(batch_size, num_heads, sequence_length, hidden_size)`.
                - attention_weights (List[torch.Tensor], optional):
                The attention weights of shape `(num_layers, num_heads, sequence_length, sequence_length)`,
                only if `output_attentions=True`.
                - other_outputs (List[torch.Tensor], optional):
                Other intermediate outputs, only if `output_attentions=True`.

        Raises:
            None.
        """
        residual = hidden_states
        hidden_states = self.ln_1(hidden_states)
        attn_outputs = self.attn(
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            custom_query=custom_query,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        outputs = attn_outputs[1:]
        # residual connection
        hidden_states = attn_output + residual

        residual = hidden_states
        hidden_states = self.ln_2(hidden_states)
        feed_forward_hidden_states = self.mlp(hidden_states)
        # residual connection
        hidden_states = residual + feed_forward_hidden_states

        if use_cache:
            outputs = (hidden_states,) + outputs
        else:
            outputs = (hidden_states,) + outputs[1:]

        return outputs  # hidden_states, present, (attentions, cross_attentions)

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguBlock.__init__(config)

Initialize a GPTPanguBlock instance with the provided configuration.

PARAMETER DESCRIPTION
self

The instance of the GPTPanguBlock class.

TYPE: GPTPanguBlock

config

The configuration object containing parameters for the block.

  • hidden_size (int): The size of the hidden layers.
  • intermediate_size (int, optional): The size of the intermediate layers. Defaults to None. If not provided, it is set to 4 times the hidden size.
  • layer_norm_epsilon (float): The epsilon value for layer normalization.

TYPE: GPTPanguConfig

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
def __init__(self, config):
    """
    Initialize a GPTPanguBlock instance with the provided configuration.

    Args:
        self (GPTPanguBlock): The instance of the GPTPanguBlock class.
        config (GPTPanguConfig):
            The configuration object containing parameters for the block.

            - hidden_size (int): The size of the hidden layers.
            - intermediate_size (int, optional): The size of the intermediate layers. Defaults to None.
            If not provided, it is set to 4 times the hidden size.
            - layer_norm_epsilon (float): The epsilon value for layer normalization.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    hidden_size = config.hidden_size
    inner_dim = config.intermediate_size if config.intermediate_size is not None else 4 * hidden_size

    self.ln_1 = nn.LayerNorm([hidden_size], eps=config.layer_norm_epsilon)
    self.attn = GPTPanguAttention(config)
    self.ln_2 = nn.LayerNorm([hidden_size], eps=config.layer_norm_epsilon)
    self.mlp = GPTPanguMLP(inner_dim, config)

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguBlock.forward(hidden_states, layer_past=None, attention_mask=None, head_mask=None, custom_query=None, use_cache=False, output_attentions=False)

Constructs the GPTPanguBlock.

PARAMETER DESCRIPTION
self

The instance of the class.

hidden_states

The input hidden states of shape (batch_size, sequence_length, hidden_size).

TYPE: Tensor

layer_past

The cached past hidden states of shape (batch_size, num_heads, sequence_length, hidden_size). Default is None.

TYPE: Tuple[Tensor] DEFAULT: None

attention_mask

The attention mask of shape (batch_size, num_heads, sequence_length, sequence_length). Default is None.

TYPE: Tensor DEFAULT: None

head_mask

The head mask of shape (num_heads,). Default is None.

TYPE: Tensor DEFAULT: None

custom_query

The custom query tensor of shape (batch_size, num_heads, sequence_length, hidden_size). Default is None.

TYPE: Tensor DEFAULT: None

use_cache

Whether to use the cache for the hidden states. Default is False.

TYPE: bool DEFAULT: False

output_attentions

Whether to output attentions probabilities. Default is False.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION

Tuple[torch.Tensor]: A tuple containing the following:

  • hidden_states (torch.Tensor): The output hidden states of shape (batch_size, sequence_length, hidden_size).
  • layer_past (Tuple[torch.Tensor]): The updated cached past hidden states of shape (batch_size, num_heads, sequence_length, hidden_size).
  • attention_weights (List[torch.Tensor], optional): The attention weights of shape (num_layers, num_heads, sequence_length, sequence_length), only if output_attentions=True.
  • other_outputs (List[torch.Tensor], optional): Other intermediate outputs, only if output_attentions=True.
Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
def forward(
    self,
    hidden_states,
    layer_past=None,
    attention_mask=None,
    head_mask=None,
    custom_query=None,
    use_cache=False,
    output_attentions=False,
):
    """
    Constructs the GPTPanguBlock.

    Args:
        self: The instance of the class.
        hidden_states (torch.Tensor): The input hidden states of shape `(batch_size, sequence_length, hidden_size)`.
        layer_past (Tuple[torch.Tensor], optional):
            The cached past hidden states of shape `(batch_size, num_heads, sequence_length, hidden_size)`.
            Default is `None`.
        attention_mask (torch.Tensor, optional):
            The attention mask of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
            Default is `None`.
        head_mask (torch.Tensor, optional): The head mask of shape `(num_heads,)`. Default is `None`.
        custom_query (torch.Tensor, optional):
            The custom query tensor of shape `(batch_size, num_heads, sequence_length, hidden_size)`.
            Default is `None`.
        use_cache (bool, optional): Whether to use the cache for the hidden states. Default is `False`.
        output_attentions (bool, optional): Whether to output attentions probabilities. Default is `False`.

    Returns:
        Tuple[torch.Tensor]:
            A tuple containing the following:

            - hidden_states (torch.Tensor):
            The output hidden states of shape `(batch_size, sequence_length, hidden_size)`.
            - layer_past (Tuple[torch.Tensor]):
            The updated cached past hidden states of shape `(batch_size, num_heads, sequence_length, hidden_size)`.
            - attention_weights (List[torch.Tensor], optional):
            The attention weights of shape `(num_layers, num_heads, sequence_length, sequence_length)`,
            only if `output_attentions=True`.
            - other_outputs (List[torch.Tensor], optional):
            Other intermediate outputs, only if `output_attentions=True`.

    Raises:
        None.
    """
    residual = hidden_states
    hidden_states = self.ln_1(hidden_states)
    attn_outputs = self.attn(
        hidden_states,
        layer_past=layer_past,
        attention_mask=attention_mask,
        head_mask=head_mask,
        custom_query=custom_query,
        use_cache=use_cache,
        output_attentions=output_attentions,
    )
    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
    outputs = attn_outputs[1:]
    # residual connection
    hidden_states = attn_output + residual

    residual = hidden_states
    hidden_states = self.ln_2(hidden_states)
    feed_forward_hidden_states = self.mlp(hidden_states)
    # residual connection
    hidden_states = residual + feed_forward_hidden_states

    if use_cache:
        outputs = (hidden_states,) + outputs
    else:
        outputs = (hidden_states,) + outputs[1:]

    return outputs  # hidden_states, present, (attentions, cross_attentions)

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguForCausalLM

Bases: GPTPanguPreTrainedModel

The GPTPanguForCausalLM class represents a Pangu model for causal language modeling. It inherits from the GPTPanguPreTrainedModel class.

This class includes methods for initializing the model, getting and setting output embeddings, preparing inputs for generation, and generating outputs based on input data. Additionally, it provides a method for re-ordering the past key values cache when using beam search or beam sampling.

The init method initializes the model with a given configuration and sets up the transformer and lm_head layers. The get_output_embeddings and set_output_embeddings methods deal with accessing and modifying the output embeddings for the model. The prepare_inputs_for_generation method prepares input data for generation, considering past key values, attention mask, position ids, and token type ids. The forward method forwards outputs based on input data, including handling labels for language modeling and computing loss.

The _reorder_cache method is a static method used to re-order the past_key_values cache when beam search or beam sample methods are called, ensuring correct alignment with the beam index at each generation step.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
class GPTPanguForCausalLM(GPTPanguPreTrainedModel):

    """
    The GPTPanguForCausalLM class represents a Pangu model for causal language modeling.
    It inherits from the GPTPanguPreTrainedModel class.

    This class includes methods for initializing the model, getting and setting output embeddings,
    preparing inputs for generation, and generating outputs based on input data. Additionally, it provides a method
    for re-ordering the past key values cache when using beam search or beam sampling.

    The __init__ method initializes the model with a given configuration and sets up the transformer and lm_head layers.
    The get_output_embeddings and set_output_embeddings methods deal with accessing and  modifying the output embeddings
    for the model. The prepare_inputs_for_generation method prepares input data for generation, considering past key
    values, attention mask, position ids, and token type ids. The forward method forwards outputs based on input data,
    including handling labels for language modeling and computing loss.

    The _reorder_cache method is a static method used to re-order the past_key_values cache when beam search or beam
    sample methods are called, ensuring correct alignment with the beam index at each generation step.
    """
    def __init__(self, config):
        """
        Initializes an instance of the GPTPanguForCausalLM class.

        Args:
            self: The instance of the class.
            config:
                A configuration object containing settings for the model.

                - Type: object
                - Purpose: Specifies the configuration settings for the model.
                - Restrictions: Must be a valid configuration object compatible with the model.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)
        self.transformer = GPTPanguModel(config)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        """
        This method returns the output embeddings of the GPTPanguForCausalLM model.

        Args:
            self: The instance of the GPTPanguForCausalLM class.

        Returns:
            lm_head: This method returns the output embeddings of the model.

        Raises:
            None.
        """
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        """
        Sets the output embeddings for the GPTPanguForCausalLM model.

        Args:
            self (GPTPanguForCausalLM): The instance of the GPTPanguForCausalLM class.
            new_embeddings (torch.nn.Module): The new embeddings to set as the output embeddings.

        Returns:
            None.

        Raises:
            None.
        """
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
        """
        Prepare inputs for generation.

        Args:
            self (GPTPanguForCausalLM): The instance of the GPTPanguForCausalLM class.
            input_ids (torch.Tensor): The input tensor of token indices representing the sequence.
            past (tuple, optional): The past key values used for fast decoding.

        Returns:
            dict:
                A dictionary containing the prepared inputs for generation with the following keys:

                - 'input_ids' (torch.Tensor): The modified input tensor.
                - 'past_key_values' (tuple): The past key values.
                - 'use_cache' (bool): The flag indicating whether to use cache.
                - 'position_ids' (torch.Tensor): The modified position indices tensor.
                - 'attention_mask' (torch.Tensor): The attention mask tensor.
                - 'token_type_ids' (torch.Tensor): The modified token type indices tensor.

        Raises:
            None.
        """
        token_type_ids = kwargs.get("token_type_ids", None)
        # only last token for inputs_ids if past is defined in kwargs
        if past:
            input_ids = input_ids[:, -1].unsqueeze(-1)
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)

        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.int().cumsum(-1).long() - 1
            position_ids = position_ids.masked_fill(attention_mask == 0, 1)
            if past:
                position_ids = position_ids[:, -1].unsqueeze(-1)
        else:
            position_ids = None
        return {
            "input_ids": input_ids,
            "past_key_values": past,
            "use_cache": kwargs.get("use_cache"),
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }

    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        Args:
            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
                ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
                ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :]
            shift_labels = labels[..., 1:]
            # Flatten the tokens
            loss = ops.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1),
                                     ignore_index=self.config.pad_token_id)

        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

    @staticmethod
    def _reorder_cache(past: Tuple[Tuple[mindspore.Tensor]], beam_idx: mindspore.Tensor) -> Tuple[Tuple[mindspore.Tensor]]:
        """
        This function is used to re-order the :obj:`past_key_values` cache if
        :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
        called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
        """
        return tuple(
            tuple(past_state.index_select(0, beam_idx) for past_state in layer_past)
            for layer_past in past
        )

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguForCausalLM.__init__(config)

Initializes an instance of the GPTPanguForCausalLM class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

A configuration object containing settings for the model.

  • Type: object
  • Purpose: Specifies the configuration settings for the model.
  • Restrictions: Must be a valid configuration object compatible with the model.

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
def __init__(self, config):
    """
    Initializes an instance of the GPTPanguForCausalLM class.

    Args:
        self: The instance of the class.
        config:
            A configuration object containing settings for the model.

            - Type: object
            - Purpose: Specifies the configuration settings for the model.
            - Restrictions: Must be a valid configuration object compatible with the model.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)
    self.transformer = GPTPanguModel(config)
    self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguForCausalLM.forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

PARAMETER DESCRIPTION
labels

Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set labels = input_ids Indices are selected in [-100, 0, ..., config.vocab_size] All labels set to -100 are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size]

DEFAULT: None

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
def forward(
    self,
    input_ids=None,
    past_key_values=None,
    attention_mask=None,
    token_type_ids=None,
    position_ids=None,
    head_mask=None,
    inputs_embeds=None,
    labels=None,
    use_cache=None,
    output_attentions=None,
    output_hidden_states=None,
    return_dict=None,
):
    r"""
    Args:
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    transformer_outputs = self.transformer(
        input_ids,
        past_key_values=past_key_values,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = transformer_outputs[0]

    lm_logits = self.lm_head(hidden_states)

    loss = None
    if labels is not None:
        # Shift so that tokens < n predict n
        shift_logits = lm_logits[..., :-1, :]
        shift_labels = labels[..., 1:]
        # Flatten the tokens
        loss = ops.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1),
                                 ignore_index=self.config.pad_token_id)

    if not return_dict:
        output = (lm_logits,) + transformer_outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return CausalLMOutputWithPast(
        loss=loss,
        logits=lm_logits,
        past_key_values=transformer_outputs.past_key_values,
        hidden_states=transformer_outputs.hidden_states,
        attentions=transformer_outputs.attentions,
    )

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguForCausalLM.get_output_embeddings()

This method returns the output embeddings of the GPTPanguForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the GPTPanguForCausalLM class.

RETURNS DESCRIPTION
lm_head

This method returns the output embeddings of the model.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
797
798
799
800
801
802
803
804
805
806
807
808
809
810
def get_output_embeddings(self):
    """
    This method returns the output embeddings of the GPTPanguForCausalLM model.

    Args:
        self: The instance of the GPTPanguForCausalLM class.

    Returns:
        lm_head: This method returns the output embeddings of the model.

    Raises:
        None.
    """
    return self.lm_head

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguForCausalLM.prepare_inputs_for_generation(input_ids, past=None, **kwargs)

Prepare inputs for generation.

PARAMETER DESCRIPTION
self

The instance of the GPTPanguForCausalLM class.

TYPE: GPTPanguForCausalLM

input_ids

The input tensor of token indices representing the sequence.

TYPE: Tensor

past

The past key values used for fast decoding.

TYPE: tuple DEFAULT: None

RETURNS DESCRIPTION
dict

A dictionary containing the prepared inputs for generation with the following keys:

  • 'input_ids' (torch.Tensor): The modified input tensor.
  • 'past_key_values' (tuple): The past key values.
  • 'use_cache' (bool): The flag indicating whether to use cache.
  • 'position_ids' (torch.Tensor): The modified position indices tensor.
  • 'attention_mask' (torch.Tensor): The attention mask tensor.
  • 'token_type_ids' (torch.Tensor): The modified token type indices tensor.
Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
    """
    Prepare inputs for generation.

    Args:
        self (GPTPanguForCausalLM): The instance of the GPTPanguForCausalLM class.
        input_ids (torch.Tensor): The input tensor of token indices representing the sequence.
        past (tuple, optional): The past key values used for fast decoding.

    Returns:
        dict:
            A dictionary containing the prepared inputs for generation with the following keys:

            - 'input_ids' (torch.Tensor): The modified input tensor.
            - 'past_key_values' (tuple): The past key values.
            - 'use_cache' (bool): The flag indicating whether to use cache.
            - 'position_ids' (torch.Tensor): The modified position indices tensor.
            - 'attention_mask' (torch.Tensor): The attention mask tensor.
            - 'token_type_ids' (torch.Tensor): The modified token type indices tensor.

    Raises:
        None.
    """
    token_type_ids = kwargs.get("token_type_ids", None)
    # only last token for inputs_ids if past is defined in kwargs
    if past:
        input_ids = input_ids[:, -1].unsqueeze(-1)
        if token_type_ids is not None:
            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)

    attention_mask = kwargs.get("attention_mask", None)
    position_ids = kwargs.get("position_ids", None)

    if attention_mask is not None and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.int().cumsum(-1).long() - 1
        position_ids = position_ids.masked_fill(attention_mask == 0, 1)
        if past:
            position_ids = position_ids[:, -1].unsqueeze(-1)
    else:
        position_ids = None
    return {
        "input_ids": input_ids,
        "past_key_values": past,
        "use_cache": kwargs.get("use_cache"),
        "position_ids": position_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
    }

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguForCausalLM.set_output_embeddings(new_embeddings)

Sets the output embeddings for the GPTPanguForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the GPTPanguForCausalLM class.

TYPE: GPTPanguForCausalLM

new_embeddings

The new embeddings to set as the output embeddings.

TYPE: Module

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
def set_output_embeddings(self, new_embeddings):
    """
    Sets the output embeddings for the GPTPanguForCausalLM model.

    Args:
        self (GPTPanguForCausalLM): The instance of the GPTPanguForCausalLM class.
        new_embeddings (torch.nn.Module): The new embeddings to set as the output embeddings.

    Returns:
        None.

    Raises:
        None.
    """
    self.lm_head = new_embeddings

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguMLP

Bases: Module

GPTPanguMLP represents a multi-layer perceptron (MLP) used in the GPT-Pangu model for processing intermediate hidden states.

This class inherits from nn.Module and contains methods for initializing the MLP layers and processing hidden states through a feedforward neural network.

ATTRIBUTE DESCRIPTION
c_fc

Fully connected layer to transform input hidden states.

TYPE: Linear

c_proj

Fully connected layer to project intermediate hidden states back to original embed dimension.

TYPE: Linear

act

Activation function applied to hidden states.

TYPE: ACT2FN[activation_function]

dropout

Dropout layer to add regularization to the model.

TYPE: Dropout

METHOD DESCRIPTION
__init__

Initializes the GPTPanguMLP with specified intermediate size and configuration parameters.

forward

Processes the input 'hidden_states' through the MLP layers and returns the processed hidden states.

Example
>>> intermediate_size = 512
>>> config = Configuration(hidden_size=768, activation_function='gelu', resid_pdrop=0.1)
>>> mlp = GPTPanguMLP(intermediate_size, config)
>>> output = mlp.forward(hidden_states)
Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
class GPTPanguMLP(nn.Module):

    """
    GPTPanguMLP represents a multi-layer perceptron (MLP) used in the GPT-Pangu model for processing intermediate
    hidden states.

    This class inherits from nn.Module and contains methods for initializing the MLP layers and processing hidden states
    through a feedforward neural network.

    Attributes:
        c_fc (nn.Linear): Fully connected layer to transform input hidden states.
        c_proj (nn.Linear): Fully connected layer to project intermediate hidden states back to original embed dimension.
        act (ACT2FN[config.activation_function]): Activation function applied to hidden states.
        dropout (nn.Dropout): Dropout layer to add regularization to the model.

    Methods:
        __init__: Initializes the GPTPanguMLP with specified intermediate size and configuration parameters.

        forward: Processes the input 'hidden_states' through the MLP layers and returns the processed hidden states.

    Example:
        ```python
        >>> intermediate_size = 512
        >>> config = Configuration(hidden_size=768, activation_function='gelu', resid_pdrop=0.1)
        >>> mlp = GPTPanguMLP(intermediate_size, config)
        >>> output = mlp.forward(hidden_states)
        ```

    """
    def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * hidden_size
        """
        Initializes the GPTPanguMLP class.

        Args:
            self: The object instance.
            intermediate_size (int): The size of the intermediate layer.
            config (object): The configuration object containing hidden_size, activation_function,
                and resid_pdrop attributes.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        embed_dim = config.hidden_size
        self.c_fc = nn.Linear(embed_dim, intermediate_size)
        self.c_proj = nn.Linear(intermediate_size, embed_dim)
        self.act = ACT2FN[config.activation_function]
        self.dropout = nn.Dropout(p=config.resid_pdrop)

    def forward(self, hidden_states):
        """
        This method forwards the hidden states by applying a series of transformations.

        Args:
            self (GPTPanguMLP): The instance of the GPTPanguMLP class.
            hidden_states (tensor): The input hidden states to be processed.

        Returns:
            None: This method does not return any value explicitly, as the processed hidden states are modified in place.

        Raises:
            None.
        """
        hidden_states = self.c_fc(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.c_proj(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return hidden_states

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguMLP.__init__(intermediate_size, config)

Initializes the GPTPanguMLP class.

PARAMETER DESCRIPTION
self

The object instance.

intermediate_size

The size of the intermediate layer.

TYPE: int

config

The configuration object containing hidden_size, activation_function, and resid_pdrop attributes.

TYPE: object

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * hidden_size
    """
    Initializes the GPTPanguMLP class.

    Args:
        self: The object instance.
        intermediate_size (int): The size of the intermediate layer.
        config (object): The configuration object containing hidden_size, activation_function,
            and resid_pdrop attributes.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    embed_dim = config.hidden_size
    self.c_fc = nn.Linear(embed_dim, intermediate_size)
    self.c_proj = nn.Linear(intermediate_size, embed_dim)
    self.act = ACT2FN[config.activation_function]
    self.dropout = nn.Dropout(p=config.resid_pdrop)

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguMLP.forward(hidden_states)

This method forwards the hidden states by applying a series of transformations.

PARAMETER DESCRIPTION
self

The instance of the GPTPanguMLP class.

TYPE: GPTPanguMLP

hidden_states

The input hidden states to be processed.

TYPE: tensor

RETURNS DESCRIPTION
None

This method does not return any value explicitly, as the processed hidden states are modified in place.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def forward(self, hidden_states):
    """
    This method forwards the hidden states by applying a series of transformations.

    Args:
        self (GPTPanguMLP): The instance of the GPTPanguMLP class.
        hidden_states (tensor): The input hidden states to be processed.

    Returns:
        None: This method does not return any value explicitly, as the processed hidden states are modified in place.

    Raises:
        None.
    """
    hidden_states = self.c_fc(hidden_states)
    hidden_states = self.act(hidden_states)
    hidden_states = self.c_proj(hidden_states)
    hidden_states = self.dropout(hidden_states)
    return hidden_states

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguModel

Bases: GPTPanguPreTrainedModel

GPTPanguModel

This class represents a GPT-Pangu model, which is a variant of the GPT (Generative Pre-trained Transformer) model. It is designed for pre-training and fine-tuning on large-scale Chinese text data. The GPTPanguModel class inherits from the GPTPanguPreTrainedModel class.

ATTRIBUTE DESCRIPTION
embed_dim

The dimensionality of the embedding layer.

TYPE: int

wte

The word/token embedding layer.

TYPE: Embedding

wpe

The position embedding layer.

TYPE: Embedding

wqe

The query embedding layer.

TYPE: Embedding

drop

The dropout layer.

TYPE: Dropout

h

The list of GPTPanguBlock layers.

TYPE: ModuleList

ln_f

The layer normalization layer.

TYPE: LayerNorm

gradient_checkpointing

Whether to use gradient checkpointing.

TYPE: bool

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
class GPTPanguModel(GPTPanguPreTrainedModel):

    """GPTPanguModel

    This class represents a GPT-Pangu model, which is a variant of the GPT (Generative Pre-trained Transformer) model.
    It is designed for pre-training and fine-tuning on large-scale Chinese text data. The GPTPanguModel class inherits
    from the GPTPanguPreTrainedModel class.

    Attributes:
        embed_dim (int): The dimensionality of the embedding layer.
        wte (nn.Embedding): The word/token embedding layer.
        wpe (nn.Embedding): The position embedding layer.
        wqe (nn.Embedding): The query embedding layer.
        drop (nn.Dropout): The dropout layer.
        h (nn.ModuleList): The list of GPTPanguBlock layers.
        ln_f (nn.LayerNorm): The layer normalization layer.
        gradient_checkpointing (bool): Whether to use gradient checkpointing.
    """
    def __init__(self, config):
        """
        Initializes a new instance of the GPTPanguModel class.

        Args:
            self: The instance of the GPTPanguModel class.
            config:
                A configuration object that contains the settings for the model.

                - Type: object
                - Purpose: Specifies the configuration settings for the model.
                - Restrictions: Must be a valid configuration object.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)

        self.embed_dim = config.hidden_size

        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
        self.wqe = nn.Embedding(config.max_position_embeddings, self.embed_dim)

        self.drop = nn.Dropout(p=config.embd_pdrop)
        self.h = nn.ModuleList([GPTPanguBlock(config) for _ in range(config.num_layers)])
        self.ln_f = nn.LayerNorm([self.embed_dim], eps=config.layer_norm_epsilon)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        Method to retrieve input embeddings from the GPTPanguModel.

        Args:
            self: GPTPanguModel instance. The object instance of the GPTPanguModel class.

        Returns:
           The input embeddings for further processing in the model.

        Raises:
            None.
        """
        return self.wte

    def set_input_embeddings(self, new_embeddings):
        """
        Set the input embeddings for the GPTPanguModel.

        Args:
            self (GPTPanguModel): The instance of the GPTPanguModel class.
            new_embeddings: The new input embeddings to be set for the model.
                It should be a tensor or array representing the embeddings.

        Returns:
            None: This method updates the input embeddings of the model in-place.

        Raises:
            TypeError: If the new_embeddings parameter is not of the correct type.
            ValueError: If the new_embeddings parameter is empty or invalid.
        """
        self.wte = new_embeddings

    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        """
        Constructs the GPTPanguModel.

        Args:
            self (GPTPanguModel): The object instance.
            input_ids (torch.Tensor, optional): The input tensor of shape (batch_size, sequence_length).
                It represents the input token IDs. Defaults to None.
            past_key_values (tuple, optional): The tuple of past key values.
                Each element in the tuple is a tensor of shape (batch_size, num_heads, sequence_length,
                hidden_size//num_heads). Defaults to None.
            attention_mask (torch.Tensor, optional): The attention mask tensor of shape (batch_size, sequence_length).
                It indicates which tokens should be attended to and which ones should not. Defaults to None.
            token_type_ids (torch.Tensor, optional): The token type IDs tensor of shape (batch_size, sequence_length).
                It represents the token type embeddings. Defaults to None.
            position_ids (torch.Tensor, optional): The position IDs tensor of shape (batch_size, sequence_length).
                It represents the position embeddings. Defaults to None.
            head_mask (torch.Tensor, optional): The head mask tensor of shape (num_layers, num_heads).
                It specifies which heads should be masked for each layer. Defaults to None.
            inputs_embeds (torch.Tensor, optional):
                The input embeddings tensor of shape (batch_size, sequence_length, hidden_size).
                It represents the input embeddings directly instead of using input_ids. Defaults to None.
            use_cache (bool, optional): Whether to use cache for faster decoding. Defaults to None.
            output_attentions (bool, optional): Whether to output attention weights. Defaults to None.
            output_hidden_states (bool, optional): Whether to output hidden states. Defaults to None.
            return_dict (bool, optional): Whether to use a dictionary as the return type. Defaults to None.

        Returns:
            None

        Raises:
            ValueError: If both input_ids and inputs_embeds are provided simultaneously.
            ValueError: If neither input_ids nor inputs_embeds are provided.
            ValueError: If batch_size is not defined or is less than or equal to 0.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        if input_ids is not None:
            input_shape = input_ids.shape
            input_ids = input_ids.view(-1, input_shape[-1])
            batch_size = input_ids.shape[0]
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
            batch_size = inputs_embeds.shape[0]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, input_shape[-1])
        if position_ids is not None:
            position_ids = position_ids.view(-1, input_shape[-1])

        if past_key_values is None:
            past_length = 0
            past_key_values = tuple([None] * len(self.h))
        else:
            past_length = past_key_values[0][0].shape[-2]
        if position_ids is None:
            position_ids = ops.arange(past_length, input_shape[-1] + past_length, dtype=mindspore.int64)
            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])

        # GPT2Attention mask.
        if attention_mask is not None:
            if batch_size <= 0:
                raise ValueError("batch_size has to be defined and > 0")
            attention_mask = attention_mask.view(batch_size, -1)
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
            attention_mask = attention_mask[:, None, None, :]

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
            attention_mask = (1.0 - attention_mask) * -10000.0

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x num_heads x N x N
        # head_mask has shape n_layer x batch x num_heads x N x N
        head_mask = self.get_head_mask(head_mask, self.config.num_layers)

        if inputs_embeds is None:
            inputs_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)
        hidden_states = inputs_embeds + position_embeds

        if token_type_ids is not None:
            token_type_embeds = self.wte(token_type_ids)
            hidden_states = hidden_states + token_type_embeds

        hidden_states = self.drop(hidden_states)

        output_shape = input_shape + (hidden_states.shape[-1],)

        # top attention custom query
        last_layer_id = len(self.h) - 1
        query_embeds = self.wqe(position_ids)

        presents = () if use_cache else None
        all_self_attentions = () if output_attentions else None
        all_hidden_states = () if output_hidden_states else None
        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
            # Final LayerNorm before last query layer
            if i == last_layer_id:
                hidden_states = self.ln_f(hidden_states)

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            outputs = block(
                hidden_states,
                layer_past=layer_past,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
                # custom query
                custom_query=query_embeds if i == last_layer_id else None,
                use_cache=use_cache,
                output_attentions=output_attentions,
            )

            hidden_states = outputs[0]
            if use_cache is True:
                presents = presents + (outputs[1],)

            if output_attentions:
                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)

        hidden_states = hidden_states.view(*output_shape)
        # Add last hidden state
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)

        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=presents,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguModel.__init__(config)

Initializes a new instance of the GPTPanguModel class.

PARAMETER DESCRIPTION
self

The instance of the GPTPanguModel class.

config

A configuration object that contains the settings for the model.

  • Type: object
  • Purpose: Specifies the configuration settings for the model.
  • Restrictions: Must be a valid configuration object.

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
def __init__(self, config):
    """
    Initializes a new instance of the GPTPanguModel class.

    Args:
        self: The instance of the GPTPanguModel class.
        config:
            A configuration object that contains the settings for the model.

            - Type: object
            - Purpose: Specifies the configuration settings for the model.
            - Restrictions: Must be a valid configuration object.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)

    self.embed_dim = config.hidden_size

    self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
    self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
    self.wqe = nn.Embedding(config.max_position_embeddings, self.embed_dim)

    self.drop = nn.Dropout(p=config.embd_pdrop)
    self.h = nn.ModuleList([GPTPanguBlock(config) for _ in range(config.num_layers)])
    self.ln_f = nn.LayerNorm([self.embed_dim], eps=config.layer_norm_epsilon)

    self.gradient_checkpointing = False
    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguModel.forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

Constructs the GPTPanguModel.

PARAMETER DESCRIPTION
self

The object instance.

TYPE: GPTPanguModel

input_ids

The input tensor of shape (batch_size, sequence_length). It represents the input token IDs. Defaults to None.

TYPE: Tensor DEFAULT: None

past_key_values

The tuple of past key values. Each element in the tuple is a tensor of shape (batch_size, num_heads, sequence_length, hidden_size//num_heads). Defaults to None.

TYPE: tuple DEFAULT: None

attention_mask

The attention mask tensor of shape (batch_size, sequence_length). It indicates which tokens should be attended to and which ones should not. Defaults to None.

TYPE: Tensor DEFAULT: None

token_type_ids

The token type IDs tensor of shape (batch_size, sequence_length). It represents the token type embeddings. Defaults to None.

TYPE: Tensor DEFAULT: None

position_ids

The position IDs tensor of shape (batch_size, sequence_length). It represents the position embeddings. Defaults to None.

TYPE: Tensor DEFAULT: None

head_mask

The head mask tensor of shape (num_layers, num_heads). It specifies which heads should be masked for each layer. Defaults to None.

TYPE: Tensor DEFAULT: None

inputs_embeds

The input embeddings tensor of shape (batch_size, sequence_length, hidden_size). It represents the input embeddings directly instead of using input_ids. Defaults to None.

TYPE: Tensor DEFAULT: None

use_cache

Whether to use cache for faster decoding. Defaults to None.

TYPE: bool DEFAULT: None

output_attentions

Whether to output attention weights. Defaults to None.

TYPE: bool DEFAULT: None

output_hidden_states

Whether to output hidden states. Defaults to None.

TYPE: bool DEFAULT: None

return_dict

Whether to use a dictionary as the return type. Defaults to None.

TYPE: bool DEFAULT: None

RETURNS DESCRIPTION

None

RAISES DESCRIPTION
ValueError

If both input_ids and inputs_embeds are provided simultaneously.

ValueError

If neither input_ids nor inputs_embeds are provided.

ValueError

If batch_size is not defined or is less than or equal to 0.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
def forward(
    self,
    input_ids=None,
    past_key_values=None,
    attention_mask=None,
    token_type_ids=None,
    position_ids=None,
    head_mask=None,
    inputs_embeds=None,
    use_cache=None,
    output_attentions=None,
    output_hidden_states=None,
    return_dict=None,
):
    """
    Constructs the GPTPanguModel.

    Args:
        self (GPTPanguModel): The object instance.
        input_ids (torch.Tensor, optional): The input tensor of shape (batch_size, sequence_length).
            It represents the input token IDs. Defaults to None.
        past_key_values (tuple, optional): The tuple of past key values.
            Each element in the tuple is a tensor of shape (batch_size, num_heads, sequence_length,
            hidden_size//num_heads). Defaults to None.
        attention_mask (torch.Tensor, optional): The attention mask tensor of shape (batch_size, sequence_length).
            It indicates which tokens should be attended to and which ones should not. Defaults to None.
        token_type_ids (torch.Tensor, optional): The token type IDs tensor of shape (batch_size, sequence_length).
            It represents the token type embeddings. Defaults to None.
        position_ids (torch.Tensor, optional): The position IDs tensor of shape (batch_size, sequence_length).
            It represents the position embeddings. Defaults to None.
        head_mask (torch.Tensor, optional): The head mask tensor of shape (num_layers, num_heads).
            It specifies which heads should be masked for each layer. Defaults to None.
        inputs_embeds (torch.Tensor, optional):
            The input embeddings tensor of shape (batch_size, sequence_length, hidden_size).
            It represents the input embeddings directly instead of using input_ids. Defaults to None.
        use_cache (bool, optional): Whether to use cache for faster decoding. Defaults to None.
        output_attentions (bool, optional): Whether to output attention weights. Defaults to None.
        output_hidden_states (bool, optional): Whether to output hidden states. Defaults to None.
        return_dict (bool, optional): Whether to use a dictionary as the return type. Defaults to None.

    Returns:
        None

    Raises:
        ValueError: If both input_ids and inputs_embeds are provided simultaneously.
        ValueError: If neither input_ids nor inputs_embeds are provided.
        ValueError: If batch_size is not defined or is less than or equal to 0.
    """
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if input_ids is not None and inputs_embeds is not None:
        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
    if input_ids is not None:
        input_shape = input_ids.shape
        input_ids = input_ids.view(-1, input_shape[-1])
        batch_size = input_ids.shape[0]
    elif inputs_embeds is not None:
        input_shape = inputs_embeds.shape[:-1]
        batch_size = inputs_embeds.shape[0]
    else:
        raise ValueError("You have to specify either input_ids or inputs_embeds")

    if token_type_ids is not None:
        token_type_ids = token_type_ids.view(-1, input_shape[-1])
    if position_ids is not None:
        position_ids = position_ids.view(-1, input_shape[-1])

    if past_key_values is None:
        past_length = 0
        past_key_values = tuple([None] * len(self.h))
    else:
        past_length = past_key_values[0][0].shape[-2]
    if position_ids is None:
        position_ids = ops.arange(past_length, input_shape[-1] + past_length, dtype=mindspore.int64)
        position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])

    # GPT2Attention mask.
    if attention_mask is not None:
        if batch_size <= 0:
            raise ValueError("batch_size has to be defined and > 0")
        attention_mask = attention_mask.view(batch_size, -1)
        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        attention_mask = attention_mask[:, None, None, :]

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
        attention_mask = (1.0 - attention_mask) * -10000.0

    # Prepare head mask if needed
    # 1.0 in head_mask indicate we keep the head
    # attention_probs has shape bsz x num_heads x N x N
    # head_mask has shape n_layer x batch x num_heads x N x N
    head_mask = self.get_head_mask(head_mask, self.config.num_layers)

    if inputs_embeds is None:
        inputs_embeds = self.wte(input_ids)
    position_embeds = self.wpe(position_ids)
    hidden_states = inputs_embeds + position_embeds

    if token_type_ids is not None:
        token_type_embeds = self.wte(token_type_ids)
        hidden_states = hidden_states + token_type_embeds

    hidden_states = self.drop(hidden_states)

    output_shape = input_shape + (hidden_states.shape[-1],)

    # top attention custom query
    last_layer_id = len(self.h) - 1
    query_embeds = self.wqe(position_ids)

    presents = () if use_cache else None
    all_self_attentions = () if output_attentions else None
    all_hidden_states = () if output_hidden_states else None
    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
        # Final LayerNorm before last query layer
        if i == last_layer_id:
            hidden_states = self.ln_f(hidden_states)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        outputs = block(
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask[i],
            # custom query
            custom_query=query_embeds if i == last_layer_id else None,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )

        hidden_states = outputs[0]
        if use_cache is True:
            presents = presents + (outputs[1],)

        if output_attentions:
            all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)

    hidden_states = hidden_states.view(*output_shape)
    # Add last hidden state
    if output_hidden_states:
        all_hidden_states = all_hidden_states + (hidden_states,)

    if not return_dict:
        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)

    return BaseModelOutputWithPast(
        last_hidden_state=hidden_states,
        past_key_values=presents,
        hidden_states=all_hidden_states,
        attentions=all_self_attentions,
    )

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguModel.get_input_embeddings()

Method to retrieve input embeddings from the GPTPanguModel.

PARAMETER DESCRIPTION
self

GPTPanguModel instance. The object instance of the GPTPanguModel class.

RETURNS DESCRIPTION

The input embeddings for further processing in the model.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
550
551
552
553
554
555
556
557
558
559
560
561
562
563
def get_input_embeddings(self):
    """
    Method to retrieve input embeddings from the GPTPanguModel.

    Args:
        self: GPTPanguModel instance. The object instance of the GPTPanguModel class.

    Returns:
       The input embeddings for further processing in the model.

    Raises:
        None.
    """
    return self.wte

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguModel.set_input_embeddings(new_embeddings)

Set the input embeddings for the GPTPanguModel.

PARAMETER DESCRIPTION
self

The instance of the GPTPanguModel class.

TYPE: GPTPanguModel

new_embeddings

The new input embeddings to be set for the model. It should be a tensor or array representing the embeddings.

RETURNS DESCRIPTION
None

This method updates the input embeddings of the model in-place.

RAISES DESCRIPTION
TypeError

If the new_embeddings parameter is not of the correct type.

ValueError

If the new_embeddings parameter is empty or invalid.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
def set_input_embeddings(self, new_embeddings):
    """
    Set the input embeddings for the GPTPanguModel.

    Args:
        self (GPTPanguModel): The instance of the GPTPanguModel class.
        new_embeddings: The new input embeddings to be set for the model.
            It should be a tensor or array representing the embeddings.

    Returns:
        None: This method updates the input embeddings of the model in-place.

    Raises:
        TypeError: If the new_embeddings parameter is not of the correct type.
        ValueError: If the new_embeddings parameter is empty or invalid.
    """
    self.wte = new_embeddings

mindnlp.transformers.models.gpt_pangu.modeling_gptpangu.GPTPanguPreTrainedModel

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp/transformers/models/gpt_pangu/modeling_gptpangu.py
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
class GPTPanguPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    config_class = GPTPanguConfig
    base_model_prefix = "transformer"
    supports_gradient_checkpointing = False

    def _init_weights(self, cell):
        """Initialize the weights"""
        if isinstance(cell, (nn.Linear,)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            cell.weight.set_data(initializer(Normal(self.config.initializer_range),
                                                    cell.weight.shape, cell.weight.dtype))
            if cell.bias is not None:
                cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
        elif isinstance(cell, nn.Embedding):
            weight = initializer(Normal(self.config.initializer_range),
                                                 cell.weight.shape,
                                                 cell.weight.dtype)
            if cell.padding_idx is not None:
                weight[cell.padding_idx] = 0
            cell.weight.set_data(weight)
        elif isinstance(cell, nn.LayerNorm):
            cell.weight.set_data(initializer('ones', cell.weight.shape, cell.weight.dtype))
            cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))

        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
        #
        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
        for name, p in cell.parameters_and_names():
            if "c_proj" in name and "weight" in name:
                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                p.set_data(initializer(Normal(self.config.initializer_range / math.sqrt(2 * self.config.num_layers)),
                                       p.shape, p.dtype))

mindnlp.transformers.models.gpt_pangu.configuration_gptpangu

PanGu_Alpha Models config

mindnlp.transformers.models.gpt_pangu.configuration_gptpangu.GPTPanguConfig

Bases: PretrainedConfig

GPTPanguConfig

Source code in mindnlp/transformers/models/gpt_pangu/configuration_gptpangu.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class GPTPanguConfig(PretrainedConfig):
    """GPTPanguConfig"""
    model_type = "gpt_pangu"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=40000,
        max_position_embeddings=1024,
        hidden_size=2560,
        intermediate_size=None,
        num_layers=32,
        num_heads=32,
        activation_function="gelu",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        scale_attn_weights=True,
        initializer_range=0.02,
        summary_type="cls_index",
        summary_use_proj=True,
        summary_activation=None,
        summary_proj_to_labels=True,
        summary_first_dropout=0.1,
        use_cache=True,
        bos_token_id=9,
        eos_token_id=9,
        **kwargs,
    ):
        """
        This method initializes an instance of the GPTPanguConfig class.

        Args:
            self: The instance of the class.
            vocab_size (int, optional): The size of the vocabulary. Defaults to 40000.
            max_position_embeddings (int, optional): The maximum position index. Defaults to 1024.
            hidden_size (int, optional): The hidden size of the model. Defaults to 2560.
            intermediate_size (int, optional): The size of the intermediate layer in the transformer encoder. Defaults to None.
            num_layers (int, optional): The number of layers in the transformer encoder. Defaults to 32.
            num_heads (int, optional): The number of attention heads in the transformer encoder. Defaults to 32.
            activation_function (str, optional): The activation function used in the transformer layers. Defaults to 'gelu'.
            resid_pdrop (float, optional): The dropout probability for the residual connections. Defaults to 0.1.
            embd_pdrop (float, optional): The dropout probability for the embedding layer. Defaults to 0.1.
            attn_pdrop (float, optional): The dropout probability for the attention layers. Defaults to 0.1.
            layer_norm_epsilon (float, optional): The epsilon value for layer normalization. Defaults to 1e-05.
            scale_attn_weights (bool, optional): Whether to scale the attention weights. Defaults to True.
            initializer_range (float, optional): The range of the initializer. Defaults to 0.02.
            summary_type (str, optional): The type of summary produced by the model. Defaults to 'cls_index'.
            summary_use_proj (bool, optional): Whether to use projection in the summary. Defaults to True.
            summary_activation (str, optional): The activation function used in the summary. Defaults to None.
            summary_proj_to_labels (bool, optional): Whether to project to labels in the summary. Defaults to True.
            summary_first_dropout (float, optional): The dropout probability for the first summary layer. Defaults to 0.1.
            use_cache (bool, optional): Whether to use cache in the model. Defaults to True.
            bos_token_id (int, optional): The beginning of sequence token id. Defaults to 9.
            eos_token_id (int, optional): The end of sequence token id. Defaults to 9.

        Returns:
            None.

        Raises:
            None
        """
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.scale_attn_weights = scale_attn_weights
        self.initializer_range = initializer_range
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
        self.use_cache = use_cache

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

mindnlp.transformers.models.gpt_pangu.configuration_gptpangu.GPTPanguConfig.__init__(vocab_size=40000, max_position_embeddings=1024, hidden_size=2560, intermediate_size=None, num_layers=32, num_heads=32, activation_function='gelu', resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-05, scale_attn_weights=True, initializer_range=0.02, summary_type='cls_index', summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, use_cache=True, bos_token_id=9, eos_token_id=9, **kwargs)

This method initializes an instance of the GPTPanguConfig class.

PARAMETER DESCRIPTION
self

The instance of the class.

vocab_size

The size of the vocabulary. Defaults to 40000.

TYPE: int DEFAULT: 40000

max_position_embeddings

The maximum position index. Defaults to 1024.

TYPE: int DEFAULT: 1024

hidden_size

The hidden size of the model. Defaults to 2560.

TYPE: int DEFAULT: 2560

intermediate_size

The size of the intermediate layer in the transformer encoder. Defaults to None.

TYPE: int DEFAULT: None

num_layers

The number of layers in the transformer encoder. Defaults to 32.

TYPE: int DEFAULT: 32

num_heads

The number of attention heads in the transformer encoder. Defaults to 32.

TYPE: int DEFAULT: 32

activation_function

The activation function used in the transformer layers. Defaults to 'gelu'.

TYPE: str DEFAULT: 'gelu'

resid_pdrop

The dropout probability for the residual connections. Defaults to 0.1.

TYPE: float DEFAULT: 0.1

embd_pdrop

The dropout probability for the embedding layer. Defaults to 0.1.

TYPE: float DEFAULT: 0.1

attn_pdrop

The dropout probability for the attention layers. Defaults to 0.1.

TYPE: float DEFAULT: 0.1

layer_norm_epsilon

The epsilon value for layer normalization. Defaults to 1e-05.

TYPE: float DEFAULT: 1e-05

scale_attn_weights

Whether to scale the attention weights. Defaults to True.

TYPE: bool DEFAULT: True

initializer_range

The range of the initializer. Defaults to 0.02.

TYPE: float DEFAULT: 0.02

summary_type

The type of summary produced by the model. Defaults to 'cls_index'.

TYPE: str DEFAULT: 'cls_index'

summary_use_proj

Whether to use projection in the summary. Defaults to True.

TYPE: bool DEFAULT: True

summary_activation

The activation function used in the summary. Defaults to None.

TYPE: str DEFAULT: None

summary_proj_to_labels

Whether to project to labels in the summary. Defaults to True.

TYPE: bool DEFAULT: True

summary_first_dropout

The dropout probability for the first summary layer. Defaults to 0.1.

TYPE: float DEFAULT: 0.1

use_cache

Whether to use cache in the model. Defaults to True.

TYPE: bool DEFAULT: True

bos_token_id

The beginning of sequence token id. Defaults to 9.

TYPE: int DEFAULT: 9

eos_token_id

The end of sequence token id. Defaults to 9.

TYPE: int DEFAULT: 9

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_pangu/configuration_gptpangu.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def __init__(
    self,
    vocab_size=40000,
    max_position_embeddings=1024,
    hidden_size=2560,
    intermediate_size=None,
    num_layers=32,
    num_heads=32,
    activation_function="gelu",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-5,
    scale_attn_weights=True,
    initializer_range=0.02,
    summary_type="cls_index",
    summary_use_proj=True,
    summary_activation=None,
    summary_proj_to_labels=True,
    summary_first_dropout=0.1,
    use_cache=True,
    bos_token_id=9,
    eos_token_id=9,
    **kwargs,
):
    """
    This method initializes an instance of the GPTPanguConfig class.

    Args:
        self: The instance of the class.
        vocab_size (int, optional): The size of the vocabulary. Defaults to 40000.
        max_position_embeddings (int, optional): The maximum position index. Defaults to 1024.
        hidden_size (int, optional): The hidden size of the model. Defaults to 2560.
        intermediate_size (int, optional): The size of the intermediate layer in the transformer encoder. Defaults to None.
        num_layers (int, optional): The number of layers in the transformer encoder. Defaults to 32.
        num_heads (int, optional): The number of attention heads in the transformer encoder. Defaults to 32.
        activation_function (str, optional): The activation function used in the transformer layers. Defaults to 'gelu'.
        resid_pdrop (float, optional): The dropout probability for the residual connections. Defaults to 0.1.
        embd_pdrop (float, optional): The dropout probability for the embedding layer. Defaults to 0.1.
        attn_pdrop (float, optional): The dropout probability for the attention layers. Defaults to 0.1.
        layer_norm_epsilon (float, optional): The epsilon value for layer normalization. Defaults to 1e-05.
        scale_attn_weights (bool, optional): Whether to scale the attention weights. Defaults to True.
        initializer_range (float, optional): The range of the initializer. Defaults to 0.02.
        summary_type (str, optional): The type of summary produced by the model. Defaults to 'cls_index'.
        summary_use_proj (bool, optional): Whether to use projection in the summary. Defaults to True.
        summary_activation (str, optional): The activation function used in the summary. Defaults to None.
        summary_proj_to_labels (bool, optional): Whether to project to labels in the summary. Defaults to True.
        summary_first_dropout (float, optional): The dropout probability for the first summary layer. Defaults to 0.1.
        use_cache (bool, optional): Whether to use cache in the model. Defaults to True.
        bos_token_id (int, optional): The beginning of sequence token id. Defaults to 9.
        eos_token_id (int, optional): The end of sequence token id. Defaults to 9.

    Returns:
        None.

    Raises:
        None
    """
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_layers = num_layers
    self.num_heads = num_heads
    self.activation_function = activation_function
    self.resid_pdrop = resid_pdrop
    self.embd_pdrop = embd_pdrop
    self.attn_pdrop = attn_pdrop
    self.layer_norm_epsilon = layer_norm_epsilon
    self.scale_attn_weights = scale_attn_weights
    self.initializer_range = initializer_range
    self.summary_type = summary_type
    self.summary_use_proj = summary_use_proj
    self.summary_activation = summary_activation
    self.summary_first_dropout = summary_first_dropout
    self.summary_proj_to_labels = summary_proj_to_labels
    self.use_cache = use_cache

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id

    super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu

PanGu_Alpha Tokenizer.

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu.GPTPanguTokenizer

Bases: PreTrainedTokenizer

This class represents a tokenizer for the GPTPangu model, which is used for tokenizing Chinese text. It inherits from the PreTrainedTokenizer class.

ATTRIBUTE DESCRIPTION
sp

An instance of the SentencePieceProcessor class used for tokenization.

TYPE: SentencePieceProcessor

translator

A translation dictionary to replace spaces and newlines with special tokens.

TYPE: dict

Properties

vocab_size (int): Returns the size of the vocabulary used by the tokenizer.

METHOD DESCRIPTION
__init__

Initializes the GPTPanguTokenizer object.

get_vocab

Returns the vocabulary as a dictionary.

build_inputs_with_special_tokens

Builds model inputs by adding special tokens to a sequence or a pair of sequences for sequence classification tasks.

tokenize

Tokenizes a string.

convert_tokens_to_ids

Converts a list of tokens to their corresponding IDs.

convert_ids_to_tokens

Converts a list of IDs to their corresponding tokens.

decode

Decodes a list of IDs into text.

Source code in mindnlp/transformers/models/gpt_pangu/tokenization_gptpangu.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
class GPTPanguTokenizer(PreTrainedTokenizer):

    """
    This class represents a tokenizer for the GPTPangu model, which is used for tokenizing Chinese text.
    It inherits from the PreTrainedTokenizer class.

    Attributes:
        sp (sentencepiece.SentencePieceProcessor): An instance of the SentencePieceProcessor class used for tokenization.
        translator (dict): A translation dictionary to replace spaces and newlines with special tokens.

    Properties:
        vocab_size (int): Returns the size of the vocabulary used by the tokenizer.

    Methods:
        __init__:
            Initializes the GPTPanguTokenizer object.

        get_vocab:
            Returns the vocabulary as a dictionary.

        build_inputs_with_special_tokens:
            Builds model inputs by adding special tokens to a sequence or a pair of sequences
            for sequence classification tasks.

        tokenize:
            Tokenizes a string.

        convert_tokens_to_ids:
            Converts a list of tokens to their corresponding IDs.

        convert_ids_to_tokens:
            Converts a list of IDs to their corresponding tokens.

        decode:
            Decodes a list of IDs into text.
    """
    # Ref: https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha/src/branch/master/tokenization_jieba.py
    vocab_files_names = {
        "model_file": "vocab.model"
    }

    def __init__(
            self,
            model_file,
            **kwargs
    ):
        """
        Initializes a new instance of the GPTPanguTokenizer class.

        Args:
            self: An instance of the GPTPanguTokenizer class.
            model_file (str): The path to the model file used by the tokenizer.
                The model file should be in the format expected by the sentencepiece.SentencePieceProcessor.
                The tokenizer will load the model file during initialization.

        Returns:
            None.

        Raises:
            None.

        """
        self.sp = sentencepiece.SentencePieceProcessor()
        self.sp.Load(model_file=model_file)
        self.translator = str.maketrans(" \n", "\u2582\u2583")

        super().__init__(**kwargs)
        # special token ids
        # self.eos_token_id = self.sp.piece_to_id("<eot>")

    @property
    def vocab_size(self):
        """ Returns vocab size """
        return self.sp.vocab_size()

    def get_vocab(self):
        """ Returns vocab as a dict """
        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        if self.bos_token_id is not None:
            if token_ids_1 is None:
                return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
            bos = [self.bos_token_id]
            sep = [self.sep_token_id]
            eos = [self.eos_token_id]
            return bos + token_ids_0 + sep + token_ids_1 + eos

        if token_ids_1 is None:
            return token_ids_0 + [self.eos_token_id]
        sep = [self.sep_token_id]
        eos = [self.eos_token_id]
        return token_ids_0 + sep + token_ids_1 + eos

    def tokenize(self, text, **kwargs):
        """ Tokenize a string. """
        seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
        return seg_list

    def convert_tokens_to_ids(self, tokens):
        """
        Converts a list of tokens into their corresponding token IDs using the GPTPanguTokenizer.

        Args:
            self (GPTPanguTokenizer): An instance of the GPTPanguTokenizer class.
            tokens (str or list): The tokens to be converted into token IDs.
                If a string is provided, it will be treated as a single token.

        Returns:
            list or None: A list of token IDs corresponding to the input tokens.
                Returns None if the input tokens are None.

        Raises:
            None

        Note:
            - If the input tokens are None, the method returns None.
            - If the input tokens are a string, the method calls the _convert_token_to_id_with_added_voc() method to
            convert it into a token ID.
            - If the input tokens contain special tokens, the method identifies their indices and splits the tokens
            into segments. Each segment is then encoded using the sp.encode() method and appended to the list of token
            IDs.
            - The method concatenates all the encoded segments and returns the final list of token IDs.

        Example:
            ```python
            >>> tokenizer = GPTPanguTokenizer()
            >>> tokens = ['Hello', 'world', '!']
            >>> ids = tokenizer.convert_tokens_to_ids(tokens)
            >>> # ids = [123, 456, 789]
            ```
        """
        if tokens is None:
            return None

        if isinstance(tokens, str):
            return self._convert_token_to_id_with_added_voc(tokens)

        special_tokens_index = [i for i, token in enumerate(tokens) if token in self.all_special_tokens]

        ids = []
        i = 0
        for j in special_tokens_index:
            new_seg = " ".join(tokens[i:j])
            ids.extend(self.sp.encode(new_seg))
            ids.append(self._convert_token_to_id(tokens[j]))
            i = j + 1

        new_seg = " ".join(tokens[i:])
        ids.extend(self.sp.encode(new_seg))

        return ids

        # new_seg = " ".join(tokens)
        # return self.sp.encode(new_seg)
        # # return tokens

    def _convert_token_to_id(self, token):
        """
        Converts a token to its corresponding ID using the GPTPanguTokenizer.

        Args:
            self (GPTPanguTokenizer): An instance of the GPTPanguTokenizer class.
            token (str): The token to be converted to its corresponding ID.

        Returns:
            None: This method does not return any value but performs the conversion operation internally.

        Raises:
            TypeError: If the token provided is not a string.
            ValueError: If the token does not exist in the tokenizer's vocabulary.
        """
        return self.sp.piece_to_id(token)

    def _convert_id_to_token(self, index):
        """
        Converts an index to its corresponding token using the GPTPanguTokenizer.

        Args:
            self (GPTPanguTokenizer): An instance of the GPTPanguTokenizer class.
            index (int): The index value to be converted to a token. It should be a non-negative integer.

        Returns:
            None

        Raises:
            None
        """
        return self.sp.id_to_piece(index)

    def convert_ids_to_tokens(self, ids):
        """
        Converts a list of token IDs to their corresponding tokens using the GPTPanguTokenizer.

        Args:
            self (GPTPanguTokenizer): An instance of the GPTPanguTokenizer class.
            ids (List[int]): A list of token IDs to be converted to tokens. Each ID represents a unique token.

        Returns:
            None

        Raises:
            None

        Note:
            The GPTPanguTokenizer must be initialized with a pretrained model before using this method.

        Example:
            ```python
            >>> tokenizer = GPTPanguTokenizer()
            >>> token_ids = [0, 1, 2]
            >>> tokenizer.convert_ids_to_tokens(token_ids)
            ['<s>', 'Hello', '</s>']
            ```
        """
        return self.decode(ids)

    def decode(self, ids, **kwargs):
        """
        Decode the given token IDs into text using the GPTPanguTokenizer.

        Args:
            self (GPTPanguTokenizer): An instance of the GPTPanguTokenizer class.
            ids (Union[mindspore.Tensor, np.ndarray, List[int]]): The token IDs to decode into text.
                If passed as a mindspore.Tensor or np.ndarray, it will be converted to a list of integers.
                This parameter is required.

        Returns:
            str: The decoded text corresponding to the provided token IDs.
                Whitespace characters ' ' will be replaced with spaces, '▂' will be replaced with spaces, and '▃' will
                be replaced with newline characters.

        Raises:
            None: This method does not raise any exceptions.
        """
        if isinstance(ids, (mindspore.Tensor, np.ndarray)):
            ids = ids.tolist()

        if kwargs.get('skip_special_tokens', None) is True:
            ids = [token_id for token_id in ids if token_id not in self.all_special_ids]
        text = self.sp.decode(ids)
        if isinstance(text, list):
            text = text[0]
        text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')#.replace('⁇', self.unk_token)
        return text

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu.GPTPanguTokenizer.vocab_size property

Returns vocab size

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu.GPTPanguTokenizer.__init__(model_file, **kwargs)

Initializes a new instance of the GPTPanguTokenizer class.

PARAMETER DESCRIPTION
self

An instance of the GPTPanguTokenizer class.

model_file

The path to the model file used by the tokenizer. The model file should be in the format expected by the sentencepiece.SentencePieceProcessor. The tokenizer will load the model file during initialization.

TYPE: str

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_pangu/tokenization_gptpangu.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def __init__(
        self,
        model_file,
        **kwargs
):
    """
    Initializes a new instance of the GPTPanguTokenizer class.

    Args:
        self: An instance of the GPTPanguTokenizer class.
        model_file (str): The path to the model file used by the tokenizer.
            The model file should be in the format expected by the sentencepiece.SentencePieceProcessor.
            The tokenizer will load the model file during initialization.

    Returns:
        None.

    Raises:
        None.

    """
    self.sp = sentencepiece.SentencePieceProcessor()
    self.sp.Load(model_file=model_file)
    self.translator = str.maketrans(" \n", "\u2582\u2583")

    super().__init__(**kwargs)

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu.GPTPanguTokenizer.build_inputs_with_special_tokens(token_ids_0, token_ids_1=None)

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format:

  • single sequence: [CLS] X [SEP]
  • pair of sequences: [CLS] A [SEP] B [SEP]
PARAMETER DESCRIPTION
token_ids_0

List of IDs to which the special tokens will be added.

TYPE: `List[int]`

token_ids_1

Optional second list of IDs for sequence pairs.

TYPE: `List[int]`, *optional* DEFAULT: None

RETURNS DESCRIPTION

List[int]: List of input IDs with the appropriate special tokens.

Source code in mindnlp/transformers/models/gpt_pangu/tokenization_gptpangu.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    """
    Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
    adding special tokens. A BERT sequence has the following format:

    - single sequence: `[CLS] X [SEP]`
    - pair of sequences: `[CLS] A [SEP] B [SEP]`

    Args:
        token_ids_0 (`List[int]`):
            List of IDs to which the special tokens will be added.
        token_ids_1 (`List[int]`, *optional*):
            Optional second list of IDs for sequence pairs.

    Returns:
        `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
    """
    if self.bos_token_id is not None:
        if token_ids_1 is None:
            return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        bos = [self.bos_token_id]
        sep = [self.sep_token_id]
        eos = [self.eos_token_id]
        return bos + token_ids_0 + sep + token_ids_1 + eos

    if token_ids_1 is None:
        return token_ids_0 + [self.eos_token_id]
    sep = [self.sep_token_id]
    eos = [self.eos_token_id]
    return token_ids_0 + sep + token_ids_1 + eos

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu.GPTPanguTokenizer.convert_ids_to_tokens(ids)

Converts a list of token IDs to their corresponding tokens using the GPTPanguTokenizer.

PARAMETER DESCRIPTION
self

An instance of the GPTPanguTokenizer class.

TYPE: GPTPanguTokenizer

ids

A list of token IDs to be converted to tokens. Each ID represents a unique token.

TYPE: List[int]

RETURNS DESCRIPTION

None

Note

The GPTPanguTokenizer must be initialized with a pretrained model before using this method.

Example
>>> tokenizer = GPTPanguTokenizer()
>>> token_ids = [0, 1, 2]
>>> tokenizer.convert_ids_to_tokens(token_ids)
['<s>', 'Hello', '</s>']
Source code in mindnlp/transformers/models/gpt_pangu/tokenization_gptpangu.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def convert_ids_to_tokens(self, ids):
    """
    Converts a list of token IDs to their corresponding tokens using the GPTPanguTokenizer.

    Args:
        self (GPTPanguTokenizer): An instance of the GPTPanguTokenizer class.
        ids (List[int]): A list of token IDs to be converted to tokens. Each ID represents a unique token.

    Returns:
        None

    Raises:
        None

    Note:
        The GPTPanguTokenizer must be initialized with a pretrained model before using this method.

    Example:
        ```python
        >>> tokenizer = GPTPanguTokenizer()
        >>> token_ids = [0, 1, 2]
        >>> tokenizer.convert_ids_to_tokens(token_ids)
        ['<s>', 'Hello', '</s>']
        ```
    """
    return self.decode(ids)

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu.GPTPanguTokenizer.convert_tokens_to_ids(tokens)

Converts a list of tokens into their corresponding token IDs using the GPTPanguTokenizer.

PARAMETER DESCRIPTION
self

An instance of the GPTPanguTokenizer class.

TYPE: GPTPanguTokenizer

tokens

The tokens to be converted into token IDs. If a string is provided, it will be treated as a single token.

TYPE: str or list

RETURNS DESCRIPTION

list or None: A list of token IDs corresponding to the input tokens. Returns None if the input tokens are None.

Note
  • If the input tokens are None, the method returns None.
  • If the input tokens are a string, the method calls the _convert_token_to_id_with_added_voc() method to convert it into a token ID.
  • If the input tokens contain special tokens, the method identifies their indices and splits the tokens into segments. Each segment is then encoded using the sp.encode() method and appended to the list of token IDs.
  • The method concatenates all the encoded segments and returns the final list of token IDs.
Example
>>> tokenizer = GPTPanguTokenizer()
>>> tokens = ['Hello', 'world', '!']
>>> ids = tokenizer.convert_tokens_to_ids(tokens)
>>> # ids = [123, 456, 789]
Source code in mindnlp/transformers/models/gpt_pangu/tokenization_gptpangu.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def convert_tokens_to_ids(self, tokens):
    """
    Converts a list of tokens into their corresponding token IDs using the GPTPanguTokenizer.

    Args:
        self (GPTPanguTokenizer): An instance of the GPTPanguTokenizer class.
        tokens (str or list): The tokens to be converted into token IDs.
            If a string is provided, it will be treated as a single token.

    Returns:
        list or None: A list of token IDs corresponding to the input tokens.
            Returns None if the input tokens are None.

    Raises:
        None

    Note:
        - If the input tokens are None, the method returns None.
        - If the input tokens are a string, the method calls the _convert_token_to_id_with_added_voc() method to
        convert it into a token ID.
        - If the input tokens contain special tokens, the method identifies their indices and splits the tokens
        into segments. Each segment is then encoded using the sp.encode() method and appended to the list of token
        IDs.
        - The method concatenates all the encoded segments and returns the final list of token IDs.

    Example:
        ```python
        >>> tokenizer = GPTPanguTokenizer()
        >>> tokens = ['Hello', 'world', '!']
        >>> ids = tokenizer.convert_tokens_to_ids(tokens)
        >>> # ids = [123, 456, 789]
        ```
    """
    if tokens is None:
        return None

    if isinstance(tokens, str):
        return self._convert_token_to_id_with_added_voc(tokens)

    special_tokens_index = [i for i, token in enumerate(tokens) if token in self.all_special_tokens]

    ids = []
    i = 0
    for j in special_tokens_index:
        new_seg = " ".join(tokens[i:j])
        ids.extend(self.sp.encode(new_seg))
        ids.append(self._convert_token_to_id(tokens[j]))
        i = j + 1

    new_seg = " ".join(tokens[i:])
    ids.extend(self.sp.encode(new_seg))

    return ids

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu.GPTPanguTokenizer.decode(ids, **kwargs)

Decode the given token IDs into text using the GPTPanguTokenizer.

PARAMETER DESCRIPTION
self

An instance of the GPTPanguTokenizer class.

TYPE: GPTPanguTokenizer

ids

The token IDs to decode into text. If passed as a mindspore.Tensor or np.ndarray, it will be converted to a list of integers. This parameter is required.

TYPE: Union[Tensor, ndarray, List[int]]

RETURNS DESCRIPTION
str

The decoded text corresponding to the provided token IDs. Whitespace characters ' ' will be replaced with spaces, '▂' will be replaced with spaces, and '▃' will be replaced with newline characters.

RAISES DESCRIPTION
None

This method does not raise any exceptions.

Source code in mindnlp/transformers/models/gpt_pangu/tokenization_gptpangu.py
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
def decode(self, ids, **kwargs):
    """
    Decode the given token IDs into text using the GPTPanguTokenizer.

    Args:
        self (GPTPanguTokenizer): An instance of the GPTPanguTokenizer class.
        ids (Union[mindspore.Tensor, np.ndarray, List[int]]): The token IDs to decode into text.
            If passed as a mindspore.Tensor or np.ndarray, it will be converted to a list of integers.
            This parameter is required.

    Returns:
        str: The decoded text corresponding to the provided token IDs.
            Whitespace characters ' ' will be replaced with spaces, '▂' will be replaced with spaces, and '▃' will
            be replaced with newline characters.

    Raises:
        None: This method does not raise any exceptions.
    """
    if isinstance(ids, (mindspore.Tensor, np.ndarray)):
        ids = ids.tolist()

    if kwargs.get('skip_special_tokens', None) is True:
        ids = [token_id for token_id in ids if token_id not in self.all_special_ids]
    text = self.sp.decode(ids)
    if isinstance(text, list):
        text = text[0]
    text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')#.replace('⁇', self.unk_token)
    return text

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu.GPTPanguTokenizer.get_vocab()

Returns vocab as a dict

Source code in mindnlp/transformers/models/gpt_pangu/tokenization_gptpangu.py
110
111
112
113
114
def get_vocab(self):
    """ Returns vocab as a dict """
    vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
    vocab.update(self.added_tokens_encoder)
    return vocab

mindnlp.transformers.models.gpt_pangu.tokenization_gptpangu.GPTPanguTokenizer.tokenize(text, **kwargs)

Tokenize a string.

Source code in mindnlp/transformers/models/gpt_pangu/tokenization_gptpangu.py
147
148
149
150
def tokenize(self, text, **kwargs):
    """ Tokenize a string. """
    seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
    return seg_list