Skip to content

moss

mindnlp.transformers.models.moss.moss

Moss model

mindnlp.transformers.models.moss.moss.MossAttention

Bases: Module

Moss attention layer

Source code in mindnlp/transformers/models/moss/moss.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
class MossAttention(nn.Module):
    """
    Moss attention layer
    """
    def __init__(self, config):
        """
        Initializes a MossAttention object.

        Args:
            self (MossAttention): The current instance of MossAttention.
            config (object):
                A configuration object containing the following attributes:

                - max_position_embeddings (int): The maximum number of positions for positional embeddings.
                - attn_pdrop (float): The dropout probability for attention weights.
                - resid_pdrop (float): The dropout probability for residual connections.
                - hidden_size (int): The dimension of the hidden state.
                - num_attention_heads (int): The number of attention heads for multi-head attention.
                - rotary_dim (int): The dimension for rotary position embeddings.

        Returns:
            None.

        Raises:
            ValueError: If the `embed_dim` is not divisible by `num_attention_heads`.
        """
        super().__init__()

        max_positions = config.max_position_embeddings
        # self.register_buffer(
        #     "causal_mask",
        #     ops.tril(ops.ones(max_positions, max_positions)).view(
        #         1, 1, max_positions, max_positions
        #     ),
        # )
        self.causal_mask = ops.tril(ops.ones((max_positions, max_positions), dtype=mindspore.bool_)).view(
            1, 1, max_positions, max_positions)

        self.attn_dropout = nn.Dropout(p=config.attn_pdrop)
        self.resid_dropout = nn.Dropout(p=config.resid_pdrop)

        self.embed_dim = config.hidden_size
        self.num_attention_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_attention_heads
        if self.head_dim * self.num_attention_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
                f" `num_attention_heads`: {self.num_attention_heads})."
            )
        self.scale_attn = ops.sqrt(Tensor(self.head_dim, dtype=mindspore.float32)).to(
            mindspore.float32
        )
        self.qkv_proj = nn.Linear(
            self.embed_dim, self.embed_dim * 3, bias=False)

        self.out_proj = nn.Linear(
            self.embed_dim, self.embed_dim, bias=False)
        self.rotary_dim = config.rotary_dim
        pos_embd_dim = self.rotary_dim or self.embed_dim
        self.embed_positions = create_sinusoidal_positions(
            max_positions, pos_embd_dim)

    def _split_heads(self, input_tensor, n_head, dim_head, mp_num):
        """
        Splits the input tensor into multiple heads for multi-head attention in the MossAttention class.

        Args:
            self (MossAttention): An instance of the MossAttention class.
            input_tensor (tensor): The input tensor to be split into heads.
            n_head (int): The total number of heads in the attention mechanism.
            dim_head (int): The dimensionality of each head.
            mp_num (int): The number of parallel processes.

        Returns:
            None: This method modifies the input tensor in-place.

        Raises:
            None.
        """
        reshaped = input_tensor.reshape(input_tensor.shape[:-1] + (n_head // mp_num, dim_head))
        reshaped = reshaped.reshape(input_tensor.shape[:-2] + (-1,) + reshaped.shape[-1:])
        return reshaped

    def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
        """
        if len(tensor.shape) == 5:
            # tensor = ops.permute(tensor,(0, 1, 3, 2, 4))
            tensor = ops.permute(tensor, (0, 1, 3, 2, 4))
        elif len(tensor.shape) == 4:
            # tensor = ops.permute(tensor, (0, 2, 1, 3))
            tensor = ops.permute(tensor, (0, 2, 1, 3))
        else:
            raise ValueError(
                f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
        new_shape = tensor.shape[:-2] + (num_attention_heads * attn_head_size,)
        return tensor.view(new_shape)

    def _attn(
            self,
            query,
            key,
            value,
            attention_mask=None,
            head_mask=None,
    ):
        """
        Method _attn in the MossAttention class.

        Args:
            self: The instance of the MossAttention class.
            query (Tensor): The query tensor for attention calculation.
                Shape should be (batch_size, num_heads, query_length, head_size).
            key (Tensor): The key tensor for attention calculation.
                Shape should be (batch_size, num_heads, key_length, head_size).
            value (Tensor): The value tensor to be weighted by attention scores.
                Shape should be (batch_size, num_heads, key_length, head_size).
            attention_mask (Tensor, optional): Mask tensor to be added to the attention weights.
                If provided, its shape should match the shape of the attention weights.
            head_mask (Tensor, optional): Mask tensor to be applied to the attention weights.
                If provided, its shape should match the shape of the attention weights.

        Returns:
            Tuple[Tensor, Tensor]: A tuple containing the attention output tensor and the attention weights tensor.
                The attention output tensor has the same shape as the value tensor.
                The attention weights tensor represents the importance assigned to each element in the value tensor
                based on the query and key similarities.

        Raises:
            ValueError: If query, key, or value tensors have incompatible shapes.
            TypeError: If the input tensors are not of the expected data type.
            IndexError: If there is an issue with the shape manipulation of the tensors.
            RuntimeError: If there is a runtime error during the computation process.
        """
        # compute causal mask from causal mask buffer
        query_length, key_length = query.shape[-2], key.shape[-2]
        causal_mask = self.causal_mask[:, :, key_length -
                                             query_length: key_length, :key_length]

        # Keep the attention weights computation in fp32 to avoid overflow issues
        query = query.to(mindspore.float32)
        key = key.to(mindspore.float32)

        attn_weights = ops.matmul(query, ops.swapaxes(key, -1, -2))

        attn_weights = attn_weights / self.scale_attn
        mask_value = np.finfo(
            mindspore.dtype_to_nptype(attn_weights.dtype)).min
        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
        mask_value = Tensor(mask_value, dtype=attn_weights.dtype)
        attn_weights = ops.where(causal_mask, attn_weights, mask_value)

        if attention_mask is not None:
            # Apply the attention mask
            attn_weights = attn_weights + attention_mask

        attn_weights = nn.Softmax(axis=-1)(attn_weights)
        attn_weights = attn_weights.to(value.dtype)
        attn_weights = self.attn_dropout(attn_weights)

        # Mask heads if we want to
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        attn_output = ops.matmul(attn_weights, value)

        return attn_output, attn_weights

    def forward(
            self,
            hidden_states: Optional[Tensor],
            layer_past: Optional[Tuple[Tensor]] = None,
            attention_mask: Optional[Tensor] = None,
            position_ids: Optional[Tensor] = None,
            head_mask: Optional[Tensor] = None,
            use_cache: Optional[bool] = False,
            output_attentions: Optional[bool] = False,
    ) -> Union[
        Tuple[Tensor, Tuple[Tensor]],
        Optional[Tuple[Tensor, Tuple[Tensor],
        Tuple[Tensor, ...]]],
    ]:
        """
        Constructs the attention mechanism for the MossAttention class.

        Args:
            self: The object instance.
            hidden_states (Optional[Tensor]): The input hidden states.
            layer_past (Optional[Tuple[Tensor]]): The past layer states.
            attention_mask (Optional[Tensor]): Mask for attention computation.
            position_ids (Optional[Tensor]): Positional embeddings.
            head_mask (Optional[Tensor]): Mask for attention heads.
            use_cache (Optional[bool]): Flag to indicate cache usage.
            output_attentions (Optional[bool]): Flag to output attention weights.

        Returns:
            Union[Tuple[Tensor, Tuple[Tensor]], Optional[Tuple[Tensor, Tuple[Tensor], Tuple[Tensor, ...]]]]:
                A tuple containing the attention output tensor and the present state, or None if use_cache is False.
                If output_attentions is True, also includes the attention weights tensor.

        Raises:
            None.
        """
        qkv = self.qkv_proj(hidden_states)
        # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
        mp_num = 4
        qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))

        local_dim = self.head_dim * self.num_attention_heads // mp_num
        query, value, key = ops.split(qkv_split, local_dim, axis=-1)

        query = self._split_heads(
            query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
        key = self._split_heads(
            key, self.num_attention_heads, self.head_dim, mp_num=mp_num)

        value = self._split_heads(
            value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
        value = ops.permute(value, (0, 2, 1, 3))
        # embed_positions = self.embed_positions
        # if embed_positions.device != position_ids.device:
        #     embed_positions = embed_positions.to(position_ids.device)
        #     self.embed_positions = embed_positions

        embed_positions = self.embed_positions
        sincos = embed_positions[position_ids]
        sin, cos = ops.split(sincos, sincos.shape[-1] // 2, -1)

        if self.rotary_dim is not None:
            k_rot = key[:, :, :, : self.rotary_dim]
            k_pass = key[:, :, :, self.rotary_dim:]

            q_rot = query[:, :, :, : self.rotary_dim]
            q_pass = query[:, :, :, self.rotary_dim:]

            k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
            q_rot = apply_rotary_pos_emb(q_rot, sin, cos)

            key = ops.cat([k_rot, k_pass], axis=-1)
            query = ops.cat([q_rot, q_pass], axis=-1)
        else:
            key = apply_rotary_pos_emb(key, sin, cos)
            query = apply_rotary_pos_emb(query, sin, cos)

        key = ops.permute(key, (0, 2, 1, 3))
        query = ops.permute(query, (0, 2, 1, 3))
        if layer_past is not None:
            past_key = layer_past[0]
            past_value = layer_past[1]
            key = ops.cat((past_key, key), axis=-2)
            value = ops.cat((past_value, value), axis=-2)

        if use_cache is True:
            present = (key, value)
        else:
            present = None

        # compute self-attention: V x Softmax(QK^T)
        attn_output, attn_weights = self._attn(
            query, key, value, attention_mask, head_mask)

        attn_output = self._merge_heads(
            attn_output, self.num_attention_heads, self.head_dim)
        attn_output = self.out_proj(attn_output)
        attn_output = self.resid_dropout(attn_output)

        outputs = (attn_output, present)
        if output_attentions:
            outputs += (attn_weights,)

        return outputs  # a, present, (attentions)

mindnlp.transformers.models.moss.moss.MossAttention.__init__(config)

Initializes a MossAttention object.

PARAMETER DESCRIPTION
self

The current instance of MossAttention.

TYPE: MossAttention

config

A configuration object containing the following attributes:

  • max_position_embeddings (int): The maximum number of positions for positional embeddings.
  • attn_pdrop (float): The dropout probability for attention weights.
  • resid_pdrop (float): The dropout probability for residual connections.
  • hidden_size (int): The dimension of the hidden state.
  • num_attention_heads (int): The number of attention heads for multi-head attention.
  • rotary_dim (int): The dimension for rotary position embeddings.

TYPE: object

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If the embed_dim is not divisible by num_attention_heads.

Source code in mindnlp/transformers/models/moss/moss.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def __init__(self, config):
    """
    Initializes a MossAttention object.

    Args:
        self (MossAttention): The current instance of MossAttention.
        config (object):
            A configuration object containing the following attributes:

            - max_position_embeddings (int): The maximum number of positions for positional embeddings.
            - attn_pdrop (float): The dropout probability for attention weights.
            - resid_pdrop (float): The dropout probability for residual connections.
            - hidden_size (int): The dimension of the hidden state.
            - num_attention_heads (int): The number of attention heads for multi-head attention.
            - rotary_dim (int): The dimension for rotary position embeddings.

    Returns:
        None.

    Raises:
        ValueError: If the `embed_dim` is not divisible by `num_attention_heads`.
    """
    super().__init__()

    max_positions = config.max_position_embeddings
    # self.register_buffer(
    #     "causal_mask",
    #     ops.tril(ops.ones(max_positions, max_positions)).view(
    #         1, 1, max_positions, max_positions
    #     ),
    # )
    self.causal_mask = ops.tril(ops.ones((max_positions, max_positions), dtype=mindspore.bool_)).view(
        1, 1, max_positions, max_positions)

    self.attn_dropout = nn.Dropout(p=config.attn_pdrop)
    self.resid_dropout = nn.Dropout(p=config.resid_pdrop)

    self.embed_dim = config.hidden_size
    self.num_attention_heads = config.num_attention_heads
    self.head_dim = self.embed_dim // self.num_attention_heads
    if self.head_dim * self.num_attention_heads != self.embed_dim:
        raise ValueError(
            f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
            f" `num_attention_heads`: {self.num_attention_heads})."
        )
    self.scale_attn = ops.sqrt(Tensor(self.head_dim, dtype=mindspore.float32)).to(
        mindspore.float32
    )
    self.qkv_proj = nn.Linear(
        self.embed_dim, self.embed_dim * 3, bias=False)

    self.out_proj = nn.Linear(
        self.embed_dim, self.embed_dim, bias=False)
    self.rotary_dim = config.rotary_dim
    pos_embd_dim = self.rotary_dim or self.embed_dim
    self.embed_positions = create_sinusoidal_positions(
        max_positions, pos_embd_dim)

mindnlp.transformers.models.moss.moss.MossAttention.forward(hidden_states, layer_past=None, attention_mask=None, position_ids=None, head_mask=None, use_cache=False, output_attentions=False)

Constructs the attention mechanism for the MossAttention class.

PARAMETER DESCRIPTION
self

The object instance.

hidden_states

The input hidden states.

TYPE: Optional[Tensor]

layer_past

The past layer states.

TYPE: Optional[Tuple[Tensor]] DEFAULT: None

attention_mask

Mask for attention computation.

TYPE: Optional[Tensor] DEFAULT: None

position_ids

Positional embeddings.

TYPE: Optional[Tensor] DEFAULT: None

head_mask

Mask for attention heads.

TYPE: Optional[Tensor] DEFAULT: None

use_cache

Flag to indicate cache usage.

TYPE: Optional[bool] DEFAULT: False

output_attentions

Flag to output attention weights.

TYPE: Optional[bool] DEFAULT: False

RETURNS DESCRIPTION
Union[Tuple[Tensor, Tuple[Tensor]], Optional[Tuple[Tensor, Tuple[Tensor], Tuple[Tensor, ...]]]]

Union[Tuple[Tensor, Tuple[Tensor]], Optional[Tuple[Tensor, Tuple[Tensor], Tuple[Tensor, ...]]]]: A tuple containing the attention output tensor and the present state, or None if use_cache is False. If output_attentions is True, also includes the attention weights tensor.

Source code in mindnlp/transformers/models/moss/moss.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
def forward(
        self,
        hidden_states: Optional[Tensor],
        layer_past: Optional[Tuple[Tensor]] = None,
        attention_mask: Optional[Tensor] = None,
        position_ids: Optional[Tensor] = None,
        head_mask: Optional[Tensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
) -> Union[
    Tuple[Tensor, Tuple[Tensor]],
    Optional[Tuple[Tensor, Tuple[Tensor],
    Tuple[Tensor, ...]]],
]:
    """
    Constructs the attention mechanism for the MossAttention class.

    Args:
        self: The object instance.
        hidden_states (Optional[Tensor]): The input hidden states.
        layer_past (Optional[Tuple[Tensor]]): The past layer states.
        attention_mask (Optional[Tensor]): Mask for attention computation.
        position_ids (Optional[Tensor]): Positional embeddings.
        head_mask (Optional[Tensor]): Mask for attention heads.
        use_cache (Optional[bool]): Flag to indicate cache usage.
        output_attentions (Optional[bool]): Flag to output attention weights.

    Returns:
        Union[Tuple[Tensor, Tuple[Tensor]], Optional[Tuple[Tensor, Tuple[Tensor], Tuple[Tensor, ...]]]]:
            A tuple containing the attention output tensor and the present state, or None if use_cache is False.
            If output_attentions is True, also includes the attention weights tensor.

    Raises:
        None.
    """
    qkv = self.qkv_proj(hidden_states)
    # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
    mp_num = 4
    qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))

    local_dim = self.head_dim * self.num_attention_heads // mp_num
    query, value, key = ops.split(qkv_split, local_dim, axis=-1)

    query = self._split_heads(
        query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
    key = self._split_heads(
        key, self.num_attention_heads, self.head_dim, mp_num=mp_num)

    value = self._split_heads(
        value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
    value = ops.permute(value, (0, 2, 1, 3))
    # embed_positions = self.embed_positions
    # if embed_positions.device != position_ids.device:
    #     embed_positions = embed_positions.to(position_ids.device)
    #     self.embed_positions = embed_positions

    embed_positions = self.embed_positions
    sincos = embed_positions[position_ids]
    sin, cos = ops.split(sincos, sincos.shape[-1] // 2, -1)

    if self.rotary_dim is not None:
        k_rot = key[:, :, :, : self.rotary_dim]
        k_pass = key[:, :, :, self.rotary_dim:]

        q_rot = query[:, :, :, : self.rotary_dim]
        q_pass = query[:, :, :, self.rotary_dim:]

        k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
        q_rot = apply_rotary_pos_emb(q_rot, sin, cos)

        key = ops.cat([k_rot, k_pass], axis=-1)
        query = ops.cat([q_rot, q_pass], axis=-1)
    else:
        key = apply_rotary_pos_emb(key, sin, cos)
        query = apply_rotary_pos_emb(query, sin, cos)

    key = ops.permute(key, (0, 2, 1, 3))
    query = ops.permute(query, (0, 2, 1, 3))
    if layer_past is not None:
        past_key = layer_past[0]
        past_value = layer_past[1]
        key = ops.cat((past_key, key), axis=-2)
        value = ops.cat((past_value, value), axis=-2)

    if use_cache is True:
        present = (key, value)
    else:
        present = None

    # compute self-attention: V x Softmax(QK^T)
    attn_output, attn_weights = self._attn(
        query, key, value, attention_mask, head_mask)

    attn_output = self._merge_heads(
        attn_output, self.num_attention_heads, self.head_dim)
    attn_output = self.out_proj(attn_output)
    attn_output = self.resid_dropout(attn_output)

    outputs = (attn_output, present)
    if output_attentions:
        outputs += (attn_weights,)

    return outputs  # a, present, (attentions)

mindnlp.transformers.models.moss.moss.MossBlock

Bases: Module

Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->Moss

Source code in mindnlp/transformers/models/moss/moss.py
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
class MossBlock(nn.Module):
    """
    Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->Moss
    """
    def __init__(self, config):
        """
        Initializes a MossBlock instance with the provided configuration.

        Args:
            self (MossBlock): The instance of MossBlock.
            config (object): An object containing configuration parameters for the MossBlock.
                This object should have the following attributes:

                - n_inner (int or None): The inner dimension size. If None, defaults to 4 times the embedding size.
                - n_embd (int): The embedding size.
                - layer_norm_epsilon (float): The epsilon value for LayerNorm.

        Returns:
            None: This method initializes the MossBlock instance with the specified configuration parameters.

        Raises:
            None.
        """
        super().__init__()
        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
        self.ln_1 = nn.LayerNorm(
            [config.n_embd], eps=config.layer_norm_epsilon)
        self.attn = MossAttention(config)
        self.mlp = MossMLP(inner_dim, config)

    def forward(
            self,
            hidden_states: Optional[Tensor],
            layer_past: Optional[Tuple[Tensor]] = None,
            attention_mask: Optional[Tensor] = None,
            position_ids: Optional[Tensor] = None,
            head_mask: Optional[Tensor] = None,
            use_cache: Optional[bool] = False,
            output_attentions: Optional[bool] = False,
    ) -> Union[Tuple[Tensor], Optional[Tuple[Tensor, Tuple[Tensor, ...]]]]:
        """
        Constructs a MossBlock by applying self-attention and feed-forward layers to the given hidden states.

        Args:
            self (MossBlock): The current MossBlock instance.
            hidden_states (Optional[Tensor]): Input tensor of shape (batch_size, sequence_length, hidden_size).
            layer_past (Optional[Tuple[Tensor]]): Tuple of past hidden states for the self-attention layer. Defaults to None.
            attention_mask (Optional[Tensor]): Mask tensor to prevent attention to certain positions. Defaults to None.
            position_ids (Optional[Tensor]): Tensor containing the position indices of each input token. Defaults to None.
            head_mask (Optional[Tensor]): Mask tensor to specify which attention heads to mask. Defaults to None.
            use_cache (Optional[bool]): Whether to use caching for the self-attention layer. Defaults to False.
            output_attentions (Optional[bool]): Whether to output the attention weights. Defaults to False.

        Returns:
            Union[Tuple[Tensor], Optional[Tuple[Tensor, Tuple[Tensor, ...]]]]:
                A tuple containing the output tensor after applying the self-attention and feed-forward layers.
                If `use_cache` is True, the tuple includes the hidden states and the past hidden states for the
                self-attention layer. Otherwise, the tuple only includes the hidden states.

        Raises:
            None

        """
        residual = hidden_states
        hidden_states = self.ln_1(hidden_states)
        attn_outputs = self.attn(
            hidden_states=hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        outputs = attn_outputs[1:]

        feed_forward_hidden_states = self.mlp(hidden_states)
        hidden_states = attn_output + feed_forward_hidden_states + residual

        if use_cache:
            outputs = (hidden_states,) + outputs
        else:
            outputs = (hidden_states,) + outputs[1:]

        return outputs  # hidden_states, present, (attentions)

mindnlp.transformers.models.moss.moss.MossBlock.__init__(config)

Initializes a MossBlock instance with the provided configuration.

PARAMETER DESCRIPTION
self

The instance of MossBlock.

TYPE: MossBlock

config

An object containing configuration parameters for the MossBlock. This object should have the following attributes:

  • n_inner (int or None): The inner dimension size. If None, defaults to 4 times the embedding size.
  • n_embd (int): The embedding size.
  • layer_norm_epsilon (float): The epsilon value for LayerNorm.

TYPE: object

RETURNS DESCRIPTION
None

This method initializes the MossBlock instance with the specified configuration parameters.

Source code in mindnlp/transformers/models/moss/moss.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
def __init__(self, config):
    """
    Initializes a MossBlock instance with the provided configuration.

    Args:
        self (MossBlock): The instance of MossBlock.
        config (object): An object containing configuration parameters for the MossBlock.
            This object should have the following attributes:

            - n_inner (int or None): The inner dimension size. If None, defaults to 4 times the embedding size.
            - n_embd (int): The embedding size.
            - layer_norm_epsilon (float): The epsilon value for LayerNorm.

    Returns:
        None: This method initializes the MossBlock instance with the specified configuration parameters.

    Raises:
        None.
    """
    super().__init__()
    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
    self.ln_1 = nn.LayerNorm(
        [config.n_embd], eps=config.layer_norm_epsilon)
    self.attn = MossAttention(config)
    self.mlp = MossMLP(inner_dim, config)

mindnlp.transformers.models.moss.moss.MossBlock.forward(hidden_states, layer_past=None, attention_mask=None, position_ids=None, head_mask=None, use_cache=False, output_attentions=False)

Constructs a MossBlock by applying self-attention and feed-forward layers to the given hidden states.

PARAMETER DESCRIPTION
self

The current MossBlock instance.

TYPE: MossBlock

hidden_states

Input tensor of shape (batch_size, sequence_length, hidden_size).

TYPE: Optional[Tensor]

layer_past

Tuple of past hidden states for the self-attention layer. Defaults to None.

TYPE: Optional[Tuple[Tensor]] DEFAULT: None

attention_mask

Mask tensor to prevent attention to certain positions. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

position_ids

Tensor containing the position indices of each input token. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

head_mask

Mask tensor to specify which attention heads to mask. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

use_cache

Whether to use caching for the self-attention layer. Defaults to False.

TYPE: Optional[bool] DEFAULT: False

output_attentions

Whether to output the attention weights. Defaults to False.

TYPE: Optional[bool] DEFAULT: False

RETURNS DESCRIPTION
Union[Tuple[Tensor], Optional[Tuple[Tensor, Tuple[Tensor, ...]]]]

Union[Tuple[Tensor], Optional[Tuple[Tensor, Tuple[Tensor, ...]]]]: A tuple containing the output tensor after applying the self-attention and feed-forward layers. If use_cache is True, the tuple includes the hidden states and the past hidden states for the self-attention layer. Otherwise, the tuple only includes the hidden states.

Source code in mindnlp/transformers/models/moss/moss.py
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
def forward(
        self,
        hidden_states: Optional[Tensor],
        layer_past: Optional[Tuple[Tensor]] = None,
        attention_mask: Optional[Tensor] = None,
        position_ids: Optional[Tensor] = None,
        head_mask: Optional[Tensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
) -> Union[Tuple[Tensor], Optional[Tuple[Tensor, Tuple[Tensor, ...]]]]:
    """
    Constructs a MossBlock by applying self-attention and feed-forward layers to the given hidden states.

    Args:
        self (MossBlock): The current MossBlock instance.
        hidden_states (Optional[Tensor]): Input tensor of shape (batch_size, sequence_length, hidden_size).
        layer_past (Optional[Tuple[Tensor]]): Tuple of past hidden states for the self-attention layer. Defaults to None.
        attention_mask (Optional[Tensor]): Mask tensor to prevent attention to certain positions. Defaults to None.
        position_ids (Optional[Tensor]): Tensor containing the position indices of each input token. Defaults to None.
        head_mask (Optional[Tensor]): Mask tensor to specify which attention heads to mask. Defaults to None.
        use_cache (Optional[bool]): Whether to use caching for the self-attention layer. Defaults to False.
        output_attentions (Optional[bool]): Whether to output the attention weights. Defaults to False.

    Returns:
        Union[Tuple[Tensor], Optional[Tuple[Tensor, Tuple[Tensor, ...]]]]:
            A tuple containing the output tensor after applying the self-attention and feed-forward layers.
            If `use_cache` is True, the tuple includes the hidden states and the past hidden states for the
            self-attention layer. Otherwise, the tuple only includes the hidden states.

    Raises:
        None

    """
    residual = hidden_states
    hidden_states = self.ln_1(hidden_states)
    attn_outputs = self.attn(
        hidden_states=hidden_states,
        layer_past=layer_past,
        attention_mask=attention_mask,
        position_ids=position_ids,
        head_mask=head_mask,
        use_cache=use_cache,
        output_attentions=output_attentions,
    )
    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
    outputs = attn_outputs[1:]

    feed_forward_hidden_states = self.mlp(hidden_states)
    hidden_states = attn_output + feed_forward_hidden_states + residual

    if use_cache:
        outputs = (hidden_states,) + outputs
    else:
        outputs = (hidden_states,) + outputs[1:]

    return outputs  # hidden_states, present, (attentions)

mindnlp.transformers.models.moss.moss.MossForCausalLM

Bases: MossPreTrainedModel

The Moss Model transformer with a language modeling head on top.

Source code in mindnlp/transformers/models/moss/moss.py
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
class MossForCausalLM(MossPreTrainedModel):
    """
    The Moss Model transformer with a language modeling head on top.
    """
    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.causal_mask"]

    def __init__(self, config):
        """
        Initializes an instance of the MossForCausalLM class.

        Args:
            self (MossForCausalLM): The current instance of the MossForCausalLM class.
            config (object):
                An object containing configuration parameters.

                - wbits (int): Number of bits for weight quantization. Default is 32.
                - groupsize (int): Size of the weight quantization group. Default is 128.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)
        if not hasattr(config, 'wbits'):
            config.wbits = 32
            config.groupsize = 128

        # if config.wbits not in [4, 8, 32]:
        #     logger.warning(f'Specify `wbits` with 4, 8 or 32 to load the model. ')
        if config.wbits in [4, 8]:
            def noop():
                pass

            mindspore.common.initializer.HeUniform = noop
            mindspore.ops.uniform = noop
            mindspore.common.initializer.Normal = noop

            # torch.set_default_dtype(mindspore.half)

            self._init_weights = False
            # torch.set_default_dtype(mindspore.half)
        self.transformer = MossModel(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
        if config.wbits in [4, 8]:
            # torch.set_default_dtype(mindspore.float32)
            self._init_weights = True
            self.quantize(config.wbits, config.groupsize)
        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        """
        get output embeddings
        """
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        """
        set output embeddings
        """
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
        """
        Prepare inputs for the generation task.
        """
        token_type_ids = kwargs.get("token_type_ids", None)
        # only last token for inputs_ids if past is defined in kwargs
        if past_key_values:
            input_ids = ops.unsqueeze(input_ids[:, -1], dim=-1)
            if token_type_ids is not None:
                ops.unsqueeze(token_type_ids[:, -1], dim=-1)
        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = ops.unsqueeze(position_ids[:, -1], dim=-1)

        return {
            "input_ids": input_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }

    def forward(
            self,
            input_ids: Optional[Tensor] = None,
            past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
            attention_mask: Optional[Tensor] = None,
            token_type_ids: Optional[Tensor] = None,
            position_ids: Optional[Tensor] = None,
            head_mask: Optional[Tensor] = None,
            inputs_embeds: Optional[Tensor] = None,
            labels: Optional[Tensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Tuple]:
        r"""
        Args:
            labels (`Tensor(dtype=mindspore.int64)` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
                `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
                are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]

        # make sure sampling in fp16 works correctly and
        # compute loss in fp32 to match with mesh-tf version
        # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
        logits = self.lm_head(hidden_states).to(mindspore.float32)

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :]
            shift_labels = labels[..., 1:]
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

            loss = loss.to(hidden_states.dtype)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    @staticmethod
    def _reorder_cache(
            past_key_values: Tuple[Tuple[Tensor]], beam_idx: Tensor
    ) -> Tuple[Tuple[Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        return tuple(
            tuple(past_state.index_select(0, beam_idx.to(past_state.device))
                  for past_state in layer_past)
            for layer_past in past_key_values
        )

    def quantize(self, wbits, groupsize):
        """
        Function to quantize a model using GPTQ.
        """

mindnlp.transformers.models.moss.moss.MossForCausalLM.__init__(config)

Initializes an instance of the MossForCausalLM class.

PARAMETER DESCRIPTION
self

The current instance of the MossForCausalLM class.

TYPE: MossForCausalLM

config

An object containing configuration parameters.

  • wbits (int): Number of bits for weight quantization. Default is 32.
  • groupsize (int): Size of the weight quantization group. Default is 128.

TYPE: object

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/moss/moss.py
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
def __init__(self, config):
    """
    Initializes an instance of the MossForCausalLM class.

    Args:
        self (MossForCausalLM): The current instance of the MossForCausalLM class.
        config (object):
            An object containing configuration parameters.

            - wbits (int): Number of bits for weight quantization. Default is 32.
            - groupsize (int): Size of the weight quantization group. Default is 128.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)
    if not hasattr(config, 'wbits'):
        config.wbits = 32
        config.groupsize = 128

    # if config.wbits not in [4, 8, 32]:
    #     logger.warning(f'Specify `wbits` with 4, 8 or 32 to load the model. ')
    if config.wbits in [4, 8]:
        def noop():
            pass

        mindspore.common.initializer.HeUniform = noop
        mindspore.ops.uniform = noop
        mindspore.common.initializer.Normal = noop

        # torch.set_default_dtype(mindspore.half)

        self._init_weights = False
        # torch.set_default_dtype(mindspore.half)
    self.transformer = MossModel(config)
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
    if config.wbits in [4, 8]:
        # torch.set_default_dtype(mindspore.float32)
        self._init_weights = True
        self.quantize(config.wbits, config.groupsize)
    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.moss.moss.MossForCausalLM.forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

PARAMETER DESCRIPTION
labels

Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set labels = input_ids Indices are selected in [-100, 0, ..., config.vocab_size] All labels set to -100 are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size]

TYPE: `Tensor(dtype=mindspore.int64)` of shape `(batch_size, sequence_length)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/moss/moss.py
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
def forward(
        self,
        input_ids: Optional[Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
        attention_mask: Optional[Tensor] = None,
        token_type_ids: Optional[Tensor] = None,
        position_ids: Optional[Tensor] = None,
        head_mask: Optional[Tensor] = None,
        inputs_embeds: Optional[Tensor] = None,
        labels: Optional[Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
) -> Union[Tuple, Tuple]:
    r"""
    Args:
        labels (`Tensor(dtype=mindspore.int64)` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    outputs = self.transformer(
        input_ids,
        past_key_values=past_key_values,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = outputs[0]

    # make sure sampling in fp16 works correctly and
    # compute loss in fp32 to match with mesh-tf version
    # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
    logits = self.lm_head(hidden_states).to(mindspore.float32)

    loss = None
    if labels is not None:
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :]
        shift_labels = labels[..., 1:]
        # Flatten the tokens
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1))

        loss = loss.to(hidden_states.dtype)

    if not return_dict:
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

mindnlp.transformers.models.moss.moss.MossForCausalLM.get_output_embeddings()

get output embeddings

Source code in mindnlp/transformers/models/moss/moss.py
884
885
886
887
888
def get_output_embeddings(self):
    """
    get output embeddings
    """
    return self.lm_head

mindnlp.transformers.models.moss.moss.MossForCausalLM.prepare_inputs_for_generation(input_ids, past_key_values=None, **kwargs)

Prepare inputs for the generation task.

Source code in mindnlp/transformers/models/moss/moss.py
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
    """
    Prepare inputs for the generation task.
    """
    token_type_ids = kwargs.get("token_type_ids", None)
    # only last token for inputs_ids if past is defined in kwargs
    if past_key_values:
        input_ids = ops.unsqueeze(input_ids[:, -1], dim=-1)
        if token_type_ids is not None:
            ops.unsqueeze(token_type_ids[:, -1], dim=-1)
    attention_mask = kwargs.get("attention_mask", None)
    position_ids = kwargs.get("position_ids", None)

    if attention_mask is not None and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids.masked_fill_(attention_mask == 0, 1)
        if past_key_values:
            position_ids = ops.unsqueeze(position_ids[:, -1], dim=-1)

    return {
        "input_ids": input_ids,
        "past_key_values": past_key_values,
        "use_cache": kwargs.get("use_cache"),
        "position_ids": position_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
    }

mindnlp.transformers.models.moss.moss.MossForCausalLM.quantize(wbits, groupsize)

Function to quantize a model using GPTQ.

Source code in mindnlp/transformers/models/moss/moss.py
1008
1009
1010
1011
def quantize(self, wbits, groupsize):
    """
    Function to quantize a model using GPTQ.
    """

mindnlp.transformers.models.moss.moss.MossForCausalLM.set_output_embeddings(new_embeddings)

set output embeddings

Source code in mindnlp/transformers/models/moss/moss.py
890
891
892
893
894
def set_output_embeddings(self, new_embeddings):
    """
    set output embeddings
    """
    self.lm_head = new_embeddings

mindnlp.transformers.models.moss.moss.MossMLP

Bases: Module

Copied from transformers.models.gptj.modeling_gptj.GPTJMLP with GPTJ->Moss

Source code in mindnlp/transformers/models/moss/moss.py
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
class MossMLP(nn.Module):
    """
    Copied from transformers.models.gptj.modeling_gptj.GPTJMLP with GPTJ->Moss
    """
    def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * embed_dim
        """
        Initializes an instance of the MossMLP class.

        Args:
            self: The current instance of the class.
            intermediate_size (int): The size of the intermediate layer.
            config: The configuration object for the model.

        Returns:
            None

        Raises:
            None
        """
        super().__init__()
        embed_dim = config.n_embd

        self.fc_in = nn.Linear(embed_dim, intermediate_size)
        self.fc_out = nn.Linear(intermediate_size, embed_dim)

        self.act = ACT2FN[config.activation_function]
        self.dropout = nn.Dropout(p=config.resid_pdrop)

    def forward(self, hidden_states: Optional[Tensor]) -> Tensor:
        """
        Constructs the forward pass of the MossMLP neural network.

        Args:
            self (MossMLP): The instance of the MossMLP class.
            hidden_states (Optional[Tensor]): The input hidden states tensor. Default is None.
                A tensor representing the hidden states to be processed by the network.

        Returns:
            Tensor: The processed hidden states tensor after passing through the network layers.
                The final output tensor of the forward pass.

        Raises:
            None.
        """
        hidden_states = self.fc_in(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.fc_out(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return hidden_states

mindnlp.transformers.models.moss.moss.MossMLP.__init__(intermediate_size, config)

Initializes an instance of the MossMLP class.

PARAMETER DESCRIPTION
self

The current instance of the class.

intermediate_size

The size of the intermediate layer.

TYPE: int

config

The configuration object for the model.

RETURNS DESCRIPTION

None

Source code in mindnlp/transformers/models/moss/moss.py
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * embed_dim
    """
    Initializes an instance of the MossMLP class.

    Args:
        self: The current instance of the class.
        intermediate_size (int): The size of the intermediate layer.
        config: The configuration object for the model.

    Returns:
        None

    Raises:
        None
    """
    super().__init__()
    embed_dim = config.n_embd

    self.fc_in = nn.Linear(embed_dim, intermediate_size)
    self.fc_out = nn.Linear(intermediate_size, embed_dim)

    self.act = ACT2FN[config.activation_function]
    self.dropout = nn.Dropout(p=config.resid_pdrop)

mindnlp.transformers.models.moss.moss.MossMLP.forward(hidden_states)

Constructs the forward pass of the MossMLP neural network.

PARAMETER DESCRIPTION
self

The instance of the MossMLP class.

TYPE: MossMLP

hidden_states

The input hidden states tensor. Default is None. A tensor representing the hidden states to be processed by the network.

TYPE: Optional[Tensor]

RETURNS DESCRIPTION
Tensor

The processed hidden states tensor after passing through the network layers. The final output tensor of the forward pass.

TYPE: Tensor

Source code in mindnlp/transformers/models/moss/moss.py
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
def forward(self, hidden_states: Optional[Tensor]) -> Tensor:
    """
    Constructs the forward pass of the MossMLP neural network.

    Args:
        self (MossMLP): The instance of the MossMLP class.
        hidden_states (Optional[Tensor]): The input hidden states tensor. Default is None.
            A tensor representing the hidden states to be processed by the network.

    Returns:
        Tensor: The processed hidden states tensor after passing through the network layers.
            The final output tensor of the forward pass.

    Raises:
        None.
    """
    hidden_states = self.fc_in(hidden_states)
    hidden_states = self.act(hidden_states)
    hidden_states = self.fc_out(hidden_states)
    hidden_states = self.dropout(hidden_states)
    return hidden_states

mindnlp.transformers.models.moss.moss.MossModel

Bases: MossPreTrainedModel

Moss model layer

Source code in mindnlp/transformers/models/moss/moss.py
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
class MossModel(MossPreTrainedModel):
    """
    Moss model layer
    """
    def __init__(self, config):
        """
        Initializes an instance of the MossModel class.

        Args:
            self: The current instance of the class.
            config: An object containing configuration parameters for the model.
                It should have the following attributes:

                - n_embd (int): The embedding dimension.
                - vocab_size (int): The size of the vocabulary.
                - embd_pdrop (float): The dropout probability for the embedding layer.
                - n_layer (int): The number of MossBlocks to be used in the model.
                - layer_norm_epsilon (float): A small value added to the variance to avoid division by zero in LayerNorm.
                - rotary_dim (int): The dimension of the rotary positional encoding.
                It should be less than or equal to n_ctx // num_attention_heads.
                - n_ctx (int): The length of the input sequence.
                - num_attention_heads (int): The number of attention heads in each MossBlock.

        Returns:
            None

        Raises:
            None
        """
        super().__init__(config)

        self.embed_dim = config.n_embd
        self.vocab_size = config.vocab_size
        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        self.drop = nn.Dropout(p=config.embd_pdrop)
        self.h = nn.ModuleList([MossBlock(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm([self.embed_dim], eps=config.layer_norm_epsilon)
        self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)

        self.gradient_checkpointing = False

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        get input embeddings
        """
        return self.wte

    def set_input_embeddings(self, new_embeddings):
        """
        set input embeddings
        """
        self.wte = new_embeddings

    def forward(
            self,
            input_ids: Optional[Tensor] = None,
            past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
            attention_mask: Optional[Tensor] = None,
            token_type_ids: Optional[Tensor] = None,
            position_ids: Optional[Tensor] = None,
            head_mask: Optional[Tensor] = None,
            inputs_embeds: Optional[Tensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Tuple]:
        """
        Construct moss model
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time")
        if input_ids is not None:
            input_shape = input_ids.shape
            input_ids = input_ids.view(-1, input_shape[-1])
            batch_size = input_ids.shape[0]
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
            batch_size = inputs_embeds.shape[0]
        else:
            raise ValueError(
                "You have to specify either input_ids or inputs_embeds")

        # device = input_ids.device if input_ids is not None else inputs_embeds.device

        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, input_shape[-1])

        if position_ids is not None:
            position_ids = position_ids.view(-1, input_shape[-1]).long()

        if past_key_values is None:
            past_length = 0
            past_key_values = tuple([None] * len(self.h))
        else:
            past_length = past_key_values[0][0].size(-2)

        if position_ids is None:
            position_ids = ops.arange(
                past_length, input_shape[-1] + past_length, dtype=mindspore.int64)
            position_ids = Tensor.unsqueeze(
                position_ids, dim=0).view(-1, input_shape[-1])

        # Attention mask.
        if attention_mask is not None:
            if batch_size <= 0:
                raise ValueError("batch_size has to be defined and > 0")
            attention_mask = attention_mask.view(batch_size, -1)
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
            attention_mask = attention_mask[:, None, None, :]

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and the dtype's smallest value for masked positions.
            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
            attention_mask = attention_mask.to(
                dtype=self.dtype)  # fp16 compatibility
            attention_mask = (1.0 - attention_mask) * \
                             float(np.finfo(mindspore.dtype_to_nptype(self.dtype)).min)

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x num_attention_heads x N x N
        # head_mask has shape n_layer x batch x num_attention_heads x N x N
        head_mask = self.get_head_mask(head_mask, self.config.n_layer)

        if inputs_embeds is None:
            inputs_embeds = self.wte(input_ids)

        hidden_states = inputs_embeds

        if token_type_ids is not None:
            token_type_embeds = self.wte(token_type_ids)
            hidden_states = hidden_states + token_type_embeds

        hidden_states = self.drop(hidden_states)

        output_shape = input_shape + (hidden_states.shape[-1],)

        # if self.gradient_checkpointing and self.training:
        #     if use_cache:
        #         logger.warning_once(
        #             "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
        #             "`use_cache=False`..."
        #         )
        #         use_cache = False

        presents = () if use_cache else None
        all_self_attentions = () if output_attentions else None
        all_hidden_states = () if output_hidden_states else None
        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # if self.gradient_checkpointing and self.training:

            #     def create_custom_forward(cell):
            #         def custom_forward(*inputs):
            #             # None for past_key_value
            #             return cell(*inputs, use_cache, output_attentions)

            #         return custom_forward
            #     # outputs = torch.utils.checkpoint.checkpoint(
            #     #     create_custom_forward(block),
            #     #     hidden_states,
            #     #     None,
            #     #     attention_mask,
            #     #     position_ids,
            #     #     head_mask[i],
            #     # )
            # else:
            outputs = block(
                hidden_states=hidden_states,
                layer_past=layer_past,
                attention_mask=attention_mask,
                position_ids=position_ids,
                head_mask=head_mask[i],
                use_cache=use_cache,
                output_attentions=output_attentions,
            )

            hidden_states = outputs[0]
            if use_cache:
                presents = presents + (outputs[1],)

            if output_attentions:
                all_self_attentions = all_self_attentions + \
                                      (outputs[2 if use_cache else 1],)

        hidden_states = self.ln_f(hidden_states)

        hidden_states = hidden_states.view(output_shape)
        # Add last hidden state
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)

        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=presents,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )

mindnlp.transformers.models.moss.moss.MossModel.__init__(config)

Initializes an instance of the MossModel class.

PARAMETER DESCRIPTION
self

The current instance of the class.

config

An object containing configuration parameters for the model. It should have the following attributes:

  • n_embd (int): The embedding dimension.
  • vocab_size (int): The size of the vocabulary.
  • embd_pdrop (float): The dropout probability for the embedding layer.
  • n_layer (int): The number of MossBlocks to be used in the model.
  • layer_norm_epsilon (float): A small value added to the variance to avoid division by zero in LayerNorm.
  • rotary_dim (int): The dimension of the rotary positional encoding. It should be less than or equal to n_ctx // num_attention_heads.
  • n_ctx (int): The length of the input sequence.
  • num_attention_heads (int): The number of attention heads in each MossBlock.

RETURNS DESCRIPTION

None

Source code in mindnlp/transformers/models/moss/moss.py
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
def __init__(self, config):
    """
    Initializes an instance of the MossModel class.

    Args:
        self: The current instance of the class.
        config: An object containing configuration parameters for the model.
            It should have the following attributes:

            - n_embd (int): The embedding dimension.
            - vocab_size (int): The size of the vocabulary.
            - embd_pdrop (float): The dropout probability for the embedding layer.
            - n_layer (int): The number of MossBlocks to be used in the model.
            - layer_norm_epsilon (float): A small value added to the variance to avoid division by zero in LayerNorm.
            - rotary_dim (int): The dimension of the rotary positional encoding.
            It should be less than or equal to n_ctx // num_attention_heads.
            - n_ctx (int): The length of the input sequence.
            - num_attention_heads (int): The number of attention heads in each MossBlock.

    Returns:
        None

    Raises:
        None
    """
    super().__init__(config)

    self.embed_dim = config.n_embd
    self.vocab_size = config.vocab_size
    self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
    self.drop = nn.Dropout(p=config.embd_pdrop)
    self.h = nn.ModuleList([MossBlock(config) for _ in range(config.n_layer)])
    self.ln_f = nn.LayerNorm([self.embed_dim], eps=config.layer_norm_epsilon)
    self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)

    self.gradient_checkpointing = False

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.moss.moss.MossModel.forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

Construct moss model

Source code in mindnlp/transformers/models/moss/moss.py
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
def forward(
        self,
        input_ids: Optional[Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
        attention_mask: Optional[Tensor] = None,
        token_type_ids: Optional[Tensor] = None,
        position_ids: Optional[Tensor] = None,
        head_mask: Optional[Tensor] = None,
        inputs_embeds: Optional[Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
) -> Union[Tuple, Tuple]:
    """
    Construct moss model
    """
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if input_ids is not None and inputs_embeds is not None:
        raise ValueError(
            "You cannot specify both input_ids and inputs_embeds at the same time")
    if input_ids is not None:
        input_shape = input_ids.shape
        input_ids = input_ids.view(-1, input_shape[-1])
        batch_size = input_ids.shape[0]
    elif inputs_embeds is not None:
        input_shape = inputs_embeds.shape[:-1]
        batch_size = inputs_embeds.shape[0]
    else:
        raise ValueError(
            "You have to specify either input_ids or inputs_embeds")

    # device = input_ids.device if input_ids is not None else inputs_embeds.device

    if token_type_ids is not None:
        token_type_ids = token_type_ids.view(-1, input_shape[-1])

    if position_ids is not None:
        position_ids = position_ids.view(-1, input_shape[-1]).long()

    if past_key_values is None:
        past_length = 0
        past_key_values = tuple([None] * len(self.h))
    else:
        past_length = past_key_values[0][0].size(-2)

    if position_ids is None:
        position_ids = ops.arange(
            past_length, input_shape[-1] + past_length, dtype=mindspore.int64)
        position_ids = Tensor.unsqueeze(
            position_ids, dim=0).view(-1, input_shape[-1])

    # Attention mask.
    if attention_mask is not None:
        if batch_size <= 0:
            raise ValueError("batch_size has to be defined and > 0")
        attention_mask = attention_mask.view(batch_size, -1)
        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        attention_mask = attention_mask[:, None, None, :]

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and the dtype's smallest value for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_mask = attention_mask.to(
            dtype=self.dtype)  # fp16 compatibility
        attention_mask = (1.0 - attention_mask) * \
                         float(np.finfo(mindspore.dtype_to_nptype(self.dtype)).min)

    # Prepare head mask if needed
    # 1.0 in head_mask indicate we keep the head
    # attention_probs has shape bsz x num_attention_heads x N x N
    # head_mask has shape n_layer x batch x num_attention_heads x N x N
    head_mask = self.get_head_mask(head_mask, self.config.n_layer)

    if inputs_embeds is None:
        inputs_embeds = self.wte(input_ids)

    hidden_states = inputs_embeds

    if token_type_ids is not None:
        token_type_embeds = self.wte(token_type_ids)
        hidden_states = hidden_states + token_type_embeds

    hidden_states = self.drop(hidden_states)

    output_shape = input_shape + (hidden_states.shape[-1],)

    # if self.gradient_checkpointing and self.training:
    #     if use_cache:
    #         logger.warning_once(
    #             "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
    #             "`use_cache=False`..."
    #         )
    #         use_cache = False

    presents = () if use_cache else None
    all_self_attentions = () if output_attentions else None
    all_hidden_states = () if output_hidden_states else None
    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # if self.gradient_checkpointing and self.training:

        #     def create_custom_forward(cell):
        #         def custom_forward(*inputs):
        #             # None for past_key_value
        #             return cell(*inputs, use_cache, output_attentions)

        #         return custom_forward
        #     # outputs = torch.utils.checkpoint.checkpoint(
        #     #     create_custom_forward(block),
        #     #     hidden_states,
        #     #     None,
        #     #     attention_mask,
        #     #     position_ids,
        #     #     head_mask[i],
        #     # )
        # else:
        outputs = block(
            hidden_states=hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask[i],
            use_cache=use_cache,
            output_attentions=output_attentions,
        )

        hidden_states = outputs[0]
        if use_cache:
            presents = presents + (outputs[1],)

        if output_attentions:
            all_self_attentions = all_self_attentions + \
                                  (outputs[2 if use_cache else 1],)

    hidden_states = self.ln_f(hidden_states)

    hidden_states = hidden_states.view(output_shape)
    # Add last hidden state
    if output_hidden_states:
        all_hidden_states = all_hidden_states + (hidden_states,)

    if not return_dict:
        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)

    return BaseModelOutputWithPast(
        last_hidden_state=hidden_states,
        past_key_values=presents,
        hidden_states=all_hidden_states,
        attentions=all_self_attentions,
    )

mindnlp.transformers.models.moss.moss.MossModel.get_input_embeddings()

get input embeddings

Source code in mindnlp/transformers/models/moss/moss.py
653
654
655
656
657
def get_input_embeddings(self):
    """
    get input embeddings
    """
    return self.wte

mindnlp.transformers.models.moss.moss.MossModel.set_input_embeddings(new_embeddings)

set input embeddings

Source code in mindnlp/transformers/models/moss/moss.py
659
660
661
662
663
def set_input_embeddings(self, new_embeddings):
    """
    set input embeddings
    """
    self.wte = new_embeddings

mindnlp.transformers.models.moss.moss.MossPreTrainedModel

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp/transformers/models/moss/moss.py
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
class MossPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    config_class = MossConfig
    base_model_prefix = "transformer"
    supports_gradient_checkpointing = True
    _no_split_modules = ["MossBlock"]

    # def __init__(self, *inputs, **kwargs):
    #     super().__init__(*inputs, **kwargs)

    def _init_weights(self, cell):
        """Initialize the weight."""
        if isinstance(cell, (nn.Linear,)):
            # Slightly different from Mesh Transformer JAX which uses truncated_normal for initialization
            # cf https://github.com/MindSpore/MindSpore/pull/5617
            # cell.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # cell = ops.normal(cell.weight.data.shape,mean=0.0,stddev=self.config.initializer_range)
            cell.weight.set_data(initializer(Normal(self.config.initializer_range),
                                             cell.weight.shape, cell.weight.dtype))
            if cell.bias is not None:
                cell.bias.set_data(initializer(
                    Zero(), cell.bias.shape, cell.bias.dtype))
        elif isinstance(cell, nn.Embedding):

            cell.weight.set_data(initializer(Normal(self.config.initializer_range),
                                                      cell.weight.shape, cell.weight.dtype))
            if cell.padding_idx is not None:
                cell.weight.data[cell.padding_idx].zero_()
        elif isinstance(cell, nn.LayerNorm):

            cell.bias.set_data(initializer(
                Zero(), cell.bias.shape, cell.bias.dtype))
            cell.weight.set_data(initializer(
                One(), cell.weight.shape, cell.weight.dtype))

    def _set_gradient_checkpointing(self, cell, value=False):
        """
        Sets the gradient checkpointing flag for a given cell in the MossPreTrainedModel.

        Args:
            self (MossPreTrainedModel): The instance of the MossPreTrainedModel class.
            cell (object): The cell for which the gradient checkpointing flag needs to be set.
                Must be an instance of the MossModel class.
            value (bool): The value to be set for the gradient checkpointing flag.
                If True, gradient checkpointing is enabled for the specified cell.
                If False, gradient checkpointing is disabled for the specified cell.

        Returns:
            None.

        Raises:
            None.
        """
        if isinstance(cell, MossModel):
            cell.gradient_checkpointing = value

mindnlp.transformers.models.moss.moss.apply_rotary_pos_emb(tensor, sin, cos)

Copied from transformers.models.gptj.modeling_gptj.apply_rotary_pos_emb

Source code in mindnlp/transformers/models/moss/moss.py
65
66
67
68
69
70
71
def apply_rotary_pos_emb(tensor: Tensor, sin: Tensor, cos: Tensor) -> Tensor:
    """
    Copied from transformers.models.gptj.modeling_gptj.apply_rotary_pos_emb
    """
    sin = ops.repeat_elements(sin[:, :, None, :], 2, 3)
    cos = ops.repeat_elements(cos[:, :, None, :], 2, 3)
    return (tensor * cos) + (rotate_every_two(tensor) * sin)

mindnlp.transformers.models.moss.moss.create_sinusoidal_positions(num_pos, dim)

Copied from transformers.models.gptj.modeling_gptj.create_sinusoidal_positions

Source code in mindnlp/transformers/models/moss/moss.py
44
45
46
47
48
49
50
51
52
def create_sinusoidal_positions(num_pos: int, dim: int) -> Tensor:
    """
    Copied from transformers.models.gptj.modeling_gptj.create_sinusoidal_positions
    """
    inv_freq = 1.0 / (10000 ** (ops.arange(0, dim, 2) * 1.0 / dim))
    sinusoid_inp = ops.einsum(
        "i , j -> i j", ops.arange(num_pos, dtype=mindspore.float32), inv_freq).float()
    res = ops.cat((ops.sin(sinusoid_inp), ops.cos(sinusoid_inp)), axis=1)
    return res

mindnlp.transformers.models.moss.moss.rotate_every_two(input_tensor)

Copied from transformers.models.gptj.modeling_gptj.rotate_every_two

Source code in mindnlp/transformers/models/moss/moss.py
55
56
57
58
59
60
61
62
def rotate_every_two(input_tensor: Tensor) -> Tensor:
    """
    Copied from transformers.models.gptj.modeling_gptj.rotate_every_two
    """
    tensor1 = input_tensor[:, :, :, ::2]
    tensor2 = input_tensor[:, :, :, 1::2]
    out_tensor = ops.stack((-tensor2, tensor1), axis=-1)
    return ops.flatten(out_tensor, start_dim=-2)

mindnlp.transformers.models.moss.moss_configuration

Moss model configuration

mindnlp.transformers.models.moss.moss_configuration.MossConfig

Bases: PretrainedConfig

Configuration for moss

Source code in mindnlp/transformers/models/moss/moss_configuration.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class MossConfig(PretrainedConfig):
    """
    Configuration for moss
    """
    model_type = "moss"
    attribute_map = {
        "max_position_embeddings": "n_positions",
        "hidden_size": "n_embd",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }
    def __init__(
            self,
            vocab_size=107008,
            n_positions=2048,
            n_ctx=2048,
            n_embd=4096,
            n_layer=28,
            n_head=16,
            rotary_dim=64,
            n_inner=None,
            activation_function="gelu_new",
            resid_pdrop=0.0,
            embd_pdrop=0.0,
            attn_pdrop=0.0,
            layer_norm_epsilon=1e-5,
            initializer_range=0.02,
            use_cache=True,
            bos_token_id=106028,
            eos_token_id=106068,
            tie_word_embeddings=False,
            wbits=32,
            groupsize=128,
            # max_position_embeddings = 1024,
            **kwargs,
    ):
        """
        Initialize a MossConfig object.

        Args:
            vocab_size (int): The size of the vocabulary.
            n_positions (int): The number of positions.
            n_ctx (int): The context size.
            n_embd (int): The embedding size.
            n_layer (int): The number of layers.
            n_head (int): The number of attention heads.
            rotary_dim (int): The dimension for rotary embeddings.
            n_inner (int): The inner dimension size (if applicable).
            activation_function (str): The activation function used.
            resid_pdrop (float): The dropout probability for residual connections.
            embd_pdrop (float): The dropout probability for embeddings.
            attn_pdrop (float): The dropout probability for attention layers.
            layer_norm_epsilon (float): The epsilon value for layer normalization.
            initializer_range (float): The range for parameter initialization.
            use_cache (bool): Flag indicating whether to use cache.
            bos_token_id (int): The ID for the beginning of sequence token.
            eos_token_id (int): The ID for the end of sequence token.
            tie_word_embeddings (bool): Flag indicating whether word embeddings should be tied.
            wbits (int): The number of bits for weight quantization.
            groupsize (int): The group size for quantization.

        Returns:
            None

        Raises:
            ValueError: If an invalid parameter value is provided.
            TypeError: If the input types are incorrect.
            RuntimeError: If an unexpected error occurs during initialization.
        """
        self.vocab_size = vocab_size
        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_inner = n_inner
        self.rotary_dim = rotary_dim
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.wbits = wbits
        self.groupsize = groupsize
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.max_position_embeddings = n_positions
        self.hidden_size = n_embd
        self.num_attention_heads=n_head
        self.num_hidden_layers=n_layer
        super().__init__(
            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
        )

mindnlp.transformers.models.moss.moss_configuration.MossConfig.__init__(vocab_size=107008, n_positions=2048, n_ctx=2048, n_embd=4096, n_layer=28, n_head=16, rotary_dim=64, n_inner=None, activation_function='gelu_new', resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, layer_norm_epsilon=1e-05, initializer_range=0.02, use_cache=True, bos_token_id=106028, eos_token_id=106068, tie_word_embeddings=False, wbits=32, groupsize=128, **kwargs)

Initialize a MossConfig object.

PARAMETER DESCRIPTION
vocab_size

The size of the vocabulary.

TYPE: int DEFAULT: 107008

n_positions

The number of positions.

TYPE: int DEFAULT: 2048

n_ctx

The context size.

TYPE: int DEFAULT: 2048

n_embd

The embedding size.

TYPE: int DEFAULT: 4096

n_layer

The number of layers.

TYPE: int DEFAULT: 28

n_head

The number of attention heads.

TYPE: int DEFAULT: 16

rotary_dim

The dimension for rotary embeddings.

TYPE: int DEFAULT: 64

n_inner

The inner dimension size (if applicable).

TYPE: int DEFAULT: None

activation_function

The activation function used.

TYPE: str DEFAULT: 'gelu_new'

resid_pdrop

The dropout probability for residual connections.

TYPE: float DEFAULT: 0.0

embd_pdrop

The dropout probability for embeddings.

TYPE: float DEFAULT: 0.0

attn_pdrop

The dropout probability for attention layers.

TYPE: float DEFAULT: 0.0

layer_norm_epsilon

The epsilon value for layer normalization.

TYPE: float DEFAULT: 1e-05

initializer_range

The range for parameter initialization.

TYPE: float DEFAULT: 0.02

use_cache

Flag indicating whether to use cache.

TYPE: bool DEFAULT: True

bos_token_id

The ID for the beginning of sequence token.

TYPE: int DEFAULT: 106028

eos_token_id

The ID for the end of sequence token.

TYPE: int DEFAULT: 106068

tie_word_embeddings

Flag indicating whether word embeddings should be tied.

TYPE: bool DEFAULT: False

wbits

The number of bits for weight quantization.

TYPE: int DEFAULT: 32

groupsize

The group size for quantization.

TYPE: int DEFAULT: 128

RETURNS DESCRIPTION

None

RAISES DESCRIPTION
ValueError

If an invalid parameter value is provided.

TypeError

If the input types are incorrect.

RuntimeError

If an unexpected error occurs during initialization.

Source code in mindnlp/transformers/models/moss/moss_configuration.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def __init__(
        self,
        vocab_size=107008,
        n_positions=2048,
        n_ctx=2048,
        n_embd=4096,
        n_layer=28,
        n_head=16,
        rotary_dim=64,
        n_inner=None,
        activation_function="gelu_new",
        resid_pdrop=0.0,
        embd_pdrop=0.0,
        attn_pdrop=0.0,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=106028,
        eos_token_id=106068,
        tie_word_embeddings=False,
        wbits=32,
        groupsize=128,
        # max_position_embeddings = 1024,
        **kwargs,
):
    """
    Initialize a MossConfig object.

    Args:
        vocab_size (int): The size of the vocabulary.
        n_positions (int): The number of positions.
        n_ctx (int): The context size.
        n_embd (int): The embedding size.
        n_layer (int): The number of layers.
        n_head (int): The number of attention heads.
        rotary_dim (int): The dimension for rotary embeddings.
        n_inner (int): The inner dimension size (if applicable).
        activation_function (str): The activation function used.
        resid_pdrop (float): The dropout probability for residual connections.
        embd_pdrop (float): The dropout probability for embeddings.
        attn_pdrop (float): The dropout probability for attention layers.
        layer_norm_epsilon (float): The epsilon value for layer normalization.
        initializer_range (float): The range for parameter initialization.
        use_cache (bool): Flag indicating whether to use cache.
        bos_token_id (int): The ID for the beginning of sequence token.
        eos_token_id (int): The ID for the end of sequence token.
        tie_word_embeddings (bool): Flag indicating whether word embeddings should be tied.
        wbits (int): The number of bits for weight quantization.
        groupsize (int): The group size for quantization.

    Returns:
        None

    Raises:
        ValueError: If an invalid parameter value is provided.
        TypeError: If the input types are incorrect.
        RuntimeError: If an unexpected error occurs during initialization.
    """
    self.vocab_size = vocab_size
    self.n_ctx = n_ctx
    self.n_positions = n_positions
    self.n_embd = n_embd
    self.n_layer = n_layer
    self.n_head = n_head
    self.n_inner = n_inner
    self.rotary_dim = rotary_dim
    self.activation_function = activation_function
    self.resid_pdrop = resid_pdrop
    self.embd_pdrop = embd_pdrop
    self.attn_pdrop = attn_pdrop
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.use_cache = use_cache
    self.wbits = wbits
    self.groupsize = groupsize
    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id
    self.max_position_embeddings = n_positions
    self.hidden_size = n_embd
    self.num_attention_heads=n_head
    self.num_hidden_layers=n_layer
    super().__init__(
        bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
    )

mindnlp.transformers.models.moss.moss_tokenization

MindSpore Moss model.