Skip to content

internlm

mindnlp.transformers.models.internlm.modeling_internlm

InternLM Model.

mindnlp.transformers.models.internlm.modeling_internlm.InternLMAttention

Bases: Module

Multi-headed attention from 'Attention Is All You Need' paper

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
class InternLMAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    def __init__(self, config: InternLMConfig):
        """
        Initializes an instance of the InternLMAttention class.

        Args:
            self: The instance of the class.
            config (InternLMConfig): An instance of the InternLMConfig class containing the configuration parameters.

        Returns:
            None

        Raises:
            ValueError: If `hidden_size` is not divisible by `num_heads`.

        This method initializes the InternLMAttention class by setting the instance variables and initializing the projection layers.

        The `config` parameter is an instance of the InternLMConfig class, which contains the following attributes:

        - `hidden_size` (int): The size of the hidden state.
        - `num_attention_heads` (int): The number of attention heads.
        - `max_position_embeddings` (int): The maximum number of position embeddings.
        - `bias` (bool): Whether to include bias in the projection layers.

        The method sets the following instance variables:

        - `config` (InternLMConfig): The configuration instance.
        - `hidden_size` (int): The size of the hidden state.
        - `num_heads` (int): The number of attention heads.
        - `head_dim` (int): The dimension of each attention head.
        - `max_position_embeddings` (int): The maximum number of position embeddings.

        The method also initializes the following projection layers:

        - `q_proj` (Dense): The projection layer for the query.
        - `k_proj` (Dense): The projection layer for the key.
        - `v_proj` (Dense): The projection layer for the value.
        - `o_proj` (Dense): The projection layer for the output.

        If the product of `head_dim` and `num_heads` is not equal to `hidden_size`, a ValueError is raised.
        """
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.max_position_embeddings = config.max_position_embeddings

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
        self._init_rope()

    def _init_rope(self):
        """
        This method initializes the rotary embedding for the InternLMAttention class.

        Args:
            self: The instance of the InternLMAttention class.

        Returns:
            None: However, it initializes and assigns the rotary embedding to the instance variable 'rotary_emb'.

        Raises:
            ValueError: If the rotary embedding's type specified in the configuration is not one of ('origin', 'dynamic').
        """
        if self.config.rotary["type"] == "origin":
            self.rotary_emb = InternLMRotaryEmbedding(
                self.head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.config.rotary["base"],
            )
        elif self.config.rotary["type"] == "dynamic":
            self.rotary_emb = InternLMDynamicNTKScalingRotaryEmbedding(
                self.head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.config.rotary["base"],
                scaling_factor=self.config.rotary.get("scaling_factor", 1.0),
            )
        else:
            raise ValueError("Currently we only support rotary embedding's type being one of ('origin', 'dynamic').")
        return self.rotary_emb

    def _shape(self, tensor: mindspore.Tensor, seq_len: int, bsz: int):
        """
        Reshapes the input tensor according to the specified dimensions for the InternLMAttention class.

        Args:
            self (InternLMAttention): An instance of the InternLMAttention class.
            tensor (mindspore.Tensor): The input tensor to be reshaped.
            seq_len (int): The length of the sequence.
            bsz (int): The batch size.

        Returns:
            None: The method modifies the input tensor in-place.

        Raises:
            None.
        """
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).swapaxes(1, 2)

    def forward(
        self,
        hidden_states: mindspore.Tensor,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        **kwargs,
    ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
        """
        Constructs the attention mechanism for the InternLMAttention class.

        Args:
            self: The object instance.
            hidden_states (mindspore.Tensor): The input hidden states.
                Its shape is (batch_size, sequence_length, hidden_size).
            attention_mask (Optional[mindspore.Tensor]): The attention mask tensor.
                It has the same shape as `hidden_states`. Default is None.
            position_ids (Optional[mindspore.Tensor]): The position ids tensor.
                It has the same shape as `hidden_states`. Default is None.
            past_key_value (Optional[Tuple[mindspore.Tensor]]): The past key-value tuple. Default is None.
            output_attentions (bool): Whether to output attention weights. Default is False.
            use_cache (bool): Whether to use cache. Default is False.

        Returns:
            Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
                A tuple containing the attention output, attention weights, and the updated past key-value tuple.

                - attn_output (mindspore.Tensor): The output tensor of shape (batch_size, sequence_length, hidden_size).
                - attn_weights (Optional[mindspore.Tensor]): The attention weights tensor of shape
                (batch_size, num_heads, sequence_length, sequence_length). If `output_attentions` is False, it is set to None.
                - past_key_value (Optional[Tuple[mindspore.Tensor]]): The updated past key-value tuple.
                If `use_cache` is False, it is set to None.

        Raises:
            ValueError: If the shape of attention weights is not (batch_size, num_heads, sequence_length, sequence_length).
            ValueError: If the shape of attention mask is not (batch_size, 1, sequence_length, sequence_length).

        """
        bsz, q_len, _ = hidden_states.shape
        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)
        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)
        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)

        if past_key_value is not None:
            # reuse k, v, self_attention
            key_states = ops.cat([past_key_value[0], key_states], axis=2)
            value_states = ops.cat([past_key_value[1], value_states], axis=2)

        past_key_value = (key_states, value_states) if use_cache else None
        kv_seq_len = key_states.shape[-2]
        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
        attn_weights = ops.matmul(query_states, key_states.swapaxes(2, 3)) / math.sqrt(self.head_dim)
        if attn_weights.shape != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.shape}"
            )
        if attention_mask is not None:
            if attention_mask.shape!= (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
                )
            attn_weights = attn_weights + attention_mask
            attn_weights = ops.maximum(attn_weights,Tensor(np.finfo(mindspore.dtype_to_nptype(attn_weights.dtype)).min))

        attn_weights = ops.softmax(attn_weights, axis=-1).astype(query_states.dtype)
        attn_output = ops.matmul(attn_weights, value_states)

        if attn_output.shape != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.shape}"
            )

        attn_output = attn_output.swapaxes(1, 2)
        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

mindnlp.transformers.models.internlm.modeling_internlm.InternLMAttention.__init__(config)

Initializes an instance of the InternLMAttention class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

An instance of the InternLMConfig class containing the configuration parameters.

TYPE: InternLMConfig

RETURNS DESCRIPTION

None

RAISES DESCRIPTION
ValueError

If hidden_size is not divisible by num_heads.

This method initializes the InternLMAttention class by setting the instance variables and initializing the projection layers.

The config parameter is an instance of the InternLMConfig class, which contains the following attributes:

  • hidden_size (int): The size of the hidden state.
  • num_attention_heads (int): The number of attention heads.
  • max_position_embeddings (int): The maximum number of position embeddings.
  • bias (bool): Whether to include bias in the projection layers.

The method sets the following instance variables:

  • config (InternLMConfig): The configuration instance.
  • hidden_size (int): The size of the hidden state.
  • num_heads (int): The number of attention heads.
  • head_dim (int): The dimension of each attention head.
  • max_position_embeddings (int): The maximum number of position embeddings.

The method also initializes the following projection layers:

  • q_proj (Dense): The projection layer for the query.
  • k_proj (Dense): The projection layer for the key.
  • v_proj (Dense): The projection layer for the value.
  • o_proj (Dense): The projection layer for the output.

If the product of head_dim and num_heads is not equal to hidden_size, a ValueError is raised.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
def __init__(self, config: InternLMConfig):
    """
    Initializes an instance of the InternLMAttention class.

    Args:
        self: The instance of the class.
        config (InternLMConfig): An instance of the InternLMConfig class containing the configuration parameters.

    Returns:
        None

    Raises:
        ValueError: If `hidden_size` is not divisible by `num_heads`.

    This method initializes the InternLMAttention class by setting the instance variables and initializing the projection layers.

    The `config` parameter is an instance of the InternLMConfig class, which contains the following attributes:

    - `hidden_size` (int): The size of the hidden state.
    - `num_attention_heads` (int): The number of attention heads.
    - `max_position_embeddings` (int): The maximum number of position embeddings.
    - `bias` (bool): Whether to include bias in the projection layers.

    The method sets the following instance variables:

    - `config` (InternLMConfig): The configuration instance.
    - `hidden_size` (int): The size of the hidden state.
    - `num_heads` (int): The number of attention heads.
    - `head_dim` (int): The dimension of each attention head.
    - `max_position_embeddings` (int): The maximum number of position embeddings.

    The method also initializes the following projection layers:

    - `q_proj` (Dense): The projection layer for the query.
    - `k_proj` (Dense): The projection layer for the key.
    - `v_proj` (Dense): The projection layer for the value.
    - `o_proj` (Dense): The projection layer for the output.

    If the product of `head_dim` and `num_heads` is not equal to `hidden_size`, a ValueError is raised.
    """
    super().__init__()
    self.config = config
    self.hidden_size = config.hidden_size
    self.num_heads = config.num_attention_heads
    self.head_dim = self.hidden_size // self.num_heads
    self.max_position_embeddings = config.max_position_embeddings

    if (self.head_dim * self.num_heads) != self.hidden_size:
        raise ValueError(
            f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
            f" and `num_heads`: {self.num_heads})."
        )

    self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
    self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
    self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
    self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
    self._init_rope()

mindnlp.transformers.models.internlm.modeling_internlm.InternLMAttention.forward(hidden_states, attention_mask=None, position_ids=None, past_key_value=None, output_attentions=False, use_cache=False, **kwargs)

Constructs the attention mechanism for the InternLMAttention class.

PARAMETER DESCRIPTION
self

The object instance.

hidden_states

The input hidden states. Its shape is (batch_size, sequence_length, hidden_size).

TYPE: Tensor

attention_mask

The attention mask tensor. It has the same shape as hidden_states. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

position_ids

The position ids tensor. It has the same shape as hidden_states. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

past_key_value

The past key-value tuple. Default is None.

TYPE: Optional[Tuple[Tensor]] DEFAULT: None

output_attentions

Whether to output attention weights. Default is False.

TYPE: bool DEFAULT: False

use_cache

Whether to use cache. Default is False.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]

Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]: A tuple containing the attention output, attention weights, and the updated past key-value tuple.

  • attn_output (mindspore.Tensor): The output tensor of shape (batch_size, sequence_length, hidden_size).
  • attn_weights (Optional[mindspore.Tensor]): The attention weights tensor of shape (batch_size, num_heads, sequence_length, sequence_length). If output_attentions is False, it is set to None.
  • past_key_value (Optional[Tuple[mindspore.Tensor]]): The updated past key-value tuple. If use_cache is False, it is set to None.
RAISES DESCRIPTION
ValueError

If the shape of attention weights is not (batch_size, num_heads, sequence_length, sequence_length).

ValueError

If the shape of attention mask is not (batch_size, 1, sequence_length, sequence_length).

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
def forward(
    self,
    hidden_states: mindspore.Tensor,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    **kwargs,
) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
    """
    Constructs the attention mechanism for the InternLMAttention class.

    Args:
        self: The object instance.
        hidden_states (mindspore.Tensor): The input hidden states.
            Its shape is (batch_size, sequence_length, hidden_size).
        attention_mask (Optional[mindspore.Tensor]): The attention mask tensor.
            It has the same shape as `hidden_states`. Default is None.
        position_ids (Optional[mindspore.Tensor]): The position ids tensor.
            It has the same shape as `hidden_states`. Default is None.
        past_key_value (Optional[Tuple[mindspore.Tensor]]): The past key-value tuple. Default is None.
        output_attentions (bool): Whether to output attention weights. Default is False.
        use_cache (bool): Whether to use cache. Default is False.

    Returns:
        Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
            A tuple containing the attention output, attention weights, and the updated past key-value tuple.

            - attn_output (mindspore.Tensor): The output tensor of shape (batch_size, sequence_length, hidden_size).
            - attn_weights (Optional[mindspore.Tensor]): The attention weights tensor of shape
            (batch_size, num_heads, sequence_length, sequence_length). If `output_attentions` is False, it is set to None.
            - past_key_value (Optional[Tuple[mindspore.Tensor]]): The updated past key-value tuple.
            If `use_cache` is False, it is set to None.

    Raises:
        ValueError: If the shape of attention weights is not (batch_size, num_heads, sequence_length, sequence_length).
        ValueError: If the shape of attention mask is not (batch_size, 1, sequence_length, sequence_length).

    """
    bsz, q_len, _ = hidden_states.shape
    query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)
    key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)
    value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)

    if past_key_value is not None:
        # reuse k, v, self_attention
        key_states = ops.cat([past_key_value[0], key_states], axis=2)
        value_states = ops.cat([past_key_value[1], value_states], axis=2)

    past_key_value = (key_states, value_states) if use_cache else None
    kv_seq_len = key_states.shape[-2]
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
    attn_weights = ops.matmul(query_states, key_states.swapaxes(2, 3)) / math.sqrt(self.head_dim)
    if attn_weights.shape != (bsz, self.num_heads, q_len, kv_seq_len):
        raise ValueError(
            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
            f" {attn_weights.shape}"
        )
    if attention_mask is not None:
        if attention_mask.shape!= (bsz, 1, q_len, kv_seq_len):
            raise ValueError(
                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
            )
        attn_weights = attn_weights + attention_mask
        attn_weights = ops.maximum(attn_weights,Tensor(np.finfo(mindspore.dtype_to_nptype(attn_weights.dtype)).min))

    attn_weights = ops.softmax(attn_weights, axis=-1).astype(query_states.dtype)
    attn_output = ops.matmul(attn_weights, value_states)

    if attn_output.shape != (bsz, self.num_heads, q_len, self.head_dim):
        raise ValueError(
            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
            f" {attn_output.shape}"
        )

    attn_output = attn_output.swapaxes(1, 2)
    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

    attn_output = self.o_proj(attn_output)

    if not output_attentions:
        attn_weights = None

    return attn_output, attn_weights, past_key_value

mindnlp.transformers.models.internlm.modeling_internlm.InternLMDecoderLayer

Bases: Module

DecoderLayer

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
class InternLMDecoderLayer(nn.Module):
    """
    DecoderLayer
    """
    def __init__(self, config: InternLMConfig):
        """Initialize an instance of the InternLMDecoderLayer class.

        Args:
            self (InternLMDecoderLayer): The instance of the class.
            config (InternLMConfig):
                The configuration object containing various settings for the decoder layer.

                - hidden_size (int): The size of the hidden states.
                - intermediate_size (int): The size of the intermediate layer in the MLP.
                - hidden_act (str): The activation function to be used in the MLP.
                - rms_norm_eps (float): The epsilon value used in the RMS normalization.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = INTERNLM_ATTENTION_CLASSES['eager'](config=config)

        self.mlp = InternLMMLP(
            hidden_size=self.hidden_size,
            intermediate_size=config.intermediate_size,
            hidden_act=config.hidden_act,
        )
        self.input_layernorm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
            self,
            hidden_states: Tensor,
            attention_mask: Optional[Tensor] = None,
            position_ids: Optional[Tensor] = None,
            past_key_value: Optional[Tuple[Tensor]] = None,
            output_attentions: Optional[bool] = False,
            use_cache: Optional[bool] = False,
    ) -> Tuple[Tensor, Optional[Tuple[Tensor, Tensor]]]:
        """
        Args:
            hidden_states (`Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`Tensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(Tensor)`, *optional*): cached past key and value projection states
        """
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs

mindnlp.transformers.models.internlm.modeling_internlm.InternLMDecoderLayer.__init__(config)

Initialize an instance of the InternLMDecoderLayer class.

PARAMETER DESCRIPTION
self

The instance of the class.

TYPE: InternLMDecoderLayer

config

The configuration object containing various settings for the decoder layer.

  • hidden_size (int): The size of the hidden states.
  • intermediate_size (int): The size of the intermediate layer in the MLP.
  • hidden_act (str): The activation function to be used in the MLP.
  • rms_norm_eps (float): The epsilon value used in the RMS normalization.

TYPE: InternLMConfig

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
def __init__(self, config: InternLMConfig):
    """Initialize an instance of the InternLMDecoderLayer class.

    Args:
        self (InternLMDecoderLayer): The instance of the class.
        config (InternLMConfig):
            The configuration object containing various settings for the decoder layer.

            - hidden_size (int): The size of the hidden states.
            - intermediate_size (int): The size of the intermediate layer in the MLP.
            - hidden_act (str): The activation function to be used in the MLP.
            - rms_norm_eps (float): The epsilon value used in the RMS normalization.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    self.hidden_size = config.hidden_size

    self.self_attn = INTERNLM_ATTENTION_CLASSES['eager'](config=config)

    self.mlp = InternLMMLP(
        hidden_size=self.hidden_size,
        intermediate_size=config.intermediate_size,
        hidden_act=config.hidden_act,
    )
    self.input_layernorm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    self.post_attention_layernorm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

mindnlp.transformers.models.internlm.modeling_internlm.InternLMDecoderLayer.forward(hidden_states, attention_mask=None, position_ids=None, past_key_value=None, output_attentions=False, use_cache=False)

PARAMETER DESCRIPTION
hidden_states

input to the layer of shape (batch, seq_len, embed_dim)

TYPE: `Tensor`

attention_mask

attention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.

TYPE: `Tensor`, *optional* DEFAULT: None

output_attentions

Whether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.

TYPE: `bool`, *optional* DEFAULT: False

use_cache

If set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).

TYPE: `bool`, *optional* DEFAULT: False

past_key_value

cached past key and value projection states

TYPE: `Tuple(Tensor)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
def forward(
        self,
        hidden_states: Tensor,
        attention_mask: Optional[Tensor] = None,
        position_ids: Optional[Tensor] = None,
        past_key_value: Optional[Tuple[Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
) -> Tuple[Tensor, Optional[Tuple[Tensor, Tensor]]]:
    """
    Args:
        hidden_states (`Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
        attention_mask (`Tensor`, *optional*): attention mask of size
            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more detail.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
            (see `past_key_values`).
        past_key_value (`Tuple(Tensor)`, *optional*): cached past key and value projection states
    """
    residual = hidden_states

    hidden_states = self.input_layernorm(hidden_states)

    # Self Attention
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
        hidden_states=hidden_states,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=output_attentions,
        use_cache=use_cache,
    )
    hidden_states = residual + hidden_states

    # Fully Connected
    residual = hidden_states
    hidden_states = self.post_attention_layernorm(hidden_states)
    hidden_states = self.mlp(hidden_states)
    hidden_states = residual + hidden_states

    outputs = (hidden_states,)

    if output_attentions:
        outputs += (self_attn_weights,)

    if use_cache:
        outputs += (present_key_value,)

    return outputs

mindnlp.transformers.models.internlm.modeling_internlm.InternLMDynamicNTKScalingRotaryEmbedding

Bases: InternLMRotaryEmbedding

The InternLMDynamicNTKScalingRotaryEmbedding class is a Python class that represents a dynamic version of the Neural Tangent Kernel (NTK) Scaling Rotary Embedding used in the context of an InternLM model.

This class inherits from the InternLMRotaryEmbedding class and provides additional functionality for dynamically adjusting the NTK scaling factor based on the sequence length. It calculates and caches the cosine and sine values necessary for the rotary embeddings.

ATTRIBUTE DESCRIPTION
scaling_factor

The scaling factor used for adjusting the NTK scaling based on sequence length.

TYPE: float

METHOD DESCRIPTION
__init__

Initializes the InternLMDynamicNTKScalingRotaryEmbedding object with the specified dimensions, maximum position embeddings, base, and scaling factor. Calls the superclass initializer.

_set_cos_sin_cache

Sets the cosine and sine cache based on the provided sequence length and data type. Calculates the NTK scaling factor, inverse frequencies, and caches the cosine and sine values.

Note

This class assumes the existence of the InternLMRotaryEmbedding superclass.

Example
>>> # Create an instance of InternLMDynamicNTKScalingRotaryEmbedding
>>> embedding = InternLMDynamicNTKScalingRotaryEmbedding(dim=512, max_position_embeddings=1024, base=20000, scaling_factor=0.8)
...
>>> # Access the scaling factor attribute
>>> scaling_factor = embedding.scaling_factor
...
>>> # Call the _set_cos_sin_cache method
>>> embedding._set_cos_sin_cache(seq_len=512, dtype=torch.float32)
Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
class InternLMDynamicNTKScalingRotaryEmbedding(InternLMRotaryEmbedding):

    """
    The `InternLMDynamicNTKScalingRotaryEmbedding` class is a Python class that represents a dynamic version of the
    Neural Tangent Kernel (NTK) Scaling Rotary Embedding used in the context of an InternLM model.

    This class inherits from the `InternLMRotaryEmbedding` class and provides additional functionality for dynamically
    adjusting the NTK scaling factor based on the sequence length. It calculates and caches the cosine and sine values
    necessary for the rotary embeddings.

    Attributes:
        scaling_factor (float): The scaling factor used for adjusting the NTK scaling based on sequence length.

    Methods:
        __init__:
            Initializes the `InternLMDynamicNTKScalingRotaryEmbedding` object with the specified dimensions,
            maximum position embeddings, base, and scaling factor. Calls the superclass initializer.

        _set_cos_sin_cache:
            Sets the cosine and sine cache based on the provided sequence length and data type. Calculates the NTK
            scaling factor, inverse frequencies, and caches the cosine and sine values.

    Note:
        This class assumes the existence of the `InternLMRotaryEmbedding` superclass.

    Example:
        ```python
        >>> # Create an instance of InternLMDynamicNTKScalingRotaryEmbedding
        >>> embedding = InternLMDynamicNTKScalingRotaryEmbedding(dim=512, max_position_embeddings=1024, base=20000, scaling_factor=0.8)
        ...
        >>> # Access the scaling factor attribute
        >>> scaling_factor = embedding.scaling_factor
        ...
        >>> # Call the _set_cos_sin_cache method
        >>> embedding._set_cos_sin_cache(seq_len=512, dtype=torch.float32)
        ```
    """
    def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0):
        """
        Initializes an instance of the InternLMDynamicNTKScalingRotaryEmbedding class.

        Args:
            self (InternLMDynamicNTKScalingRotaryEmbedding): The instance of the class itself.
            dim (int): The dimension of the embedding.
            max_position_embeddings (int, optional): The maximum number of position embeddings. Defaults to 2048.
            base (int, optional): The base value used in positional encoding calculation. Defaults to 10000.
            scaling_factor (float, optional): The scaling factor applied to the embeddings. Defaults to 1.0.

        Returns:
            None.

        Raises:
            None.
        """
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base)

    def _set_cos_sin_cache(self, seq_len, dtype):
        """
        Method '_set_cos_sin_cache' in the class 'InternLMDynamicNTKScalingRotaryEmbedding'.

        This method initializes the cosine and sine cache based on the given sequence length and data type.

        Args:
            self: The instance of the class.
            seq_len (int): The length of the input sequence. Must be greater than 0.
            dtype: The data type for the calculations.
                Should be a valid data type compatible with the operations performed.

        Returns:
            None.

        Raises:
            ValueError: If the input sequence length 'seq_len' is not a positive integer.
            TypeError: If the provided data type 'dtype' is not valid or compatible with the operations.
        """
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (base ** (ops.arange(0, self.dim, 2).float() / self.dim))
            self.inv_freq = inv_freq

        t = ops.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)

        freqs = ops.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = ops.cat((freqs, freqs), axis=-1)
        self.cos_cached = emb.cos().to(dtype)
        self.sin_cached = emb.sin().to(dtype)

mindnlp.transformers.models.internlm.modeling_internlm.InternLMDynamicNTKScalingRotaryEmbedding.__init__(dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0)

Initializes an instance of the InternLMDynamicNTKScalingRotaryEmbedding class.

PARAMETER DESCRIPTION
self

The instance of the class itself.

TYPE: InternLMDynamicNTKScalingRotaryEmbedding

dim

The dimension of the embedding.

TYPE: int

max_position_embeddings

The maximum number of position embeddings. Defaults to 2048.

TYPE: int DEFAULT: 2048

base

The base value used in positional encoding calculation. Defaults to 10000.

TYPE: int DEFAULT: 10000

scaling_factor

The scaling factor applied to the embeddings. Defaults to 1.0.

TYPE: float DEFAULT: 1.0

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0):
    """
    Initializes an instance of the InternLMDynamicNTKScalingRotaryEmbedding class.

    Args:
        self (InternLMDynamicNTKScalingRotaryEmbedding): The instance of the class itself.
        dim (int): The dimension of the embedding.
        max_position_embeddings (int, optional): The maximum number of position embeddings. Defaults to 2048.
        base (int, optional): The base value used in positional encoding calculation. Defaults to 10000.
        scaling_factor (float, optional): The scaling factor applied to the embeddings. Defaults to 1.0.

    Returns:
        None.

    Raises:
        None.
    """
    self.scaling_factor = scaling_factor
    super().__init__(dim, max_position_embeddings, base)

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM

Bases: InternLMPreTrainedModel

A class representing an InternLM model for causal language modeling.

This class extends the InternLMPreTrainedModel class and provides additional functionality specific to causal language modeling tasks. It includes methods for initializing the model, setting and getting input and output embeddings, setting the decoder, forwarding the model, and preparing inputs for generation.

ATTRIBUTE DESCRIPTION
model

The underlying InternLM model.

TYPE: InternLMModel

lm_head

The linear layer for mapping hidden states to the vocabulary space.

TYPE: Linear

METHOD DESCRIPTION
__init__

Initializes the InternLMForCausalLM instance.

get_input_embeddings

Returns the input embeddings of the model.

set_input_embeddings

Sets the input embeddings of the model.

get_output_embeddings

Returns the output embeddings of the model.

set_output_embeddings

Sets the output embeddings of the model.

set_decoder

Sets the decoder for the model.

get_decoder

Returns the decoder of the model.

forward

Constructs the model and computes the masked language modeling loss.

prepare_inputs_for_generation

Prepares inputs for generation by modifying the input_ids, attention_mask, and position_ids.

Example
>>> from transformers import AutoTokenizer, InternLMForCausalLM
...
>>> model = InternLMForCausalLM(config)
>>> tokenizer = AutoTokenizer.from_pretrained(model)
...
>>> # Access model attributes
>>> input_embeddings = model.get_input_embeddings()
>>> output_embeddings = model.get_output_embeddings()
...
>>> # Modify model attributes
>>> model.set_input_embeddings(new_input_embeddings)
>>> model.set_output_embeddings(new_output_embeddings)
...
>>> # Set decoder
>>> model.set_decoder(decoder_model)
...
>>> # Generate text
>>> model.forward(input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
>>> generated_text = model.prepare_inputs_for_generation(input_ids, past_key_values, attention_mask, inputs_embeds, **kwargs)
Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
class InternLMForCausalLM(InternLMPreTrainedModel):

    """
    A class representing an InternLM model for causal language modeling.

    This class extends the InternLMPreTrainedModel class and provides additional functionality specific to
    causal language modeling tasks. It includes methods for initializing the model, setting and getting
    input and output embeddings, setting the decoder, forwarding the model, and preparing inputs for generation.

    Attributes:
        model (InternLMModel): The underlying InternLM model.
        lm_head (nn.Linear): The linear layer for mapping hidden states to the vocabulary space.

    Methods:
        __init__: Initializes the InternLMForCausalLM instance.
        get_input_embeddings: Returns the input embeddings of the model.
        set_input_embeddings: Sets the input embeddings of the model.
        get_output_embeddings: Returns the output embeddings of the model.
        set_output_embeddings: Sets the output embeddings of the model.
        set_decoder: Sets the decoder for the model.
        get_decoder: Returns the decoder of the model.
        forward: Constructs the model and computes the masked language modeling loss.
        prepare_inputs_for_generation: Prepares inputs for generation by modifying the input_ids, attention_mask,
            and position_ids.

    Example:
        ```python
        >>> from transformers import AutoTokenizer, InternLMForCausalLM
        ...
        >>> model = InternLMForCausalLM(config)
        >>> tokenizer = AutoTokenizer.from_pretrained(model)
        ...
        >>> # Access model attributes
        >>> input_embeddings = model.get_input_embeddings()
        >>> output_embeddings = model.get_output_embeddings()
        ...
        >>> # Modify model attributes
        >>> model.set_input_embeddings(new_input_embeddings)
        >>> model.set_output_embeddings(new_output_embeddings)
        ...
        >>> # Set decoder
        >>> model.set_decoder(decoder_model)
        ...
        >>> # Generate text
        >>> model.forward(input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
        >>> generated_text = model.prepare_inputs_for_generation(input_ids, past_key_values, attention_mask, inputs_embeds, **kwargs)
        ```
    """
    _auto_class = "AutoModelForCausalLM"
    def __init__(self, config, size=None):
        """
        Initializes a new instance of the InternLMForCausalLM class.

        Args:
            self: The instance of the class.
            config:
                The configuration for the language model.

                - Type: object
                - Purpose: Specifies the configuration parameters for the language model.
            size:
                The size of the language model input. (Optional)

                - Type: int
                - Purpose: Specifies the size of the language model input. If not provided, defaults to None.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)
        self.model = InternLMModel(config)

        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        Retrieve the input embeddings from the InternLMForCausalLM model.

        Args:
            self: An instance of the InternLMForCausalLM class.

        Returns:
            None.

        Raises:
            None.

        This method is used to obtain the input embeddings from the model.
        The input embeddings are representations of the input tokens that the model uses to process the text.
        The embeddings capture the semantic meaning and contextual information of the tokens, which is crucial for
        the model's performance.

        Note:
            The 'embed_tokens' attribute of the 'self.model' object contains the input embeddings.
            This attribute should be accessed to retrieve the embeddings.

        Example:
            ```python
            >>> model = InternLMForCausalLM()
            >>> embeddings = model.get_input_embeddings()
            ```
        """
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        '''
        Sets the input embeddings for the InternLMForCausalLM model.

        Args:
            self (InternLMForCausalLM): An instance of the InternLMForCausalLM class.
            value (object): The input embeddings to be set for the model.

        Returns:
            None.

        Raises:
            None.
        '''
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        """
        Returns the output embeddings of the InternLMForCausalLM model.

        Args:
            self: The instance of the InternLMForCausalLM class.

        Returns:
            The output embeddings of the model, represented by the 'lm_head' attribute.

        Raises:
            None.

        Note:
            The output embeddings are typically used to map the model's hidden state to a specific output vocabulary.
            These embeddings can be used for downstream tasks such as text generation or classification.
        """
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        """
        Sets the output embeddings for the InternLMForCausalLM model.

        Args:
            self (InternLMForCausalLM): The instance of the InternLMForCausalLM class.
            new_embeddings (Tensor): The new embeddings to be set for the output layer.

        Returns:
            None.

        Raises:
            TypeError: If the new_embeddings parameter is not of type Tensor.
        """
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        """
        Sets the decoder for the InternLMForCausalLM class.

        Args:
            self (InternLMForCausalLM): The current instance of the InternLMForCausalLM class.
            decoder: The decoder object to be set for the InternLMForCausalLM instance.

        Returns:
            None.

        Raises:
            None.
        """
        self.model = decoder

    def get_decoder(self):
        """
        Method to retrieve the decoder from the InternLMForCausalLM class.

        Args:
            self (object): The instance of the InternLMForCausalLM class.
                This parameter is required to access the model within the class.

        Returns:
            None:
                The method returns the decoder object associated with the InternLMForCausalLM instance.

        Raises:
            None.
        """
        return self.model

    def forward(
            self,
            input_ids: Tensor = None,
            attention_mask: Optional[Tensor] = None,
            position_ids: Optional[Tensor] = None,
            past_key_values: Optional[List[Tensor]] = None,
            inputs_embeds: Optional[Tensor] = None,
            labels: Optional[Tensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            Union[Tuple, CausalLMOutputWithPast]

        Example:
            ```python
            >>> from transformers import AutoTokenizer, InternLMForCausalLM
            >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
            >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
            >>> prompt = "Hey, are you consciours? Can you talk to me?"
            >>> inputs = tokenizer(prompt, return_tensors="pt")
            >>> # Generate
            >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
            >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
            "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
            ```
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :]
            shift_labels = labels[..., 1:]
            # Flatten the tokens
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            loss = ops.cross_entropy(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
    ):
        """
        Prepare inputs for generation.

        This method prepares the inputs for the generation process in the InternLMForCausalLM class.

        Args:
            self (InternLMForCausalLM): The instance of the InternLMForCausalLM class.
            input_ids (torch.Tensor): The input tensor containing the tokenized input sequence.
            past_key_values (Optional[torch.Tensor]): The tensor of past key values for generation. Default is None.
            attention_mask (Optional[torch.Tensor]): The attention mask tensor. Default is None.
            inputs_embeds (Optional[torch.Tensor]): The tensor of embedded inputs. Default is None.

        Returns:
            model_inputs (dict): A dictionary containing the prepared model inputs for generation.
                It can have the following keys:

                - 'inputs_embeds' (torch.Tensor): The tensor of embedded inputs, if provided.
                - 'input_ids' (torch.Tensor): The tensor of tokenized input sequence.
                - 'position_ids' (torch.Tensor): The tensor of position IDs.
                - 'past_key_values' (torch.Tensor): The tensor of past key values for generation.
                - 'use_cache' (bool): A flag indicating whether to use cache or not.
                - 'attention_mask' (torch.Tensor): The attention mask tensor.

        Raises:
            None.
        """
        if past_key_values:
            input_ids = input_ids[:, -1:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids = position_ids.masked_fill(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -1].unsqueeze(-1)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        """
        Reorders the cache of past key values based on the provided beam index.

        Args:
            past_key_values (tuple): A tuple containing the past key values for each layer.
                Each past key value is expected to be a tensor.
            beam_idx (tensor): A tensor representing the beam index.

        Returns:
            None.

        Raises:
            None
        """
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
        return reordered_past

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM.__init__(config, size=None)

Initializes a new instance of the InternLMForCausalLM class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

The configuration for the language model.

  • Type: object
  • Purpose: Specifies the configuration parameters for the language model.

size

The size of the language model input. (Optional)

  • Type: int
  • Purpose: Specifies the size of the language model input. If not provided, defaults to None.

DEFAULT: None

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
def __init__(self, config, size=None):
    """
    Initializes a new instance of the InternLMForCausalLM class.

    Args:
        self: The instance of the class.
        config:
            The configuration for the language model.

            - Type: object
            - Purpose: Specifies the configuration parameters for the language model.
        size:
            The size of the language model input. (Optional)

            - Type: int
            - Purpose: Specifies the size of the language model input. If not provided, defaults to None.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)
    self.model = InternLMModel(config)

    self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM.forward(input_ids=None, attention_mask=None, position_ids=None, past_key_values=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

PARAMETER DESCRIPTION
labels

Labels for computing the masked language modeling loss. Indices should either be in [0, ..., config.vocab_size] or -100 (see input_ids docstring). Tokens with indices set to -100 are ignored (masked), the loss is only computed for the tokens with labels in [0, ..., config.vocab_size].

TYPE: `torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional* DEFAULT: None

RETURNS DESCRIPTION
Union[Tuple, CausalLMOutputWithPast]

Union[Tuple, CausalLMOutputWithPast]

Example
>>> from transformers import AutoTokenizer, InternLMForCausalLM
>>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
>>> prompt = "Hey, are you consciours? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
def forward(
        self,
        input_ids: Tensor = None,
        attention_mask: Optional[Tensor] = None,
        position_ids: Optional[Tensor] = None,
        past_key_values: Optional[List[Tensor]] = None,
        inputs_embeds: Optional[Tensor] = None,
        labels: Optional[Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

    Returns:
        Union[Tuple, CausalLMOutputWithPast]

    Example:
        ```python
        >>> from transformers import AutoTokenizer, InternLMForCausalLM
        >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
        >>> prompt = "Hey, are you consciours? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")
        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
        ```
    """
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    hidden_states = outputs[0]
    logits = self.lm_head(hidden_states)

    loss = None
    if labels is not None:
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :]
        shift_labels = labels[..., 1:]
        # Flatten the tokens
        shift_logits = shift_logits.view(-1, self.config.vocab_size)
        shift_labels = shift_labels.view(-1)
        # Enable model parallelism
        loss = ops.cross_entropy(shift_logits, shift_labels)

    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output

    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM.get_decoder()

Method to retrieve the decoder from the InternLMForCausalLM class.

PARAMETER DESCRIPTION
self

The instance of the InternLMForCausalLM class. This parameter is required to access the model within the class.

TYPE: object

RETURNS DESCRIPTION
None

The method returns the decoder object associated with the InternLMForCausalLM instance.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
def get_decoder(self):
    """
    Method to retrieve the decoder from the InternLMForCausalLM class.

    Args:
        self (object): The instance of the InternLMForCausalLM class.
            This parameter is required to access the model within the class.

    Returns:
        None:
            The method returns the decoder object associated with the InternLMForCausalLM instance.

    Raises:
        None.
    """
    return self.model

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM.get_input_embeddings()

Retrieve the input embeddings from the InternLMForCausalLM model.

PARAMETER DESCRIPTION
self

An instance of the InternLMForCausalLM class.

RETURNS DESCRIPTION

None.

This method is used to obtain the input embeddings from the model. The input embeddings are representations of the input tokens that the model uses to process the text. The embeddings capture the semantic meaning and contextual information of the tokens, which is crucial for the model's performance.

Note

The 'embed_tokens' attribute of the 'self.model' object contains the input embeddings. This attribute should be accessed to retrieve the embeddings.

Example
>>> model = InternLMForCausalLM()
>>> embeddings = model.get_input_embeddings()
Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
def get_input_embeddings(self):
    """
    Retrieve the input embeddings from the InternLMForCausalLM model.

    Args:
        self: An instance of the InternLMForCausalLM class.

    Returns:
        None.

    Raises:
        None.

    This method is used to obtain the input embeddings from the model.
    The input embeddings are representations of the input tokens that the model uses to process the text.
    The embeddings capture the semantic meaning and contextual information of the tokens, which is crucial for
    the model's performance.

    Note:
        The 'embed_tokens' attribute of the 'self.model' object contains the input embeddings.
        This attribute should be accessed to retrieve the embeddings.

    Example:
        ```python
        >>> model = InternLMForCausalLM()
        >>> embeddings = model.get_input_embeddings()
        ```
    """
    return self.model.embed_tokens

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM.get_output_embeddings()

Returns the output embeddings of the InternLMForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the InternLMForCausalLM class.

RETURNS DESCRIPTION

The output embeddings of the model, represented by the 'lm_head' attribute.

Note

The output embeddings are typically used to map the model's hidden state to a specific output vocabulary. These embeddings can be used for downstream tasks such as text generation or classification.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
def get_output_embeddings(self):
    """
    Returns the output embeddings of the InternLMForCausalLM model.

    Args:
        self: The instance of the InternLMForCausalLM class.

    Returns:
        The output embeddings of the model, represented by the 'lm_head' attribute.

    Raises:
        None.

    Note:
        The output embeddings are typically used to map the model's hidden state to a specific output vocabulary.
        These embeddings can be used for downstream tasks such as text generation or classification.
    """
    return self.lm_head

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM.prepare_inputs_for_generation(input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs)

Prepare inputs for generation.

This method prepares the inputs for the generation process in the InternLMForCausalLM class.

PARAMETER DESCRIPTION
self

The instance of the InternLMForCausalLM class.

TYPE: InternLMForCausalLM

input_ids

The input tensor containing the tokenized input sequence.

TYPE: Tensor

past_key_values

The tensor of past key values for generation. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

attention_mask

The attention mask tensor. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

inputs_embeds

The tensor of embedded inputs. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

RETURNS DESCRIPTION
model_inputs

A dictionary containing the prepared model inputs for generation. It can have the following keys:

  • 'inputs_embeds' (torch.Tensor): The tensor of embedded inputs, if provided.
  • 'input_ids' (torch.Tensor): The tensor of tokenized input sequence.
  • 'position_ids' (torch.Tensor): The tensor of position IDs.
  • 'past_key_values' (torch.Tensor): The tensor of past key values for generation.
  • 'use_cache' (bool): A flag indicating whether to use cache or not.
  • 'attention_mask' (torch.Tensor): The attention mask tensor.

TYPE: dict

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
    """
    Prepare inputs for generation.

    This method prepares the inputs for the generation process in the InternLMForCausalLM class.

    Args:
        self (InternLMForCausalLM): The instance of the InternLMForCausalLM class.
        input_ids (torch.Tensor): The input tensor containing the tokenized input sequence.
        past_key_values (Optional[torch.Tensor]): The tensor of past key values for generation. Default is None.
        attention_mask (Optional[torch.Tensor]): The attention mask tensor. Default is None.
        inputs_embeds (Optional[torch.Tensor]): The tensor of embedded inputs. Default is None.

    Returns:
        model_inputs (dict): A dictionary containing the prepared model inputs for generation.
            It can have the following keys:

            - 'inputs_embeds' (torch.Tensor): The tensor of embedded inputs, if provided.
            - 'input_ids' (torch.Tensor): The tensor of tokenized input sequence.
            - 'position_ids' (torch.Tensor): The tensor of position IDs.
            - 'past_key_values' (torch.Tensor): The tensor of past key values for generation.
            - 'use_cache' (bool): A flag indicating whether to use cache or not.
            - 'attention_mask' (torch.Tensor): The attention mask tensor.

    Raises:
        None.
    """
    if past_key_values:
        input_ids = input_ids[:, -1:]

    position_ids = kwargs.get("position_ids", None)
    if attention_mask is not None and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids = position_ids.masked_fill(attention_mask == 0, 1)
        if past_key_values:
            position_ids = position_ids[:, -1].unsqueeze(-1)

    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
    if inputs_embeds is not None and past_key_values is None:
        model_inputs = {"inputs_embeds": inputs_embeds}
    else:
        model_inputs = {"input_ids": input_ids}

    model_inputs.update(
        {
            "position_ids": position_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "attention_mask": attention_mask,
        }
    )
    return model_inputs

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM.set_decoder(decoder)

Sets the decoder for the InternLMForCausalLM class.

PARAMETER DESCRIPTION
self

The current instance of the InternLMForCausalLM class.

TYPE: InternLMForCausalLM

decoder

The decoder object to be set for the InternLMForCausalLM instance.

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
def set_decoder(self, decoder):
    """
    Sets the decoder for the InternLMForCausalLM class.

    Args:
        self (InternLMForCausalLM): The current instance of the InternLMForCausalLM class.
        decoder: The decoder object to be set for the InternLMForCausalLM instance.

    Returns:
        None.

    Raises:
        None.
    """
    self.model = decoder

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM.set_input_embeddings(value)

Sets the input embeddings for the InternLMForCausalLM model.

PARAMETER DESCRIPTION
self

An instance of the InternLMForCausalLM class.

TYPE: InternLMForCausalLM

value

The input embeddings to be set for the model.

TYPE: object

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
def set_input_embeddings(self, value):
    '''
    Sets the input embeddings for the InternLMForCausalLM model.

    Args:
        self (InternLMForCausalLM): An instance of the InternLMForCausalLM class.
        value (object): The input embeddings to be set for the model.

    Returns:
        None.

    Raises:
        None.
    '''
    self.model.embed_tokens = value

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForCausalLM.set_output_embeddings(new_embeddings)

Sets the output embeddings for the InternLMForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the InternLMForCausalLM class.

TYPE: InternLMForCausalLM

new_embeddings

The new embeddings to be set for the output layer.

TYPE: Tensor

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
TypeError

If the new_embeddings parameter is not of type Tensor.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
def set_output_embeddings(self, new_embeddings):
    """
    Sets the output embeddings for the InternLMForCausalLM model.

    Args:
        self (InternLMForCausalLM): The instance of the InternLMForCausalLM class.
        new_embeddings (Tensor): The new embeddings to be set for the output layer.

    Returns:
        None.

    Raises:
        TypeError: If the new_embeddings parameter is not of type Tensor.
    """
    self.lm_head = new_embeddings

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForSequenceClassification

Bases: InternLMPreTrainedModel

This class represents an InternLM model for sequence classification tasks. It is a subclass of the InternLMPreTrainedModel class.

The InternLMForSequenceClassification class is initialized with a configuration object, which includes the number of labels for the classification task. The model architecture consists of an InternLMModel and a score layer.

The class provides methods for getting and setting the input embeddings of the model. The get_input_embeddings method returns the embedded tokens of the model, while the set_input_embeddings method allows for setting new input embeddings.

The forward method is responsible for processing input data and generating classification outputs. It takes several optional parameters, including input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, and return_dict. The method returns either a tuple or a SequenceClassifierOutputWithPast object, depending on the value of the return_dict parameter.

If labels are provided, the method computes the sequence classification loss based on the configured problem type. The problem type can be 'regression', 'single_label_classification', or 'multi_label_classification', depending on the number of labels and the data type of the labels. The loss is computed using various loss functions, such as mean squared error (MSE) loss, cross-entropy loss, or binary cross-entropy with logits loss.

If the return_dict parameter is False, the method returns a tuple containing the pooled logits and other transformer outputs. If the loss is not None, it is included in the tuple. If the return_dict parameter is True, the method returns a SequenceClassifierOutputWithPast object, which includes the loss, pooled logits, past key values, hidden states, and attentions.

Note

The class assumes that the batch size is 1 or that a padding token ID is defined. If the batch size is greater than 1 and no padding token ID is defined, a ValueError is raised.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
class InternLMForSequenceClassification(InternLMPreTrainedModel):

    """
    This class represents an InternLM model for sequence classification tasks.
    It is a subclass of the InternLMPreTrainedModel class.

    The InternLMForSequenceClassification class is initialized with a configuration object, which includes the number
    of labels for the classification task. The model architecture consists of an InternLMModel and a score layer.

    The class provides methods for getting and setting the input embeddings of the model.
    The get_input_embeddings method returns the embedded tokens of the model, while the set_input_embeddings method
    allows for setting new input embeddings.

    The forward method is responsible for processing input data and generating classification outputs.
    It takes several optional parameters, including input_ids, attention_mask, position_ids, past_key_values,
    inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, and return_dict.
    The method returns either a tuple or a SequenceClassifierOutputWithPast object, depending on the value of the
    return_dict parameter.

    If labels are provided, the method computes the sequence classification loss based on the configured problem type.
    The problem type can be 'regression', 'single_label_classification', or 'multi_label_classification',
    depending on the number of labels and the data type of the labels.
    The loss is computed using various loss functions, such as mean squared error (MSE) loss, cross-entropy loss, or
    binary cross-entropy with logits loss.

    If the return_dict parameter is False, the method returns a tuple containing the pooled logits and
    other transformer outputs. If the loss is not None, it is included in the tuple. If the return_dict parameter is
    True, the method returns a SequenceClassifierOutputWithPast object, which includes the loss, pooled logits,
    past key values, hidden states, and attentions.

    Note:
        The class assumes that the batch size is 1 or that a padding token ID is defined.
        If the batch size is greater than 1 and no padding token ID is defined, a ValueError is raised.

    """
    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]

    def __init__(self, config):
        """
        Initializes a new instance of the `InternLMForSequenceClassification` class.

        Args:
            self: The object itself.
            config: An instance of the `InternLMConfig` class containing the configuration parameters for the model.
                It includes the following attributes:

                - num_labels (int): The number of labels for classification.
                This value determines the size of the output layer.
                - hidden_size (int): The size of the hidden layers in the model.
                This value is used in the `nn.Linear` layer.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = InternLMModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        Retrieve the input embeddings from the InternLMForSequenceClassification model.

        Args:
            self: An instance of the InternLMForSequenceClassification class.

        Returns:
            None.

        Raises:
            None.

        This method retrieves the input embeddings from the model's embed_tokens attribute.
        The input embeddings are used as the input to the model for sequence classification tasks.
        The method does not modify the input embeddings or perform any additional processing.
        The retrieved input embeddings can be used for further analysis or visualization, if needed.
        """
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        """
        This method is a part of the 'InternLMForSequenceClassification' class and is used to set the
        input embeddings for the model.

        Args:
            self (object): The instance of the class.
            value (object): The input embeddings value to be set for the model.
                It can be of any valid type that represents the input embeddings.

        Returns:
            None: This method does not return any value explicitly, but it sets the input embeddings for the model.

        Raises:
            None.
        """
        self.model.embed_tokens = value

    def forward(
        self,
        input_ids: mindspore.Tensor = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[List[mindspore.Tensor]] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        Args:
            labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = ops.equal(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                sequence_lengths = sequence_lengths % input_ids.shape[-1]
            else:
                sequence_lengths = -1

        pooled_logits = logits[ops.arange(0,batch_size), sequence_lengths]

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                if self.num_labels == 1:
                    loss = ops.mse_loss(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = ops.mse_loss(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss = ops.cross_entropy(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss = ops.binary_cross_entropy_with_logits(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForSequenceClassification.__init__(config)

Initializes a new instance of the InternLMForSequenceClassification class.

PARAMETER DESCRIPTION
self

The object itself.

config

An instance of the InternLMConfig class containing the configuration parameters for the model. It includes the following attributes:

  • num_labels (int): The number of labels for classification. This value determines the size of the output layer.
  • hidden_size (int): The size of the hidden layers in the model. This value is used in the nn.Linear layer.

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
def __init__(self, config):
    """
    Initializes a new instance of the `InternLMForSequenceClassification` class.

    Args:
        self: The object itself.
        config: An instance of the `InternLMConfig` class containing the configuration parameters for the model.
            It includes the following attributes:

            - num_labels (int): The number of labels for classification.
            This value determines the size of the output layer.
            - hidden_size (int): The size of the hidden layers in the model.
            This value is used in the `nn.Linear` layer.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)
    self.num_labels = config.num_labels
    self.model = InternLMModel(config)
    self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForSequenceClassification.forward(input_ids=None, attention_mask=None, position_ids=None, past_key_values=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

PARAMETER DESCRIPTION
labels

Labels for computing the sequence classification/regression loss. Indices should be in [0, ..., config.num_labels - 1]. If config.num_labels == 1 a regression loss is computed (Mean-Square loss), If config.num_labels > 1 a classification loss is computed (Cross-Entropy).

TYPE: `mindspore.Tensor` of shape `(batch_size,)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
def forward(
    self,
    input_ids: mindspore.Tensor = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[List[mindspore.Tensor]] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
    r"""
    Args:
        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    transformer_outputs = self.model(
        input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = transformer_outputs[0]
    logits = self.score(hidden_states)

    if input_ids is not None:
        batch_size = input_ids.shape[0]
    else:
        batch_size = inputs_embeds.shape[0]

    if self.config.pad_token_id is None and batch_size != 1:
        raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
    if self.config.pad_token_id is None:
        sequence_lengths = -1
    else:
        if input_ids is not None:
            sequence_lengths = ops.equal(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
            sequence_lengths = sequence_lengths % input_ids.shape[-1]
        else:
            sequence_lengths = -1

    pooled_logits = logits[ops.arange(0,batch_size), sequence_lengths]

    loss = None
    if labels is not None:
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        if self.config.problem_type == "regression":
            if self.num_labels == 1:
                loss = ops.mse_loss(pooled_logits.squeeze(), labels.squeeze())
            else:
                loss = ops.mse_loss(pooled_logits, labels)
        elif self.config.problem_type == "single_label_classification":
            loss = ops.cross_entropy(pooled_logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            loss = ops.binary_cross_entropy_with_logits(pooled_logits, labels)
    if not return_dict:
        output = (pooled_logits,) + transformer_outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return SequenceClassifierOutputWithPast(
        loss=loss,
        logits=pooled_logits,
        past_key_values=transformer_outputs.past_key_values,
        hidden_states=transformer_outputs.hidden_states,
        attentions=transformer_outputs.attentions,
    )

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForSequenceClassification.get_input_embeddings()

Retrieve the input embeddings from the InternLMForSequenceClassification model.

PARAMETER DESCRIPTION
self

An instance of the InternLMForSequenceClassification class.

RETURNS DESCRIPTION

None.

This method retrieves the input embeddings from the model's embed_tokens attribute. The input embeddings are used as the input to the model for sequence classification tasks. The method does not modify the input embeddings or perform any additional processing. The retrieved input embeddings can be used for further analysis or visualization, if needed.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
def get_input_embeddings(self):
    """
    Retrieve the input embeddings from the InternLMForSequenceClassification model.

    Args:
        self: An instance of the InternLMForSequenceClassification class.

    Returns:
        None.

    Raises:
        None.

    This method retrieves the input embeddings from the model's embed_tokens attribute.
    The input embeddings are used as the input to the model for sequence classification tasks.
    The method does not modify the input embeddings or perform any additional processing.
    The retrieved input embeddings can be used for further analysis or visualization, if needed.
    """
    return self.model.embed_tokens

mindnlp.transformers.models.internlm.modeling_internlm.InternLMForSequenceClassification.set_input_embeddings(value)

This method is a part of the 'InternLMForSequenceClassification' class and is used to set the input embeddings for the model.

PARAMETER DESCRIPTION
self

The instance of the class.

TYPE: object

value

The input embeddings value to be set for the model. It can be of any valid type that represents the input embeddings.

TYPE: object

RETURNS DESCRIPTION
None

This method does not return any value explicitly, but it sets the input embeddings for the model.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
def set_input_embeddings(self, value):
    """
    This method is a part of the 'InternLMForSequenceClassification' class and is used to set the
    input embeddings for the model.

    Args:
        self (object): The instance of the class.
        value (object): The input embeddings value to be set for the model.
            It can be of any valid type that represents the input embeddings.

    Returns:
        None: This method does not return any value explicitly, but it sets the input embeddings for the model.

    Raises:
        None.
    """
    self.model.embed_tokens = value

mindnlp.transformers.models.internlm.modeling_internlm.InternLMMLP

Bases: Module

MLP

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
class InternLMMLP(nn.Module):
    """
    MLP
    """
    def __init__(
            self,
            hidden_size: int,
            intermediate_size: int,
            hidden_act: str,
    ):
        """
        Initializes the InternLMMLP class.

        Args:
            self: The instance of the class.
            hidden_size (int): The size of the hidden layer in the neural network.
            intermediate_size (int): The size of the intermediate layer in the neural network.
            hidden_act (str): The activation function for the hidden layer.
                It should be one of the supported activation functions.

        Returns:
            None.

        Raises:
            TypeError: If the input parameters are not of the expected types.
            ValueError: If the hidden_act parameter does not correspond to a supported activation function.
        """
        super().__init__()
        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
        self.act_fn = ACT2FN[hidden_act]

    def forward(self, x):
        """
        Constructs the output of the InternLMMLP model.

        Args:
            self (InternLMMLP): An instance of the InternLMMLP class.
            x: The input data for the model (type: unspecified).

        Returns:
            None.

        Raises:
            None.
        """
        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

mindnlp.transformers.models.internlm.modeling_internlm.InternLMMLP.__init__(hidden_size, intermediate_size, hidden_act)

Initializes the InternLMMLP class.

PARAMETER DESCRIPTION
self

The instance of the class.

hidden_size

The size of the hidden layer in the neural network.

TYPE: int

intermediate_size

The size of the intermediate layer in the neural network.

TYPE: int

hidden_act

The activation function for the hidden layer. It should be one of the supported activation functions.

TYPE: str

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
TypeError

If the input parameters are not of the expected types.

ValueError

If the hidden_act parameter does not correspond to a supported activation function.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        hidden_act: str,
):
    """
    Initializes the InternLMMLP class.

    Args:
        self: The instance of the class.
        hidden_size (int): The size of the hidden layer in the neural network.
        intermediate_size (int): The size of the intermediate layer in the neural network.
        hidden_act (str): The activation function for the hidden layer.
            It should be one of the supported activation functions.

    Returns:
        None.

    Raises:
        TypeError: If the input parameters are not of the expected types.
        ValueError: If the hidden_act parameter does not correspond to a supported activation function.
    """
    super().__init__()
    self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
    self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
    self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
    self.act_fn = ACT2FN[hidden_act]

mindnlp.transformers.models.internlm.modeling_internlm.InternLMMLP.forward(x)

Constructs the output of the InternLMMLP model.

PARAMETER DESCRIPTION
self

An instance of the InternLMMLP class.

TYPE: InternLMMLP

x

The input data for the model (type: unspecified).

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
def forward(self, x):
    """
    Constructs the output of the InternLMMLP model.

    Args:
        self (InternLMMLP): An instance of the InternLMMLP class.
        x: The input data for the model (type: unspecified).

    Returns:
        None.

    Raises:
        None.
    """
    return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

mindnlp.transformers.models.internlm.modeling_internlm.InternLMModel

Bases: InternLMPreTrainedModel

Transformer decoder consisting of config.num_hidden_layers layers. Each layer is a [DecoderLayer]

PARAMETER DESCRIPTION
config

InternLMConfig

TYPE: InternLMConfig

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
class InternLMModel(InternLMPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DecoderLayer`]

    Args:
        config: InternLMConfig
    """
    _auto_class = "AutoModel"

    def __init__(self, config: InternLMConfig):
        """
        Args:
            self (object): The instance of the InternLMModel class.
            config (InternLMConfig):
                An instance of the InternLMConfig class containing the configuration for the language model.
                It specifies the model's parameters such as vocabulary size, hidden size, number of hidden layers, etc.
                The config parameter is required and must be an instance of the InternLMConfig class.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        self.config = config

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)

        self.layers = nn.ModuleList([InternLMDecoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.norm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        # Initialize weights and apply final processing
        self.gradient_checkpointing = False
        self.post_init()

    def get_input_embeddings(self):
        """
        This method retrieves the input embeddings from the InternLMModel.

        Args:
            self (InternLMModel): The instance of the InternLMModel class.

        Returns:
            embed_tokens: This method returns the input embeddings from the InternLMModel.

        Raises:
            None.
        """
        return self.embed_tokens

    def set_input_embeddings(self, value):
        """
        Sets the input embeddings for the InternLMModel.

        Args:
            self (object): The instance of the InternLMModel class.
            value (object): The input embeddings value to be set for the model.
                It should be an object of appropriate type.

        Returns:
            None.

        Raises:
            None
        """
        self.embed_tokens = value

    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
        """
        This method prepares the decoder attention mask for the InternLMModel.

        Args:
            self (object): The instance of the InternLMModel.
            attention_mask (torch.Tensor): The attention mask to be prepared.
                It should have the same shape as input_shape.
            input_shape (tuple): The shape of the input tensor.
            inputs_embeds (torch.Tensor): The input embeddings tensor.
            past_key_values_length (int): The length of past key values.

        Returns:
            combined_attention_mask (torch.Tensor): The combined attention mask prepared for the decoder.
                Returns None if the input_shape[-1] is less than or equal to 1.

        Raises:
            ValueError: If input_shape[-1] is less than or equal to 0.
            TypeError: If any of the input parameters are of incorrect type.
        """
        # create causal mask
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        combined_attention_mask = None
        if input_shape[-1] > 1:
            combined_attention_mask = _make_causal_mask(
                input_shape,
                inputs_embeds.dtype,
                past_key_values_length=past_key_values_length,
            )

        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
            combined_attention_mask = (
                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
            )

        return combined_attention_mask

    def forward(
            self,
            input_ids: Tensor = None,
            attention_mask: Optional[Tensor] = None,
            position_ids: Optional[Tensor] = None,
            past_key_values: Optional[List[Tensor]] = None,
            inputs_embeds: Optional[Tensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        '''
        Constructs the internal language model for the model.

        Args:
            self (object): The instance of the class.
            input_ids (Tensor, optional): The input tensor of token indices. Defaults to None.
            attention_mask (Tensor, optional): The mask tensor to avoid attention on padding tokens. Defaults to None.
            position_ids (Tensor, optional): The tensor of token positions. Defaults to None.
            past_key_values (List[Tensor], optional): List of tensors containing past key values. Defaults to None.
            inputs_embeds (Tensor, optional): The input embeddings tensor. Defaults to None.
            use_cache (bool, optional): Flag to use caching. Defaults to None.
            output_attentions (bool, optional): Flag to output attentions. Defaults to None.
            output_hidden_states (bool, optional): Flag to output hidden states. Defaults to None.
            return_dict (bool, optional): Flag to return a dictionary. Defaults to None.

        Returns:
            Union[Tuple, BaseModelOutputWithPast]: The output as a tuple or an instance of BaseModelOutputWithPast.

        Raises:
            ValueError: If both input_ids and inputs_embeds are specified, or if neither of them is specified.
            Warning: If `use_cache=True` is incompatible with gradient checkpointing.
        '''
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        #if self.config.attn_implementation == "flash_attention_2":
            #_import_flash_attn()
        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
        if input_ids is not None:
            batch_size, seq_length = input_ids.shape
        elif inputs_embeds is not None:
            batch_size, seq_length, _ = inputs_embeds.shape
        else:
            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

        seq_length_with_past = seq_length
        past_key_values_length = 0

        if past_key_values is not None:
            past_key_values_length = past_key_values[0][0].shape[2]
            seq_length_with_past = seq_length_with_past + past_key_values_length

        if position_ids is None:
            position_ids = ops.arange(
                past_key_values_length, seq_length + past_key_values_length, dtype=mindspore.int64
            )
            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
        else:
            position_ids = position_ids.view(-1, seq_length).long()

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)
        # embed positions
        if attention_mask is None:
            attention_mask = ops.ones(
                (batch_size, seq_length_with_past), dtype=mindspore.bool_
            )
        attention_mask = self._prepare_decoder_attention_mask(
            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
        )

        hidden_states = inputs_embeds
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False
        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = () if use_cache else None

        for idx, decoder_layer in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            past_key_value = past_key_values[idx] if past_key_values is not None else None

            # TODO: how checkpoint
            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
            )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

mindnlp.transformers.models.internlm.modeling_internlm.InternLMModel.__init__(config)

PARAMETER DESCRIPTION
self

The instance of the InternLMModel class.

TYPE: object

config

An instance of the InternLMConfig class containing the configuration for the language model. It specifies the model's parameters such as vocabulary size, hidden size, number of hidden layers, etc. The config parameter is required and must be an instance of the InternLMConfig class.

TYPE: InternLMConfig

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
def __init__(self, config: InternLMConfig):
    """
    Args:
        self (object): The instance of the InternLMModel class.
        config (InternLMConfig):
            An instance of the InternLMConfig class containing the configuration for the language model.
            It specifies the model's parameters such as vocabulary size, hidden size, number of hidden layers, etc.
            The config parameter is required and must be an instance of the InternLMConfig class.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)
    self.padding_idx = config.pad_token_id
    self.vocab_size = config.vocab_size
    self.config = config

    self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)

    self.layers = nn.ModuleList([InternLMDecoderLayer(config) for _ in range(config.num_hidden_layers)])
    self.norm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    # Initialize weights and apply final processing
    self.gradient_checkpointing = False
    self.post_init()

mindnlp.transformers.models.internlm.modeling_internlm.InternLMModel.forward(input_ids=None, attention_mask=None, position_ids=None, past_key_values=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

Constructs the internal language model for the model.

PARAMETER DESCRIPTION
self

The instance of the class.

TYPE: object

input_ids

The input tensor of token indices. Defaults to None.

TYPE: Tensor DEFAULT: None

attention_mask

The mask tensor to avoid attention on padding tokens. Defaults to None.

TYPE: Tensor DEFAULT: None

position_ids

The tensor of token positions. Defaults to None.

TYPE: Tensor DEFAULT: None

past_key_values

List of tensors containing past key values. Defaults to None.

TYPE: List[Tensor] DEFAULT: None

inputs_embeds

The input embeddings tensor. Defaults to None.

TYPE: Tensor DEFAULT: None

use_cache

Flag to use caching. Defaults to None.

TYPE: bool DEFAULT: None

output_attentions

Flag to output attentions. Defaults to None.

TYPE: bool DEFAULT: None

output_hidden_states

Flag to output hidden states. Defaults to None.

TYPE: bool DEFAULT: None

return_dict

Flag to return a dictionary. Defaults to None.

TYPE: bool DEFAULT: None

RETURNS DESCRIPTION
Union[Tuple, BaseModelOutputWithPast]

Union[Tuple, BaseModelOutputWithPast]: The output as a tuple or an instance of BaseModelOutputWithPast.

RAISES DESCRIPTION
ValueError

If both input_ids and inputs_embeds are specified, or if neither of them is specified.

Warning

If use_cache=True is incompatible with gradient checkpointing.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
def forward(
        self,
        input_ids: Tensor = None,
        attention_mask: Optional[Tensor] = None,
        position_ids: Optional[Tensor] = None,
        past_key_values: Optional[List[Tensor]] = None,
        inputs_embeds: Optional[Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
    '''
    Constructs the internal language model for the model.

    Args:
        self (object): The instance of the class.
        input_ids (Tensor, optional): The input tensor of token indices. Defaults to None.
        attention_mask (Tensor, optional): The mask tensor to avoid attention on padding tokens. Defaults to None.
        position_ids (Tensor, optional): The tensor of token positions. Defaults to None.
        past_key_values (List[Tensor], optional): List of tensors containing past key values. Defaults to None.
        inputs_embeds (Tensor, optional): The input embeddings tensor. Defaults to None.
        use_cache (bool, optional): Flag to use caching. Defaults to None.
        output_attentions (bool, optional): Flag to output attentions. Defaults to None.
        output_hidden_states (bool, optional): Flag to output hidden states. Defaults to None.
        return_dict (bool, optional): Flag to return a dictionary. Defaults to None.

    Returns:
        Union[Tuple, BaseModelOutputWithPast]: The output as a tuple or an instance of BaseModelOutputWithPast.

    Raises:
        ValueError: If both input_ids and inputs_embeds are specified, or if neither of them is specified.
        Warning: If `use_cache=True` is incompatible with gradient checkpointing.
    '''
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    #if self.config.attn_implementation == "flash_attention_2":
        #_import_flash_attn()
    # retrieve input_ids and inputs_embeds
    if input_ids is not None and inputs_embeds is not None:
        raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
    if input_ids is not None:
        batch_size, seq_length = input_ids.shape
    elif inputs_embeds is not None:
        batch_size, seq_length, _ = inputs_embeds.shape
    else:
        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")

    seq_length_with_past = seq_length
    past_key_values_length = 0

    if past_key_values is not None:
        past_key_values_length = past_key_values[0][0].shape[2]
        seq_length_with_past = seq_length_with_past + past_key_values_length

    if position_ids is None:
        position_ids = ops.arange(
            past_key_values_length, seq_length + past_key_values_length, dtype=mindspore.int64
        )
        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
    else:
        position_ids = position_ids.view(-1, seq_length).long()

    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(input_ids)
    # embed positions
    if attention_mask is None:
        attention_mask = ops.ones(
            (batch_size, seq_length_with_past), dtype=mindspore.bool_
        )
    attention_mask = self._prepare_decoder_attention_mask(
        attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
    )

    hidden_states = inputs_embeds
    if self.gradient_checkpointing and self.training:
        if use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
            )
            use_cache = False
    # decoder layers
    all_hidden_states = () if output_hidden_states else None
    all_self_attns = () if output_attentions else None
    next_decoder_cache = () if use_cache else None

    for idx, decoder_layer in enumerate(self.layers):
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        past_key_value = past_key_values[idx] if past_key_values is not None else None

        # TODO: how checkpoint
        layer_outputs = decoder_layer(
            hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )

        hidden_states = layer_outputs[0]

        if use_cache:
            next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

        if output_attentions:
            all_self_attns += (layer_outputs[1],)

    hidden_states = self.norm(hidden_states)

    # add hidden states from the last decoder layer
    if output_hidden_states:
        all_hidden_states += (hidden_states,)

    next_cache = next_decoder_cache if use_cache else None
    if not return_dict:
        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
    return BaseModelOutputWithPast(
        last_hidden_state=hidden_states,
        past_key_values=next_cache,
        hidden_states=all_hidden_states,
        attentions=all_self_attns,
    )

mindnlp.transformers.models.internlm.modeling_internlm.InternLMModel.get_input_embeddings()

This method retrieves the input embeddings from the InternLMModel.

PARAMETER DESCRIPTION
self

The instance of the InternLMModel class.

TYPE: InternLMModel

RETURNS DESCRIPTION
embed_tokens

This method returns the input embeddings from the InternLMModel.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
771
772
773
774
775
776
777
778
779
780
781
782
783
784
def get_input_embeddings(self):
    """
    This method retrieves the input embeddings from the InternLMModel.

    Args:
        self (InternLMModel): The instance of the InternLMModel class.

    Returns:
        embed_tokens: This method returns the input embeddings from the InternLMModel.

    Raises:
        None.
    """
    return self.embed_tokens

mindnlp.transformers.models.internlm.modeling_internlm.InternLMModel.set_input_embeddings(value)

Sets the input embeddings for the InternLMModel.

PARAMETER DESCRIPTION
self

The instance of the InternLMModel class.

TYPE: object

value

The input embeddings value to be set for the model. It should be an object of appropriate type.

TYPE: object

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
def set_input_embeddings(self, value):
    """
    Sets the input embeddings for the InternLMModel.

    Args:
        self (object): The instance of the InternLMModel class.
        value (object): The input embeddings value to be set for the model.
            It should be an object of appropriate type.

    Returns:
        None.

    Raises:
        None
    """
    self.embed_tokens = value

mindnlp.transformers.models.internlm.modeling_internlm.InternLMPreTrainedModel

Bases: PreTrainedModel

The 'InternLMPreTrainedModel' class represents a pre-trained language model for internal use. It inherits from the 'PreTrainedModel' class and includes methods for initializing weights and setting gradient checkpointing.

ATTRIBUTE DESCRIPTION
config

The configuration for the pre-trained model.

METHOD DESCRIPTION
_init_weights

Initializes the weights for the specified cell using the specified initializer range.

_set_gradient_checkpointing

Sets the gradient checkpointing for the specified module to the specified value.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
class InternLMPreTrainedModel(PreTrainedModel):

    """
    The 'InternLMPreTrainedModel' class represents a pre-trained language model for internal use.
    It inherits from the 'PreTrainedModel' class and includes methods for initializing weights and setting gradient
    checkpointing.

    Attributes:
        config: The configuration for the pre-trained model.

    Methods:
        _init_weights: Initializes the weights for the specified cell using the specified initializer range.
        _set_gradient_checkpointing: Sets the gradient checkpointing for the specified module to the specified value.

    """
    config_class = InternLMConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["InternLMDecoderLayer"]
    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
    #_skip_keys_device_placement = "past_key_values"

    def _init_weights(self, cell):
        """
        Initializes the weights of a given cell.

        Args:
            self (InternLMPreTrainedModel): The instance of the InternLMPreTrainedModel class.
            cell (nn.Module): The cell for which the weights need to be initialized.

        Returns:
            None: This method modifies the weights of the given cell in-place.

        Raises:
            None.

        This method initializes the weights of the given cell based on the configuration specified in `self.config`.
        It supports two types of cells: `nn.Linear` and `nn.Embedding`.

        For `nn.Linear` cells, the weights are initialized using a normal distribution with mean 0 and standard deviation
        `self.config.initializer_range`.
        The weights are set using the `set_data` method of the `weight` attribute of the cell.
        If the cell has a bias attribute (`cell.bias`), it is initialized with zeros using the `set_data` method as well.

        For `nn.Embedding` cells, the weights are initialized using a normal distribution with mean 0 and
        standard deviation `self.config.initializer_range`. The weights are randomly sampled using the `np.random.normal`
        function and set using the `set_data` method of the `weight` attribute of the cell.
        If the cell has a `padding_idx` attribute (`cell.padding_idx`), the weight at that index is set to 0.

        Note:
            This method modifies the weights of the cell in-place and does not return any value.
        """
        std = self.config.initializer_range
        if isinstance(cell, nn.Linear):
            cell.weight.set_data(initializer(Normal(
                sigma=std, mean=0.0), cell.weight.shape, cell.weight.dtype))
            if cell.bias is not None:
                cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
        elif isinstance(cell, nn.Embedding):
            weight = np.random.normal(0.0, std, cell.weight.shape)
            if cell.padding_idx is not None:
                weight[cell.padding_idx] = 0

            cell.weight.set_data(Tensor(weight, cell.weight.dtype))

    def _set_gradient_checkpointing(self, module, value=False):
        """
        Sets the gradient checkpointing attribute of a given module.

        Args:
            self (InternLMPreTrainedModel): The instance of the InternLMPreTrainedModel class.
            module: The module for which the gradient checkpointing attribute needs to be set.
                Should be an instance of the InternLMModel class.
            value (bool): The value to set for the gradient checkpointing attribute. Default is False.

        Returns:
            None.

        Raises:
            None.

        This method sets the gradient_checkpointing attribute of the specified module to the given value.
        The gradient checkpointing attribute determines whether to enable gradient checkpointing during training.
        Gradient checkpointing is a technique used to reduce memory consumption during backward pass by trading off computation time.
        If the module is an instance of the InternLMModel class, the gradient checkpointing attribute is set to the specified value.
        """
        if isinstance(module, InternLMModel):
            module.gradient_checkpointing = value

mindnlp.transformers.models.internlm.modeling_internlm.InternLMRMSNorm

Bases: Module

RMSNorm

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
class InternLMRMSNorm(nn.Module):
    """
    RMSNorm
    """
    def __init__(self, hidden_size, eps=1e-6):
        """
        RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = Parameter(ops.ones(hidden_size), 'weight')
        self.variance_epsilon = epsilon

    def forward(self, hidden_states):
        """
        Constructs the RMS normalization of hidden states.

        Args:
            self (InternLMRMSNorm): An instance of the InternLMRMSNorm class.
            hidden_states (Tensor): Tensor holding the hidden states. Should be of type mindspore.Tensor.

        Returns:
            None: This method modifies the input hidden states in-place.

        Raises:
            ValueError: If the input hidden_states are not of type mindspore.Tensor.
            TypeError: If the weight dtype is not mindspore.float16 or mindspore.bfloat16.
        """
        variance = hidden_states.to(mindspore.float32).pow(2).mean(-1, keep_dims=True)
        hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon)

        # convert into half-precision if necessary
        if self.weight.dtype in [mindspore.float16, mindspore.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        return self.weight * hidden_states

mindnlp.transformers.models.internlm.modeling_internlm.InternLMRMSNorm.__init__(hidden_size, eps=1e-06)

RMSNorm is equivalent to T5LayerNorm

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
86
87
88
89
90
91
92
def __init__(self, hidden_size, eps=1e-6):
    """
    RMSNorm is equivalent to T5LayerNorm
    """
    super().__init__()
    self.weight = Parameter(ops.ones(hidden_size), 'weight')
    self.variance_epsilon = epsilon

mindnlp.transformers.models.internlm.modeling_internlm.InternLMRMSNorm.forward(hidden_states)

Constructs the RMS normalization of hidden states.

PARAMETER DESCRIPTION
self

An instance of the InternLMRMSNorm class.

TYPE: InternLMRMSNorm

hidden_states

Tensor holding the hidden states. Should be of type mindspore.Tensor.

TYPE: Tensor

RETURNS DESCRIPTION
None

This method modifies the input hidden states in-place.

RAISES DESCRIPTION
ValueError

If the input hidden_states are not of type mindspore.Tensor.

TypeError

If the weight dtype is not mindspore.float16 or mindspore.bfloat16.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def forward(self, hidden_states):
    """
    Constructs the RMS normalization of hidden states.

    Args:
        self (InternLMRMSNorm): An instance of the InternLMRMSNorm class.
        hidden_states (Tensor): Tensor holding the hidden states. Should be of type mindspore.Tensor.

    Returns:
        None: This method modifies the input hidden states in-place.

    Raises:
        ValueError: If the input hidden_states are not of type mindspore.Tensor.
        TypeError: If the weight dtype is not mindspore.float16 or mindspore.bfloat16.
    """
    variance = hidden_states.to(mindspore.float32).pow(2).mean(-1, keep_dims=True)
    hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon)

    # convert into half-precision if necessary
    if self.weight.dtype in [mindspore.float16, mindspore.bfloat16]:
        hidden_states = hidden_states.to(self.weight.dtype)

    return self.weight * hidden_states

mindnlp.transformers.models.internlm.modeling_internlm.InternLMRotaryEmbedding

Bases: Module

RotaryEmbedding

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
class InternLMRotaryEmbedding(nn.Module):
    """
    RotaryEmbedding
    """
    def __init__(self, dim, max_position_embeddings=2048, base=10000):
        """
        __init__ method in the InternLMRotaryEmbedding class.

        Args:
            self: The instance of the class.
            dim (int): The dimension of the input embeddings.
            max_position_embeddings (int, optional): The maximum position embeddings. Defaults to 2048.
            base (int, optional): The base value for calculations. Defaults to 10000.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        self.inv_freq = 1.0 / (base ** (ops.arange(0, dim, 2).float() / dim))

        self.max_seq_len_cached = max_position_embeddings
        t = ops.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)
        freqs = ops.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper,
        # but it uses a different permutation in order to obtain the same calculation
        emb = ops.cat((freqs, freqs), axis=-1)
        self.cos_cached = emb.cos()[None, None, :, :]
        self.sin_cached = emb.sin()[None, None, :, :]

    def forward(self, x, seq_len=None):
        '''
        This method forwards the rotary embeddings for the input sequence.

        Args:
            self (InternLMRotaryEmbedding): The instance of the InternLMRotaryEmbedding class.
            x:
                The input tensor for which the rotary embeddings are to be forwarded.

                - Type: tensor
                - Purpose: This parameter represents the input tensor for which the rotary embeddings are to be forwarded.
            seq_len (int, optional):
                The length of the input sequence.

                - Type: int
                - Purpose: This parameter represents the length of the input sequence for which the rotary embeddings
                    are to be forwarded. If not provided, it defaults to None.

        Returns:
            None.

        Raises:
            ValueError: If seq_len is greater than the maximum sequence length cached.
            TypeError: If the input parameters are not of the expected types.
        '''
        # x: [bs, num_attention_heads, seq_len, head_size]
        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
        if seq_len > self.max_seq_len_cached:
            self.max_seq_len_cached = seq_len
            t = ops.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)
            freqs = ops.einsum("i,j->ij", t, self.inv_freq)
            # Different from paper, but it uses a different permutation in order to obtain the same calculation
            emb = ops.cat((freqs, freqs), axis=-1)
            self.cos_cached = emb.cos()[None, None, :, :]
            self.sin_cached = emb.sin()[None, None, :, :]
        return (
            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
        )

mindnlp.transformers.models.internlm.modeling_internlm.InternLMRotaryEmbedding.__init__(dim, max_position_embeddings=2048, base=10000)

init method in the InternLMRotaryEmbedding class.

PARAMETER DESCRIPTION
self

The instance of the class.

dim

The dimension of the input embeddings.

TYPE: int

max_position_embeddings

The maximum position embeddings. Defaults to 2048.

TYPE: int DEFAULT: 2048

base

The base value for calculations. Defaults to 10000.

TYPE: int DEFAULT: 10000

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def __init__(self, dim, max_position_embeddings=2048, base=10000):
    """
    __init__ method in the InternLMRotaryEmbedding class.

    Args:
        self: The instance of the class.
        dim (int): The dimension of the input embeddings.
        max_position_embeddings (int, optional): The maximum position embeddings. Defaults to 2048.
        base (int, optional): The base value for calculations. Defaults to 10000.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    self.inv_freq = 1.0 / (base ** (ops.arange(0, dim, 2).float() / dim))

    self.max_seq_len_cached = max_position_embeddings
    t = ops.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)
    freqs = ops.einsum("i,j->ij", t, self.inv_freq)
    # Different from paper,
    # but it uses a different permutation in order to obtain the same calculation
    emb = ops.cat((freqs, freqs), axis=-1)
    self.cos_cached = emb.cos()[None, None, :, :]
    self.sin_cached = emb.sin()[None, None, :, :]

mindnlp.transformers.models.internlm.modeling_internlm.InternLMRotaryEmbedding.forward(x, seq_len=None)

This method forwards the rotary embeddings for the input sequence.

PARAMETER DESCRIPTION
self

The instance of the InternLMRotaryEmbedding class.

TYPE: InternLMRotaryEmbedding

x

The input tensor for which the rotary embeddings are to be forwarded.

  • Type: tensor
  • Purpose: This parameter represents the input tensor for which the rotary embeddings are to be forwarded.

seq_len

The length of the input sequence.

  • Type: int
  • Purpose: This parameter represents the length of the input sequence for which the rotary embeddings are to be forwarded. If not provided, it defaults to None.

TYPE: int DEFAULT: None

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If seq_len is greater than the maximum sequence length cached.

TypeError

If the input parameters are not of the expected types.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def forward(self, x, seq_len=None):
    '''
    This method forwards the rotary embeddings for the input sequence.

    Args:
        self (InternLMRotaryEmbedding): The instance of the InternLMRotaryEmbedding class.
        x:
            The input tensor for which the rotary embeddings are to be forwarded.

            - Type: tensor
            - Purpose: This parameter represents the input tensor for which the rotary embeddings are to be forwarded.
        seq_len (int, optional):
            The length of the input sequence.

            - Type: int
            - Purpose: This parameter represents the length of the input sequence for which the rotary embeddings
                are to be forwarded. If not provided, it defaults to None.

    Returns:
        None.

    Raises:
        ValueError: If seq_len is greater than the maximum sequence length cached.
        TypeError: If the input parameters are not of the expected types.
    '''
    # x: [bs, num_attention_heads, seq_len, head_size]
    # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
    if seq_len > self.max_seq_len_cached:
        self.max_seq_len_cached = seq_len
        t = ops.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)
        freqs = ops.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = ops.cat((freqs, freqs), axis=-1)
        self.cos_cached = emb.cos()[None, None, :, :]
        self.sin_cached = emb.sin()[None, None, :, :]
    return (
        self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
        self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
    )

mindnlp.transformers.models.internlm.modeling_internlm.apply_rotary_pos_emb(q, k, cos, sin, position_ids)

Apply rotary positional embeddings to input queries (q) and keys (k).

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
290
291
292
293
294
295
296
297
298
299
300
301
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
    """
    Apply rotary positional embeddings to input queries (q) and keys (k).
    """
    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

mindnlp.transformers.models.internlm.modeling_internlm.rotate_half(x)

Rotates half the hidden dims of the input.

Source code in mindnlp/transformers/models/internlm/modeling_internlm.py
284
285
286
287
288
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2:]
    return ops.cat((-x2, x1), axis=-1)

mindnlp.transformers.models.internlm.configuration_internlm

InternLM model configuration

mindnlp.transformers.models.internlm.configuration_internlm.InternLMConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [InternLMModel]. It is used to instantiate an InternLM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the InternLM-7B. Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER DESCRIPTION
vocab_size

Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [InternLMModel]

TYPE: `int`, *optional*, defaults to 32000 DEFAULT: 103168

hidden_size

Dimension of the hidden representations.

TYPE: `int`, *optional*, defaults to 4096 DEFAULT: 4096

intermediate_size

Dimension of the MLP representations.

TYPE: `int`, *optional*, defaults to 11008 DEFAULT: 11008

num_hidden_layers

Number of hidden layers in the Transformer encoder.

TYPE: `int`, *optional*, defaults to 32 DEFAULT: 32

num_attention_heads

Number of attention heads for each attention layer in the Transformer encoder.

TYPE: `int`, *optional*, defaults to 32 DEFAULT: 32

hidden_act

The non-linear activation function (function or string) in the decoder.

TYPE: `str` or `function`, *optional*, defaults to `"silu"` DEFAULT: 'silu'

max_position_embeddings

The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048).

TYPE: `int`, *optional*, defaults to 2048 DEFAULT: 2048

initializer_range

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

TYPE: `float`, *optional*, defaults to 0.02 DEFAULT: 0.02

rms_norm_eps

The epsilon used by the rms normalization layers.

TYPE: `float`, *optional*, defaults to 1e-12 DEFAULT: 1e-06

use_cache

Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

tie_word_embeddings(`bool`,

Whether to tie weight embeddings

TYPE: *optional*, defaults to `False`

Source code in mindnlp/transformers/models/internlm/configuration_internlm.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class InternLMConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
    an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`InternLMModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
    """
    model_type = "internlm"
    _auto_class = "AutoConfig"

    def __init__(  # pylint: disable=W0102
        self,
        vocab_size=103168,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False,
        bias=True,
        rotary={"base": 10000, "type": "dynamic"},  # pylint: disable=W0102
        attn_implementation="eager",
        **kwargs,
    ):
        """
        This method initializes an instance of the InternLMConfig class with the provided configuration parameters.

        Args:
            vocab_size (int): The size of the vocabulary used in the language model.
            hidden_size (int): The size of the hidden layers in the model.
            intermediate_size (int): The size of the intermediate layers in the model.
            num_hidden_layers (int): The number of hidden layers in the model.
            num_attention_heads (int): The number of attention heads in the model.
            hidden_act (str): The activation function used in the hidden layers.
            max_position_embeddings (int): The maximum position index that can be used in the model.
            initializer_range (float): The range for weight initialization.
            rms_norm_eps (float): The epsilon value for RMS norm.
            use_cache (bool): Whether to use cache during model computation.
            pad_token_id (int): The token ID used for padding sequences.
            bos_token_id (int): The token ID used for the beginning of a sequence.
            eos_token_id (int): The token ID used for the end of a sequence.
            tie_word_embeddings (bool): Whether to tie the word embeddings.
            bias (bool): Whether to include bias in the model.
            rotary (dict): Dictionary with keys 'base' (int) and 'type' (str) defining rotary settings.
            attn_implementation (str): The implementation method for attention. If None, defaults to 'eager'.

        Returns:
            None.

        Raises:
            None
        """
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.bias = bias
        self.rotary = rotary
        self.attn_implementation = attn_implementation
        if self.attn_implementation is None:
            self.attn_implementation = "eager"
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

mindnlp.transformers.models.internlm.configuration_internlm.InternLMConfig.__init__(vocab_size=103168, hidden_size=4096, intermediate_size=11008, num_hidden_layers=32, num_attention_heads=32, hidden_act='silu', max_position_embeddings=2048, initializer_range=0.02, rms_norm_eps=1e-06, use_cache=True, pad_token_id=0, bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, bias=True, rotary={'base': 10000, 'type': 'dynamic'}, attn_implementation='eager', **kwargs)

This method initializes an instance of the InternLMConfig class with the provided configuration parameters.

PARAMETER DESCRIPTION
vocab_size

The size of the vocabulary used in the language model.

TYPE: int DEFAULT: 103168

hidden_size

The size of the hidden layers in the model.

TYPE: int DEFAULT: 4096

intermediate_size

The size of the intermediate layers in the model.

TYPE: int DEFAULT: 11008

num_hidden_layers

The number of hidden layers in the model.

TYPE: int DEFAULT: 32

num_attention_heads

The number of attention heads in the model.

TYPE: int DEFAULT: 32

hidden_act

The activation function used in the hidden layers.

TYPE: str DEFAULT: 'silu'

max_position_embeddings

The maximum position index that can be used in the model.

TYPE: int DEFAULT: 2048

initializer_range

The range for weight initialization.

TYPE: float DEFAULT: 0.02

rms_norm_eps

The epsilon value for RMS norm.

TYPE: float DEFAULT: 1e-06

use_cache

Whether to use cache during model computation.

TYPE: bool DEFAULT: True

pad_token_id

The token ID used for padding sequences.

TYPE: int DEFAULT: 0

bos_token_id

The token ID used for the beginning of a sequence.

TYPE: int DEFAULT: 1

eos_token_id

The token ID used for the end of a sequence.

TYPE: int DEFAULT: 2

tie_word_embeddings

Whether to tie the word embeddings.

TYPE: bool DEFAULT: False

bias

Whether to include bias in the model.

TYPE: bool DEFAULT: True

rotary

Dictionary with keys 'base' (int) and 'type' (str) defining rotary settings.

TYPE: dict DEFAULT: {'base': 10000, 'type': 'dynamic'}

attn_implementation

The implementation method for attention. If None, defaults to 'eager'.

TYPE: str DEFAULT: 'eager'

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/configuration_internlm.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def __init__(  # pylint: disable=W0102
    self,
    vocab_size=103168,
    hidden_size=4096,
    intermediate_size=11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    rms_norm_eps=1e-6,
    use_cache=True,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    tie_word_embeddings=False,
    bias=True,
    rotary={"base": 10000, "type": "dynamic"},  # pylint: disable=W0102
    attn_implementation="eager",
    **kwargs,
):
    """
    This method initializes an instance of the InternLMConfig class with the provided configuration parameters.

    Args:
        vocab_size (int): The size of the vocabulary used in the language model.
        hidden_size (int): The size of the hidden layers in the model.
        intermediate_size (int): The size of the intermediate layers in the model.
        num_hidden_layers (int): The number of hidden layers in the model.
        num_attention_heads (int): The number of attention heads in the model.
        hidden_act (str): The activation function used in the hidden layers.
        max_position_embeddings (int): The maximum position index that can be used in the model.
        initializer_range (float): The range for weight initialization.
        rms_norm_eps (float): The epsilon value for RMS norm.
        use_cache (bool): Whether to use cache during model computation.
        pad_token_id (int): The token ID used for padding sequences.
        bos_token_id (int): The token ID used for the beginning of a sequence.
        eos_token_id (int): The token ID used for the end of a sequence.
        tie_word_embeddings (bool): Whether to tie the word embeddings.
        bias (bool): Whether to include bias in the model.
        rotary (dict): Dictionary with keys 'base' (int) and 'type' (str) defining rotary settings.
        attn_implementation (str): The implementation method for attention. If None, defaults to 'eager'.

    Returns:
        None.

    Raises:
        None
    """
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.rms_norm_eps = rms_norm_eps
    self.use_cache = use_cache
    self.bias = bias
    self.rotary = rotary
    self.attn_implementation = attn_implementation
    if self.attn_implementation is None:
        self.attn_implementation = "eager"
    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )

mindnlp.transformers.models.internlm.tokenization_internlm

tokenization internlm

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer

Bases: PreTrainedTokenizer

Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.

PARAMETER DESCRIPTION
vocab_file

Path to the vocabulary file.

TYPE: `str`

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
class InternLMTokenizer(PreTrainedTokenizer):
    """
    Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        unk_token="<unk>",
        bos_token="<s>",
        eos_token="</s>",
        pad_token=None,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        add_bos_token=True,
        add_eos_token=False,
        clean_up_tokenization_spaces=False,
        **kwargs,
    ):
        """
        Initializes an instance of the InternLMTokenizer class.

        Args:
            self: An instance of the InternLMTokenizer class.
            vocab_file (str): The path to the vocabulary file.
            unk_token (str, optional): The unknown token. Defaults to '<unk>'.
            bos_token (str, optional): The beginning of sentence token. Defaults to '<s>'.
            eos_token (str, optional): The end of sentence token. Defaults to '</s>'.
            pad_token (str, optional): The padding token. Defaults to None.
            sp_model_kwargs (Optional[Dict[str, Any]], optional):
                Additional keyword arguments for the SentencePieceProcessor. Defaults to None.
            add_bos_token (bool, optional): Whether to add the bos_token to the vocabulary. Defaults to True.
            add_eos_token (bool, optional): Whether to add the eos_token to the vocabulary. Defaults to False.
            clean_up_tokenization_spaces (bool, optional): Whether to clean up tokenization spaces. Defaults to False.
            **kwargs: Additional keyword arguments.

        Returns:
            None.

        Raises:
            None.
        """
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)

        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            sp_model_kwargs=self.sp_model_kwargs,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )
        self.vocab_file = vocab_file
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token

    def __getstate__(self):
        """
        Method '__getstate__' in the class 'InternLMTokenizer' is used to retrieve the state of the object
        for pickling or serialization purposes.

        Args:
            self: An instance of the 'InternLMTokenizer' class.

        Returns:
            None: This method does not explicitly return any value.
                However, it modifies the state of the object by setting the 'sp_model' attribute to None
                and returns the modified state as a dictionary.

        Raises:
            None.
        """
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        """
        Sets the state of the InternLMTokenizer object.

        Args:
            self (InternLMTokenizer): The instance of the InternLMTokenizer class.
            d (dict): The dictionary containing the state information to be set.
                The dictionary should have the '__dict__' attribute which stores the internal state of the object.

        Returns:
            None.

        Raises:
            None.
        """
        self.__dict__ = d
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(self.vocab_file)

    @property
    def vocab_size(self):
        """Returns vocab size"""
        return self.sp_model.get_piece_size()

    def get_vocab(self):
        """Returns vocab as a dict"""
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def _tokenize(self, text):
        """Returns a tokenized string."""
        return self.sp_model.encode(text, out_type=str)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.sp_model.piece_to_id(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        token = self.sp_model.IdToPiece(index)
        return token

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for i, token in enumerate(tokens):
            # make sure that special tokens are not decoded using sentencepiece model
            if token in self.all_special_tokens:
                if not prev_is_special and i != 0:
                    out_string += " "
                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string

    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        return (out_vocab_file,)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Method to build input tokens with special tokens for an internal language model tokenizer.

        Args:
            self (InternLMTokenizer): The instance of the InternLMTokenizer class.
            token_ids_0 (list): List of token IDs for the first input sequence.
            token_ids_1 (list, optional): List of token IDs for the second input sequence. Defaults to None.

        Returns:
            list: A list of token IDs with special tokens added at the beginning and end of each input sequence.

        Raises:
            None
        """
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

        output = bos_token_id + token_ids_0 + eos_token_id

        if token_ids_1 is not None:
            output = output + bos_token_id + token_ids_1 + eos_token_id

        return output

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        bos_token_id = [1] if self.add_bos_token else []
        eos_token_id = [1] if self.add_eos_token else []

        if token_ids_1 is None:
            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
        return (
            bos_token_id
            + ([0] * len(token_ids_0))
            + eos_token_id
            + bos_token_id
            + ([0] * len(token_ids_1))
            + eos_token_id
        )

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)

        if token_ids_1 is not None:
            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)

        return output

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.vocab_size property

Returns vocab size

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.__getstate__()

Method 'getstate' in the class 'InternLMTokenizer' is used to retrieve the state of the object for pickling or serialization purposes.

PARAMETER DESCRIPTION
self

An instance of the 'InternLMTokenizer' class.

RETURNS DESCRIPTION
None

This method does not explicitly return any value. However, it modifies the state of the object by setting the 'sp_model' attribute to None and returns the modified state as a dictionary.

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def __getstate__(self):
    """
    Method '__getstate__' in the class 'InternLMTokenizer' is used to retrieve the state of the object
    for pickling or serialization purposes.

    Args:
        self: An instance of the 'InternLMTokenizer' class.

    Returns:
        None: This method does not explicitly return any value.
            However, it modifies the state of the object by setting the 'sp_model' attribute to None
            and returns the modified state as a dictionary.

    Raises:
        None.
    """
    state = self.__dict__.copy()
    state["sp_model"] = None
    return state

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.__init__(vocab_file, unk_token='<unk>', bos_token='<s>', eos_token='</s>', pad_token=None, sp_model_kwargs=None, add_bos_token=True, add_eos_token=False, clean_up_tokenization_spaces=False, **kwargs)

Initializes an instance of the InternLMTokenizer class.

PARAMETER DESCRIPTION
self

An instance of the InternLMTokenizer class.

vocab_file

The path to the vocabulary file.

TYPE: str

unk_token

The unknown token. Defaults to ''.

TYPE: str DEFAULT: '<unk>'

bos_token

The beginning of sentence token. Defaults to ''.

TYPE: str DEFAULT: '<s>'

eos_token

The end of sentence token. Defaults to ''.

TYPE: str DEFAULT: '</s>'

pad_token

The padding token. Defaults to None.

TYPE: str DEFAULT: None

sp_model_kwargs

Additional keyword arguments for the SentencePieceProcessor. Defaults to None.

TYPE: Optional[Dict[str, Any]] DEFAULT: None

add_bos_token

Whether to add the bos_token to the vocabulary. Defaults to True.

TYPE: bool DEFAULT: True

add_eos_token

Whether to add the eos_token to the vocabulary. Defaults to False.

TYPE: bool DEFAULT: False

clean_up_tokenization_spaces

Whether to clean up tokenization spaces. Defaults to False.

TYPE: bool DEFAULT: False

**kwargs

Additional keyword arguments.

DEFAULT: {}

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def __init__(
    self,
    vocab_file,
    unk_token="<unk>",
    bos_token="<s>",
    eos_token="</s>",
    pad_token=None,
    sp_model_kwargs: Optional[Dict[str, Any]] = None,
    add_bos_token=True,
    add_eos_token=False,
    clean_up_tokenization_spaces=False,
    **kwargs,
):
    """
    Initializes an instance of the InternLMTokenizer class.

    Args:
        self: An instance of the InternLMTokenizer class.
        vocab_file (str): The path to the vocabulary file.
        unk_token (str, optional): The unknown token. Defaults to '<unk>'.
        bos_token (str, optional): The beginning of sentence token. Defaults to '<s>'.
        eos_token (str, optional): The end of sentence token. Defaults to '</s>'.
        pad_token (str, optional): The padding token. Defaults to None.
        sp_model_kwargs (Optional[Dict[str, Any]], optional):
            Additional keyword arguments for the SentencePieceProcessor. Defaults to None.
        add_bos_token (bool, optional): Whether to add the bos_token to the vocabulary. Defaults to True.
        add_eos_token (bool, optional): Whether to add the eos_token to the vocabulary. Defaults to False.
        clean_up_tokenization_spaces (bool, optional): Whether to clean up tokenization spaces. Defaults to False.
        **kwargs: Additional keyword arguments.

    Returns:
        None.

    Raises:
        None.
    """
    self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
    self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
    self.sp_model.Load(vocab_file)

    bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
    eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
    unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
    pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
    super().__init__(
        bos_token=bos_token,
        eos_token=eos_token,
        unk_token=unk_token,
        pad_token=pad_token,
        add_bos_token=add_bos_token,
        add_eos_token=add_eos_token,
        sp_model_kwargs=self.sp_model_kwargs,
        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
        **kwargs,
    )
    self.vocab_file = vocab_file
    self.add_bos_token = add_bos_token
    self.add_eos_token = add_eos_token

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.__setstate__(d)

Sets the state of the InternLMTokenizer object.

PARAMETER DESCRIPTION
self

The instance of the InternLMTokenizer class.

TYPE: InternLMTokenizer

d

The dictionary containing the state information to be set. The dictionary should have the 'dict' attribute which stores the internal state of the object.

TYPE: dict

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def __setstate__(self, d):
    """
    Sets the state of the InternLMTokenizer object.

    Args:
        self (InternLMTokenizer): The instance of the InternLMTokenizer class.
        d (dict): The dictionary containing the state information to be set.
            The dictionary should have the '__dict__' attribute which stores the internal state of the object.

    Returns:
        None.

    Raises:
        None.
    """
    self.__dict__ = d
    self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
    self.sp_model.Load(self.vocab_file)

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.build_inputs_with_special_tokens(token_ids_0, token_ids_1=None)

Method to build input tokens with special tokens for an internal language model tokenizer.

PARAMETER DESCRIPTION
self

The instance of the InternLMTokenizer class.

TYPE: InternLMTokenizer

token_ids_0

List of token IDs for the first input sequence.

TYPE: list

token_ids_1

List of token IDs for the second input sequence. Defaults to None.

TYPE: list DEFAULT: None

RETURNS DESCRIPTION
list

A list of token IDs with special tokens added at the beginning and end of each input sequence.

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    """
    Method to build input tokens with special tokens for an internal language model tokenizer.

    Args:
        self (InternLMTokenizer): The instance of the InternLMTokenizer class.
        token_ids_0 (list): List of token IDs for the first input sequence.
        token_ids_1 (list, optional): List of token IDs for the second input sequence. Defaults to None.

    Returns:
        list: A list of token IDs with special tokens added at the beginning and end of each input sequence.

    Raises:
        None
    """
    bos_token_id = [self.bos_token_id] if self.add_bos_token else []
    eos_token_id = [self.eos_token_id] if self.add_eos_token else []

    output = bos_token_id + token_ids_0 + eos_token_id

    if token_ids_1 is not None:
        output = output + bos_token_id + token_ids_1 + eos_token_id

    return output

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.convert_tokens_to_string(tokens)

Converts a sequence of tokens (string) in a single string.

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def convert_tokens_to_string(self, tokens):
    """Converts a sequence of tokens (string) in a single string."""
    current_sub_tokens = []
    out_string = ""
    prev_is_special = False
    for i, token in enumerate(tokens):
        # make sure that special tokens are not decoded using sentencepiece model
        if token in self.all_special_tokens:
            if not prev_is_special and i != 0:
                out_string += " "
            out_string += self.sp_model.decode(current_sub_tokens) + token
            prev_is_special = True
            current_sub_tokens = []
        else:
            current_sub_tokens.append(token)
            prev_is_special = False
    out_string += self.sp_model.decode(current_sub_tokens)
    return out_string

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.create_token_type_ids_from_sequences(token_ids_0, token_ids_1=None)

Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT sequence pair mask has the following format:

0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence    | second sequence |

if token_ids_1 is None, only returns the first portion of the mask (0s).

PARAMETER DESCRIPTION
token_ids_0

List of ids.

TYPE: `List[int]`

token_ids_1

Optional second list of IDs for sequence pairs.

TYPE: `List[int]`, *optional* DEFAULT: None

RETURNS DESCRIPTION
List[int]

List[int]: List of token type IDs according to the given sequence(s).

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
def create_token_type_ids_from_sequences(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
    """
    Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
    sequence pair mask has the following format:

    ```
    0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
    | first sequence    | second sequence |
    ```

    if token_ids_1 is None, only returns the first portion of the mask (0s).

    Args:
        token_ids_0 (`List[int]`):
            List of ids.
        token_ids_1 (`List[int]`, *optional*):
            Optional second list of IDs for sequence pairs.

    Returns:
        `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
    """
    bos_token_id = [self.bos_token_id] if self.add_bos_token else []
    eos_token_id = [self.eos_token_id] if self.add_eos_token else []

    output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)

    if token_ids_1 is not None:
        output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)

    return output

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.get_special_tokens_mask(token_ids_0, token_ids_1=None, already_has_special_tokens=False)

Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer prepare_for_model method.

PARAMETER DESCRIPTION
token_ids_0

List of IDs.

TYPE: `List[int]`

token_ids_1

Optional second list of IDs for sequence pairs.

TYPE: `List[int]`, *optional* DEFAULT: None

already_has_special_tokens

Whether or not the token list is already formatted with special tokens for the model.

TYPE: `bool`, *optional*, defaults to `False` DEFAULT: False

RETURNS DESCRIPTION
List[int]

List[int]: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def get_special_tokens_mask(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
    """
    Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
    special tokens using the tokenizer `prepare_for_model` method.

    Args:
        token_ids_0 (`List[int]`):
            List of IDs.
        token_ids_1 (`List[int]`, *optional*):
            Optional second list of IDs for sequence pairs.
        already_has_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not the token list is already formatted with special tokens for the model.

    Returns:
        `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
    """
    if already_has_special_tokens:
        return super().get_special_tokens_mask(
            token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
        )

    bos_token_id = [1] if self.add_bos_token else []
    eos_token_id = [1] if self.add_eos_token else []

    if token_ids_1 is None:
        return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
    return (
        bos_token_id
        + ([0] * len(token_ids_0))
        + eos_token_id
        + bos_token_id
        + ([0] * len(token_ids_1))
        + eos_token_id
    )

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.get_vocab()

Returns vocab as a dict

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
161
162
163
164
165
def get_vocab(self):
    """Returns vocab as a dict"""
    vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
    vocab.update(self.added_tokens_encoder)
    return vocab

mindnlp.transformers.models.internlm.tokenization_internlm.InternLMTokenizer.save_vocabulary(save_directory, filename_prefix=None)

Save the vocabulary and special tokens file to a directory.

PARAMETER DESCRIPTION
save_directory

The directory in which to save the vocabulary.

TYPE: `str`

RETURNS DESCRIPTION
Tuple[str]

Tuple(str): Paths to the files saved.

Source code in mindnlp/transformers/models/internlm/tokenization_internlm.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
    """
    Save the vocabulary and special tokens file to a directory.

    Args:
        save_directory (`str`):
            The directory in which to save the vocabulary.

    Returns:
        `Tuple(str)`: Paths to the files saved.
    """
    if not os.path.isdir(save_directory):
        logger.error(f"Vocabulary path ({save_directory}) should be a directory")
        return
    out_vocab_file = os.path.join(
        save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
    )

    if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
        copyfile(self.vocab_file, out_vocab_file)
    elif not os.path.isfile(self.vocab_file):
        with open(out_vocab_file, "wb") as fi:
            content_spiece_model = self.sp_model.serialized_model_proto()
            fi.write(content_spiece_model)

    return (out_vocab_file,)