Skip to content

minicpm

mindnlp.transformers.models.minicpm.modeling_minicpm

MindSpore MiniCPM model.

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMAttention

Bases: Module

Multi-headed attention from 'Attention Is All You Need' paper

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
class MiniCPMAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    def __init__(self, config: MiniCPMConfig, layer_idx: Optional[int] = None):
        """
        Initializes an instance of the MiniCPMAttention class.

        Args:
            self: The instance of the class.
            config (MiniCPMConfig):
                The configuration object for MiniCPMAttention.

                - `config` contains various attributes that define the behavior of the attention mechanism.
                - It is an instance of the MiniCPMConfig class.
            layer_idx (Optional[int], default=None):
                The index of the layer.

                - This parameter is optional and can be omitted.
                - If provided, it helps in caching during the forward call.
                - Not providing `layer_idx` is not recommended, as it may lead to errors if caching is used.
                - Please make sure to provide a valid `layer_idx` when creating an instance of this class.

        Returns:
            None.

        Raises:
            ValueError:
                If `hidden_size` is not divisible by `num_heads`.

                - This exception is raised when the condition `hidden_size % num_heads != 0` is not satisfied.
                - `hidden_size` must be divisible by `num_heads` for the attention mechanism to work correctly.

            Warning:
                If `layer_idx` is not provided, a warning is issued.

                - The warning message suggests that not providing `layer_idx` is not recommended.
                - It also highlights that errors may occur during the forward call if caching is used.
                - The user is advised to provide a valid `layer_idx` when creating an instance of this class.

        Note:
            The method initializes the MiniCPMAttention instance by assigning values to various attributes.
            It performs several checks to ensure the correctness of the provided configuration.
            The method also initializes the projection layers and sets up the required variables
            for the attention mechanism.
            Additionally, it initializes the rope mechanism by calling the `_init_rope` method.
        """
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
        self._init_rope()

    def _init_rope(self):
        """
        This method initializes the Rotary Positional Encoding (RoPE) for the MiniCPMAttention class.

        Args:
            self: MiniCPMAttention
                The instance of the MiniCPMAttention class.

        Returns:
            None.

        Raises:
            ValueError:
                If the scaling_type provided in the configuration for RoPE is not 'linear' or 'dynamic'.
        """
        if self.config.rope_scaling is None:
            self.rotary_emb = MiniCPMRotaryEmbedding(
                self.head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding(
                    self.head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = MiniCPMDynamicNTKScalingRotaryEmbedding(
                    self.head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def _shape(self, tensor: mindspore.Tensor, seq_len: int, bsz: int):
        """
        This method is responsible for shaping the input tensor to prepare it for MiniCPMAttention computation.

        Args:
            tensor (mindspore.Tensor): The input tensor to be reshaped.
                It should be of shape (seq_len * bsz, num_heads * head_dim).
            seq_len (int): The length of the input sequence.
            bsz (int): The batch size.

        Returns:
            None: This method returns None as it directly modifies the input tensor in place.

        Raises:
            ValueError: If the shape of the input tensor is not compatible with the reshaping operation.
            TypeError: If the input tensor is not of type mindspore.Tensor.
        """
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).swapaxes(1, 2)

    def forward(
        self,
        hidden_states: mindspore.Tensor,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        **kwargs,
    ) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
        '''
        This method forwards the MiniCPMAttention layer.

        Args:
            self: The object instance.
            hidden_states (mindspore.Tensor): The input hidden states with shape
                (batch_size, sequence_length, hidden_size).
            attention_mask (Optional[mindspore.Tensor]): Optional tensor with shape
                (batch_size, 1, sequence_length, sequence_length) representing the attention mask.
            position_ids (Optional[mindspore.Tensor]): Optional tensor with shape (batch_size, sequence_length)
                representing the position indices of input tokens.
            past_key_value (Optional[Cache]): Optional cache for past key-value pairs.
            output_attentions (bool): Flag indicating whether to return the attention weights.
            use_cache (bool): Flag indicating whether to use cache for key-value pairs.

        Returns:
            Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
                A tuple containing the output tensor of shape (batch_size, sequence_length, hidden_size),
                optional attention weights tensor, and optional updated past key-value pairs.

        Raises:
            ValueError: If the attention weights or attention mask have invalid shapes.
            ValueError: If the output tensor 'attn_output' has an unexpected shape.
            ValueError: If the cache structure has changed since version v4.36 and the layer index is not
                initialized for auto-regressive decoding.
        '''
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

        bsz, q_len, _ = hidden_states.shape

        if self.config.pretraining_tp > 1:
            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
            query_slices = self.q_proj.weight.split(
                (self.num_heads * self.head_dim) // self.config.pretraining_tp, axis=0
            )
            key_slices = self.k_proj.weight.split(key_value_slicing, axis=0)
            value_slices = self.v_proj.weight.split(key_value_slicing, axis=0)

            query_states = [ops.dense(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
            query_states = ops.cat(query_states, axis=-1)

            key_states = [ops.dense(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
            key_states = ops.cat(key_states, axis=-1)

            value_states = [ops.dense(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
            value_states = ops.cat(value_states, axis=-1)

        else:
            query_states = self.q_proj(hidden_states)
            key_states = self.k_proj(hidden_states)
            value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        cos, sin = self.rotary_emb(value_states.to(mindspore.float32), seq_len=kv_seq_len)

        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = ops.matmul(query_states, key_states.swapaxes(2, 3)) / math.sqrt(self.head_dim)
        if attn_weights.shape != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.shape}"
            )

        if attention_mask is not None:
            if attention_mask.shape != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = ops.softmax(attn_weights, axis=-1, dtype=mindspore.float32).to(query_states.dtype)
        attn_weights = ops.dropout(attn_weights, p=self.attention_dropout, training=self.training)
        attn_output = ops.matmul(attn_weights, value_states)

        if attn_output.shape != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.shape}"
            )

        attn_output = attn_output.swapaxes(1, 2)

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        if self.config.pretraining_tp > 1:
            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, axis=2)
            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, axis=1)
            attn_output = sum(ops.dense(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp))
        else:
            attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMAttention.__init__(config, layer_idx=None)

Initializes an instance of the MiniCPMAttention class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

The configuration object for MiniCPMAttention.

  • config contains various attributes that define the behavior of the attention mechanism.
  • It is an instance of the MiniCPMConfig class.

TYPE: MiniCPMConfig

layer_idx

The index of the layer.

  • This parameter is optional and can be omitted.
  • If provided, it helps in caching during the forward call.
  • Not providing layer_idx is not recommended, as it may lead to errors if caching is used.
  • Please make sure to provide a valid layer_idx when creating an instance of this class.

TYPE: Optional[int], default=None DEFAULT: None

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If hidden_size is not divisible by num_heads.

  • This exception is raised when the condition hidden_size % num_heads != 0 is not satisfied.
  • hidden_size must be divisible by num_heads for the attention mechanism to work correctly.
Warning

If layer_idx is not provided, a warning is issued.

  • The warning message suggests that not providing layer_idx is not recommended.
  • It also highlights that errors may occur during the forward call if caching is used.
  • The user is advised to provide a valid layer_idx when creating an instance of this class.
Note

The method initializes the MiniCPMAttention instance by assigning values to various attributes. It performs several checks to ensure the correctness of the provided configuration. The method also initializes the projection layers and sets up the required variables for the attention mechanism. Additionally, it initializes the rope mechanism by calling the _init_rope method.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
def __init__(self, config: MiniCPMConfig, layer_idx: Optional[int] = None):
    """
    Initializes an instance of the MiniCPMAttention class.

    Args:
        self: The instance of the class.
        config (MiniCPMConfig):
            The configuration object for MiniCPMAttention.

            - `config` contains various attributes that define the behavior of the attention mechanism.
            - It is an instance of the MiniCPMConfig class.
        layer_idx (Optional[int], default=None):
            The index of the layer.

            - This parameter is optional and can be omitted.
            - If provided, it helps in caching during the forward call.
            - Not providing `layer_idx` is not recommended, as it may lead to errors if caching is used.
            - Please make sure to provide a valid `layer_idx` when creating an instance of this class.

    Returns:
        None.

    Raises:
        ValueError:
            If `hidden_size` is not divisible by `num_heads`.

            - This exception is raised when the condition `hidden_size % num_heads != 0` is not satisfied.
            - `hidden_size` must be divisible by `num_heads` for the attention mechanism to work correctly.

        Warning:
            If `layer_idx` is not provided, a warning is issued.

            - The warning message suggests that not providing `layer_idx` is not recommended.
            - It also highlights that errors may occur during the forward call if caching is used.
            - The user is advised to provide a valid `layer_idx` when creating an instance of this class.

    Note:
        The method initializes the MiniCPMAttention instance by assigning values to various attributes.
        It performs several checks to ensure the correctness of the provided configuration.
        The method also initializes the projection layers and sets up the required variables
        for the attention mechanism.
        Additionally, it initializes the rope mechanism by calling the `_init_rope` method.
    """
    super().__init__()
    self.config = config
    self.layer_idx = layer_idx
    if layer_idx is None:
        logger.warning_once(
            f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
            "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
            "when creating this class."
        )

    self.attention_dropout = config.attention_dropout
    self.hidden_size = config.hidden_size
    self.num_heads = config.num_attention_heads
    self.head_dim = self.hidden_size // self.num_heads
    self.num_key_value_heads = config.num_key_value_heads
    self.num_key_value_groups = self.num_heads // self.num_key_value_heads
    self.max_position_embeddings = config.max_position_embeddings
    self.rope_theta = config.rope_theta
    self.is_causal = True

    if (self.head_dim * self.num_heads) != self.hidden_size:
        raise ValueError(
            f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
            f" and `num_heads`: {self.num_heads})."
        )

    self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
    self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
    self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
    self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
    self._init_rope()

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMAttention.forward(hidden_states, attention_mask=None, position_ids=None, past_key_value=None, output_attentions=False, use_cache=False, **kwargs)

This method forwards the MiniCPMAttention layer.

PARAMETER DESCRIPTION
self

The object instance.

hidden_states

The input hidden states with shape (batch_size, sequence_length, hidden_size).

TYPE: Tensor

attention_mask

Optional tensor with shape (batch_size, 1, sequence_length, sequence_length) representing the attention mask.

TYPE: Optional[Tensor] DEFAULT: None

position_ids

Optional tensor with shape (batch_size, sequence_length) representing the position indices of input tokens.

TYPE: Optional[Tensor] DEFAULT: None

past_key_value

Optional cache for past key-value pairs.

TYPE: Optional[Cache] DEFAULT: None

output_attentions

Flag indicating whether to return the attention weights.

TYPE: bool DEFAULT: False

use_cache

Flag indicating whether to use cache for key-value pairs.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]

Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]: A tuple containing the output tensor of shape (batch_size, sequence_length, hidden_size), optional attention weights tensor, and optional updated past key-value pairs.

RAISES DESCRIPTION
ValueError

If the attention weights or attention mask have invalid shapes.

ValueError

If the output tensor 'attn_output' has an unexpected shape.

ValueError

If the cache structure has changed since version v4.36 and the layer index is not initialized for auto-regressive decoding.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
def forward(
    self,
    hidden_states: mindspore.Tensor,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    past_key_value: Optional[Cache] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    **kwargs,
) -> Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
    '''
    This method forwards the MiniCPMAttention layer.

    Args:
        self: The object instance.
        hidden_states (mindspore.Tensor): The input hidden states with shape
            (batch_size, sequence_length, hidden_size).
        attention_mask (Optional[mindspore.Tensor]): Optional tensor with shape
            (batch_size, 1, sequence_length, sequence_length) representing the attention mask.
        position_ids (Optional[mindspore.Tensor]): Optional tensor with shape (batch_size, sequence_length)
            representing the position indices of input tokens.
        past_key_value (Optional[Cache]): Optional cache for past key-value pairs.
        output_attentions (bool): Flag indicating whether to return the attention weights.
        use_cache (bool): Flag indicating whether to use cache for key-value pairs.

    Returns:
        Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Optional[Tuple[mindspore.Tensor]]]:
            A tuple containing the output tensor of shape (batch_size, sequence_length, hidden_size),
            optional attention weights tensor, and optional updated past key-value pairs.

    Raises:
        ValueError: If the attention weights or attention mask have invalid shapes.
        ValueError: If the output tensor 'attn_output' has an unexpected shape.
        ValueError: If the cache structure has changed since version v4.36 and the layer index is not
            initialized for auto-regressive decoding.
    '''
    if "padding_mask" in kwargs:
        warnings.warn(
            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
        )

    bsz, q_len, _ = hidden_states.shape

    if self.config.pretraining_tp > 1:
        key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
        query_slices = self.q_proj.weight.split(
            (self.num_heads * self.head_dim) // self.config.pretraining_tp, axis=0
        )
        key_slices = self.k_proj.weight.split(key_value_slicing, axis=0)
        value_slices = self.v_proj.weight.split(key_value_slicing, axis=0)

        query_states = [ops.dense(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
        query_states = ops.cat(query_states, axis=-1)

        key_states = [ops.dense(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
        key_states = ops.cat(key_states, axis=-1)

        value_states = [ops.dense(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
        value_states = ops.cat(value_states, axis=-1)

    else:
        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).swapaxes(1, 2)
    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)
    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).swapaxes(1, 2)

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        if self.layer_idx is None:
            raise ValueError(
                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                "with a layer index."
            )
        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
    cos, sin = self.rotary_emb(value_states.to(mindspore.float32), seq_len=kv_seq_len)

    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

    if past_key_value is not None:
        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    attn_weights = ops.matmul(query_states, key_states.swapaxes(2, 3)) / math.sqrt(self.head_dim)
    if attn_weights.shape != (bsz, self.num_heads, q_len, kv_seq_len):
        raise ValueError(
            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
            f" {attn_weights.shape}"
        )

    if attention_mask is not None:
        if attention_mask.shape != (bsz, 1, q_len, kv_seq_len):
            raise ValueError(
                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
            )
        attn_weights = attn_weights + attention_mask

    # upcast attention to fp32
    attn_weights = ops.softmax(attn_weights, axis=-1, dtype=mindspore.float32).to(query_states.dtype)
    attn_weights = ops.dropout(attn_weights, p=self.attention_dropout, training=self.training)
    attn_output = ops.matmul(attn_weights, value_states)

    if attn_output.shape != (bsz, self.num_heads, q_len, self.head_dim):
        raise ValueError(
            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
            f" {attn_output.shape}"
        )

    attn_output = attn_output.swapaxes(1, 2)

    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

    if self.config.pretraining_tp > 1:
        attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, axis=2)
        o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, axis=1)
        attn_output = sum(ops.dense(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp))
    else:
        attn_output = self.o_proj(attn_output)

    if not output_attentions:
        attn_weights = None

    return attn_output, attn_weights, past_key_value

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMDecoderLayer

Bases: Module

MiniCPMDecoderLayer represents a single layer of the MiniCPM (Minimalist Conditional Pretrained Model) decoder. This class is responsible for processing input hidden states through self-attention mechanism and MLP (Multi-Layer Perceptron) for decoding tasks.

ATTRIBUTE DESCRIPTION
hidden_size

Size of the hidden states.

TYPE: int

self_attn

Instance of the attention mechanism used in the layer.

TYPE: MINICPM_ATTENTION_CLASSES

mlp

Instance of the MLP network.

TYPE: MiniCPMMLP

input_layernorm

Layer normalization applied to the input hidden states.

TYPE: MiniCPMRMSNorm

post_attention_layernorm

Layer normalization applied after the self-attention mechanism.

TYPE: MiniCPMRMSNorm

scale_depth

Scaling factor applied to the hidden states.

TYPE: int

num_hidden_layers

Number of hidden layers in the model.

TYPE: int

METHOD DESCRIPTION
forward

Processes the input hidden states through the layer.

Args:

  • hidden_states (mindspore.Tensor): Input to the layer of shape (batch, seq_len, embed_dim).
  • attention_mask (mindspore.Tensor, optional): Attention mask used for masking certain positions in the input.
  • position_ids (mindspore.Tensor, optional): Tensor representing the position ids of each token.
  • past_key_value (Tuple[mindspore.Tensor], optional): Cached past key and value projection states.
  • output_attentions (bool, optional): Whether to return attention tensors of all attention layers.
  • use_cache (bool, optional): If True, past key-value states are returned for speeding up decoding.
  • kwargs: Additional keyword arguments.

Returns:

  • Tuple containing the processed hidden states and optionally attentions and present key values.
Note

If 'padding_mask' is passed as a keyword argument in kwargs, a deprecation warning will be issued. It is recommended to use 'attention_mask' instead.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
class MiniCPMDecoderLayer(nn.Module):

    """
    MiniCPMDecoderLayer represents a single layer of the MiniCPM (Minimalist Conditional Pretrained Model) decoder.
    This class is responsible for processing input hidden states through self-attention mechanism and MLP
    (Multi-Layer Perceptron) for decoding tasks.

    Attributes:
        hidden_size (int): Size of the hidden states.
        self_attn (MINICPM_ATTENTION_CLASSES): Instance of the attention mechanism used in the layer.
        mlp (MiniCPMMLP): Instance of the MLP network.
        input_layernorm (MiniCPMRMSNorm): Layer normalization applied to the input hidden states.
        post_attention_layernorm (MiniCPMRMSNorm): Layer normalization applied after the self-attention mechanism.
        scale_depth (int): Scaling factor applied to the hidden states.
        num_hidden_layers (int): Number of hidden layers in the model.

    Methods:
        forward:
            Processes the input hidden states through the layer.

            Args:

            - hidden_states (mindspore.Tensor): Input to the layer of shape (batch, seq_len, embed_dim).
            - attention_mask (mindspore.Tensor, optional): Attention mask used for masking certain positions in the input.
            - position_ids (mindspore.Tensor, optional): Tensor representing the position ids of each token.
            - past_key_value (Tuple[mindspore.Tensor], optional): Cached past key and value projection states.
            - output_attentions (bool, optional): Whether to return attention tensors of all attention layers.
            - use_cache (bool, optional): If True, past key-value states are returned for speeding up decoding.
            - kwargs: Additional keyword arguments.

            Returns:

            - Tuple containing the processed hidden states and optionally attentions and present key values.

    Note:
        If 'padding_mask' is passed as a keyword argument in kwargs, a deprecation warning will be issued.
        It is recommended to use 'attention_mask' instead.
    """
    def __init__(self, config: MiniCPMConfig, layer_idx: int):
        """
        Initializes a new instance of MiniCPMDecoderLayer.

        Args:
            self: The object instance.
            config (MiniCPMConfig): An instance of MiniCPMConfig containing the configuration settings
                for the decoder layer.
            layer_idx (int): The index of the layer within the decoder.

        Returns:
            None.

        Raises:
            TypeError: If the config parameter is not of type MiniCPMConfig.
            ValueError: If the layer_idx parameter is not a non-negative integer.
        """
        super().__init__()
        self.hidden_size = config.hidden_size
        self.self_attn = MINICPM_ATTENTION_CLASSES['eager'](config=config, layer_idx=layer_idx)

        self.mlp = MiniCPMMLP(config)
        self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.scale_depth = config.scale_depth
        self.num_hidden_layers = config.num_hidden_layers

    def forward(
        self,
        hidden_states: mindspore.Tensor,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        **kwargs,
    ) -> Tuple[mindspore.Tensor, Optional[Tuple[mindspore.Tensor, mindspore.Tensor]]]:
        """
        Args:
            hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`mindspore.Tensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(mindspore.Tensor)`, *optional*): cached past key and value projection states
        """
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            **kwargs,
        )

        hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)

        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMDecoderLayer.__init__(config, layer_idx)

Initializes a new instance of MiniCPMDecoderLayer.

PARAMETER DESCRIPTION
self

The object instance.

config

An instance of MiniCPMConfig containing the configuration settings for the decoder layer.

TYPE: MiniCPMConfig

layer_idx

The index of the layer within the decoder.

TYPE: int

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
TypeError

If the config parameter is not of type MiniCPMConfig.

ValueError

If the layer_idx parameter is not a non-negative integer.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
def __init__(self, config: MiniCPMConfig, layer_idx: int):
    """
    Initializes a new instance of MiniCPMDecoderLayer.

    Args:
        self: The object instance.
        config (MiniCPMConfig): An instance of MiniCPMConfig containing the configuration settings
            for the decoder layer.
        layer_idx (int): The index of the layer within the decoder.

    Returns:
        None.

    Raises:
        TypeError: If the config parameter is not of type MiniCPMConfig.
        ValueError: If the layer_idx parameter is not a non-negative integer.
    """
    super().__init__()
    self.hidden_size = config.hidden_size
    self.self_attn = MINICPM_ATTENTION_CLASSES['eager'](config=config, layer_idx=layer_idx)

    self.mlp = MiniCPMMLP(config)
    self.input_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    self.post_attention_layernorm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    self.scale_depth = config.scale_depth
    self.num_hidden_layers = config.num_hidden_layers

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMDecoderLayer.forward(hidden_states, attention_mask=None, position_ids=None, past_key_value=None, output_attentions=False, use_cache=False, **kwargs)

PARAMETER DESCRIPTION
hidden_states

input to the layer of shape (batch, seq_len, embed_dim)

TYPE: `mindspore.Tensor`

attention_mask

attention mask of size (batch_size, sequence_length) if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length) if default attention is used.

TYPE: `mindspore.Tensor`, *optional* DEFAULT: None

output_attentions

Whether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.

TYPE: `bool`, *optional* DEFAULT: False

use_cache

If set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).

TYPE: `bool`, *optional* DEFAULT: False

past_key_value

cached past key and value projection states

TYPE: `Tuple(mindspore.Tensor)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
def forward(
    self,
    hidden_states: mindspore.Tensor,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    past_key_value: Optional[Tuple[mindspore.Tensor]] = None,
    output_attentions: Optional[bool] = False,
    use_cache: Optional[bool] = False,
    **kwargs,
) -> Tuple[mindspore.Tensor, Optional[Tuple[mindspore.Tensor, mindspore.Tensor]]]:
    """
    Args:
        hidden_states (`mindspore.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
        attention_mask (`mindspore.Tensor`, *optional*):
            attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
            query_sequence_length, key_sequence_length)` if default attention is used.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more detail.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
            (see `past_key_values`).
        past_key_value (`Tuple(mindspore.Tensor)`, *optional*): cached past key and value projection states
    """
    if "padding_mask" in kwargs:
        warnings.warn(
            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
        )

    residual = hidden_states
    hidden_states = self.input_layernorm(hidden_states)
    # Self Attention
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
        hidden_states=hidden_states,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=output_attentions,
        use_cache=use_cache,
        **kwargs,
    )

    hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))

    # Fully Connected
    residual = hidden_states
    hidden_states = self.post_attention_layernorm(hidden_states)

    hidden_states = self.mlp(hidden_states)
    hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))

    outputs = (hidden_states,)

    if output_attentions:
        outputs += (self_attn_weights,)

    if use_cache:
        outputs += (present_key_value,)

    return outputs

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMDynamicNTKScalingRotaryEmbedding

Bases: MiniCPMRotaryEmbedding

MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
    """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
    def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0):
        """
        Initializes a new instance of the MiniCPMDynamicNTKScalingRotaryEmbedding class.

        Args:
            self: The instance of the class.
            dim (int): The dimensionality of the embedding.
            max_position_embeddings (int, optional): The maximum number of position embeddings. Defaults to 2048.
            base (int, optional): The base value. Defaults to 10000.
            scaling_factor (float, optional): The scaling factor. Defaults to 1.0.

        Returns:
            None.

        Raises:
            None.
        """
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base)

    def _set_cos_sin_cache(self, seq_len, dtype):
        """
        This method '_set_cos_sin_cache' is defined in the class 'MiniCPMDynamicNTKScalingRotaryEmbedding'.
        It initializes the cosine and sine caches based on the given sequence length and data type.

        Args:
            self (object): The instance of the MiniCPMDynamicNTKScalingRotaryEmbedding class.
            seq_len (int): The length of the sequence for which cosine and sine caches need to be computed.
            dtype (dtype): The data type to be used for computation. Typically, this should be a floating-point data type.

        Returns:
            None: This method does not return any value explicitly. It updates the 'cos_cached' and 'sin_cached'
                attributes of the class instance.

        Raises:
            ValueError: If the 'seq_len' provided is less than or equal to 0.
            RuntimeError: If an error occurs during the computation of cosine and sine caches.
        """
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (base ** (ops.arange(0, self.dim, 2).float() / self.dim))
            self.inv_freq = inv_freq

        t = ops.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)

        freqs = ops.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = ops.cat((freqs, freqs), axis=-1)

        self.cos_cached = emb.cos().to(dtype)
        self.sin_cached = emb.sin().to(dtype)

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMDynamicNTKScalingRotaryEmbedding.__init__(dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0)

Initializes a new instance of the MiniCPMDynamicNTKScalingRotaryEmbedding class.

PARAMETER DESCRIPTION
self

The instance of the class.

dim

The dimensionality of the embedding.

TYPE: int

max_position_embeddings

The maximum number of position embeddings. Defaults to 2048.

TYPE: int DEFAULT: 2048

base

The base value. Defaults to 10000.

TYPE: int DEFAULT: 10000

scaling_factor

The scaling factor. Defaults to 1.0.

TYPE: float DEFAULT: 1.0

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0):
    """
    Initializes a new instance of the MiniCPMDynamicNTKScalingRotaryEmbedding class.

    Args:
        self: The instance of the class.
        dim (int): The dimensionality of the embedding.
        max_position_embeddings (int, optional): The maximum number of position embeddings. Defaults to 2048.
        base (int, optional): The base value. Defaults to 10000.
        scaling_factor (float, optional): The scaling factor. Defaults to 1.0.

    Returns:
        None.

    Raises:
        None.
    """
    self.scaling_factor = scaling_factor
    super().__init__(dim, max_position_embeddings, base)

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM

Bases: MiniCPMPreTrainedModel

This class represents the MiniCPM model for causal language modeling. It is specifically designed for generating text based on given input prompts. The model is initialized with a configuration and consists of a MiniCPM model, an embedding layer, and a linear layer for predicting the next token in the sequence.

ATTRIBUTE DESCRIPTION
model

The underlying MiniCPM model.

TYPE: MiniCPMModel

vocab_size

The size of the vocabulary.

TYPE: int

lm_head

The linear layer for predicting the next token.

TYPE: Linear

METHOD DESCRIPTION
__init__

Initializes the MiniCPMForCausalLM model.

get_input_embeddings

Returns the input embeddings of the model.

set_input_embeddings

Sets the input embeddings of the model.

get_output_embeddings

Returns the output embeddings of the model.

set_output_embeddings

Sets the output embeddings of the model.

set_decoder

Sets the decoder of the model.

get_decoder

Returns the decoder of the model.

forward

Constructs the MiniCPM model and computes the language modeling loss.

prepare_inputs_for_generation

Prepares the inputs for text generation.

_reorder_cache

Reorders the cache for beam search.

chat

Generates a response to a given query using the MiniCPM model.

Example
>>> from transformers import AutoTokenizer, MiniCPMForCausalLM
...
>>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
...
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
...
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
    r"""
    This class represents the MiniCPM model for causal language modeling. It is specifically designed for generating
    text based on given input prompts. The model is initialized with a configuration and consists of a MiniCPM model,
    an embedding layer, and a linear layer for predicting the next token in the sequence.

    Attributes:
        model (MiniCPMModel): The underlying MiniCPM model.
        vocab_size (int): The size of the vocabulary.
        lm_head (nn.Linear): The linear layer for predicting the next token.

    Methods:
        __init__: Initializes the MiniCPMForCausalLM model.
        get_input_embeddings: Returns the input embeddings of the model.
        set_input_embeddings: Sets the input embeddings of the model.
        get_output_embeddings: Returns the output embeddings of the model.
        set_output_embeddings: Sets the output embeddings of the model.
        set_decoder: Sets the decoder of the model.
        get_decoder: Returns the decoder of the model.
        forward: Constructs the MiniCPM model and computes the language modeling loss.
        prepare_inputs_for_generation: Prepares the inputs for text generation.
        _reorder_cache: Reorders the cache for beam search.
        chat: Generates a response to a given query using the MiniCPM model.

    Example:
        ```python
        >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
        ...
        >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
        ...
        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")
        ...
        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```
    """
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        """
        Initializes an instance of the MiniCPMForCausalLM class.

        Args:
            self (MiniCPMForCausalLM): The object instance.
            config: The configuration object containing the model's settings.

        Returns:
            None

        Raises:
            None
        """
        super().__init__(config)
        self.model = MiniCPMModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        This method returns the input embeddings from the MiniCPMForCausalLM model.

        Args:
            self: The instance of the MiniCPMForCausalLM class.

        Returns:
            The input embeddings from the model.

        Raises:
            None.
        """
        return self.model.embed_tokens

    def set_input_embeddings(self, new_embeddings):
        """
        Method to set new input embeddings for the MiniCPMForCausalLM model.

        Args:
            self (MiniCPMForCausalLM): The instance of MiniCPMForCausalLM class.
            new_embeddings (object): The new embeddings to be set for the model.
                Should be compatible with the model's embed_tokens attribute.

        Returns:
            The input embeddings for the model.

        Raises:
            None.
        """
        self.model.embed_tokens = new_embeddings

    def get_output_embeddings(self):
        """
        Returns the output embeddings of the MiniCPMForCausalLM model.

        Args:
            self (MiniCPMForCausalLM): The instance of the MiniCPMForCausalLM class.

        Returns:
            None.

        Raises:
            None.

        This method retrieves the output embeddings of the MiniCPMForCausalLM model.
        The output embeddings are computed by the 'lm_head' layer of the model.

        Note:
            The 'lm_head' layer is a linear transformation layer that maps the final hidden states of the model to
            the vocabulary size. It is responsible for generating the output probabilities for each token
            in the sequence.

        Example:
            ```python
            >>> model = MiniCPMForCausalLM()
            >>> embeddings = model.get_output_embeddings()
            ```
        """
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        """
        Method to set new embeddings for the output layer of the MiniCPMForCausalLM model.

        Args:
            self (MiniCPMForCausalLM): The instance of the MiniCPMForCausalLM class.
                This parameter is used to reference the current instance of the MiniCPMForCausalLM model.
            new_embeddings (any): The new embeddings to be set as the output embeddings.
                This parameter represents the new embeddings that will replace the current output embeddings.
                It can be of any data type.

        Returns:
            None: This method does not return any value. It sets the 'lm_head' attribute of the MiniCPMForCausalLM
                instance to the new_embeddings.

        Raises:
            None.
        """
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        """
        This method sets the decoder for the MiniCPMForCausalLM model.

        Args:
            self (object): The instance of the MiniCPMForCausalLM class.
            decoder (object): The decoder object to be set for the model. It should be an instance of a decoder class.

        Returns:
            None.

        Raises:
            None.
        """
        self.model = decoder

    def get_decoder(self):
        """
        Retrieves the decoder model used for the MiniCPMForCausalLM class.

        Args:
            self: An instance of the MiniCPMForCausalLM class.

        Returns:
            The decoder model object.

        Raises:
            None.

        This method returns the decoder model object associated with the MiniCPMForCausalLM instance.
        The decoder model is an essential component of the MiniCPMForCausalLM class and is used for generating
        predictions based on the input data. The decoder model object is returned as the result of this method.
        """
        return self.model

    def forward(
        self,
        input_ids: mindspore.Tensor = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[List[mindspore.Tensor]] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            `Union[Tuple, CausalLMOutputWithPast]`

        Example:
            ```python
            >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
            ...
            >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
            >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
            ...
            >>> prompt = "Hey, are you conscious? Can you talk to me?"
            >>> inputs = tokenizer(prompt, return_tensors="pt")
            ...
            >>> # Generate
            >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
            >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
            "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
            ```
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        if self.config.pretraining_tp > 1:
            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, axis=0)
            logits = [ops.dense(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
            logits = ops.cat(logits, axis=-1)
        else:
            logits = self.lm_head(hidden_states / (self.config.hidden_size / self.config.dim_model_base))
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :]
            shift_labels = labels[..., 1:]
            # Flatten the tokens
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            loss = ops.cross_entropy(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
    ):
        """
        Prepare inputs for generation.

        Args:
            self (MiniCPMForCausalLM): The instance of the MiniCPMForCausalLM class.
            input_ids (torch.Tensor): The input tensor of token indices. Shape: [batch_size, sequence_length].
            past_key_values (Cache or Tuple[torch.Tensor, torch.Tensor] or None): The past key values used for
                efficient generation. If Cache object or Tuple is provided, it contains the cached key and value
                tensors. If None, no past key values are used.
            attention_mask (torch.Tensor or None): The attention mask tensor to mask padded tokens.
                Shape: [batch_size, sequence_length].
            inputs_embeds (torch.Tensor or None): The tensor of embeddings for input tokens.
                Shape: [batch_size, sequence_length, embedding_dim].

        Returns:
            dict: A dictionary containing the model inputs including either 'input_ids' or 'inputs_embeds',
                'position_ids', 'past_key_values', 'use_cache', and 'attention_mask'.

        Raises:
            TypeError: If the input_ids, past_key_values, attention_mask, or inputs_embeds have invalid types.
            ValueError: If the input_ids and attention_mask shapes are incompatible or
                if cache_length + input_ids.shape[1] > max_cache_length.
        """
        if past_key_values is not None:
            if isinstance(past_key_values, Cache):
                cache_length = past_key_values.get_seq_length()
                past_length = past_key_values.seen_tokens
                max_cache_length = past_key_values.get_max_length()
            else:
                cache_length = past_length = past_key_values[0][0].shape[2]
                max_cache_length = None

            # Keep only the unprocessed tokens:
            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
            # input)
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
            # input_ids based on the past_length.
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids = position_ids.masked_fill(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        """
        Reorders the past key values based on the provided beam index.

        Args:
            past_key_values (tuple): A tuple containing past key values from the model layers.
            beam_idx (Tensor): A tensor representing the beam index used for reordering.

        Returns:
            tuple: A tuple of reordered past key values based on the provided beam index.

        Raises:
            None
        """
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),
            )
        return reordered_past

    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
             max_length: int = 4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None,
             **kwargs):
        """
        Chat method for MiniCPMForCausalLM class.

        This method facilitates a conversation by generating responses based on the given query and history.
        It utilizes a tokenizer to convert text into tokens and a language model to generate responses.

        Args:
            self (MiniCPMForCausalLM): An instance of the MiniCPMForCausalLM class.
            tokenizer: The tokenizer object used to tokenize the input text.
            query (str): The user's query as a string.
            history (List[Dict], optional): A list of dictionaries representing the conversation history.
                Each dictionary contains the role (e.g., 'user' or 'assistant') and the content of the message.
                Defaults to None.
            role (str, optional): The role of the current message. Defaults to 'user'.
            max_length (int, optional): The maximum length of the generated response. Defaults to 4096.
            num_beams (int, optional): The number of beams to be used during generation. Defaults to 1.
            do_sample (bool, optional): Whether to use sampling during generation. Defaults to True.
            top_p (float, optional): The cumulative probability for top-p sampling. Defaults to 0.8.
            temperature (float, optional): The temperature value for generation. Defaults to 0.3.
            logits_processor: An optional logits_processor object to be used during generation. Defaults to None.
            **kwargs: Additional keyword arguments for generation.

        Returns:
            tuple: A tuple containing the generated response (str) and the updated conversation history (List[Dict]).

        Raises:
            None.
        """
        if history is None:
            history = []
        if logits_processor:
            gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
                        "temperature": temperature, "logits_processor": logits_processor, **kwargs}
        else:
            gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
                        "temperature": temperature, "logits_processor": logits_processor, **kwargs}

        history.append({"role": role, "content": query})
        history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=False)
        inputs = tokenizer(history_str, return_tensors='ms')
        outputs = self.generate(**inputs, **gen_kwargs)
        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
        response = tokenizer.decode(outputs)
        pattern = re.compile(r".*?(?=<AI>|<用户>)", re.DOTALL)
        matches = pattern.findall(response)
        if len(matches) > 0:
            response = matches[0]
        history.append({"role": "assistant", "content": response})
        return response, history

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.__init__(config)

Initializes an instance of the MiniCPMForCausalLM class.

PARAMETER DESCRIPTION
self

The object instance.

TYPE: MiniCPMForCausalLM

config

The configuration object containing the model's settings.

RETURNS DESCRIPTION

None

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
def __init__(self, config):
    """
    Initializes an instance of the MiniCPMForCausalLM class.

    Args:
        self (MiniCPMForCausalLM): The object instance.
        config: The configuration object containing the model's settings.

    Returns:
        None

    Raises:
        None
    """
    super().__init__(config)
    self.model = MiniCPMModel(config)
    self.vocab_size = config.vocab_size
    self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.chat(tokenizer, query, history=None, role='user', max_length=4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None, **kwargs)

Chat method for MiniCPMForCausalLM class.

This method facilitates a conversation by generating responses based on the given query and history. It utilizes a tokenizer to convert text into tokens and a language model to generate responses.

PARAMETER DESCRIPTION
self

An instance of the MiniCPMForCausalLM class.

TYPE: MiniCPMForCausalLM

tokenizer

The tokenizer object used to tokenize the input text.

query

The user's query as a string.

TYPE: str

history

A list of dictionaries representing the conversation history. Each dictionary contains the role (e.g., 'user' or 'assistant') and the content of the message. Defaults to None.

TYPE: List[Dict] DEFAULT: None

role

The role of the current message. Defaults to 'user'.

TYPE: str DEFAULT: 'user'

max_length

The maximum length of the generated response. Defaults to 4096.

TYPE: int DEFAULT: 4096

num_beams

The number of beams to be used during generation. Defaults to 1.

TYPE: int DEFAULT: 1

do_sample

Whether to use sampling during generation. Defaults to True.

TYPE: bool DEFAULT: True

top_p

The cumulative probability for top-p sampling. Defaults to 0.8.

TYPE: float DEFAULT: 0.8

temperature

The temperature value for generation. Defaults to 0.3.

TYPE: float DEFAULT: 0.3

logits_processor

An optional logits_processor object to be used during generation. Defaults to None.

DEFAULT: None

**kwargs

Additional keyword arguments for generation.

DEFAULT: {}

RETURNS DESCRIPTION
tuple

A tuple containing the generated response (str) and the updated conversation history (List[Dict]).

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
         max_length: int = 4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None,
         **kwargs):
    """
    Chat method for MiniCPMForCausalLM class.

    This method facilitates a conversation by generating responses based on the given query and history.
    It utilizes a tokenizer to convert text into tokens and a language model to generate responses.

    Args:
        self (MiniCPMForCausalLM): An instance of the MiniCPMForCausalLM class.
        tokenizer: The tokenizer object used to tokenize the input text.
        query (str): The user's query as a string.
        history (List[Dict], optional): A list of dictionaries representing the conversation history.
            Each dictionary contains the role (e.g., 'user' or 'assistant') and the content of the message.
            Defaults to None.
        role (str, optional): The role of the current message. Defaults to 'user'.
        max_length (int, optional): The maximum length of the generated response. Defaults to 4096.
        num_beams (int, optional): The number of beams to be used during generation. Defaults to 1.
        do_sample (bool, optional): Whether to use sampling during generation. Defaults to True.
        top_p (float, optional): The cumulative probability for top-p sampling. Defaults to 0.8.
        temperature (float, optional): The temperature value for generation. Defaults to 0.3.
        logits_processor: An optional logits_processor object to be used during generation. Defaults to None.
        **kwargs: Additional keyword arguments for generation.

    Returns:
        tuple: A tuple containing the generated response (str) and the updated conversation history (List[Dict]).

    Raises:
        None.
    """
    if history is None:
        history = []
    if logits_processor:
        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
                    "temperature": temperature, "logits_processor": logits_processor, **kwargs}
    else:
        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
                    "temperature": temperature, "logits_processor": logits_processor, **kwargs}

    history.append({"role": role, "content": query})
    history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=False)
    inputs = tokenizer(history_str, return_tensors='ms')
    outputs = self.generate(**inputs, **gen_kwargs)
    outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
    response = tokenizer.decode(outputs)
    pattern = re.compile(r".*?(?=<AI>|<用户>)", re.DOTALL)
    matches = pattern.findall(response)
    if len(matches) > 0:
        response = matches[0]
    history.append({"role": "assistant", "content": response})
    return response, history

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.forward(input_ids=None, attention_mask=None, position_ids=None, past_key_values=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

PARAMETER DESCRIPTION
labels

Labels for computing the masked language modeling loss. Indices should either be in [0, ..., config.vocab_size] or -100 (see input_ids docstring). Tokens with indices set to -100 are ignored (masked), the loss is only computed for the tokens with labels in [0, ..., config.vocab_size].

TYPE: `mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional* DEFAULT: None

RETURNS DESCRIPTION
Union[Tuple, CausalLMOutputWithPast]

Union[Tuple, CausalLMOutputWithPast]

Example
>>> from transformers import AutoTokenizer, MiniCPMForCausalLM
...
>>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
...
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
...
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
def forward(
    self,
    input_ids: mindspore.Tensor = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[List[mindspore.Tensor]] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`mindspore.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

    Returns:
        `Union[Tuple, CausalLMOutputWithPast]`

    Example:
        ```python
        >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
        ...
        >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
        ...
        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")
        ...
        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```
    """
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    hidden_states = outputs[0]
    if self.config.pretraining_tp > 1:
        lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, axis=0)
        logits = [ops.dense(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
        logits = ops.cat(logits, axis=-1)
    else:
        logits = self.lm_head(hidden_states / (self.config.hidden_size / self.config.dim_model_base))
    logits = logits.float()

    loss = None
    if labels is not None:
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :]
        shift_labels = labels[..., 1:]
        # Flatten the tokens
        shift_logits = shift_logits.view(-1, self.config.vocab_size)
        shift_labels = shift_labels.view(-1)
        # Enable model parallelism
        loss = ops.cross_entropy(shift_logits, shift_labels)

    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output

    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.get_decoder()

Retrieves the decoder model used for the MiniCPMForCausalLM class.

PARAMETER DESCRIPTION
self

An instance of the MiniCPMForCausalLM class.

RETURNS DESCRIPTION

The decoder model object.

This method returns the decoder model object associated with the MiniCPMForCausalLM instance. The decoder model is an essential component of the MiniCPMForCausalLM class and is used for generating predictions based on the input data. The decoder model object is returned as the result of this method.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
def get_decoder(self):
    """
    Retrieves the decoder model used for the MiniCPMForCausalLM class.

    Args:
        self: An instance of the MiniCPMForCausalLM class.

    Returns:
        The decoder model object.

    Raises:
        None.

    This method returns the decoder model object associated with the MiniCPMForCausalLM instance.
    The decoder model is an essential component of the MiniCPMForCausalLM class and is used for generating
    predictions based on the input data. The decoder model object is returned as the result of this method.
    """
    return self.model

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.get_input_embeddings()

This method returns the input embeddings from the MiniCPMForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMForCausalLM class.

RETURNS DESCRIPTION

The input embeddings from the model.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
def get_input_embeddings(self):
    """
    This method returns the input embeddings from the MiniCPMForCausalLM model.

    Args:
        self: The instance of the MiniCPMForCausalLM class.

    Returns:
        The input embeddings from the model.

    Raises:
        None.
    """
    return self.model.embed_tokens

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.get_output_embeddings()

Returns the output embeddings of the MiniCPMForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMForCausalLM class.

TYPE: MiniCPMForCausalLM

RETURNS DESCRIPTION

None.

This method retrieves the output embeddings of the MiniCPMForCausalLM model. The output embeddings are computed by the 'lm_head' layer of the model.

Note

The 'lm_head' layer is a linear transformation layer that maps the final hidden states of the model to the vocabulary size. It is responsible for generating the output probabilities for each token in the sequence.

Example
>>> model = MiniCPMForCausalLM()
>>> embeddings = model.get_output_embeddings()
Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
def get_output_embeddings(self):
    """
    Returns the output embeddings of the MiniCPMForCausalLM model.

    Args:
        self (MiniCPMForCausalLM): The instance of the MiniCPMForCausalLM class.

    Returns:
        None.

    Raises:
        None.

    This method retrieves the output embeddings of the MiniCPMForCausalLM model.
    The output embeddings are computed by the 'lm_head' layer of the model.

    Note:
        The 'lm_head' layer is a linear transformation layer that maps the final hidden states of the model to
        the vocabulary size. It is responsible for generating the output probabilities for each token
        in the sequence.

    Example:
        ```python
        >>> model = MiniCPMForCausalLM()
        >>> embeddings = model.get_output_embeddings()
        ```
    """
    return self.lm_head

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.prepare_inputs_for_generation(input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs)

Prepare inputs for generation.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMForCausalLM class.

TYPE: MiniCPMForCausalLM

input_ids

The input tensor of token indices. Shape: [batch_size, sequence_length].

TYPE: Tensor

past_key_values

The past key values used for efficient generation. If Cache object or Tuple is provided, it contains the cached key and value tensors. If None, no past key values are used.

TYPE: Cache or Tuple[Tensor, Tensor] or None DEFAULT: None

attention_mask

The attention mask tensor to mask padded tokens. Shape: [batch_size, sequence_length].

TYPE: Tensor or None DEFAULT: None

inputs_embeds

The tensor of embeddings for input tokens. Shape: [batch_size, sequence_length, embedding_dim].

TYPE: Tensor or None DEFAULT: None

RETURNS DESCRIPTION
dict

A dictionary containing the model inputs including either 'input_ids' or 'inputs_embeds', 'position_ids', 'past_key_values', 'use_cache', and 'attention_mask'.

RAISES DESCRIPTION
TypeError

If the input_ids, past_key_values, attention_mask, or inputs_embeds have invalid types.

ValueError

If the input_ids and attention_mask shapes are incompatible or if cache_length + input_ids.shape[1] > max_cache_length.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
def prepare_inputs_for_generation(
    self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
    """
    Prepare inputs for generation.

    Args:
        self (MiniCPMForCausalLM): The instance of the MiniCPMForCausalLM class.
        input_ids (torch.Tensor): The input tensor of token indices. Shape: [batch_size, sequence_length].
        past_key_values (Cache or Tuple[torch.Tensor, torch.Tensor] or None): The past key values used for
            efficient generation. If Cache object or Tuple is provided, it contains the cached key and value
            tensors. If None, no past key values are used.
        attention_mask (torch.Tensor or None): The attention mask tensor to mask padded tokens.
            Shape: [batch_size, sequence_length].
        inputs_embeds (torch.Tensor or None): The tensor of embeddings for input tokens.
            Shape: [batch_size, sequence_length, embedding_dim].

    Returns:
        dict: A dictionary containing the model inputs including either 'input_ids' or 'inputs_embeds',
            'position_ids', 'past_key_values', 'use_cache', and 'attention_mask'.

    Raises:
        TypeError: If the input_ids, past_key_values, attention_mask, or inputs_embeds have invalid types.
        ValueError: If the input_ids and attention_mask shapes are incompatible or
            if cache_length + input_ids.shape[1] > max_cache_length.
    """
    if past_key_values is not None:
        if isinstance(past_key_values, Cache):
            cache_length = past_key_values.get_seq_length()
            past_length = past_key_values.seen_tokens
            max_cache_length = past_key_values.get_max_length()
        else:
            cache_length = past_length = past_key_values[0][0].shape[2]
            max_cache_length = None

        # Keep only the unprocessed tokens:
        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
        # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
        # input)
        if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
        # input_ids based on the past_length.
        elif past_length < input_ids.shape[1]:
            input_ids = input_ids[:, past_length:]
        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
        if (
            max_cache_length is not None
            and attention_mask is not None
            and cache_length + input_ids.shape[1] > max_cache_length
        ):
            attention_mask = attention_mask[:, -max_cache_length:]

    position_ids = kwargs.get("position_ids", None)
    if attention_mask is not None and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids = position_ids.masked_fill(attention_mask == 0, 1)
        if past_key_values:
            position_ids = position_ids[:, -input_ids.shape[1] :]

    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
    if inputs_embeds is not None and past_key_values is None:
        model_inputs = {"inputs_embeds": inputs_embeds}
    else:
        model_inputs = {"input_ids": input_ids}

    model_inputs.update(
        {
            "position_ids": position_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "attention_mask": attention_mask,
        }
    )
    return model_inputs

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.set_decoder(decoder)

This method sets the decoder for the MiniCPMForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMForCausalLM class.

TYPE: object

decoder

The decoder object to be set for the model. It should be an instance of a decoder class.

TYPE: object

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
def set_decoder(self, decoder):
    """
    This method sets the decoder for the MiniCPMForCausalLM model.

    Args:
        self (object): The instance of the MiniCPMForCausalLM class.
        decoder (object): The decoder object to be set for the model. It should be an instance of a decoder class.

    Returns:
        None.

    Raises:
        None.
    """
    self.model = decoder

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.set_input_embeddings(new_embeddings)

Method to set new input embeddings for the MiniCPMForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of MiniCPMForCausalLM class.

TYPE: MiniCPMForCausalLM

new_embeddings

The new embeddings to be set for the model. Should be compatible with the model's embed_tokens attribute.

TYPE: object

RETURNS DESCRIPTION

The input embeddings for the model.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
def set_input_embeddings(self, new_embeddings):
    """
    Method to set new input embeddings for the MiniCPMForCausalLM model.

    Args:
        self (MiniCPMForCausalLM): The instance of MiniCPMForCausalLM class.
        new_embeddings (object): The new embeddings to be set for the model.
            Should be compatible with the model's embed_tokens attribute.

    Returns:
        The input embeddings for the model.

    Raises:
        None.
    """
    self.model.embed_tokens = new_embeddings

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForCausalLM.set_output_embeddings(new_embeddings)

Method to set new embeddings for the output layer of the MiniCPMForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMForCausalLM class. This parameter is used to reference the current instance of the MiniCPMForCausalLM model.

TYPE: MiniCPMForCausalLM

new_embeddings

The new embeddings to be set as the output embeddings. This parameter represents the new embeddings that will replace the current output embeddings. It can be of any data type.

TYPE: any

RETURNS DESCRIPTION
None

This method does not return any value. It sets the 'lm_head' attribute of the MiniCPMForCausalLM instance to the new_embeddings.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
def set_output_embeddings(self, new_embeddings):
    """
    Method to set new embeddings for the output layer of the MiniCPMForCausalLM model.

    Args:
        self (MiniCPMForCausalLM): The instance of the MiniCPMForCausalLM class.
            This parameter is used to reference the current instance of the MiniCPMForCausalLM model.
        new_embeddings (any): The new embeddings to be set as the output embeddings.
            This parameter represents the new embeddings that will replace the current output embeddings.
            It can be of any data type.

    Returns:
        None: This method does not return any value. It sets the 'lm_head' attribute of the MiniCPMForCausalLM
            instance to the new_embeddings.

    Raises:
        None.
    """
    self.lm_head = new_embeddings

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForSequenceClassification

Bases: MiniCPMPreTrainedModel

MiniCPMForSequenceClassification is a Python class that represents a fine-tuning model for sequence classification tasks based on the MiniCPM architecture. It inherits from the MiniCPMPreTrainedModel class and provides methods for initializing the model, getting and setting input embeddings, and forwarding the sequence classification model.

ATTRIBUTE DESCRIPTION
num_labels

The number of labels for sequence classification.

TYPE: int

model

The MiniCPM model used for sequence classification.

TYPE: MiniCPMModel

score

The layer for scoring sequence classification logits.

TYPE: Linear

METHOD DESCRIPTION
__init__

Initializes the MiniCPMForSequenceClassification instance with the provided configuration.

get_input_embeddings

Returns the input embeddings from the MiniCPM model.

set_input_embeddings

Sets new input embeddings for the MiniCPM model.

forward

Constructs the sequence classification model based on the provided input arguments.

PARAMETER DESCRIPTION
input_ids

The input token IDs for the sequence.

TYPE: Tensor

attention_mask

The attention mask for the input sequence.

TYPE: Tensor

position_ids

The position IDs for the input tokens.

TYPE: Tensor

past_key_values

The past key values for autoregressive decoding.

TYPE: List[Tensor]

inputs_embeds

The input embeddings for the sequence.

TYPE: Tensor

labels

The labels for computing the sequence classification/regression loss.

TYPE: Tensor

use_cache

Whether to use cache for autoregressive decoding.

TYPE: bool

output_attentions

Whether to output attentions in the model.

TYPE: bool

output_hidden_states

Whether to output hidden states in the model.

TYPE: bool

return_dict

Whether to return the model outputs as a dictionary.

TYPE: bool

RETURNS DESCRIPTION

Union[Tuple, SequenceClassifierOutputWithPast]: The forwarded model outputs, including the loss, logits, past key values, hidden states, and attentions.

RAISES DESCRIPTION
ValueError

If the batch size is greater than 1 and no padding token is defined.

Note

This class inherits from MiniCPMPreTrainedModel and extends its functionality to support sequence classification tasks.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):

    """
    MiniCPMForSequenceClassification is a Python class that represents a fine-tuning model for sequence classification
    tasks based on the MiniCPM architecture. It inherits from the MiniCPMPreTrainedModel class and provides methods for
    initializing the model, getting and setting input embeddings, and forwarding the sequence classification model.

    Attributes:
        num_labels (int): The number of labels for sequence classification.
        model (MiniCPMModel): The MiniCPM model used for sequence classification.
        score (nn.Linear): The layer for scoring sequence classification logits.

    Methods:
        __init__: Initializes the MiniCPMForSequenceClassification instance with the provided configuration.
        get_input_embeddings: Returns the input embeddings from the MiniCPM model.
        set_input_embeddings: Sets new input embeddings for the MiniCPM model.
        forward: Constructs the sequence classification model based on the provided input arguments.

    Args:
        input_ids (mindspore.Tensor, optional): The input token IDs for the sequence.
        attention_mask (mindspore.Tensor, optional): The attention mask for the input sequence.
        position_ids (mindspore.Tensor, optional): The position IDs for the input tokens.
        past_key_values (List[mindspore.Tensor], optional): The past key values for autoregressive decoding.
        inputs_embeds (mindspore.Tensor, optional): The input embeddings for the sequence.
        labels (mindspore.Tensor, optional): The labels for computing the sequence classification/regression loss.
        use_cache (bool, optional): Whether to use cache for autoregressive decoding.
        output_attentions (bool, optional): Whether to output attentions in the model.
        output_hidden_states (bool, optional): Whether to output hidden states in the model.
        return_dict (bool, optional): Whether to return the model outputs as a dictionary.

    Returns:
        Union[Tuple, SequenceClassifierOutputWithPast]: The forwarded model outputs, including the loss, logits,
            past key values, hidden states, and attentions.

    Raises:
        ValueError: If the batch size is greater than 1 and no padding token is defined.

    Note:
        This class inherits from MiniCPMPreTrainedModel and extends its functionality to support sequence
        classification tasks.
    """
    def __init__(self, config):
        """
        Initializes a new instance of the MiniCPMForSequenceClassification class.

        Args:
            self (MiniCPMForSequenceClassification): The current instance of the class.
            config: An instance of the configuration class specifying the model's hyperparameters and settings.

        Returns:
            None

        Raises:
            None.
        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = MiniCPMModel(config)
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        Method to retrieve the input embeddings from the model.

        Args:
            self (MiniCPMForSequenceClassification): The instance of the MiniCPMForSequenceClassification class.
                This parameter is used to access the model's embed_tokens attribute.

        Returns:
            None: This method returns None as it simply retrieves the input embeddings from the model.

        Raises:
            None
        """
        return self.model.embed_tokens

    def set_input_embeddings(self, new_embeddings):
        """
        Method to set new input embeddings for the MiniCPMForSequenceClassification model.

        Args:
            self (MiniCPMForSequenceClassification): Instance of the MiniCPMForSequenceClassification class.
            new_embeddings (object): New embeddings to be set for the model.
                Should be compatible with the model's input embedding format.

        Returns:
            None.

        Raises:
            None.
        """
        self.model.embed_tokens = new_embeddings

    def forward(
        self,
        input_ids: mindspore.Tensor = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[List[mindspore.Tensor]] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        Args:
            labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = ops.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
            else:
                sequence_lengths = -1

        pooled_logits = logits[ops.arange(batch_size), sequence_lengths]

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                if self.num_labels == 1:
                    loss = ops.mse_loss(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = ops.mse_loss(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss = ops.cross_entropy(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss = ops.binary_cross_entropy_with_logits(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForSequenceClassification.__init__(config)

Initializes a new instance of the MiniCPMForSequenceClassification class.

PARAMETER DESCRIPTION
self

The current instance of the class.

TYPE: MiniCPMForSequenceClassification

config

An instance of the configuration class specifying the model's hyperparameters and settings.

RETURNS DESCRIPTION

None

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
def __init__(self, config):
    """
    Initializes a new instance of the MiniCPMForSequenceClassification class.

    Args:
        self (MiniCPMForSequenceClassification): The current instance of the class.
        config: An instance of the configuration class specifying the model's hyperparameters and settings.

    Returns:
        None

    Raises:
        None.
    """
    super().__init__(config)
    self.num_labels = config.num_labels
    self.model = MiniCPMModel(config)
    self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForSequenceClassification.forward(input_ids=None, attention_mask=None, position_ids=None, past_key_values=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

PARAMETER DESCRIPTION
labels

Labels for computing the sequence classification/regression loss. Indices should be in [0, ..., config.num_labels - 1]. If config.num_labels == 1 a regression loss is computed (Mean-Square loss), If config.num_labels > 1 a classification loss is computed (Cross-Entropy).

TYPE: `mindspore.Tensor` of shape `(batch_size,)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
def forward(
    self,
    input_ids: mindspore.Tensor = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[List[mindspore.Tensor]] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
    r"""
    Args:
        labels (`mindspore.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    transformer_outputs = self.model(
        input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = transformer_outputs[0]
    logits = self.score(hidden_states)

    if input_ids is not None:
        batch_size = input_ids.shape[0]
    else:
        batch_size = inputs_embeds.shape[0]

    if self.config.pad_token_id is None and batch_size != 1:
        raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
    if self.config.pad_token_id is None:
        sequence_lengths = -1
    else:
        if input_ids is not None:
            sequence_lengths = ops.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
        else:
            sequence_lengths = -1

    pooled_logits = logits[ops.arange(batch_size), sequence_lengths]

    loss = None
    if labels is not None:
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        if self.config.problem_type == "regression":
            if self.num_labels == 1:
                loss = ops.mse_loss(pooled_logits.squeeze(), labels.squeeze())
            else:
                loss = ops.mse_loss(pooled_logits, labels)
        elif self.config.problem_type == "single_label_classification":
            loss = ops.cross_entropy(pooled_logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            loss = ops.binary_cross_entropy_with_logits(pooled_logits, labels)
    if not return_dict:
        output = (pooled_logits,) + transformer_outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return SequenceClassifierOutputWithPast(
        loss=loss,
        logits=pooled_logits,
        past_key_values=transformer_outputs.past_key_values,
        hidden_states=transformer_outputs.hidden_states,
        attentions=transformer_outputs.attentions,
    )

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForSequenceClassification.get_input_embeddings()

Method to retrieve the input embeddings from the model.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMForSequenceClassification class. This parameter is used to access the model's embed_tokens attribute.

TYPE: MiniCPMForSequenceClassification

RETURNS DESCRIPTION
None

This method returns None as it simply retrieves the input embeddings from the model.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
def get_input_embeddings(self):
    """
    Method to retrieve the input embeddings from the model.

    Args:
        self (MiniCPMForSequenceClassification): The instance of the MiniCPMForSequenceClassification class.
            This parameter is used to access the model's embed_tokens attribute.

    Returns:
        None: This method returns None as it simply retrieves the input embeddings from the model.

    Raises:
        None
    """
    return self.model.embed_tokens

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMForSequenceClassification.set_input_embeddings(new_embeddings)

Method to set new input embeddings for the MiniCPMForSequenceClassification model.

PARAMETER DESCRIPTION
self

Instance of the MiniCPMForSequenceClassification class.

TYPE: MiniCPMForSequenceClassification

new_embeddings

New embeddings to be set for the model. Should be compatible with the model's input embedding format.

TYPE: object

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
def set_input_embeddings(self, new_embeddings):
    """
    Method to set new input embeddings for the MiniCPMForSequenceClassification model.

    Args:
        self (MiniCPMForSequenceClassification): Instance of the MiniCPMForSequenceClassification class.
        new_embeddings (object): New embeddings to be set for the model.
            Should be compatible with the model's input embedding format.

    Returns:
        None.

    Raises:
        None.
    """
    self.model.embed_tokens = new_embeddings

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMLinearScalingRotaryEmbedding

Bases: MiniCPMRotaryEmbedding

MiniCPMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
    """MiniCPMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
    def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0):
        """
        Initializes an instance of MiniCPMLinearScalingRotaryEmbedding.

        Args:
            self: The instance of the class.
            dim (int): The dimension of the embedding.
            max_position_embeddings (int): The maximum number of position embeddings.
            base (int): The base value used in calculations.
            scaling_factor (float): The scaling factor applied to the embeddings.

        Returns:
            None.

        Raises:
            None.
        """
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base)

    def _set_cos_sin_cache(self, seq_len, dtype):
        """
        Sets the cosine and sine cache for the MiniCPMLinearScalingRotaryEmbedding class.

        Args:
            self (MiniCPMLinearScalingRotaryEmbedding): An instance of the MiniCPMLinearScalingRotaryEmbedding class.
            seq_len (int): The length of the sequence for which to set the cache.
            dtype: The desired data type for the cache.

        Returns:
            None.

        Raises:
            None.
        """
        self.max_seq_len_cached = seq_len
        t = ops.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)
        t = t / self.scaling_factor

        freqs = ops.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = ops.cat((freqs, freqs), axis=-1)
        self.cos_cached = emb.cos().to(dtype)
        self.sin_cached = emb.sin().to(dtype)

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMLinearScalingRotaryEmbedding.__init__(dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0)

Initializes an instance of MiniCPMLinearScalingRotaryEmbedding.

PARAMETER DESCRIPTION
self

The instance of the class.

dim

The dimension of the embedding.

TYPE: int

max_position_embeddings

The maximum number of position embeddings.

TYPE: int DEFAULT: 2048

base

The base value used in calculations.

TYPE: int DEFAULT: 10000

scaling_factor

The scaling factor applied to the embeddings.

TYPE: float DEFAULT: 1.0

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0):
    """
    Initializes an instance of MiniCPMLinearScalingRotaryEmbedding.

    Args:
        self: The instance of the class.
        dim (int): The dimension of the embedding.
        max_position_embeddings (int): The maximum number of position embeddings.
        base (int): The base value used in calculations.
        scaling_factor (float): The scaling factor applied to the embeddings.

    Returns:
        None.

    Raises:
        None.
    """
    self.scaling_factor = scaling_factor
    super().__init__(dim, max_position_embeddings, base)

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMMLP

Bases: Module

MiniCPMMLP is a neural network model that implements a specific variant of a Multi-Layer Perceptron (MLP) architecture for deep learning tasks. This class inherits from nn.Module and includes methods for initializing the model's parameters and forwarding the forward pass computation.

ATTRIBUTE DESCRIPTION
config

A configuration object containing parameters such as hidden_size, intermediate_size, hidden activation function, and pretraining_tp.

hidden_size

The size of the hidden layers in the MLP.

intermediate_size

The size of the intermediate layers in the MLP.

gate_proj

A dense layer for projecting input to intermediate size with no bias.

up_proj

A dense layer for projecting input to intermediate size with no bias.

down_proj

A dense layer for projecting intermediate size to hidden size with no bias.

act_fn

The activation function applied to the hidden layers based on the specified configuration.

METHOD DESCRIPTION
__init__

Initializes the MiniCPMMLP instance with the provided configuration.

forward

Constructs the forward pass computation of the MiniCPMMLP model based on the input tensor x. If pretraining_tp > 1, it performs a segmented computation using the specified number of segments. Otherwise, it computes the forward pass in a single step.

RETURNS DESCRIPTION
down_proj

The output tensor resulting from the forward pass computation of the MiniCPMMLP model.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
class MiniCPMMLP(nn.Module):

    """
    MiniCPMMLP is a neural network model that implements a specific variant of a Multi-Layer Perceptron (MLP)
    architecture for deep learning tasks.
    This class inherits from nn.Module and includes methods for initializing the model's parameters and forwarding
    the forward pass computation.

    Attributes:
        config: A configuration object containing parameters such as hidden_size, intermediate_size,
            hidden activation function, and pretraining_tp.
        hidden_size: The size of the hidden layers in the MLP.
        intermediate_size: The size of the intermediate layers in the MLP.
        gate_proj: A dense layer for projecting input to intermediate size with no bias.
        up_proj: A dense layer for projecting input to intermediate size with no bias.
        down_proj: A dense layer for projecting intermediate size to hidden size with no bias.
        act_fn: The activation function applied to the hidden layers based on the specified configuration.

    Methods:
        __init__: Initializes the MiniCPMMLP instance with the provided configuration.
        forward: Constructs the forward pass computation of the MiniCPMMLP model based on the input tensor x.
            If pretraining_tp > 1, it performs a segmented computation using the specified number of segments.
            Otherwise, it computes the forward pass in a single step.

    Returns:
        down_proj: The output tensor resulting from the forward pass computation of the MiniCPMMLP model.
    """
    def __init__(self, config):
        """
        Initializes a MiniCPMMLP object with the provided configuration.

        Args:
            self (MiniCPMMLP): The MiniCPMMLP object instance.
            config: Configuration object containing parameters for the MiniCPMMLP model.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        """
        Constructs the intermediate states of the MiniCPMMLP model based on the input tensor x.

        Args:
            self (MiniCPMMLP): An instance of the MiniCPMMLP class.
            x (tensor): The input tensor for forwarding the intermediate states.

        Returns:
            None. The method forwards the intermediate states of the model.

        Raises:
            None.
        """
        if self.config.pretraining_tp > 1:
            slice = self.intermediate_size // self.config.pretraining_tp
            gate_proj_slices = self.gate_proj.weight.split(slice, axis=0)
            up_proj_slices = self.up_proj.weight.split(slice, axis=0)
            down_proj_slices = self.down_proj.weight.split(slice, axis=1)

            gate_proj = ops.cat(
                [ops.dense(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], axis=-1
            )
            up_proj = ops.cat([ops.dense(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], axis=-1)

            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, axis=2)
            down_proj = [
                ops.dense(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
            ]
            down_proj = sum(down_proj)
        else:
            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

        return down_proj

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMMLP.__init__(config)

Initializes a MiniCPMMLP object with the provided configuration.

PARAMETER DESCRIPTION
self

The MiniCPMMLP object instance.

TYPE: MiniCPMMLP

config

Configuration object containing parameters for the MiniCPMMLP model.

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
def __init__(self, config):
    """
    Initializes a MiniCPMMLP object with the provided configuration.

    Args:
        self (MiniCPMMLP): The MiniCPMMLP object instance.
        config: Configuration object containing parameters for the MiniCPMMLP model.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__()
    self.config = config
    self.hidden_size = config.hidden_size
    self.intermediate_size = config.intermediate_size
    self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
    self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
    self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
    self.act_fn = ACT2FN[config.hidden_act]

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMMLP.forward(x)

Constructs the intermediate states of the MiniCPMMLP model based on the input tensor x.

PARAMETER DESCRIPTION
self

An instance of the MiniCPMMLP class.

TYPE: MiniCPMMLP

x

The input tensor for forwarding the intermediate states.

TYPE: tensor

RETURNS DESCRIPTION

None. The method forwards the intermediate states of the model.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
def forward(self, x):
    """
    Constructs the intermediate states of the MiniCPMMLP model based on the input tensor x.

    Args:
        self (MiniCPMMLP): An instance of the MiniCPMMLP class.
        x (tensor): The input tensor for forwarding the intermediate states.

    Returns:
        None. The method forwards the intermediate states of the model.

    Raises:
        None.
    """
    if self.config.pretraining_tp > 1:
        slice = self.intermediate_size // self.config.pretraining_tp
        gate_proj_slices = self.gate_proj.weight.split(slice, axis=0)
        up_proj_slices = self.up_proj.weight.split(slice, axis=0)
        down_proj_slices = self.down_proj.weight.split(slice, axis=1)

        gate_proj = ops.cat(
            [ops.dense(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], axis=-1
        )
        up_proj = ops.cat([ops.dense(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], axis=-1)

        intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, axis=2)
        down_proj = [
            ops.dense(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
        ]
        down_proj = sum(down_proj)
    else:
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

    return down_proj

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMModel

Bases: MiniCPMPreTrainedModel

Transformer decoder consisting of config.num_hidden_layers layers. Each layer is a [MiniCPMDecoderLayer]

PARAMETER DESCRIPTION
config

MiniCPMConfig

TYPE: MiniCPMConfig

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
class MiniCPMModel(MiniCPMPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]

    Args:
        config: MiniCPMConfig
    """
    def __init__(self, config: MiniCPMConfig):
        """
        Initializes a MiniCPMModel instance with the provided configuration.

        Args:
            self (MiniCPMModel): The instance of MiniCPMModel.
            config (MiniCPMConfig):
                The configuration object containing various settings for the model.

                - config.pad_token_id (int): The token ID used for padding sequences.
                - config.vocab_size (int): The size of the vocabulary.
                - config.hidden_size (int): The dimension of the hidden layers.
                - config.num_hidden_layers (int): The number of hidden layers in the model.
                - config.rms_norm_eps (float): The epsilon value for RMS normalization.

        Returns:
            None.

        Raises:
            ValueError: If the configuration object is missing required attributes.
            TypeError: If the configuration attributes are of incorrect types.
            RuntimeError: If there is an issue during the initialization process.
        """
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [MiniCPMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )

        self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        Get the input embeddings for the MiniCPMModel.

        Args:
            self (MiniCPMModel): An instance of the MiniCPMModel class.

        Returns:
            None.

        Raises:
            None.
        """
        return self.embed_tokens

    def set_input_embeddings(self, new_embeddings):
        """
        Set the input embeddings for the MiniCPMModel.

        Args:
            self (MiniCPMModel): The instance of the MiniCPMModel class.
            new_embeddings (object): The new embeddings to be set for self.embed_tokens.

        Returns:
            None.

        Raises:
            None.

        This method allows the user to set the input embeddings for the MiniCPMModel by replacing the current embeddings
        with the provided new_embeddings. The new_embeddings can be of any type or format, as long as it is compatible
        with the self.embed_tokens attribute. After calling this method, the MiniCPMModel instance will use the
        new embeddings for further processing.

        Note:
            The new_embeddings should be compatible with the existing self.embed_tokens attribute to ensure proper
            functioning of the MiniCPMModel.
        """
        self.embed_tokens = new_embeddings

    def forward(
        self,
        input_ids: mindspore.Tensor = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[List[mindspore.Tensor]] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        """
        Constructs the MiniCPMModel.

        Args:
            self (object): The instance of the MiniCPMModel class.
            input_ids (mindspore.Tensor): The input tensor containing the token IDs. Default is None.
            attention_mask (Optional[mindspore.Tensor]): The attention mask tensor. Default is None.
            position_ids (Optional[mindspore.Tensor]): The tensor containing the position IDs. Default is None.
            past_key_values (Optional[List[mindspore.Tensor]]): List of tensors representing past key values. Default is None.
            inputs_embeds (Optional[mindspore.Tensor]): The tensor containing the embeddings of input tokens. Default is None.
            use_cache (Optional[bool]): Flag indicating whether to use cache. Default is None.
            output_attentions (Optional[bool]): Flag indicating whether to output attentions. Default is None.
            output_hidden_states (Optional[bool]): Flag indicating whether to output hidden states. Default is None.
            return_dict (Optional[bool]): Flag indicating whether to return a dictionary. Default is None.

        Returns:
            Union[Tuple, BaseModelOutputWithPast]:
                A tuple containing the hidden states, next_cache, all_hidden_states, and all_self_attns if not None;
                or a BaseModelOutputWithPast instance containing the last hidden state, past key values, hidden states,
                and attentions.

        Raises:
            ValueError: If both input_ids and inputs_embeds are specified simultaneously, or if neither input_ids nor
                inputs_embeds are specified.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        if input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        past_key_values_length = 0
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            past_key_values_length = past_key_values.get_usable_length(seq_length)

        if position_ids is None:
            position_ids = ops.arange(
                past_key_values_length, seq_length + past_key_values_length, dtype=mindspore.int64
            )
            position_ids = position_ids.unsqueeze(0)

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb

        # 4d mask is passed through the layers
        attention_mask = _prepare_4d_causal_attention_mask(
            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
        )

        # embed positions
        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
                output_attentions=output_attentions,
                use_cache=use_cache,
            )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = None
        if use_cache:
            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
        if not return_dict:
            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMModel.__init__(config)

Initializes a MiniCPMModel instance with the provided configuration.

PARAMETER DESCRIPTION
self

The instance of MiniCPMModel.

TYPE: MiniCPMModel

config

The configuration object containing various settings for the model.

  • config.pad_token_id (int): The token ID used for padding sequences.
  • config.vocab_size (int): The size of the vocabulary.
  • config.hidden_size (int): The dimension of the hidden layers.
  • config.num_hidden_layers (int): The number of hidden layers in the model.
  • config.rms_norm_eps (float): The epsilon value for RMS normalization.

TYPE: MiniCPMConfig

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If the configuration object is missing required attributes.

TypeError

If the configuration attributes are of incorrect types.

RuntimeError

If there is an issue during the initialization process.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
def __init__(self, config: MiniCPMConfig):
    """
    Initializes a MiniCPMModel instance with the provided configuration.

    Args:
        self (MiniCPMModel): The instance of MiniCPMModel.
        config (MiniCPMConfig):
            The configuration object containing various settings for the model.

            - config.pad_token_id (int): The token ID used for padding sequences.
            - config.vocab_size (int): The size of the vocabulary.
            - config.hidden_size (int): The dimension of the hidden layers.
            - config.num_hidden_layers (int): The number of hidden layers in the model.
            - config.rms_norm_eps (float): The epsilon value for RMS normalization.

    Returns:
        None.

    Raises:
        ValueError: If the configuration object is missing required attributes.
        TypeError: If the configuration attributes are of incorrect types.
        RuntimeError: If there is an issue during the initialization process.
    """
    super().__init__(config)
    self.padding_idx = config.pad_token_id
    self.vocab_size = config.vocab_size

    self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
    self.layers = nn.ModuleList(
        [MiniCPMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
    )

    self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    self.gradient_checkpointing = False
    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMModel.forward(input_ids=None, attention_mask=None, position_ids=None, past_key_values=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

Constructs the MiniCPMModel.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMModel class.

TYPE: object

input_ids

The input tensor containing the token IDs. Default is None.

TYPE: Tensor DEFAULT: None

attention_mask

The attention mask tensor. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

position_ids

The tensor containing the position IDs. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

past_key_values

List of tensors representing past key values. Default is None.

TYPE: Optional[List[Tensor]] DEFAULT: None

inputs_embeds

The tensor containing the embeddings of input tokens. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

use_cache

Flag indicating whether to use cache. Default is None.

TYPE: Optional[bool] DEFAULT: None

output_attentions

Flag indicating whether to output attentions. Default is None.

TYPE: Optional[bool] DEFAULT: None

output_hidden_states

Flag indicating whether to output hidden states. Default is None.

TYPE: Optional[bool] DEFAULT: None

return_dict

Flag indicating whether to return a dictionary. Default is None.

TYPE: Optional[bool] DEFAULT: None

RETURNS DESCRIPTION
Union[Tuple, BaseModelOutputWithPast]

Union[Tuple, BaseModelOutputWithPast]: A tuple containing the hidden states, next_cache, all_hidden_states, and all_self_attns if not None; or a BaseModelOutputWithPast instance containing the last hidden state, past key values, hidden states, and attentions.

RAISES DESCRIPTION
ValueError

If both input_ids and inputs_embeds are specified simultaneously, or if neither input_ids nor inputs_embeds are specified.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
def forward(
    self,
    input_ids: mindspore.Tensor = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[List[mindspore.Tensor]] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
    """
    Constructs the MiniCPMModel.

    Args:
        self (object): The instance of the MiniCPMModel class.
        input_ids (mindspore.Tensor): The input tensor containing the token IDs. Default is None.
        attention_mask (Optional[mindspore.Tensor]): The attention mask tensor. Default is None.
        position_ids (Optional[mindspore.Tensor]): The tensor containing the position IDs. Default is None.
        past_key_values (Optional[List[mindspore.Tensor]]): List of tensors representing past key values. Default is None.
        inputs_embeds (Optional[mindspore.Tensor]): The tensor containing the embeddings of input tokens. Default is None.
        use_cache (Optional[bool]): Flag indicating whether to use cache. Default is None.
        output_attentions (Optional[bool]): Flag indicating whether to output attentions. Default is None.
        output_hidden_states (Optional[bool]): Flag indicating whether to output hidden states. Default is None.
        return_dict (Optional[bool]): Flag indicating whether to return a dictionary. Default is None.

    Returns:
        Union[Tuple, BaseModelOutputWithPast]:
            A tuple containing the hidden states, next_cache, all_hidden_states, and all_self_attns if not None;
            or a BaseModelOutputWithPast instance containing the last hidden state, past key values, hidden states,
            and attentions.

    Raises:
        ValueError: If both input_ids and inputs_embeds are specified simultaneously, or if neither input_ids nor
            inputs_embeds are specified.
    """
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # retrieve input_ids and inputs_embeds
    if input_ids is not None and inputs_embeds is not None:
        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
    if input_ids is not None:
        batch_size, seq_length = input_ids.shape[:2]
    elif inputs_embeds is not None:
        batch_size, seq_length = inputs_embeds.shape[:2]
    else:
        raise ValueError("You have to specify either input_ids or inputs_embeds")

    past_key_values_length = 0
    if use_cache:
        use_legacy_cache = not isinstance(past_key_values, Cache)
        if use_legacy_cache:
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
        past_key_values_length = past_key_values.get_usable_length(seq_length)

    if position_ids is None:
        position_ids = ops.arange(
            past_key_values_length, seq_length + past_key_values_length, dtype=mindspore.int64
        )
        position_ids = position_ids.unsqueeze(0)

    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb

    # 4d mask is passed through the layers
    attention_mask = _prepare_4d_causal_attention_mask(
        attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
    )

    # embed positions
    hidden_states = inputs_embeds

    # decoder layers
    all_hidden_states = () if output_hidden_states else None
    all_self_attns = () if output_attentions else None
    next_decoder_cache = None

    for decoder_layer in self.layers:
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        layer_outputs = decoder_layer(
            hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_values,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )

        hidden_states = layer_outputs[0]

        if use_cache:
            next_decoder_cache = layer_outputs[2 if output_attentions else 1]

        if output_attentions:
            all_self_attns += (layer_outputs[1],)

    hidden_states = self.norm(hidden_states)

    # add hidden states from the last decoder layer
    if output_hidden_states:
        all_hidden_states += (hidden_states,)

    next_cache = None
    if use_cache:
        next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
    if not return_dict:
        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
    return BaseModelOutputWithPast(
        last_hidden_state=hidden_states,
        past_key_values=next_cache,
        hidden_states=all_hidden_states,
        attentions=all_self_attns,
    )

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMModel.get_input_embeddings()

Get the input embeddings for the MiniCPMModel.

PARAMETER DESCRIPTION
self

An instance of the MiniCPMModel class.

TYPE: MiniCPMModel

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
956
957
958
959
960
961
962
963
964
965
966
967
968
969
def get_input_embeddings(self):
    """
    Get the input embeddings for the MiniCPMModel.

    Args:
        self (MiniCPMModel): An instance of the MiniCPMModel class.

    Returns:
        None.

    Raises:
        None.
    """
    return self.embed_tokens

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMModel.set_input_embeddings(new_embeddings)

Set the input embeddings for the MiniCPMModel.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMModel class.

TYPE: MiniCPMModel

new_embeddings

The new embeddings to be set for self.embed_tokens.

TYPE: object

RETURNS DESCRIPTION

None.

This method allows the user to set the input embeddings for the MiniCPMModel by replacing the current embeddings with the provided new_embeddings. The new_embeddings can be of any type or format, as long as it is compatible with the self.embed_tokens attribute. After calling this method, the MiniCPMModel instance will use the new embeddings for further processing.

Note

The new_embeddings should be compatible with the existing self.embed_tokens attribute to ensure proper functioning of the MiniCPMModel.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
def set_input_embeddings(self, new_embeddings):
    """
    Set the input embeddings for the MiniCPMModel.

    Args:
        self (MiniCPMModel): The instance of the MiniCPMModel class.
        new_embeddings (object): The new embeddings to be set for self.embed_tokens.

    Returns:
        None.

    Raises:
        None.

    This method allows the user to set the input embeddings for the MiniCPMModel by replacing the current embeddings
    with the provided new_embeddings. The new_embeddings can be of any type or format, as long as it is compatible
    with the self.embed_tokens attribute. After calling this method, the MiniCPMModel instance will use the
    new embeddings for further processing.

    Note:
        The new_embeddings should be compatible with the existing self.embed_tokens attribute to ensure proper
        functioning of the MiniCPMModel.
    """
    self.embed_tokens = new_embeddings

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMPreTrainedModel

Bases: PreTrainedModel

Represents a pre-trained mini version of CPM (Code-PM) model for various NLP tasks. This class inherits from PreTrainedModel and provides functionality to initialize weights for different types of cells.

The _init_weights method initializes the weights of the given cell based on the specified configuration. It sets the weights using either a normal distribution with the specified standard deviation or zeros for bias, depending on the type of the cell. For Dense cells, it initializes both weights and biases, while for Embedding cells, it initializes weights with random values and sets a specific padding index to zero if provided.

PARAMETER DESCRIPTION
cell

The cell for which weights need to be initialized.

RETURNS DESCRIPTION

None

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
class MiniCPMPreTrainedModel(PreTrainedModel):

    """
    Represents a pre-trained mini version of CPM (Code-PM) model for various NLP tasks.
    This class inherits from PreTrainedModel and provides functionality to initialize weights for different types
    of cells.

    The _init_weights method initializes the weights of the given cell based on the specified configuration.
    It sets the weights using either a normal distribution with the specified standard deviation or zeros for bias,
    depending on the type of the cell. For Dense cells, it initializes both weights and biases, while for Embedding cells,
    it initializes weights with random values and sets a specific padding index to zero if provided.

    Parameters:
        cell: The cell for which weights need to be initialized.

    Returns:
        None
    """
    config_class = MiniCPMConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["MiniCPMDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_cache_class = True

    def _init_weights(self, cell):
        """
        Initializes the weights of the given cell.

        Args:
            self (MiniCPMPreTrainedModel): The instance of the MiniCPMPreTrainedModel class.
            cell: The cell whose weights need to be initialized.

        Returns:
            None. This method initializes the weights of the cell in-place.

        Raises:
            None.
        """
        std = self.config.initializer_range
        if isinstance(cell, nn.Linear):
            cell.weight.set_data(initializer(Normal(std), cell.weight.shape, cell.weight.dtype))
            if cell.bias:
                cell.bias.set_data(initializer('zeros', cell.bias.shape, cell.bias.dtype))
        elif isinstance(cell, nn.Embedding):
            weight = np.random.normal(0.0, std, cell.weight.shape)
            if cell.padding_idx:
                weight[cell.padding_idx] = 0

            cell.weight.set_data(Tensor(weight, cell.weight.dtype))

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMRMSNorm

Bases: Module

MiniCPMRMSNorm is a custom layer normalization module designed to mimic the functionality of T5LayerNorm. It performs RMS-based layer normalization on the input hidden states using the provided weight and epsilon value.

PARAMETER DESCRIPTION
hidden_size

The size of the hidden states being normalized.

TYPE: int

eps

A small value added to the variance to prevent division by zero. Default is 1e-06.

TYPE: float DEFAULT: 1e-06

Inherits From

nn.Module

ATTRIBUTE DESCRIPTION
weight

The weight parameter used for normalization.

TYPE: Parameter

variance_epsilon

The epsilon value added to the variance.

TYPE: float

METHOD DESCRIPTION
__init__

Initializes the MiniCPMRMSNorm instance with the given hidden size and epsilon.

forward

Applies RMS-based layer normalization on the input hidden states using the weight and epsilon.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
class MiniCPMRMSNorm(nn.Module):

    """
    MiniCPMRMSNorm is a custom layer normalization module designed to mimic the functionality of T5LayerNorm. 
    It performs RMS-based layer normalization on the input hidden states using the provided weight and epsilon value.

    Parameters:
        hidden_size (int): The size of the hidden states being normalized.
        eps (float, optional): A small value added to the variance to prevent division by zero. Default is 1e-06.

    Inherits From:
        nn.Module

    Attributes:
        weight (Parameter): The weight parameter used for normalization.
        variance_epsilon (float): The epsilon value added to the variance.

    Methods:
        __init__: Initializes the MiniCPMRMSNorm instance with the given hidden size and epsilon.
        forward: Applies RMS-based layer normalization on the input hidden states using the weight and epsilon.
    """
    def __init__(self, hidden_size, eps=1e-6):
        """
        MiniCPMRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = Parameter(ops.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        """
        Constructs a MiniCPMRMSNorm object.

        Args:
            self (MiniCPMRMSNorm): The instance of the MiniCPMRMSNorm class.
            hidden_states (tensor): The input hidden states to be normalized.

        Returns:
            None.

        Raises:
            TypeError: If the input hidden_states is not a valid tensor.
            ValueError: If the weight or variance_epsilon attributes are not set in the MiniCPMRMSNorm object.
        """
        return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMRMSNorm.__init__(hidden_size, eps=1e-06)

MiniCPMRMSNorm is equivalent to T5LayerNorm

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
89
90
91
92
93
94
95
def __init__(self, hidden_size, eps=1e-6):
    """
    MiniCPMRMSNorm is equivalent to T5LayerNorm
    """
    super().__init__()
    self.weight = Parameter(ops.ones(hidden_size))
    self.variance_epsilon = eps

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMRMSNorm.forward(hidden_states)

Constructs a MiniCPMRMSNorm object.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMRMSNorm class.

TYPE: MiniCPMRMSNorm

hidden_states

The input hidden states to be normalized.

TYPE: tensor

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
TypeError

If the input hidden_states is not a valid tensor.

ValueError

If the weight or variance_epsilon attributes are not set in the MiniCPMRMSNorm object.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def forward(self, hidden_states):
    """
    Constructs a MiniCPMRMSNorm object.

    Args:
        self (MiniCPMRMSNorm): The instance of the MiniCPMRMSNorm class.
        hidden_states (tensor): The input hidden states to be normalized.

    Returns:
        None.

    Raises:
        TypeError: If the input hidden_states is not a valid tensor.
        ValueError: If the weight or variance_epsilon attributes are not set in the MiniCPMRMSNorm object.
    """
    return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMRotaryEmbedding

Bases: Module

MiniCPMRotaryEmbedding is a class that represents a rotary positional embedding layer for neural networks. It inherits from nn.Module and provides methods for initializing the embedding layer, setting cosine and sine cache, and forwarding the embeddings based on input data. The class allows for dynamic caching of positional embeddings up to a specified maximum sequence length. The rotary embeddings are computed based on the provided dimensions, maximum position embeddings, and base values. The forwardor initializes the necessary attributes, while the _set_cos_sin_cache method precomputes and caches cosine and sine values for positional embeddings. The forward method generates the positional embeddings based on the input data and the specified sequence length.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
class MiniCPMRotaryEmbedding(nn.Module):

    """
    MiniCPMRotaryEmbedding is a class that represents a rotary positional embedding layer for neural networks.
    It inherits from nn.Module and provides methods for initializing the embedding layer, setting cosine and sine cache,
    and forwarding the embeddings based on input data.
    The class allows for dynamic caching of positional embeddings up to a specified maximum sequence length.
    The rotary embeddings are computed based on the provided dimensions, maximum position embeddings, and base values.
    The forwardor initializes the necessary attributes, while the _set_cos_sin_cache method precomputes and caches
    cosine and sine values for positional embeddings.
    The forward method generates the positional embeddings based on the input data and the specified sequence length.
    """
    def __init__(self, dim, max_position_embeddings=2048, base=10000):
        """
        Initializes a new instance of the MiniCPMRotaryEmbedding class.

        Args:
            self (MiniCPMRotaryEmbedding): The instance of the class.
            dim (int): The dimension of the embedding.
            max_position_embeddings (int, optional): The maximum number of position embeddings. Defaults to 2048.
            base (int, optional): The base value used for calculating the inverse frequency. Defaults to 10000.

        Returns:
            None

        Raises:
            None
        """
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (ops.arange(0, self.dim, 2).float() / self.dim))
        self.inv_freq = inv_freq

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            # seq_len=max_position_embeddings, dtype=torch.get_default_dtype()
            seq_len=max_position_embeddings, dtype=mindspore.float32
        )

    def _set_cos_sin_cache(self, seq_len, dtype):
        """
        Method to calculate and cache the cosine and sine values for rotary embeddings.

        Args:
            self: Instance of MiniCPMRotaryEmbedding class.
            seq_len (int): The length of the sequence for which to calculate the cosine and sine values.
            dtype: Data type to which the cosine and sine values should be converted.

        Returns:
            None: This method does not return any value. It caches the cosine and sine values internally.

        Raises:
            None.
        """
        self.max_seq_len_cached = seq_len
        t = ops.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)
        freqs = ops.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = ops.cat((freqs, freqs), axis=-1)

        self.cos_cached = emb.cos().to(dtype)
        self.sin_cached = emb.sin().to(dtype)

    def forward(self, x, seq_len=None):
        """
        Construct a rotary embedding for a MiniCPM model.

        Args:
            self (MiniCPMRotaryEmbedding): The instance of the MiniCPMRotaryEmbedding class.
            x (Tensor): The input tensor for which the rotary embedding needs to be forwarded.
            seq_len (int, optional): The length of the sequence. If not provided, the default value is None.
                Defaults to None.

        Returns:
            Tuple[Tensor, Tensor]: A tuple containing two tensors, cosine and sine values of the rotary embedding,
                both of the same dtype as input tensor x.

        Raises:
            ValueError: If seq_len is greater than the maximum sequence length cached in the instance.
            TypeError: If the input dtype is not supported for the cosine and sine caches.
        """
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMRotaryEmbedding.__init__(dim, max_position_embeddings=2048, base=10000)

Initializes a new instance of the MiniCPMRotaryEmbedding class.

PARAMETER DESCRIPTION
self

The instance of the class.

TYPE: MiniCPMRotaryEmbedding

dim

The dimension of the embedding.

TYPE: int

max_position_embeddings

The maximum number of position embeddings. Defaults to 2048.

TYPE: int DEFAULT: 2048

base

The base value used for calculating the inverse frequency. Defaults to 10000.

TYPE: int DEFAULT: 10000

RETURNS DESCRIPTION

None

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def __init__(self, dim, max_position_embeddings=2048, base=10000):
    """
    Initializes a new instance of the MiniCPMRotaryEmbedding class.

    Args:
        self (MiniCPMRotaryEmbedding): The instance of the class.
        dim (int): The dimension of the embedding.
        max_position_embeddings (int, optional): The maximum number of position embeddings. Defaults to 2048.
        base (int, optional): The base value used for calculating the inverse frequency. Defaults to 10000.

    Returns:
        None

    Raises:
        None
    """
    super().__init__()

    self.dim = dim
    self.max_position_embeddings = max_position_embeddings
    self.base = base
    inv_freq = 1.0 / (self.base ** (ops.arange(0, self.dim, 2).float() / self.dim))
    self.inv_freq = inv_freq

    # Build here to make `torch.jit.trace` work.
    self._set_cos_sin_cache(
        # seq_len=max_position_embeddings, dtype=torch.get_default_dtype()
        seq_len=max_position_embeddings, dtype=mindspore.float32
    )

mindnlp.transformers.models.minicpm.modeling_minicpm.MiniCPMRotaryEmbedding.forward(x, seq_len=None)

Construct a rotary embedding for a MiniCPM model.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMRotaryEmbedding class.

TYPE: MiniCPMRotaryEmbedding

x

The input tensor for which the rotary embedding needs to be forwarded.

TYPE: Tensor

seq_len

The length of the sequence. If not provided, the default value is None. Defaults to None.

TYPE: int DEFAULT: None

RETURNS DESCRIPTION

Tuple[Tensor, Tensor]: A tuple containing two tensors, cosine and sine values of the rotary embedding, both of the same dtype as input tensor x.

RAISES DESCRIPTION
ValueError

If seq_len is greater than the maximum sequence length cached in the instance.

TypeError

If the input dtype is not supported for the cosine and sine caches.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def forward(self, x, seq_len=None):
    """
    Construct a rotary embedding for a MiniCPM model.

    Args:
        self (MiniCPMRotaryEmbedding): The instance of the MiniCPMRotaryEmbedding class.
        x (Tensor): The input tensor for which the rotary embedding needs to be forwarded.
        seq_len (int, optional): The length of the sequence. If not provided, the default value is None.
            Defaults to None.

    Returns:
        Tuple[Tensor, Tensor]: A tuple containing two tensors, cosine and sine values of the rotary embedding,
            both of the same dtype as input tensor x.

    Raises:
        ValueError: If seq_len is greater than the maximum sequence length cached in the instance.
        TypeError: If the input dtype is not supported for the cosine and sine caches.
    """
    # x: [bs, num_attention_heads, seq_len, head_size]
    if seq_len > self.max_seq_len_cached:
        self._set_cos_sin_cache(seq_len=seq_len, dtype=x.dtype)

    return (
        self.cos_cached[:seq_len].to(dtype=x.dtype),
        self.sin_cached[:seq_len].to(dtype=x.dtype),
    )

mindnlp.transformers.models.minicpm.modeling_minicpm.apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1)

Applies Rotary Position Embedding to the query and key tensors.

PARAMETER DESCRIPTION
q

The query tensor.

TYPE: `mindspore.Tensor`

k

The key tensor.

TYPE: `mindspore.Tensor`

cos

The cosine part of the rotary embedding.

TYPE: `mindspore.Tensor`

sin

The sine part of the rotary embedding.

TYPE: `mindspore.Tensor`

position_ids

The position indices of the tokens corresponding to the query and key tensors. For example, this can be used to pass offsetted position ids when working with a KV-cache.

TYPE: `mindspore.Tensor`

unsqueeze_dim

The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.

TYPE: `int`, *optional*, defaults to 1 DEFAULT: 1

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`mindspore.Tensor`): The query tensor.
        k (`mindspore.Tensor`): The key tensor.
        cos (`mindspore.Tensor`): The cosine part of the rotary embedding.
        sin (`mindspore.Tensor`): The sine part of the rotary embedding.
        position_ids (`mindspore.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(mindspore.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    # cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    # sin = sin[position_ids].unsqueeze(unsqueeze_dim)
    # q_embed = (q * cos) + (rotate_half(q) * sin)
    # k_embed = (k * cos) + (rotate_half(k) * sin)
    orig_dtype = k.dtype
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
    q_fp32 = q.to(dtype=mindspore.float32)
    k_fp32 = k.to(dtype=mindspore.float32)
    q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin)
    k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
    return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)

mindnlp.transformers.models.minicpm.modeling_minicpm.repeat_kv(hidden_states, n_rep)

This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
446
447
448
449
450
451
452
453
454
455
def repeat_kv(hidden_states: mindspore.Tensor, n_rep: int) -> mindspore.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

mindnlp.transformers.models.minicpm.modeling_minicpm.rms_layernorm(hidden, weight, eps)

PARAMETER DESCRIPTION
hidden

The input tensor to be normalized.

TYPE: Tensor

weight

The weight tensor applied to the normalized input.

TYPE: Tensor

eps

A small value added to the variance to avoid division by zero.

TYPE: float

RETURNS DESCRIPTION
None

This function does not return a value. It operates in place on the 'hidden' tensor.

RAISES DESCRIPTION
ValueError

If the 'hidden' tensor or 'weight' tensor is not of type mindspore.Tensor.

TypeError

If the 'eps' parameter is not of type float.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def rms_layernorm(hidden: mindspore.Tensor, weight: mindspore.Tensor, eps: float):
    """
    Args:
        hidden (mindspore.Tensor): The input tensor to be normalized.
        weight (mindspore.Tensor): The weight tensor applied to the normalized input.
        eps (float): A small value added to the variance to avoid division by zero.

    Returns:
        None: This function does not return a value. It operates in place on the 'hidden' tensor.

    Raises:
        ValueError: If the 'hidden' tensor or 'weight' tensor is not of type mindspore.Tensor.
        TypeError: If the 'eps' parameter is not of type float.
    """
    old_dtype = hidden.dtype
    variance = hidden.to(mindspore.float32).pow(2).mean(axis=-1, keep_dims=True)
    hidden = (hidden * ops.rsqrt(variance + eps)).to(old_dtype)
    return hidden * weight

mindnlp.transformers.models.minicpm.modeling_minicpm.rotate_half(x)

Rotates half the hidden dims of the input.

Source code in mindnlp/transformers/models/minicpm/modeling_minicpm.py
318
319
320
321
322
323
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    # x1 = x[..., : x.shape[-1] // 2]
    # x2 = x[..., x.shape[-1] // 2 :]
    x1, x2 = x.tensor_split(2, -1)
    return ops.cat((-x2, x1), axis=-1)

mindnlp.transformers.models.minicpm.configuration_minicpm

MiniCPM model configuration

mindnlp.transformers.models.minicpm.configuration_minicpm.MiniCPMConfig

Bases: PretrainedConfig

This is the configuration class to store the configuration of a [MiniCPMModel]. It is used to instantiate an MiniCPM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the MiniCPM-7B.

Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the documentation from [PretrainedConfig] for more information.

PARAMETER DESCRIPTION
vocab_size

Vocabulary size of the MiniCPM model. Defines the number of different tokens that can be represented by the inputs_ids passed when calling [MiniCPMModel]

TYPE: `int`, *optional*, defaults to 32000 DEFAULT: 32000

hidden_size

Dimension of the hidden representations.

TYPE: `int`, *optional*, defaults to 4096 DEFAULT: 4096

intermediate_size

Dimension of the MLP representations.

TYPE: `int`, *optional*, defaults to 11008 DEFAULT: 11008

num_hidden_layers

Number of hidden layers in the Transformer decoder.

TYPE: `int`, *optional*, defaults to 32 DEFAULT: 32

num_attention_heads

Number of attention heads for each attention layer in the Transformer decoder.

TYPE: `int`, *optional*, defaults to 32 DEFAULT: 32

num_key_value_heads

This is the number of key_value heads that should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, the model will use Multi Head Attention (MHA), if num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be forwarded by meanpooling all the original heads within that group. For more details checkout this paper. If it is not specified, will default to num_attention_heads.

TYPE: `int`, *optional* DEFAULT: None

hidden_act

The non-linear activation function (function or string) in the decoder.

TYPE: `str` or `function`, *optional*, defaults to `"silu"` DEFAULT: 'silu'

max_position_embeddings

The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens, MiniCPM 2 up to 4096, CodeMiniCPM up to 16384.

TYPE: `int`, *optional*, defaults to 2048 DEFAULT: 2048

initializer_range

The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

TYPE: `float`, *optional*, defaults to 0.02 DEFAULT: 0.02

rms_norm_eps

The epsilon used by the rms normalization layers.

TYPE: `float`, *optional*, defaults to 1e-06 DEFAULT: 1e-06

use_cache

Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if config.is_decoder=True.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

pad_token_id

Padding token id.

TYPE: `int`, *optional* DEFAULT: None

bos_token_id

Beginning of stream token id.

TYPE: `int`, *optional*, defaults to 1 DEFAULT: 1

eos_token_id

End of stream token id.

TYPE: `int`, *optional*, defaults to 2 DEFAULT: 2

pretraining_tp

Experimental feature. Tensor parallelism rank used during pretraining. Please refer to this document to understand more about it. This value is necessary to ensure exact reproducibility of the pretraining results. Please refer to this issue.

TYPE: `int`, *optional*, defaults to 1 DEFAULT: 1

tie_word_embeddings

Whether to tie weight embeddings

TYPE: `bool`, *optional*, defaults to `False` DEFAULT: True

rope_theta

The base period of the RoPE embeddings.

TYPE: `float`, *optional*, defaults to 10000.0 DEFAULT: 10000.0

rope_scaling

Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is {"type": strategy name, "factor": scaling factor}. When using this flag, don't update max_position_embeddings to the expected new maximum. See the following thread for more information on how these scaling strategies behave: https://www.reddit.com/r/LocalMiniCPM/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an experimental feature, subject to breaking API changes in future versions.

TYPE: `Dict`, *optional* DEFAULT: None

attention_bias

Whether to use a bias in the query, key, value and output projection layers during self-attention.

TYPE: `bool`, defaults to `False`, *optional*, defaults to `False` DEFAULT: False

attention_dropout

The dropout ratio for the attention probabilities.

TYPE: `float`, *optional*, defaults to 0.0 DEFAULT: 0.0

Example
>>> from transformers import MiniCPMModel, MiniCPMConfig
...
>>> # Initializing a MiniCPM minicpm-7b style configuration
>>> configuration = MiniCPMConfig()
...
>>> # Initializing a model from the minicpm-7b style configuration
>>> model = MiniCPMModel(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config
Source code in mindnlp/transformers/models/minicpm/configuration_minicpm.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
class MiniCPMConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MiniCPMModel`]. It is used to instantiate an MiniCPM
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the MiniCPM-7B.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the MiniCPM model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MiniCPMModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be forwarded
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens,
            MiniCPM 2 up to 4096, CodeMiniCPM up to 16384.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://hf-mirror.com/docs/transformers/parallelism) to understand more about it. This value is
            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
            these scaling strategies behave:
            https://www.reddit.com/r/LocalMiniCPM/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
            experimental feature, subject to breaking API changes in future versions.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.

    Example:
        ```python
        >>> from transformers import MiniCPMModel, MiniCPMConfig
        ...
        >>> # Initializing a MiniCPM minicpm-7b style configuration
        >>> configuration = MiniCPMConfig()
        ...
        >>> # Initializing a model from the minicpm-7b style configuration
        >>> model = MiniCPMModel(configuration)
        ...
        >>> # Accessing the model configuration
        >>> configuration = model.config
        ```
    """
    model_type = "minicpm"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        scale_emb=1,
        dim_model_base=1,
        scale_depth=1,
        **kwargs,
    ):
        """
        Initializes an instance of the MiniCPMConfig class.

        Args:
            self: The instance of the MiniCPMConfig class.
            vocab_size (int, optional): The size of the vocabulary. Defaults to 32000.
            hidden_size (int, optional): The size of the hidden layers. Defaults to 4096.
            intermediate_size (int, optional): The size of the intermediate layers. Defaults to 11008.
            num_hidden_layers (int, optional): The number of hidden layers. Defaults to 32.
            num_attention_heads (int, optional): The number of attention heads. Defaults to 32.
            num_key_value_heads (int, optional): The number of key-value heads. Defaults to None.
                If not provided, it will default to the value of num_attention_heads.
            hidden_act (str, optional): The activation function for the hidden layers. Defaults to 'silu'.
            max_position_embeddings (int, optional): The maximum number of position embeddings. Defaults to 2048.
            initializer_range (float, optional): The range for initializer values. Defaults to 0.02.
            rms_norm_eps (float, optional): The epsilon value for RMS normalization. Defaults to 1e-06.
            use_cache (bool, optional): Flag to indicate whether to use cache or not. Defaults to True.
            pad_token_id (int, optional): The ID of the padding token. Defaults to None.
            bos_token_id (int, optional): The ID of the beginning-of-sentence token. Defaults to 1.
            eos_token_id (int, optional): The ID of the end-of-sentence token. Defaults to 2.
            pretraining_tp (int, optional): The pretraining TP value. Defaults to 1.
            tie_word_embeddings (bool, optional): Flag to indicate whether to tie word embeddings or not. Defaults to True.
            rope_theta (float, optional): The theta value for the rope. Defaults to 10000.0.
            rope_scaling (None or float, optional): The scaling value for the rope. Defaults to None.
            attention_bias (bool, optional): Flag to indicate whether to use attention bias or not. Defaults to False.
            attention_dropout (float, optional): The dropout rate for attention layers. Defaults to 0.0.
            scale_emb (int, optional): The scaling factor for embeddings. Defaults to 1.
            dim_model_base (int, optional): The base dimension for the model. Defaults to 1.
            scale_depth (int, optional): The scaling factor for the depth of the model. Defaults to 1.

        Returns:
            None.

        Raises:
            None.
        """
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self._rope_scaling_validation()
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.scale_emb = scale_emb
        self.dim_model_base = dim_model_base
        self.scale_depth = scale_depth

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
                f"got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
            raise ValueError(
                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
            )
        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

mindnlp.transformers.models.minicpm.configuration_minicpm.MiniCPMConfig.__init__(vocab_size=32000, hidden_size=4096, intermediate_size=11008, num_hidden_layers=32, num_attention_heads=32, num_key_value_heads=None, hidden_act='silu', max_position_embeddings=2048, initializer_range=0.02, rms_norm_eps=1e-06, use_cache=True, pad_token_id=None, bos_token_id=1, eos_token_id=2, pretraining_tp=1, tie_word_embeddings=True, rope_theta=10000.0, rope_scaling=None, attention_bias=False, attention_dropout=0.0, scale_emb=1, dim_model_base=1, scale_depth=1, **kwargs)

Initializes an instance of the MiniCPMConfig class.

PARAMETER DESCRIPTION
self

The instance of the MiniCPMConfig class.

vocab_size

The size of the vocabulary. Defaults to 32000.

TYPE: int DEFAULT: 32000

hidden_size

The size of the hidden layers. Defaults to 4096.

TYPE: int DEFAULT: 4096

intermediate_size

The size of the intermediate layers. Defaults to 11008.

TYPE: int DEFAULT: 11008

num_hidden_layers

The number of hidden layers. Defaults to 32.

TYPE: int DEFAULT: 32

num_attention_heads

The number of attention heads. Defaults to 32.

TYPE: int DEFAULT: 32

num_key_value_heads

The number of key-value heads. Defaults to None. If not provided, it will default to the value of num_attention_heads.

TYPE: int DEFAULT: None

hidden_act

The activation function for the hidden layers. Defaults to 'silu'.

TYPE: str DEFAULT: 'silu'

max_position_embeddings

The maximum number of position embeddings. Defaults to 2048.

TYPE: int DEFAULT: 2048

initializer_range

The range for initializer values. Defaults to 0.02.

TYPE: float DEFAULT: 0.02

rms_norm_eps

The epsilon value for RMS normalization. Defaults to 1e-06.

TYPE: float DEFAULT: 1e-06

use_cache

Flag to indicate whether to use cache or not. Defaults to True.

TYPE: bool DEFAULT: True

pad_token_id

The ID of the padding token. Defaults to None.

TYPE: int DEFAULT: None

bos_token_id

The ID of the beginning-of-sentence token. Defaults to 1.

TYPE: int DEFAULT: 1

eos_token_id

The ID of the end-of-sentence token. Defaults to 2.

TYPE: int DEFAULT: 2

pretraining_tp

The pretraining TP value. Defaults to 1.

TYPE: int DEFAULT: 1

tie_word_embeddings

Flag to indicate whether to tie word embeddings or not. Defaults to True.

TYPE: bool DEFAULT: True

rope_theta

The theta value for the rope. Defaults to 10000.0.

TYPE: float DEFAULT: 10000.0

rope_scaling

The scaling value for the rope. Defaults to None.

TYPE: None or float DEFAULT: None

attention_bias

Flag to indicate whether to use attention bias or not. Defaults to False.

TYPE: bool DEFAULT: False

attention_dropout

The dropout rate for attention layers. Defaults to 0.0.

TYPE: float DEFAULT: 0.0

scale_emb

The scaling factor for embeddings. Defaults to 1.

TYPE: int DEFAULT: 1

dim_model_base

The base dimension for the model. Defaults to 1.

TYPE: int DEFAULT: 1

scale_depth

The scaling factor for the depth of the model. Defaults to 1.

TYPE: int DEFAULT: 1

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/minicpm/configuration_minicpm.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def __init__(
    self,
    vocab_size=32000,
    hidden_size=4096,
    intermediate_size=11008,
    num_hidden_layers=32,
    num_attention_heads=32,
    num_key_value_heads=None,
    hidden_act="silu",
    max_position_embeddings=2048,
    initializer_range=0.02,
    rms_norm_eps=1e-6,
    use_cache=True,
    pad_token_id=None,
    bos_token_id=1,
    eos_token_id=2,
    pretraining_tp=1,
    tie_word_embeddings=True,
    rope_theta=10000.0,
    rope_scaling=None,
    attention_bias=False,
    attention_dropout=0.0,
    scale_emb=1,
    dim_model_base=1,
    scale_depth=1,
    **kwargs,
):
    """
    Initializes an instance of the MiniCPMConfig class.

    Args:
        self: The instance of the MiniCPMConfig class.
        vocab_size (int, optional): The size of the vocabulary. Defaults to 32000.
        hidden_size (int, optional): The size of the hidden layers. Defaults to 4096.
        intermediate_size (int, optional): The size of the intermediate layers. Defaults to 11008.
        num_hidden_layers (int, optional): The number of hidden layers. Defaults to 32.
        num_attention_heads (int, optional): The number of attention heads. Defaults to 32.
        num_key_value_heads (int, optional): The number of key-value heads. Defaults to None.
            If not provided, it will default to the value of num_attention_heads.
        hidden_act (str, optional): The activation function for the hidden layers. Defaults to 'silu'.
        max_position_embeddings (int, optional): The maximum number of position embeddings. Defaults to 2048.
        initializer_range (float, optional): The range for initializer values. Defaults to 0.02.
        rms_norm_eps (float, optional): The epsilon value for RMS normalization. Defaults to 1e-06.
        use_cache (bool, optional): Flag to indicate whether to use cache or not. Defaults to True.
        pad_token_id (int, optional): The ID of the padding token. Defaults to None.
        bos_token_id (int, optional): The ID of the beginning-of-sentence token. Defaults to 1.
        eos_token_id (int, optional): The ID of the end-of-sentence token. Defaults to 2.
        pretraining_tp (int, optional): The pretraining TP value. Defaults to 1.
        tie_word_embeddings (bool, optional): Flag to indicate whether to tie word embeddings or not. Defaults to True.
        rope_theta (float, optional): The theta value for the rope. Defaults to 10000.0.
        rope_scaling (None or float, optional): The scaling value for the rope. Defaults to None.
        attention_bias (bool, optional): Flag to indicate whether to use attention bias or not. Defaults to False.
        attention_dropout (float, optional): The dropout rate for attention layers. Defaults to 0.0.
        scale_emb (int, optional): The scaling factor for embeddings. Defaults to 1.
        dim_model_base (int, optional): The base dimension for the model. Defaults to 1.
        scale_depth (int, optional): The scaling factor for the depth of the model. Defaults to 1.

    Returns:
        None.

    Raises:
        None.
    """
    self.vocab_size = vocab_size
    self.max_position_embeddings = max_position_embeddings
    self.hidden_size = hidden_size
    self.intermediate_size = intermediate_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads

    # for backward compatibility
    if num_key_value_heads is None:
        num_key_value_heads = num_attention_heads

    self.num_key_value_heads = num_key_value_heads
    self.hidden_act = hidden_act
    self.initializer_range = initializer_range
    self.rms_norm_eps = rms_norm_eps
    self.pretraining_tp = pretraining_tp
    self.use_cache = use_cache
    self.rope_theta = rope_theta
    self.rope_scaling = rope_scaling
    self._rope_scaling_validation()
    self.attention_bias = attention_bias
    self.attention_dropout = attention_dropout
    self.scale_emb = scale_emb
    self.dim_model_base = dim_model_base
    self.scale_depth = scale_depth

    super().__init__(
        pad_token_id=pad_token_id,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        tie_word_embeddings=tie_word_embeddings,
        **kwargs,
    )