Add YaRN and Dynamic-YaRN RoPE Scaling Methods #30910

@@ -66,13 +66,31 @@ class OpenLlamaConfig(PretrainedConfig):

                      rope_theta (`float`, *optional*, defaults to 10000.0):
        rope_theta (`float`, *optional*, defaults to 10000.0):

                          The base period of the RoPE embeddings.
            The base period of the RoPE embeddings.

                      rope_scaling (`Dict`, *optional*):
        rope_scaling (`Dict`, *optional*):

                          Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling

                          strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is

                          Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports four scaling
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports four scaling


                          `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update

                          `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how

                          these scaling strategies behave:
            these scaling strategies behave:

                          https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an

                          experimental feature, subject to breaking API changes in future versions.
            experimental feature, subject to breaking API changes in future versions.

                      yarn_rope_scaling (`Dict`, *optional*, defaults to `{'original_max_position_embeddings': 2048, 'extrapolation_factor': 1.0, 'attention_factor': 1.0, 'beta_fast': 32.0, 'beta_slow': 1.0, 'finetuned': False}`):
        yarn_rope_scaling (`Dict`, *optional*, defaults to `{'original_max_position_embeddings': 2048, 'extrapolation_factor': 1.0, 'attention_factor': 1.0, 'beta_fast': 32.0, 'beta_slow': 1.0, 'finetuned': False}`):

@@ -201,3 +229,55 @@ def _rope_scaling_validation(self):

                          )
            )

                      if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:

                          raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")


                  # Copied from transformers.models.llama.configuration_llama.LlamaConfig._yarn_rope_scaling_validation
    # Copied from transformers.models.llama.configuration_llama.LlamaConfig._yarn_rope_scaling_validation

                  def _yarn_rope_scaling_validation(self):
    def _yarn_rope_scaling_validation(self):

@@ -162,12 +188,14 @@ def __init__(

                      self.max_position_embeddings = max_position_embeddings
        self.max_position_embeddings = max_position_embeddings

                      self.rope_theta = rope_theta
        self.rope_theta = rope_theta

                      self.rope_scaling = rope_scaling
        self.rope_scaling = rope_scaling

                      self.yarn_rope_scaling = yarn_rope_scaling
        self.yarn_rope_scaling = yarn_rope_scaling

        self.yarn_rope_scaling = yarn_rope_scaling


                              extrapolation_factor (`float`, defaults to 1):
                extrapolation_factor (`float`, defaults to 1):

                                  Factor to ajust the n-dimensional rotational scaling for extrapolation.
                    Factor to ajust the n-dimensional rotational scaling for extrapolation.


                          return 1.0
            return 1.0

                      return 0.1 * math.log(scaling_factor) + 1.0
        return 0.1 * math.log(scaling_factor) + 1.0


                  def forward(self, x, seq_len=None):
    def forward(self, x, seq_len=None):


                                  Parameter to set the boundary for extrapolation (only) in the linear ramp function.
                    Parameter to set the boundary for extrapolation (only) in the linear ramp function.

                              beta_slow (`float`, *optional*, defaults to 1):
                beta_slow (`float`, *optional*, defaults to 1):

                                  Parameter to set the boundary for interpolation (only) in the linear ramp function.
                    Parameter to set the boundary for interpolation (only) in the linear ramp function.

                              finetuned (`bool`, *optional*, defaults to `False`):
                finetuned (`bool`, *optional*, defaults to `False`):


                          self._sin_cached[:seq_len, ...].to(dtype=x.dtype),
            self._sin_cached[:seq_len, ...].to(dtype=x.dtype),

                      )
        )


                  def yarn(self, device):
    def yarn(self, device):

    def yarn(self, device):
    def compute_yarn_scaling(self, device):


                          device,
            device,

                      )
        )


                      if finetuned:
        if finetuned:

        if finetuned:
        if self.max_position_embeddings != self.original_max_position_embeddings:


                      with self.assertRaises(AssertionError):
        with self.assertRaises(AssertionError):

                          torch.testing.assert_close(yarn_cos_long, original_cos_long)
            torch.testing.assert_close(yarn_cos_long, original_cos_long)

                      with self.assertRaises(AssertionError):
        with self.assertRaises(AssertionError):

                          torch.testing.assert_close(yarn_sin_long, original_sin_long)
            torch.testing.assert_close(yarn_sin_long, original_sin_long)
		extrapolation_factor (`float`, defaults to 1):
		Factor to ajust the n-dimensional rotational scaling for extrapolation.
	def yarn(self, device):
	def compute_yarn_scaling(self, device):
	if finetuned:
	if self.max_position_embeddings != self.original_max_position_embeddings: