diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..daf9bcb4c23d1fc7d5601d3c3bdf8e681dc46c00
--- /dev/null
+++ b/README.md
@@ -0,0 +1,24 @@
+---
+license: apache-2.0
+language:
+- en
+- es
+- fr
+- de
+- it
+- pt
+- ru
+- ar
+- hi
+- ko
+- zh
+library_name: mlx
+base_model: arcee-ai/Trinity-Large-Preview
+pipeline_tag: text-generation
+tags:
+- mlx
+---
+
+This model [finding1/Trinity-Large-Preview-MLX-6.5bpw](https://huggingface.co/finding1/Trinity-Large-Preview-MLX-6.5bpw) was
+converted to MLX format from [arcee-ai/Trinity-Large-Preview](https://huggingface.co/arcee-ai/Trinity-Large-Preview)
+using mlx-lm version **0.30.5** `mlx_lm.convert --hf-path arcee-ai/Trinity-Large-Preview --mlx-path Trinity-Large-Preview-MLX-6.5bpw --quantize --q-bits 6`.
diff --git a/chat_template.jinja b/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..05f0cd0bf1db4af2bf06100e0c636120d1b7b480
--- /dev/null
+++ b/chat_template.jinja
@@ -0,0 +1,65 @@
+{{ bos_token }}{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+ {%- if message.content is string %}
+ {%- set content = message.content %}
+ {%- else %}
+ {%- set content = '' %}
+ {%- endif %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {{- '<|im_start|>' + message.role + '\n' }}
+ {% generation %}
+ {{- content}}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>' }}
+ {% endgeneration%}
+ {{- '\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1420e404c2c81a1b3242bf269a9972d8f011bf0
--- /dev/null
+++ b/config.json
@@ -0,0 +1,551 @@
+{
+ "architectures": [
+ "AfmoeForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "auto_map": {
+ "AutoConfig": "configuration_afmoe.AfmoeConfig",
+ "AutoModel": "modeling_afmoe.AfmoeModel",
+ "AutoModelForCausalLM": "modeling_afmoe.AfmoeForCausalLM"
+ },
+ "dtype": "bfloat16",
+ "eos_token_id": 3,
+ "global_attn_every_n_layers": 4,
+ "head_dim": 128,
+ "hidden_act": "silu",
+ "hidden_size": 3072,
+ "initializer_range": 0.02,
+ "intermediate_size": 12288,
+ "layer_types": [
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "sliding_attention",
+ "full_attention"
+ ],
+ "load_balance_coeff": 5e-05,
+ "max_position_embeddings": 262144,
+ "model_type": "afmoe",
+ "moe_intermediate_size": 3072,
+ "mup_enabled": true,
+ "n_group": 1,
+ "num_attention_heads": 48,
+ "num_dense_layers": 6,
+ "num_expert_groups": 1,
+ "num_experts": 256,
+ "num_experts_per_tok": 4,
+ "num_hidden_layers": 60,
+ "num_key_value_heads": 8,
+ "num_limited_groups": 1,
+ "num_shared_experts": 1,
+ "quantization": {
+ "group_size": 64,
+ "bits": 6,
+ "mode": "affine",
+ "model.layers.6.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.7.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.8.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.9.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.10.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.11.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.12.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.13.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.14.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.15.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.16.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.17.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.18.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.19.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.20.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.21.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.22.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.23.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.24.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.25.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.26.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.27.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.28.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.29.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.30.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.31.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.32.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.33.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.34.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.35.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.36.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.37.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.38.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.39.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.40.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.41.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.42.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.43.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.44.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.45.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.46.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.47.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.48.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.49.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.50.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.51.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.52.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.53.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.54.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.55.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.56.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.57.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.58.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.59.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ }
+ },
+ "quantization_config": {
+ "group_size": 64,
+ "bits": 6,
+ "mode": "affine",
+ "model.layers.6.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.7.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.8.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.9.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.10.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.11.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.12.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.13.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.14.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.15.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.16.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.17.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.18.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.19.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.20.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.21.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.22.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.23.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.24.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.25.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.26.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.27.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.28.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.29.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.30.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.31.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.32.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.33.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.34.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.35.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.36.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.37.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.38.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.39.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.40.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.41.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.42.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.43.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.44.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.45.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.46.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.47.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.48.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.49.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.50.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.51.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.52.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.53.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.54.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.55.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.56.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.57.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.58.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.59.mlp.router.gate": {
+ "group_size": 64,
+ "bits": 8
+ }
+ },
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": null,
+ "rope_theta": 10000,
+ "route_norm": true,
+ "route_scale": 2.448,
+ "score_func": "sigmoid",
+ "sliding_window": 4096,
+ "tie_word_embeddings": false,
+ "topk_group": 1,
+ "transformers_version": "4.57.1",
+ "use_cache": true,
+ "use_grouped_mm": true,
+ "vocab_size": 200192
+}
\ No newline at end of file
diff --git a/configuration_afmoe.py b/configuration_afmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..9efecdd517e8e6168f46ebecb3d282bdea34c5dc
--- /dev/null
+++ b/configuration_afmoe.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.configuration_utils import layer_type_validation
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+class AfmoeConfig(PretrainedConfig):
+ """
+ n_group (`int`, *optional*, defaults to 1):
+ Number of groups for routed experts.
+ topk_group (`int`, *optional*, defaults to 1):
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+ """
+ model_type = "afmoe"
+ base_model_pp_plan = {
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+ "norm": (["hidden_states"], ["hidden_states"]),
+ }
+
+ def __init__(
+ self,
+ num_hidden_layers: int = 32,
+ vocab_size: int = 200192,
+ hidden_size: int = 2048,
+ intermediate_size: int = 6144,
+ moe_intermediate_size=1408,
+ num_dense_layers=1,
+ num_attention_heads=16,
+ num_key_value_heads=None,
+ head_dim=128,
+ hidden_act="silu",
+ max_position_embeddings=16384,
+ initializer_range=0.02,
+ rms_norm_eps=1e-5,
+ use_cache=True,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ num_experts=64,
+ num_experts_per_tok=6,
+ num_shared_experts=2,
+ num_expert_groups=1,
+ num_limited_groups=1,
+ score_func="sigmoid",
+ route_norm=True,
+ route_scale=1.0,
+ global_attn_every_n_layers=4,
+ sliding_window=1024,
+ mup_enabled=False,
+ layer_types=None,
+ attention_dropout: float = 0.0,
+ n_group: int = 1,
+ topk_group: int = 1,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_dense_layers = num_dense_layers
+ self.num_attention_heads = num_attention_heads
+ self.head_dim = head_dim
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+
+
+ # MoE specific
+ self.moe_intermediate_size = moe_intermediate_size
+ self.num_experts_per_tok = num_experts_per_tok
+ self.n_group = n_group
+ self.topk_group = topk_group
+ self.num_experts = num_experts
+ self.num_shared_experts = num_shared_experts
+ self.num_expert_groups = num_expert_groups
+ self.num_limited_groups = num_limited_groups
+ self.score_func = score_func
+ self.route_norm = route_norm
+ self.route_scale = route_scale
+
+
+ # Attention specific
+ self.attention_dropout = attention_dropout
+ self.global_attn_every_n_layers = global_attn_every_n_layers
+ self.sliding_window = sliding_window
+ self.layer_types = layer_types
+ if self.layer_types is None:
+ self.layer_types = [
+ "sliding_attention" if bool((i + 1) % global_attn_every_n_layers) else "full_attention" for i in range(self.num_hidden_layers)
+ ]
+ layer_type_validation(self.layer_types)
+
+ # muP specific
+ self.mup_enabled = mup_enabled
+
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+
+
+ # Validate rope configs
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+ rope_config_validation(self)
+
+ super().__init__(
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+
+__all__ = ["AfmoeConfig"]
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..71ac125ab114655c051bfbb2ed3d3fddd27887ba
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,9 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 0,
+ "eos_token_id": 3,
+ "pad_token_id": 12,
+ "transformers_version": "4.57.3",
+ "temperature": 0.8,
+ "top_p": 0.8
+}
\ No newline at end of file
diff --git a/model-00001-of-00081.safetensors b/model-00001-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..27f7163a0cee93565b877d24d89dd36ae616e1ac
--- /dev/null
+++ b/model-00001-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:103dab9e86215b42ae3eda748b1fd96d0a98dc3cffa345ba350125b57feb1e79
+size 5336460769
diff --git a/model-00002-of-00081.safetensors b/model-00002-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2f1cb8436b499a8b840869f019dd04436c428716
--- /dev/null
+++ b/model-00002-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89909275dea0e2225bb743a3e486e499de25915c52bed7092fbc60246e8335aa
+size 4000855628
diff --git a/model-00003-of-00081.safetensors b/model-00003-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6a24962534f1059f2bf7f3b2945db055df97b582
--- /dev/null
+++ b/model-00003-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cae076e6d476865a14d6e3155cb6943dde88792b98f81425591f0924d6729bc
+size 4000855568
diff --git a/model-00004-of-00081.safetensors b/model-00004-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..73a9fe05e9162f1d0f4705c4c73f7931bc8ae696
--- /dev/null
+++ b/model-00004-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1c3c89fe95e0074afc806002c890751af49c49c996be9fd66c8170910324088
+size 3925869308
diff --git a/model-00005-of-00081.safetensors b/model-00005-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c7b30e3d4f72e58d5d7c2134fd9c9d279a804eb9
--- /dev/null
+++ b/model-00005-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b6092cc0e95ec25d4556b543cdeb06c771573e75c78aa17929d3828187d31db
+size 4000855654
diff --git a/model-00006-of-00081.safetensors b/model-00006-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..34519707cd64532ce14d6959c441c991b6b205be
--- /dev/null
+++ b/model-00006-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:308a7915dff2123cc4f4170ea2b8fce1262ba27e613b72cabb8297f44370b01f
+size 4000855601
diff --git a/model-00007-of-00081.safetensors b/model-00007-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..641c0f1747f173efba619c20bba5aa8aa291b7a9
--- /dev/null
+++ b/model-00007-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7267ea012b3958ace58feeca443e265bb14c6eb2944a0233aea928c343bed5d5
+size 3925869314
diff --git a/model-00008-of-00081.safetensors b/model-00008-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..99adda54971324281a8b46cb7a085c3ef748259b
--- /dev/null
+++ b/model-00008-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1b1dee7170942f410c82d22b8525e216012cfc4975e280a070640f6151270b2
+size 4000855750
diff --git a/model-00009-of-00081.safetensors b/model-00009-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b597e769130eb319e36d7050d3825d247f137d0e
--- /dev/null
+++ b/model-00009-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96595e88cad42464f5163409c9bf8b7817b6dc67c75121acaeeb32b092fa5909
+size 4000855632
diff --git a/model-00010-of-00081.safetensors b/model-00010-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..52dc4f60692803948adccf0e6ef90c9a1879053b
--- /dev/null
+++ b/model-00010-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df4d3b4a09b7b916aa7bd23bd5f70c8cd76b1180a3a1ab52b19fa4fa37320c41
+size 3925869316
diff --git a/model-00011-of-00081.safetensors b/model-00011-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f9bdedbe83c47e1a251f74fd5c8762d2a2083d97
--- /dev/null
+++ b/model-00011-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2883c0b74971bd9a6f4ee49ee4ddcab5201946d62d5458df3faba0e7159a7349
+size 4000855750
diff --git a/model-00012-of-00081.safetensors b/model-00012-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a7bb31f3323e55896851a08deedde7671900906f
--- /dev/null
+++ b/model-00012-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d866d0f25db1876b73664eb3a835adc62aa1ce64a31b8ffcfe9bdd2c6142267b
+size 4000855632
diff --git a/model-00013-of-00081.safetensors b/model-00013-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3748d4fac321eae009cdf189e7d5a6457969ebfc
--- /dev/null
+++ b/model-00013-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1ae62a1ea422a554088730deb3a85da5dc458324c80d0cc43ab16bc5d296115
+size 3925869316
diff --git a/model-00014-of-00081.safetensors b/model-00014-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8d23078321b736e6a9f7b68d5d54d7c594fd0aa9
--- /dev/null
+++ b/model-00014-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:922ea296aca3a36bb635d3ac082d93823a647fe548398a4261d7b3ca973c3937
+size 4000855754
diff --git a/model-00015-of-00081.safetensors b/model-00015-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..da970c2794c3852adabc4e554476246cd7d9fb06
--- /dev/null
+++ b/model-00015-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a374ea9121f04d3934be84cca4b3fb31e5a264e79e21d3b9a33654d704d3b5d3
+size 4000855638
diff --git a/model-00016-of-00081.safetensors b/model-00016-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c8599fefb30004d1bba403b2a4a0f15572757692
--- /dev/null
+++ b/model-00016-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d168b770f457e5a25b08089479b761afda75527940abb79b7f47b32b6c9fd0e3
+size 3925869320
diff --git a/model-00017-of-00081.safetensors b/model-00017-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e0cf02bc8966f76c67b389878fb328c4c0763c27
--- /dev/null
+++ b/model-00017-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb051b16d87ac8130925c4611341a569005cb6d78b617c27a8246646e3a0d8aa
+size 4000855704
diff --git a/model-00018-of-00081.safetensors b/model-00018-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6114d8722bcff166694749194535df34dc1fbfb4
--- /dev/null
+++ b/model-00018-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a243b9b7883e21922e0b877fbf67b04ab69a5025fe6af6582ecbacd728b9a6ec
+size 4000855626
diff --git a/model-00019-of-00081.safetensors b/model-00019-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f1a6aa8ab4845ef12fc85bd277ebc515cfff7468
--- /dev/null
+++ b/model-00019-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2eed8ecea29e94e1c6e8b305244e270401c833cdba33b855c30077acf23d7f4f
+size 3925869316
diff --git a/model-00020-of-00081.safetensors b/model-00020-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a5b623f3790275b2ced10653e75161cd99bea18c
--- /dev/null
+++ b/model-00020-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc28249a3ee089a56c7090cc63b6a9be67d3f7a9fed856f9fb06685573edca09
+size 4000855674
diff --git a/model-00021-of-00081.safetensors b/model-00021-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cca5b624cc805c5b341abdaef08c3678118b4409
--- /dev/null
+++ b/model-00021-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dd47e3e641bf03ac56824258421b0ebde9af725ac5624456969fa9cec682c45
+size 4000855654
diff --git a/model-00022-of-00081.safetensors b/model-00022-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2b68dbe77822ea0e15aa7b55b593d35a179da0d9
--- /dev/null
+++ b/model-00022-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c2012d9e80079f0edd68bda18339cc98dea7300e4eda967a7b20551e2ebec0d
+size 3925869310
diff --git a/model-00023-of-00081.safetensors b/model-00023-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..12119a67daf688b1fc8d64f8823f84412598ee32
--- /dev/null
+++ b/model-00023-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32bb67260e2d7f666f72d4b5008469d9c289533467a79da336910d9c55705dbf
+size 4000855720
diff --git a/model-00024-of-00081.safetensors b/model-00024-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..af790646c2de4cebfd4a9c94b17921560f5fc5fc
--- /dev/null
+++ b/model-00024-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec38ba3338a0c1ed06db02e07d68311439345c96693ba5ceefd1e9d840ec337d
+size 4000855666
diff --git a/model-00025-of-00081.safetensors b/model-00025-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..355995f57e542556f267cbee850e85218e7cecb2
--- /dev/null
+++ b/model-00025-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0698c5def766ff1eebea0fa6480793341ac9144552a12d1551f881f1e3acc50c
+size 3925869314
diff --git a/model-00026-of-00081.safetensors b/model-00026-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2b1d2d1a7a28b1e34b3415c4ac4efc8874459586
--- /dev/null
+++ b/model-00026-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff44427449a1c3ceed916f1d69fc6950f170f41999c479a81e0a837f2e0779bf
+size 4000855750
diff --git a/model-00027-of-00081.safetensors b/model-00027-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..eafbf4df5de3623078585e31436b9005fdf95171
--- /dev/null
+++ b/model-00027-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8717a50f4abf1cfc5bc1797b69114c1da4a63d7e993f708060564eefe4c57d9
+size 4000855630
diff --git a/model-00028-of-00081.safetensors b/model-00028-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e204956029c4cce22f18c6757487f9accc2d4d5b
--- /dev/null
+++ b/model-00028-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baf5d7a51f294a8bbd44d34cd43cb05b39ac0bdbc7d08dceb790f27fd562c43a
+size 3925869314
diff --git a/model-00029-of-00081.safetensors b/model-00029-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f2d62705aee881dfab838a6e406e77678a39b55f
--- /dev/null
+++ b/model-00029-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c42d21d3bca679e9c468ff861a6b08488cfc13b8b9e324cf3902d92b5648455
+size 4000855750
diff --git a/model-00030-of-00081.safetensors b/model-00030-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1ecc329f3927eaa548c85912a22b0d93af73f55a
--- /dev/null
+++ b/model-00030-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39e1992bb8eea49fef2d20e62aff87977422d55757f8958b9dc88464a91c7d56
+size 4000855658
diff --git a/model-00031-of-00081.safetensors b/model-00031-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ab5f1e6b85685d94b97c490c6cabeedc94ab0309
--- /dev/null
+++ b/model-00031-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddc27ed212f49304534a307f51940b52363bff6d8d283270e0294a561322f87a
+size 3925869316
diff --git a/model-00032-of-00081.safetensors b/model-00032-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3a42362165d3bc8b1165bea73d11ae6130c5f257
--- /dev/null
+++ b/model-00032-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44b6340648bb012ef56533a6044bcc31924b55e1cb9f30b6798c39971477c06b
+size 4000855754
diff --git a/model-00033-of-00081.safetensors b/model-00033-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..361cb5e50edc65d2a8b02ba437094eef83b71cf3
--- /dev/null
+++ b/model-00033-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5702cc3ddb915abb6297dda1eb0282a6517c857d8666a5fbc5bdddf40869bb96
+size 4000855630
diff --git a/model-00034-of-00081.safetensors b/model-00034-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..01ca8e78175aeabdffd915dfaec880363f1eade3
--- /dev/null
+++ b/model-00034-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b09726c4ffc13bcf0aba33ab47b17ab1a6aeecb9a6f8389e896a6c8b7fb9eedd
+size 3925869310
diff --git a/model-00035-of-00081.safetensors b/model-00035-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d80eeeb81780272dc01c98a6a28c841a4d96b0f9
--- /dev/null
+++ b/model-00035-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f85e97f3c813701c007c037a0cbd5407f44e7db18645910574f1c0518ec7817d
+size 4000855748
diff --git a/model-00036-of-00081.safetensors b/model-00036-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c4b26f6a2f29007e9f8c4957ab41dbe56e5768dd
--- /dev/null
+++ b/model-00036-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dd075ae060d11a8b6334411428c1a0e29381fc61f9ccdf5704ce58318247ab1
+size 4000855686
diff --git a/model-00037-of-00081.safetensors b/model-00037-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..046f685068cbc7a582acc70401115c61ea18ecaa
--- /dev/null
+++ b/model-00037-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773ff86bc69f7a40e338d1315848ffb1684894247e8318dc6f353c580f65d616
+size 3925869320
diff --git a/model-00038-of-00081.safetensors b/model-00038-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..00416a19cb52a495096e23eaaab23e6ca78a557a
--- /dev/null
+++ b/model-00038-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e981bb7a788311b9c062a18ea899a78ed0157bae0a30b425cc4d9b7820dbd45
+size 4000855748
diff --git a/model-00039-of-00081.safetensors b/model-00039-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7aa67d7dfa21bbb8e622a2a5d35e516013419f9b
--- /dev/null
+++ b/model-00039-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10ece3e61af73c514b33fe38565968a771e61df6700f70f9a5acfde8bed6cfb4
+size 4000855672
diff --git a/model-00040-of-00081.safetensors b/model-00040-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fa84d827d65eedc15e074bcaa1154cb373fbe2c4
--- /dev/null
+++ b/model-00040-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b932a14f1b52d0591ccb185b7e200993f3f8f78fc6c04be3eed8705122be39f2
+size 3925869316
diff --git a/model-00041-of-00081.safetensors b/model-00041-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6799d236466e6d1a7d4c153981280b83a4dffc4c
--- /dev/null
+++ b/model-00041-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1391db3ce71d84dd5d2dc4b90431c629d954102a6d714314039978c76e98928
+size 4000855722
diff --git a/model-00042-of-00081.safetensors b/model-00042-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7d0fce2857c19afd9e191d7c17e8dbcbbc7326ac
--- /dev/null
+++ b/model-00042-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:061400a0d0088ff0b7f64f4752380f88013564459d33cc5c9ed6fac93da211d8
+size 4000855634
diff --git a/model-00043-of-00081.safetensors b/model-00043-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4021617eefcf2906bbe31aff2ee6c03cabb9f195
--- /dev/null
+++ b/model-00043-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c068d5bc32dd9707c2b83588e09fc1b542a25d55ce6f6f475b3fcb85522bb27
+size 3925869314
diff --git a/model-00044-of-00081.safetensors b/model-00044-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4df2d76a8dc23cca98976db2fd55830a490671a9
--- /dev/null
+++ b/model-00044-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b21b97fd85c25a51c4cb457acb0bb7bbd8308b80e9685bfc3ead2ca26aab343
+size 4000855740
diff --git a/model-00045-of-00081.safetensors b/model-00045-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5a3d7ebb633a566dbfe8c71c7f6ec3102a7aae44
--- /dev/null
+++ b/model-00045-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2b2e516bf037d1cd2ce2585478e98c118aeeb4084f35d311346ad52d69a207d
+size 4000855628
diff --git a/model-00046-of-00081.safetensors b/model-00046-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aa4ed76a328caf17e29ead5b2c287bb9017c7d76
--- /dev/null
+++ b/model-00046-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f506a67f8be062c09deb8f58aa6b8ea9c79a321d141a4813c32d20e0700748e
+size 3925869312
diff --git a/model-00047-of-00081.safetensors b/model-00047-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f63a7fac9007f95b5764bb39e9a4db9c0145934b
--- /dev/null
+++ b/model-00047-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22b3ca9957c40a71c63d2f44bebae7ad2454b11c1c53de1263123e9d3e09a65b
+size 4000855732
diff --git a/model-00048-of-00081.safetensors b/model-00048-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a059889ecb0b954412573eccb6d2871350bc71c1
--- /dev/null
+++ b/model-00048-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55261f6a9a38c356278ceca9538b8303c3a11fa1b4bf10c03d58be8a2589c575
+size 4000855658
diff --git a/model-00049-of-00081.safetensors b/model-00049-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3661b177f9f9cbe49b42a7575bb8ea720fbbcd1c
--- /dev/null
+++ b/model-00049-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:520c1e184d46614e5ef97f6f27c876772ff42ded6840f114c74fd40f9299df5e
+size 3925869316
diff --git a/model-00050-of-00081.safetensors b/model-00050-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e7df2ec70daf46517323961457aa66bc7dd346ff
--- /dev/null
+++ b/model-00050-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19232ab3e7b1148b720002bfcef83cb6bc4b6b73893a445e9d76137d48707147
+size 4000855754
diff --git a/model-00051-of-00081.safetensors b/model-00051-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..44ba6aca72696a15762272477786754de2d3b56d
--- /dev/null
+++ b/model-00051-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e2c0c7bd076755a70d1e417eee6289db394469ae74e9a85d17bbefc65722ccc
+size 4000855644
diff --git a/model-00052-of-00081.safetensors b/model-00052-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b3427279bc7e93dd97f569767c7e0fa991a7b155
--- /dev/null
+++ b/model-00052-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa216acef63723c56b6b4750d8d32bdbaacdd1bbcc65d89b617b689ab8b61019
+size 3925869310
diff --git a/model-00053-of-00081.safetensors b/model-00053-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d091efcd183936b383c9df9fd247d37798e3f0fa
--- /dev/null
+++ b/model-00053-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63375d6c145162e0021a243ace2775668e19f96033f348a07f1385ebd9fef83c
+size 4000855710
diff --git a/model-00054-of-00081.safetensors b/model-00054-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f9792f7551c5e780c82b9adb617a52bbcc2728ec
--- /dev/null
+++ b/model-00054-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:610998ce00f62d726a04e2112018631656e91598e9374c0648cb8eff6e4fa73e
+size 4000855646
diff --git a/model-00055-of-00081.safetensors b/model-00055-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cfba9aaa385da30b42433b3b6005bf71100132be
--- /dev/null
+++ b/model-00055-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d34757ad9a8c3a7df8588b19965e232e14eedf9c2731f36c364139012bc3489
+size 3925869314
diff --git a/model-00056-of-00081.safetensors b/model-00056-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..19ed68357a2f63690f07c99b2632ff27d17d4878
--- /dev/null
+++ b/model-00056-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b89cf6cbd9a89302f6148c395906c4ee2575d0d2af94bdb667905158b51a9167
+size 4000855748
diff --git a/model-00057-of-00081.safetensors b/model-00057-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b654b3c6a6c0a9bdf08118deec9bf0c73aa15a66
--- /dev/null
+++ b/model-00057-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86480ebffa843f8a0315c35e234ecbba80f811f3f2447bde6bff6fe6c117b15d
+size 4000855676
diff --git a/model-00058-of-00081.safetensors b/model-00058-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2b0ea448e1f5ac6ab0c8d898fd88adc2912ba47e
--- /dev/null
+++ b/model-00058-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c847d30ae24b4007c7a8a52caa1fbd0c2361edf5834b6c3feae5e7fabf5355f
+size 3925869310
diff --git a/model-00059-of-00081.safetensors b/model-00059-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2d6ba98cdd2f38e32bc05af5ed98009e825fa959
--- /dev/null
+++ b/model-00059-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bc336983d9681fd5475c91c79d5c162e3d7bfbe6fac67c9f43e6dc780c7100c
+size 4000855748
diff --git a/model-00060-of-00081.safetensors b/model-00060-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e5e32852f8083c1860bce2322f7db4b1f323e047
--- /dev/null
+++ b/model-00060-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8de8371ddb350acd37ff8e050df1913e5a1306e5b924c2c36d4545d7af049abe
+size 4000855618
diff --git a/model-00061-of-00081.safetensors b/model-00061-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a3d3d3eb77febd72465ff85a16ca15227beaf2bc
--- /dev/null
+++ b/model-00061-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c35015d817a6289ecbf51c4b121167b4c9929b7dff938c2ad4c8fe4bc1fa1f85
+size 3925869316
diff --git a/model-00062-of-00081.safetensors b/model-00062-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..83424535e1605ccc514b4fe0c1cd34eb76621b8a
--- /dev/null
+++ b/model-00062-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3261ed0687b1816f01c42eb86d3035a5fabbf7b44522b2d8365a861e5c29adf2
+size 4000855682
diff --git a/model-00063-of-00081.safetensors b/model-00063-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4c68046b405cce1bb5ec70c8a7a9d5634ef80c65
--- /dev/null
+++ b/model-00063-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:305d93b4b1250a89e8d634db175720fadefdb7a3685ec87d3ce113f220f363d3
+size 4000855690
diff --git a/model-00064-of-00081.safetensors b/model-00064-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f7b35e6e3ba2826da726d28cedbbd987d2eab4b1
--- /dev/null
+++ b/model-00064-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2729f59d04ce1f7e15dab43cc3779c4f46eb22925d19ab388d7cc0c0dc352c7
+size 3925869316
diff --git a/model-00065-of-00081.safetensors b/model-00065-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9dfc2e80216278943dad30107fcf5b481fc7f387
--- /dev/null
+++ b/model-00065-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20894251493bf37419cded1faaebe8232d5492311923ce8d6f5794ecde9ca391
+size 4000855730
diff --git a/model-00066-of-00081.safetensors b/model-00066-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..97278112036b2a0ff996bcec979886edf3f48495
--- /dev/null
+++ b/model-00066-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b94067ba005871eca929756f139ed85f09440156c66537c9202bddb0adfc2e73
+size 4000855606
diff --git a/model-00067-of-00081.safetensors b/model-00067-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7602f6bf070cb034376bd64a6aefffa3c687f3a8
--- /dev/null
+++ b/model-00067-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c8e521c1bdee876256352f33114b0b3bdc59f65d16be5e6352e33b6c0f5ac2a
+size 3925869312
diff --git a/model-00068-of-00081.safetensors b/model-00068-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..392e8b3723209f9212d65949c66b9281a1c041dc
--- /dev/null
+++ b/model-00068-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f04dd070e90d54a67081e1162460c581569c3a2a6ed2f4ae1e98af47c65df1f
+size 4000855750
diff --git a/model-00069-of-00081.safetensors b/model-00069-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1b0f84faa5e8a07d9429512171744481f03b62c6
--- /dev/null
+++ b/model-00069-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e041e688ad881f65e344f9a8570c8dae8cfc129614b61bb756d0e4d47f87c7d5
+size 4000855706
diff --git a/model-00070-of-00081.safetensors b/model-00070-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..862a218fc12e295bc3662fe159124d3a4715a8e2
--- /dev/null
+++ b/model-00070-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dad13f3a8de2afea357ce0c83e107a644e9b793ff5972f6aa2cdfa9fd3c46975
+size 3925869312
diff --git a/model-00071-of-00081.safetensors b/model-00071-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..44f196bda84a1ff766000796fbd2fc5ea988cf1e
--- /dev/null
+++ b/model-00071-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f8bc57bd369c887c2a3e4f0e83746486b4c777665ea5eab5e63f8f0e674aca8
+size 4000855750
diff --git a/model-00072-of-00081.safetensors b/model-00072-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..745176aba8478e6fb12a18a30679a3ee99ce2fd5
--- /dev/null
+++ b/model-00072-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dffda4061e69dd7518f9a51e59a005c1dbae7c8e658b4e3629369d86b91218c
+size 4000855632
diff --git a/model-00073-of-00081.safetensors b/model-00073-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3c5df70809ac3cae8c32f305d0d67b50919f201f
--- /dev/null
+++ b/model-00073-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92d80d8c235d28662b538508bd747ccd4d5dc5835211e821e3898f1e1365e8c4
+size 3925869316
diff --git a/model-00074-of-00081.safetensors b/model-00074-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..30721ed18c2d02d2487409363b9f2ac6c51874de
--- /dev/null
+++ b/model-00074-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:308fccb59fabec5344e506aef732033e9c64e04e15a93f08a8b4d2de9a3bbe2a
+size 4000855682
diff --git a/model-00075-of-00081.safetensors b/model-00075-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3e13921593794ffefad7ab68c08373b55d4acea4
--- /dev/null
+++ b/model-00075-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9243ee43eea3e06ea7750c56730956ee31686e566b14c4563c855aecd124ed8e
+size 4000855628
diff --git a/model-00076-of-00081.safetensors b/model-00076-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6b519416651ae58f1a201a4db5fd7d723e7e5ec2
--- /dev/null
+++ b/model-00076-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90a2e7d697a7a04acb04279623929d0723a8ea1cdaa5a9765aa1619bc635e693
+size 3925869316
diff --git a/model-00077-of-00081.safetensors b/model-00077-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..bb4bbfbaf772b9daf2f1d1fb97a529f13f17caa7
--- /dev/null
+++ b/model-00077-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1a6b55cb91f25c67688c3b6cff2c8ddc807b17f90dfde1c22c382b279a3e573
+size 4000855754
diff --git a/model-00078-of-00081.safetensors b/model-00078-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..24ba375e1ac71a715f178263b1e1568a7c899719
--- /dev/null
+++ b/model-00078-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c32ec00f2e49bb9c61d4d66b690bed6c0bbe82401a7e35c89145f32e9eddc136
+size 4000855624
diff --git a/model-00079-of-00081.safetensors b/model-00079-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7c400c680b7d2ef9c228c8d45497f631a16f9252
--- /dev/null
+++ b/model-00079-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99d949f6ae9f750cadaf8daf67100b8e2a08e0cdd4af43efe7231856a7cd763c
+size 3925869316
diff --git a/model-00080-of-00081.safetensors b/model-00080-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f46c37511813a377bb2a3f5b890f47326f8c27cd
--- /dev/null
+++ b/model-00080-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3abc5f9edc7a2966eb7e0229b4bfca12ae7d4c399965bbb69eebc8c9b6fe858e
+size 4000855706
diff --git a/model-00081-of-00081.safetensors b/model-00081-of-00081.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c0ce55c22cd73521495faded34867cad7715f104
--- /dev/null
+++ b/model-00081-of-00081.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec0ec76e5ad8794acafcd77d8fde314dcacd50017aa90a80159e1b1ed293f1df
+size 4448584346
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..7497af3bab36cac86e7f79de80f634fe81b6f433
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,2517 @@
+{
+ "metadata": {
+ "total_size": 323902700544,
+ "total_parameters": 398635272192
+ },
+ "weight_map": {
+ "lm_head.biases": "model-00081-of-00081.safetensors",
+ "lm_head.scales": "model-00081-of-00081.safetensors",
+ "lm_head.weight": "model-00081-of-00081.safetensors",
+ "model.embed_tokens.biases": "model-00001-of-00081.safetensors",
+ "model.embed_tokens.scales": "model-00001-of-00081.safetensors",
+ "model.embed_tokens.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.mlp.down_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.0.mlp.down_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.mlp.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.0.mlp.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.mlp.up_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.0.mlp.up_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.post_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.k_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.k_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.o_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.o_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.q_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.q_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.v_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.v_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.mlp.down_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.1.mlp.down_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.mlp.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.1.mlp.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.mlp.up_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.1.mlp.up_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.post_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.k_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.k_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.o_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.o_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.q_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.q_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.v_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.v_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.10.input_layernorm.weight": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.expert_bias": "model-00006-of-00081.safetensors",
+ "model.layers.10.mlp.experts.down_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.experts.down_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.experts.down_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.experts.gate_proj.biases": "model-00007-of-00081.safetensors",
+ "model.layers.10.mlp.experts.gate_proj.scales": "model-00007-of-00081.safetensors",
+ "model.layers.10.mlp.experts.gate_proj.weight": "model-00007-of-00081.safetensors",
+ "model.layers.10.mlp.experts.up_proj.biases": "model-00007-of-00081.safetensors",
+ "model.layers.10.mlp.experts.up_proj.scales": "model-00007-of-00081.safetensors",
+ "model.layers.10.mlp.experts.up_proj.weight": "model-00007-of-00081.safetensors",
+ "model.layers.10.mlp.router.gate.biases": "model-00006-of-00081.safetensors",
+ "model.layers.10.mlp.router.gate.scales": "model-00006-of-00081.safetensors",
+ "model.layers.10.mlp.router.gate.weight": "model-00006-of-00081.safetensors",
+ "model.layers.10.mlp.shared_experts.down_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.shared_experts.down_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.shared_experts.down_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.shared_experts.gate_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.shared_experts.gate_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.shared_experts.gate_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.shared_experts.up_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.shared_experts.up_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.10.mlp.shared_experts.up_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.10.post_attention_layernorm.weight": "model-00008-of-00081.safetensors",
+ "model.layers.10.post_mlp_layernorm.weight": "model-00008-of-00081.safetensors",
+ "model.layers.10.pre_mlp_layernorm.weight": "model-00008-of-00081.safetensors",
+ "model.layers.10.self_attn.gate_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.gate_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.gate_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.k_norm.weight": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.k_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.k_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.k_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.o_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.o_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.o_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.q_norm.weight": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.q_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.q_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.q_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.v_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.v_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.10.self_attn.v_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.11.input_layernorm.weight": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.expert_bias": "model-00008-of-00081.safetensors",
+ "model.layers.11.mlp.experts.down_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.experts.down_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.experts.down_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.experts.gate_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.11.mlp.experts.gate_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.11.mlp.experts.gate_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.11.mlp.experts.up_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.experts.up_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.experts.up_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.router.gate.biases": "model-00008-of-00081.safetensors",
+ "model.layers.11.mlp.router.gate.scales": "model-00008-of-00081.safetensors",
+ "model.layers.11.mlp.router.gate.weight": "model-00008-of-00081.safetensors",
+ "model.layers.11.mlp.shared_experts.down_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.shared_experts.down_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.shared_experts.down_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.shared_experts.gate_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.shared_experts.gate_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.shared_experts.gate_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.shared_experts.up_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.shared_experts.up_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.11.mlp.shared_experts.up_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.11.post_attention_layernorm.weight": "model-00009-of-00081.safetensors",
+ "model.layers.11.post_mlp_layernorm.weight": "model-00009-of-00081.safetensors",
+ "model.layers.11.pre_mlp_layernorm.weight": "model-00009-of-00081.safetensors",
+ "model.layers.11.self_attn.gate_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.gate_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.gate_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.k_norm.weight": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.k_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.k_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.k_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.o_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.o_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.o_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.q_norm.weight": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.q_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.q_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.q_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.v_proj.biases": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.v_proj.scales": "model-00008-of-00081.safetensors",
+ "model.layers.11.self_attn.v_proj.weight": "model-00008-of-00081.safetensors",
+ "model.layers.12.input_layernorm.weight": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.expert_bias": "model-00009-of-00081.safetensors",
+ "model.layers.12.mlp.experts.down_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.experts.down_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.experts.down_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.experts.gate_proj.biases": "model-00010-of-00081.safetensors",
+ "model.layers.12.mlp.experts.gate_proj.scales": "model-00010-of-00081.safetensors",
+ "model.layers.12.mlp.experts.gate_proj.weight": "model-00010-of-00081.safetensors",
+ "model.layers.12.mlp.experts.up_proj.biases": "model-00010-of-00081.safetensors",
+ "model.layers.12.mlp.experts.up_proj.scales": "model-00010-of-00081.safetensors",
+ "model.layers.12.mlp.experts.up_proj.weight": "model-00010-of-00081.safetensors",
+ "model.layers.12.mlp.router.gate.biases": "model-00009-of-00081.safetensors",
+ "model.layers.12.mlp.router.gate.scales": "model-00009-of-00081.safetensors",
+ "model.layers.12.mlp.router.gate.weight": "model-00009-of-00081.safetensors",
+ "model.layers.12.mlp.shared_experts.down_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.shared_experts.down_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.shared_experts.down_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.shared_experts.gate_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.shared_experts.gate_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.shared_experts.gate_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.shared_experts.up_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.shared_experts.up_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.12.mlp.shared_experts.up_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.12.post_attention_layernorm.weight": "model-00011-of-00081.safetensors",
+ "model.layers.12.post_mlp_layernorm.weight": "model-00011-of-00081.safetensors",
+ "model.layers.12.pre_mlp_layernorm.weight": "model-00011-of-00081.safetensors",
+ "model.layers.12.self_attn.gate_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.gate_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.gate_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.k_norm.weight": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.k_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.k_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.k_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.o_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.o_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.o_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.q_norm.weight": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.q_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.q_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.q_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.v_proj.biases": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.v_proj.scales": "model-00009-of-00081.safetensors",
+ "model.layers.12.self_attn.v_proj.weight": "model-00009-of-00081.safetensors",
+ "model.layers.13.input_layernorm.weight": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.expert_bias": "model-00011-of-00081.safetensors",
+ "model.layers.13.mlp.experts.down_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.experts.down_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.experts.down_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.experts.gate_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.13.mlp.experts.gate_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.13.mlp.experts.gate_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.13.mlp.experts.up_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.experts.up_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.experts.up_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.router.gate.biases": "model-00011-of-00081.safetensors",
+ "model.layers.13.mlp.router.gate.scales": "model-00011-of-00081.safetensors",
+ "model.layers.13.mlp.router.gate.weight": "model-00011-of-00081.safetensors",
+ "model.layers.13.mlp.shared_experts.down_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.shared_experts.down_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.shared_experts.down_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.shared_experts.gate_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.shared_experts.gate_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.shared_experts.gate_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.shared_experts.up_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.shared_experts.up_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.13.mlp.shared_experts.up_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.13.post_attention_layernorm.weight": "model-00012-of-00081.safetensors",
+ "model.layers.13.post_mlp_layernorm.weight": "model-00012-of-00081.safetensors",
+ "model.layers.13.pre_mlp_layernorm.weight": "model-00012-of-00081.safetensors",
+ "model.layers.13.self_attn.gate_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.gate_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.gate_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.k_norm.weight": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.k_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.k_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.k_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.o_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.o_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.o_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.q_norm.weight": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.q_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.q_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.q_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.v_proj.biases": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.v_proj.scales": "model-00011-of-00081.safetensors",
+ "model.layers.13.self_attn.v_proj.weight": "model-00011-of-00081.safetensors",
+ "model.layers.14.input_layernorm.weight": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.expert_bias": "model-00012-of-00081.safetensors",
+ "model.layers.14.mlp.experts.down_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.experts.down_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.experts.down_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.experts.gate_proj.biases": "model-00013-of-00081.safetensors",
+ "model.layers.14.mlp.experts.gate_proj.scales": "model-00013-of-00081.safetensors",
+ "model.layers.14.mlp.experts.gate_proj.weight": "model-00013-of-00081.safetensors",
+ "model.layers.14.mlp.experts.up_proj.biases": "model-00013-of-00081.safetensors",
+ "model.layers.14.mlp.experts.up_proj.scales": "model-00013-of-00081.safetensors",
+ "model.layers.14.mlp.experts.up_proj.weight": "model-00013-of-00081.safetensors",
+ "model.layers.14.mlp.router.gate.biases": "model-00012-of-00081.safetensors",
+ "model.layers.14.mlp.router.gate.scales": "model-00012-of-00081.safetensors",
+ "model.layers.14.mlp.router.gate.weight": "model-00012-of-00081.safetensors",
+ "model.layers.14.mlp.shared_experts.down_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.shared_experts.down_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.shared_experts.down_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.shared_experts.gate_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.shared_experts.gate_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.shared_experts.gate_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.shared_experts.up_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.shared_experts.up_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.14.mlp.shared_experts.up_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.14.post_attention_layernorm.weight": "model-00014-of-00081.safetensors",
+ "model.layers.14.post_mlp_layernorm.weight": "model-00014-of-00081.safetensors",
+ "model.layers.14.pre_mlp_layernorm.weight": "model-00014-of-00081.safetensors",
+ "model.layers.14.self_attn.gate_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.gate_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.gate_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.k_norm.weight": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.k_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.k_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.k_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.o_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.o_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.o_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.q_norm.weight": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.q_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.q_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.q_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.v_proj.biases": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.v_proj.scales": "model-00012-of-00081.safetensors",
+ "model.layers.14.self_attn.v_proj.weight": "model-00012-of-00081.safetensors",
+ "model.layers.15.input_layernorm.weight": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.expert_bias": "model-00014-of-00081.safetensors",
+ "model.layers.15.mlp.experts.down_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.experts.down_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.experts.down_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.experts.gate_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.15.mlp.experts.gate_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.15.mlp.experts.gate_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.15.mlp.experts.up_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.experts.up_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.experts.up_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.router.gate.biases": "model-00014-of-00081.safetensors",
+ "model.layers.15.mlp.router.gate.scales": "model-00014-of-00081.safetensors",
+ "model.layers.15.mlp.router.gate.weight": "model-00014-of-00081.safetensors",
+ "model.layers.15.mlp.shared_experts.down_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.shared_experts.down_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.shared_experts.down_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.shared_experts.gate_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.shared_experts.gate_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.shared_experts.gate_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.shared_experts.up_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.shared_experts.up_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.15.mlp.shared_experts.up_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.15.post_attention_layernorm.weight": "model-00015-of-00081.safetensors",
+ "model.layers.15.post_mlp_layernorm.weight": "model-00015-of-00081.safetensors",
+ "model.layers.15.pre_mlp_layernorm.weight": "model-00015-of-00081.safetensors",
+ "model.layers.15.self_attn.gate_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.gate_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.gate_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.k_norm.weight": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.k_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.k_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.k_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.o_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.o_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.o_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.q_norm.weight": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.q_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.q_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.q_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.v_proj.biases": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.v_proj.scales": "model-00014-of-00081.safetensors",
+ "model.layers.15.self_attn.v_proj.weight": "model-00014-of-00081.safetensors",
+ "model.layers.16.input_layernorm.weight": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.expert_bias": "model-00015-of-00081.safetensors",
+ "model.layers.16.mlp.experts.down_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.experts.down_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.experts.down_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.experts.gate_proj.biases": "model-00016-of-00081.safetensors",
+ "model.layers.16.mlp.experts.gate_proj.scales": "model-00016-of-00081.safetensors",
+ "model.layers.16.mlp.experts.gate_proj.weight": "model-00016-of-00081.safetensors",
+ "model.layers.16.mlp.experts.up_proj.biases": "model-00016-of-00081.safetensors",
+ "model.layers.16.mlp.experts.up_proj.scales": "model-00016-of-00081.safetensors",
+ "model.layers.16.mlp.experts.up_proj.weight": "model-00016-of-00081.safetensors",
+ "model.layers.16.mlp.router.gate.biases": "model-00015-of-00081.safetensors",
+ "model.layers.16.mlp.router.gate.scales": "model-00015-of-00081.safetensors",
+ "model.layers.16.mlp.router.gate.weight": "model-00015-of-00081.safetensors",
+ "model.layers.16.mlp.shared_experts.down_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.shared_experts.down_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.shared_experts.down_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.shared_experts.gate_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.shared_experts.gate_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.shared_experts.gate_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.shared_experts.up_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.shared_experts.up_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.16.mlp.shared_experts.up_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.16.post_attention_layernorm.weight": "model-00017-of-00081.safetensors",
+ "model.layers.16.post_mlp_layernorm.weight": "model-00017-of-00081.safetensors",
+ "model.layers.16.pre_mlp_layernorm.weight": "model-00017-of-00081.safetensors",
+ "model.layers.16.self_attn.gate_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.gate_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.gate_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.k_norm.weight": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.k_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.k_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.k_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.o_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.o_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.o_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.q_norm.weight": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.q_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.q_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.q_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.v_proj.biases": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.v_proj.scales": "model-00015-of-00081.safetensors",
+ "model.layers.16.self_attn.v_proj.weight": "model-00015-of-00081.safetensors",
+ "model.layers.17.input_layernorm.weight": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.expert_bias": "model-00017-of-00081.safetensors",
+ "model.layers.17.mlp.experts.down_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.experts.down_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.experts.down_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.experts.gate_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.17.mlp.experts.gate_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.17.mlp.experts.gate_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.17.mlp.experts.up_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.experts.up_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.experts.up_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.router.gate.biases": "model-00017-of-00081.safetensors",
+ "model.layers.17.mlp.router.gate.scales": "model-00017-of-00081.safetensors",
+ "model.layers.17.mlp.router.gate.weight": "model-00017-of-00081.safetensors",
+ "model.layers.17.mlp.shared_experts.down_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.shared_experts.down_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.shared_experts.down_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.shared_experts.gate_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.shared_experts.gate_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.shared_experts.gate_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.shared_experts.up_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.shared_experts.up_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.17.mlp.shared_experts.up_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.17.post_attention_layernorm.weight": "model-00018-of-00081.safetensors",
+ "model.layers.17.post_mlp_layernorm.weight": "model-00018-of-00081.safetensors",
+ "model.layers.17.pre_mlp_layernorm.weight": "model-00018-of-00081.safetensors",
+ "model.layers.17.self_attn.gate_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.gate_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.gate_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.k_norm.weight": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.k_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.k_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.k_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.o_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.o_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.o_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.q_norm.weight": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.q_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.q_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.q_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.v_proj.biases": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.v_proj.scales": "model-00017-of-00081.safetensors",
+ "model.layers.17.self_attn.v_proj.weight": "model-00017-of-00081.safetensors",
+ "model.layers.18.input_layernorm.weight": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.expert_bias": "model-00018-of-00081.safetensors",
+ "model.layers.18.mlp.experts.down_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.experts.down_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.experts.down_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.experts.gate_proj.biases": "model-00019-of-00081.safetensors",
+ "model.layers.18.mlp.experts.gate_proj.scales": "model-00019-of-00081.safetensors",
+ "model.layers.18.mlp.experts.gate_proj.weight": "model-00019-of-00081.safetensors",
+ "model.layers.18.mlp.experts.up_proj.biases": "model-00019-of-00081.safetensors",
+ "model.layers.18.mlp.experts.up_proj.scales": "model-00019-of-00081.safetensors",
+ "model.layers.18.mlp.experts.up_proj.weight": "model-00019-of-00081.safetensors",
+ "model.layers.18.mlp.router.gate.biases": "model-00018-of-00081.safetensors",
+ "model.layers.18.mlp.router.gate.scales": "model-00018-of-00081.safetensors",
+ "model.layers.18.mlp.router.gate.weight": "model-00018-of-00081.safetensors",
+ "model.layers.18.mlp.shared_experts.down_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.shared_experts.down_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.shared_experts.down_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.shared_experts.gate_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.shared_experts.gate_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.shared_experts.gate_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.shared_experts.up_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.shared_experts.up_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.18.mlp.shared_experts.up_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.18.post_attention_layernorm.weight": "model-00020-of-00081.safetensors",
+ "model.layers.18.post_mlp_layernorm.weight": "model-00020-of-00081.safetensors",
+ "model.layers.18.pre_mlp_layernorm.weight": "model-00020-of-00081.safetensors",
+ "model.layers.18.self_attn.gate_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.gate_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.gate_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.k_norm.weight": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.k_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.k_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.k_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.o_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.o_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.o_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.q_norm.weight": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.q_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.q_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.q_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.v_proj.biases": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.v_proj.scales": "model-00018-of-00081.safetensors",
+ "model.layers.18.self_attn.v_proj.weight": "model-00018-of-00081.safetensors",
+ "model.layers.19.input_layernorm.weight": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.expert_bias": "model-00020-of-00081.safetensors",
+ "model.layers.19.mlp.experts.down_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.experts.down_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.experts.down_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.experts.gate_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.19.mlp.experts.gate_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.19.mlp.experts.gate_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.19.mlp.experts.up_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.experts.up_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.experts.up_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.router.gate.biases": "model-00020-of-00081.safetensors",
+ "model.layers.19.mlp.router.gate.scales": "model-00020-of-00081.safetensors",
+ "model.layers.19.mlp.router.gate.weight": "model-00020-of-00081.safetensors",
+ "model.layers.19.mlp.shared_experts.down_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.shared_experts.down_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.shared_experts.down_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.shared_experts.gate_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.shared_experts.gate_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.shared_experts.gate_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.shared_experts.up_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.shared_experts.up_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.19.mlp.shared_experts.up_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.19.post_attention_layernorm.weight": "model-00021-of-00081.safetensors",
+ "model.layers.19.post_mlp_layernorm.weight": "model-00021-of-00081.safetensors",
+ "model.layers.19.pre_mlp_layernorm.weight": "model-00021-of-00081.safetensors",
+ "model.layers.19.self_attn.gate_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.gate_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.gate_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.k_norm.weight": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.k_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.k_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.k_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.o_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.o_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.o_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.q_norm.weight": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.q_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.q_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.q_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.v_proj.biases": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.v_proj.scales": "model-00020-of-00081.safetensors",
+ "model.layers.19.self_attn.v_proj.weight": "model-00020-of-00081.safetensors",
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.mlp.down_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.2.mlp.down_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.mlp.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.2.mlp.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.mlp.up_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.2.mlp.up_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.post_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.k_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.k_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.o_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.o_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.q_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.q_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.v_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.v_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.20.input_layernorm.weight": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.expert_bias": "model-00021-of-00081.safetensors",
+ "model.layers.20.mlp.experts.down_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.experts.down_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.experts.down_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.experts.gate_proj.biases": "model-00022-of-00081.safetensors",
+ "model.layers.20.mlp.experts.gate_proj.scales": "model-00022-of-00081.safetensors",
+ "model.layers.20.mlp.experts.gate_proj.weight": "model-00022-of-00081.safetensors",
+ "model.layers.20.mlp.experts.up_proj.biases": "model-00022-of-00081.safetensors",
+ "model.layers.20.mlp.experts.up_proj.scales": "model-00022-of-00081.safetensors",
+ "model.layers.20.mlp.experts.up_proj.weight": "model-00022-of-00081.safetensors",
+ "model.layers.20.mlp.router.gate.biases": "model-00021-of-00081.safetensors",
+ "model.layers.20.mlp.router.gate.scales": "model-00021-of-00081.safetensors",
+ "model.layers.20.mlp.router.gate.weight": "model-00021-of-00081.safetensors",
+ "model.layers.20.mlp.shared_experts.down_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.shared_experts.down_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.shared_experts.down_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.shared_experts.gate_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.shared_experts.gate_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.shared_experts.gate_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.shared_experts.up_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.shared_experts.up_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.20.mlp.shared_experts.up_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.20.post_attention_layernorm.weight": "model-00023-of-00081.safetensors",
+ "model.layers.20.post_mlp_layernorm.weight": "model-00023-of-00081.safetensors",
+ "model.layers.20.pre_mlp_layernorm.weight": "model-00023-of-00081.safetensors",
+ "model.layers.20.self_attn.gate_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.gate_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.gate_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.k_norm.weight": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.k_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.k_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.k_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.o_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.o_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.o_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.q_norm.weight": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.q_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.q_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.q_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.v_proj.biases": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.v_proj.scales": "model-00021-of-00081.safetensors",
+ "model.layers.20.self_attn.v_proj.weight": "model-00021-of-00081.safetensors",
+ "model.layers.21.input_layernorm.weight": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.expert_bias": "model-00023-of-00081.safetensors",
+ "model.layers.21.mlp.experts.down_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.experts.down_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.experts.down_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.experts.gate_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.21.mlp.experts.gate_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.21.mlp.experts.gate_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.21.mlp.experts.up_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.experts.up_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.experts.up_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.router.gate.biases": "model-00023-of-00081.safetensors",
+ "model.layers.21.mlp.router.gate.scales": "model-00023-of-00081.safetensors",
+ "model.layers.21.mlp.router.gate.weight": "model-00023-of-00081.safetensors",
+ "model.layers.21.mlp.shared_experts.down_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.shared_experts.down_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.shared_experts.down_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.shared_experts.gate_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.shared_experts.gate_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.shared_experts.gate_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.shared_experts.up_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.shared_experts.up_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.21.mlp.shared_experts.up_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.21.post_attention_layernorm.weight": "model-00024-of-00081.safetensors",
+ "model.layers.21.post_mlp_layernorm.weight": "model-00024-of-00081.safetensors",
+ "model.layers.21.pre_mlp_layernorm.weight": "model-00024-of-00081.safetensors",
+ "model.layers.21.self_attn.gate_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.gate_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.gate_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.k_norm.weight": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.k_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.k_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.k_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.o_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.o_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.o_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.q_norm.weight": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.q_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.q_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.q_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.v_proj.biases": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.v_proj.scales": "model-00023-of-00081.safetensors",
+ "model.layers.21.self_attn.v_proj.weight": "model-00023-of-00081.safetensors",
+ "model.layers.22.input_layernorm.weight": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.expert_bias": "model-00024-of-00081.safetensors",
+ "model.layers.22.mlp.experts.down_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.experts.down_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.experts.down_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.experts.gate_proj.biases": "model-00025-of-00081.safetensors",
+ "model.layers.22.mlp.experts.gate_proj.scales": "model-00025-of-00081.safetensors",
+ "model.layers.22.mlp.experts.gate_proj.weight": "model-00025-of-00081.safetensors",
+ "model.layers.22.mlp.experts.up_proj.biases": "model-00025-of-00081.safetensors",
+ "model.layers.22.mlp.experts.up_proj.scales": "model-00025-of-00081.safetensors",
+ "model.layers.22.mlp.experts.up_proj.weight": "model-00025-of-00081.safetensors",
+ "model.layers.22.mlp.router.gate.biases": "model-00024-of-00081.safetensors",
+ "model.layers.22.mlp.router.gate.scales": "model-00024-of-00081.safetensors",
+ "model.layers.22.mlp.router.gate.weight": "model-00024-of-00081.safetensors",
+ "model.layers.22.mlp.shared_experts.down_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.shared_experts.down_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.shared_experts.down_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.shared_experts.gate_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.shared_experts.gate_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.shared_experts.gate_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.shared_experts.up_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.shared_experts.up_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.22.mlp.shared_experts.up_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.22.post_attention_layernorm.weight": "model-00026-of-00081.safetensors",
+ "model.layers.22.post_mlp_layernorm.weight": "model-00026-of-00081.safetensors",
+ "model.layers.22.pre_mlp_layernorm.weight": "model-00026-of-00081.safetensors",
+ "model.layers.22.self_attn.gate_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.gate_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.gate_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.k_norm.weight": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.k_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.k_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.k_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.o_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.o_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.o_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.q_norm.weight": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.q_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.q_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.q_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.v_proj.biases": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.v_proj.scales": "model-00024-of-00081.safetensors",
+ "model.layers.22.self_attn.v_proj.weight": "model-00024-of-00081.safetensors",
+ "model.layers.23.input_layernorm.weight": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.expert_bias": "model-00026-of-00081.safetensors",
+ "model.layers.23.mlp.experts.down_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.experts.down_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.experts.down_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.experts.gate_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.23.mlp.experts.gate_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.23.mlp.experts.gate_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.23.mlp.experts.up_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.experts.up_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.experts.up_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.router.gate.biases": "model-00026-of-00081.safetensors",
+ "model.layers.23.mlp.router.gate.scales": "model-00026-of-00081.safetensors",
+ "model.layers.23.mlp.router.gate.weight": "model-00026-of-00081.safetensors",
+ "model.layers.23.mlp.shared_experts.down_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.shared_experts.down_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.shared_experts.down_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.shared_experts.gate_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.shared_experts.gate_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.shared_experts.gate_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.shared_experts.up_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.shared_experts.up_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.23.mlp.shared_experts.up_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.23.post_attention_layernorm.weight": "model-00027-of-00081.safetensors",
+ "model.layers.23.post_mlp_layernorm.weight": "model-00027-of-00081.safetensors",
+ "model.layers.23.pre_mlp_layernorm.weight": "model-00027-of-00081.safetensors",
+ "model.layers.23.self_attn.gate_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.gate_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.gate_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.k_norm.weight": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.k_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.k_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.k_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.o_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.o_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.o_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.q_norm.weight": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.q_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.q_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.q_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.v_proj.biases": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.v_proj.scales": "model-00026-of-00081.safetensors",
+ "model.layers.23.self_attn.v_proj.weight": "model-00026-of-00081.safetensors",
+ "model.layers.24.input_layernorm.weight": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.expert_bias": "model-00027-of-00081.safetensors",
+ "model.layers.24.mlp.experts.down_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.experts.down_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.experts.down_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.experts.gate_proj.biases": "model-00028-of-00081.safetensors",
+ "model.layers.24.mlp.experts.gate_proj.scales": "model-00028-of-00081.safetensors",
+ "model.layers.24.mlp.experts.gate_proj.weight": "model-00028-of-00081.safetensors",
+ "model.layers.24.mlp.experts.up_proj.biases": "model-00028-of-00081.safetensors",
+ "model.layers.24.mlp.experts.up_proj.scales": "model-00028-of-00081.safetensors",
+ "model.layers.24.mlp.experts.up_proj.weight": "model-00028-of-00081.safetensors",
+ "model.layers.24.mlp.router.gate.biases": "model-00027-of-00081.safetensors",
+ "model.layers.24.mlp.router.gate.scales": "model-00027-of-00081.safetensors",
+ "model.layers.24.mlp.router.gate.weight": "model-00027-of-00081.safetensors",
+ "model.layers.24.mlp.shared_experts.down_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.shared_experts.down_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.shared_experts.down_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.shared_experts.gate_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.shared_experts.gate_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.shared_experts.gate_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.shared_experts.up_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.shared_experts.up_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.24.mlp.shared_experts.up_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.24.post_attention_layernorm.weight": "model-00029-of-00081.safetensors",
+ "model.layers.24.post_mlp_layernorm.weight": "model-00029-of-00081.safetensors",
+ "model.layers.24.pre_mlp_layernorm.weight": "model-00029-of-00081.safetensors",
+ "model.layers.24.self_attn.gate_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.gate_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.gate_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.k_norm.weight": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.k_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.k_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.k_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.o_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.o_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.o_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.q_norm.weight": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.q_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.q_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.q_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.v_proj.biases": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.v_proj.scales": "model-00027-of-00081.safetensors",
+ "model.layers.24.self_attn.v_proj.weight": "model-00027-of-00081.safetensors",
+ "model.layers.25.input_layernorm.weight": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.expert_bias": "model-00029-of-00081.safetensors",
+ "model.layers.25.mlp.experts.down_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.experts.down_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.experts.down_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.experts.gate_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.25.mlp.experts.gate_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.25.mlp.experts.gate_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.25.mlp.experts.up_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.experts.up_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.experts.up_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.router.gate.biases": "model-00029-of-00081.safetensors",
+ "model.layers.25.mlp.router.gate.scales": "model-00029-of-00081.safetensors",
+ "model.layers.25.mlp.router.gate.weight": "model-00029-of-00081.safetensors",
+ "model.layers.25.mlp.shared_experts.down_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.shared_experts.down_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.shared_experts.down_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.shared_experts.gate_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.shared_experts.gate_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.shared_experts.gate_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.shared_experts.up_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.shared_experts.up_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.25.mlp.shared_experts.up_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.25.post_attention_layernorm.weight": "model-00030-of-00081.safetensors",
+ "model.layers.25.post_mlp_layernorm.weight": "model-00030-of-00081.safetensors",
+ "model.layers.25.pre_mlp_layernorm.weight": "model-00030-of-00081.safetensors",
+ "model.layers.25.self_attn.gate_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.gate_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.gate_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.k_norm.weight": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.k_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.k_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.k_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.o_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.o_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.o_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.q_norm.weight": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.q_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.q_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.q_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.v_proj.biases": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.v_proj.scales": "model-00029-of-00081.safetensors",
+ "model.layers.25.self_attn.v_proj.weight": "model-00029-of-00081.safetensors",
+ "model.layers.26.input_layernorm.weight": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.expert_bias": "model-00030-of-00081.safetensors",
+ "model.layers.26.mlp.experts.down_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.experts.down_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.experts.down_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.experts.gate_proj.biases": "model-00031-of-00081.safetensors",
+ "model.layers.26.mlp.experts.gate_proj.scales": "model-00031-of-00081.safetensors",
+ "model.layers.26.mlp.experts.gate_proj.weight": "model-00031-of-00081.safetensors",
+ "model.layers.26.mlp.experts.up_proj.biases": "model-00031-of-00081.safetensors",
+ "model.layers.26.mlp.experts.up_proj.scales": "model-00031-of-00081.safetensors",
+ "model.layers.26.mlp.experts.up_proj.weight": "model-00031-of-00081.safetensors",
+ "model.layers.26.mlp.router.gate.biases": "model-00030-of-00081.safetensors",
+ "model.layers.26.mlp.router.gate.scales": "model-00030-of-00081.safetensors",
+ "model.layers.26.mlp.router.gate.weight": "model-00030-of-00081.safetensors",
+ "model.layers.26.mlp.shared_experts.down_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.shared_experts.down_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.shared_experts.down_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.shared_experts.gate_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.shared_experts.gate_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.shared_experts.gate_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.shared_experts.up_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.shared_experts.up_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.26.mlp.shared_experts.up_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.26.post_attention_layernorm.weight": "model-00032-of-00081.safetensors",
+ "model.layers.26.post_mlp_layernorm.weight": "model-00032-of-00081.safetensors",
+ "model.layers.26.pre_mlp_layernorm.weight": "model-00032-of-00081.safetensors",
+ "model.layers.26.self_attn.gate_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.gate_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.gate_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.k_norm.weight": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.k_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.k_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.k_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.o_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.o_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.o_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.q_norm.weight": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.q_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.q_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.q_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.v_proj.biases": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.v_proj.scales": "model-00030-of-00081.safetensors",
+ "model.layers.26.self_attn.v_proj.weight": "model-00030-of-00081.safetensors",
+ "model.layers.27.input_layernorm.weight": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.expert_bias": "model-00032-of-00081.safetensors",
+ "model.layers.27.mlp.experts.down_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.experts.down_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.experts.down_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.experts.gate_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.27.mlp.experts.gate_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.27.mlp.experts.gate_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.27.mlp.experts.up_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.experts.up_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.experts.up_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.router.gate.biases": "model-00032-of-00081.safetensors",
+ "model.layers.27.mlp.router.gate.scales": "model-00032-of-00081.safetensors",
+ "model.layers.27.mlp.router.gate.weight": "model-00032-of-00081.safetensors",
+ "model.layers.27.mlp.shared_experts.down_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.shared_experts.down_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.shared_experts.down_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.shared_experts.gate_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.shared_experts.gate_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.shared_experts.gate_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.shared_experts.up_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.shared_experts.up_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.27.mlp.shared_experts.up_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.27.post_attention_layernorm.weight": "model-00033-of-00081.safetensors",
+ "model.layers.27.post_mlp_layernorm.weight": "model-00033-of-00081.safetensors",
+ "model.layers.27.pre_mlp_layernorm.weight": "model-00033-of-00081.safetensors",
+ "model.layers.27.self_attn.gate_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.gate_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.gate_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.k_norm.weight": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.k_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.k_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.k_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.o_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.o_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.o_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.q_norm.weight": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.q_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.q_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.q_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.v_proj.biases": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.v_proj.scales": "model-00032-of-00081.safetensors",
+ "model.layers.27.self_attn.v_proj.weight": "model-00032-of-00081.safetensors",
+ "model.layers.28.input_layernorm.weight": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.expert_bias": "model-00033-of-00081.safetensors",
+ "model.layers.28.mlp.experts.down_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.experts.down_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.experts.down_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.experts.gate_proj.biases": "model-00034-of-00081.safetensors",
+ "model.layers.28.mlp.experts.gate_proj.scales": "model-00034-of-00081.safetensors",
+ "model.layers.28.mlp.experts.gate_proj.weight": "model-00034-of-00081.safetensors",
+ "model.layers.28.mlp.experts.up_proj.biases": "model-00034-of-00081.safetensors",
+ "model.layers.28.mlp.experts.up_proj.scales": "model-00034-of-00081.safetensors",
+ "model.layers.28.mlp.experts.up_proj.weight": "model-00034-of-00081.safetensors",
+ "model.layers.28.mlp.router.gate.biases": "model-00033-of-00081.safetensors",
+ "model.layers.28.mlp.router.gate.scales": "model-00033-of-00081.safetensors",
+ "model.layers.28.mlp.router.gate.weight": "model-00033-of-00081.safetensors",
+ "model.layers.28.mlp.shared_experts.down_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.shared_experts.down_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.shared_experts.down_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.shared_experts.gate_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.shared_experts.gate_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.shared_experts.gate_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.shared_experts.up_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.shared_experts.up_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.28.mlp.shared_experts.up_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.28.post_attention_layernorm.weight": "model-00035-of-00081.safetensors",
+ "model.layers.28.post_mlp_layernorm.weight": "model-00035-of-00081.safetensors",
+ "model.layers.28.pre_mlp_layernorm.weight": "model-00035-of-00081.safetensors",
+ "model.layers.28.self_attn.gate_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.gate_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.gate_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.k_norm.weight": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.k_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.k_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.k_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.o_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.o_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.o_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.q_norm.weight": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.q_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.q_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.q_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.v_proj.biases": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.v_proj.scales": "model-00033-of-00081.safetensors",
+ "model.layers.28.self_attn.v_proj.weight": "model-00033-of-00081.safetensors",
+ "model.layers.29.input_layernorm.weight": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.expert_bias": "model-00035-of-00081.safetensors",
+ "model.layers.29.mlp.experts.down_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.experts.down_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.experts.down_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.experts.gate_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.29.mlp.experts.gate_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.29.mlp.experts.gate_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.29.mlp.experts.up_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.experts.up_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.experts.up_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.router.gate.biases": "model-00035-of-00081.safetensors",
+ "model.layers.29.mlp.router.gate.scales": "model-00035-of-00081.safetensors",
+ "model.layers.29.mlp.router.gate.weight": "model-00035-of-00081.safetensors",
+ "model.layers.29.mlp.shared_experts.down_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.shared_experts.down_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.shared_experts.down_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.shared_experts.gate_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.shared_experts.gate_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.shared_experts.gate_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.shared_experts.up_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.shared_experts.up_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.29.mlp.shared_experts.up_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.29.post_attention_layernorm.weight": "model-00036-of-00081.safetensors",
+ "model.layers.29.post_mlp_layernorm.weight": "model-00036-of-00081.safetensors",
+ "model.layers.29.pre_mlp_layernorm.weight": "model-00036-of-00081.safetensors",
+ "model.layers.29.self_attn.gate_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.gate_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.gate_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.k_norm.weight": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.k_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.k_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.k_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.o_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.o_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.o_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.q_norm.weight": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.q_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.q_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.q_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.v_proj.biases": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.v_proj.scales": "model-00035-of-00081.safetensors",
+ "model.layers.29.self_attn.v_proj.weight": "model-00035-of-00081.safetensors",
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.mlp.down_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.3.mlp.down_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.mlp.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.3.mlp.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.mlp.up_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.3.mlp.up_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.post_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.k_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.k_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.o_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.o_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.q_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.q_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.v_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.v_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.30.input_layernorm.weight": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.expert_bias": "model-00036-of-00081.safetensors",
+ "model.layers.30.mlp.experts.down_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.experts.down_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.experts.down_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.experts.gate_proj.biases": "model-00037-of-00081.safetensors",
+ "model.layers.30.mlp.experts.gate_proj.scales": "model-00037-of-00081.safetensors",
+ "model.layers.30.mlp.experts.gate_proj.weight": "model-00037-of-00081.safetensors",
+ "model.layers.30.mlp.experts.up_proj.biases": "model-00037-of-00081.safetensors",
+ "model.layers.30.mlp.experts.up_proj.scales": "model-00037-of-00081.safetensors",
+ "model.layers.30.mlp.experts.up_proj.weight": "model-00037-of-00081.safetensors",
+ "model.layers.30.mlp.router.gate.biases": "model-00036-of-00081.safetensors",
+ "model.layers.30.mlp.router.gate.scales": "model-00036-of-00081.safetensors",
+ "model.layers.30.mlp.router.gate.weight": "model-00036-of-00081.safetensors",
+ "model.layers.30.mlp.shared_experts.down_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.shared_experts.down_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.shared_experts.down_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.shared_experts.gate_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.shared_experts.gate_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.shared_experts.gate_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.shared_experts.up_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.shared_experts.up_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.30.mlp.shared_experts.up_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.30.post_attention_layernorm.weight": "model-00038-of-00081.safetensors",
+ "model.layers.30.post_mlp_layernorm.weight": "model-00038-of-00081.safetensors",
+ "model.layers.30.pre_mlp_layernorm.weight": "model-00038-of-00081.safetensors",
+ "model.layers.30.self_attn.gate_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.gate_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.gate_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.k_norm.weight": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.k_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.k_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.k_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.o_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.o_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.o_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.q_norm.weight": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.q_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.q_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.q_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.v_proj.biases": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.v_proj.scales": "model-00036-of-00081.safetensors",
+ "model.layers.30.self_attn.v_proj.weight": "model-00036-of-00081.safetensors",
+ "model.layers.31.input_layernorm.weight": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.expert_bias": "model-00038-of-00081.safetensors",
+ "model.layers.31.mlp.experts.down_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.experts.down_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.experts.down_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.experts.gate_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.31.mlp.experts.gate_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.31.mlp.experts.gate_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.31.mlp.experts.up_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.experts.up_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.experts.up_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.router.gate.biases": "model-00038-of-00081.safetensors",
+ "model.layers.31.mlp.router.gate.scales": "model-00038-of-00081.safetensors",
+ "model.layers.31.mlp.router.gate.weight": "model-00038-of-00081.safetensors",
+ "model.layers.31.mlp.shared_experts.down_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.shared_experts.down_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.shared_experts.down_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.shared_experts.gate_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.shared_experts.gate_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.shared_experts.gate_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.shared_experts.up_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.shared_experts.up_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.31.mlp.shared_experts.up_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.31.post_attention_layernorm.weight": "model-00039-of-00081.safetensors",
+ "model.layers.31.post_mlp_layernorm.weight": "model-00039-of-00081.safetensors",
+ "model.layers.31.pre_mlp_layernorm.weight": "model-00039-of-00081.safetensors",
+ "model.layers.31.self_attn.gate_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.gate_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.gate_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.k_norm.weight": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.k_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.k_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.k_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.o_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.o_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.o_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.q_norm.weight": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.q_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.q_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.q_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.v_proj.biases": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.v_proj.scales": "model-00038-of-00081.safetensors",
+ "model.layers.31.self_attn.v_proj.weight": "model-00038-of-00081.safetensors",
+ "model.layers.32.input_layernorm.weight": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.expert_bias": "model-00039-of-00081.safetensors",
+ "model.layers.32.mlp.experts.down_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.experts.down_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.experts.down_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.experts.gate_proj.biases": "model-00040-of-00081.safetensors",
+ "model.layers.32.mlp.experts.gate_proj.scales": "model-00040-of-00081.safetensors",
+ "model.layers.32.mlp.experts.gate_proj.weight": "model-00040-of-00081.safetensors",
+ "model.layers.32.mlp.experts.up_proj.biases": "model-00040-of-00081.safetensors",
+ "model.layers.32.mlp.experts.up_proj.scales": "model-00040-of-00081.safetensors",
+ "model.layers.32.mlp.experts.up_proj.weight": "model-00040-of-00081.safetensors",
+ "model.layers.32.mlp.router.gate.biases": "model-00039-of-00081.safetensors",
+ "model.layers.32.mlp.router.gate.scales": "model-00039-of-00081.safetensors",
+ "model.layers.32.mlp.router.gate.weight": "model-00039-of-00081.safetensors",
+ "model.layers.32.mlp.shared_experts.down_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.shared_experts.down_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.shared_experts.down_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.shared_experts.gate_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.shared_experts.gate_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.shared_experts.gate_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.shared_experts.up_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.shared_experts.up_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.32.mlp.shared_experts.up_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.32.post_attention_layernorm.weight": "model-00041-of-00081.safetensors",
+ "model.layers.32.post_mlp_layernorm.weight": "model-00041-of-00081.safetensors",
+ "model.layers.32.pre_mlp_layernorm.weight": "model-00041-of-00081.safetensors",
+ "model.layers.32.self_attn.gate_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.gate_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.gate_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.k_norm.weight": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.k_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.k_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.k_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.o_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.o_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.o_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.q_norm.weight": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.q_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.q_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.q_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.v_proj.biases": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.v_proj.scales": "model-00039-of-00081.safetensors",
+ "model.layers.32.self_attn.v_proj.weight": "model-00039-of-00081.safetensors",
+ "model.layers.33.input_layernorm.weight": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.expert_bias": "model-00041-of-00081.safetensors",
+ "model.layers.33.mlp.experts.down_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.experts.down_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.experts.down_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.experts.gate_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.33.mlp.experts.gate_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.33.mlp.experts.gate_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.33.mlp.experts.up_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.experts.up_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.experts.up_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.router.gate.biases": "model-00041-of-00081.safetensors",
+ "model.layers.33.mlp.router.gate.scales": "model-00041-of-00081.safetensors",
+ "model.layers.33.mlp.router.gate.weight": "model-00041-of-00081.safetensors",
+ "model.layers.33.mlp.shared_experts.down_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.shared_experts.down_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.shared_experts.down_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.shared_experts.gate_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.shared_experts.gate_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.shared_experts.gate_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.shared_experts.up_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.shared_experts.up_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.33.mlp.shared_experts.up_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.33.post_attention_layernorm.weight": "model-00042-of-00081.safetensors",
+ "model.layers.33.post_mlp_layernorm.weight": "model-00042-of-00081.safetensors",
+ "model.layers.33.pre_mlp_layernorm.weight": "model-00042-of-00081.safetensors",
+ "model.layers.33.self_attn.gate_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.gate_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.gate_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.k_norm.weight": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.k_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.k_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.k_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.o_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.o_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.o_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.q_norm.weight": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.q_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.q_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.q_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.v_proj.biases": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.v_proj.scales": "model-00041-of-00081.safetensors",
+ "model.layers.33.self_attn.v_proj.weight": "model-00041-of-00081.safetensors",
+ "model.layers.34.input_layernorm.weight": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.expert_bias": "model-00042-of-00081.safetensors",
+ "model.layers.34.mlp.experts.down_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.experts.down_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.experts.down_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.experts.gate_proj.biases": "model-00043-of-00081.safetensors",
+ "model.layers.34.mlp.experts.gate_proj.scales": "model-00043-of-00081.safetensors",
+ "model.layers.34.mlp.experts.gate_proj.weight": "model-00043-of-00081.safetensors",
+ "model.layers.34.mlp.experts.up_proj.biases": "model-00043-of-00081.safetensors",
+ "model.layers.34.mlp.experts.up_proj.scales": "model-00043-of-00081.safetensors",
+ "model.layers.34.mlp.experts.up_proj.weight": "model-00043-of-00081.safetensors",
+ "model.layers.34.mlp.router.gate.biases": "model-00042-of-00081.safetensors",
+ "model.layers.34.mlp.router.gate.scales": "model-00042-of-00081.safetensors",
+ "model.layers.34.mlp.router.gate.weight": "model-00042-of-00081.safetensors",
+ "model.layers.34.mlp.shared_experts.down_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.shared_experts.down_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.shared_experts.down_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.shared_experts.gate_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.shared_experts.gate_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.shared_experts.gate_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.shared_experts.up_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.shared_experts.up_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.34.mlp.shared_experts.up_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.34.post_attention_layernorm.weight": "model-00044-of-00081.safetensors",
+ "model.layers.34.post_mlp_layernorm.weight": "model-00044-of-00081.safetensors",
+ "model.layers.34.pre_mlp_layernorm.weight": "model-00044-of-00081.safetensors",
+ "model.layers.34.self_attn.gate_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.gate_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.gate_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.k_norm.weight": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.k_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.k_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.k_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.o_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.o_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.o_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.q_norm.weight": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.q_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.q_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.q_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.v_proj.biases": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.v_proj.scales": "model-00042-of-00081.safetensors",
+ "model.layers.34.self_attn.v_proj.weight": "model-00042-of-00081.safetensors",
+ "model.layers.35.input_layernorm.weight": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.expert_bias": "model-00044-of-00081.safetensors",
+ "model.layers.35.mlp.experts.down_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.experts.down_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.experts.down_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.experts.gate_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.35.mlp.experts.gate_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.35.mlp.experts.gate_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.35.mlp.experts.up_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.experts.up_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.experts.up_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.router.gate.biases": "model-00044-of-00081.safetensors",
+ "model.layers.35.mlp.router.gate.scales": "model-00044-of-00081.safetensors",
+ "model.layers.35.mlp.router.gate.weight": "model-00044-of-00081.safetensors",
+ "model.layers.35.mlp.shared_experts.down_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.shared_experts.down_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.shared_experts.down_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.shared_experts.gate_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.shared_experts.gate_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.shared_experts.gate_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.shared_experts.up_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.shared_experts.up_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.35.mlp.shared_experts.up_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.35.post_attention_layernorm.weight": "model-00045-of-00081.safetensors",
+ "model.layers.35.post_mlp_layernorm.weight": "model-00045-of-00081.safetensors",
+ "model.layers.35.pre_mlp_layernorm.weight": "model-00045-of-00081.safetensors",
+ "model.layers.35.self_attn.gate_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.gate_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.gate_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.k_norm.weight": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.k_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.k_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.k_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.o_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.o_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.o_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.q_norm.weight": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.q_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.q_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.q_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.v_proj.biases": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.v_proj.scales": "model-00044-of-00081.safetensors",
+ "model.layers.35.self_attn.v_proj.weight": "model-00044-of-00081.safetensors",
+ "model.layers.36.input_layernorm.weight": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.expert_bias": "model-00045-of-00081.safetensors",
+ "model.layers.36.mlp.experts.down_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.experts.down_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.experts.down_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.experts.gate_proj.biases": "model-00046-of-00081.safetensors",
+ "model.layers.36.mlp.experts.gate_proj.scales": "model-00046-of-00081.safetensors",
+ "model.layers.36.mlp.experts.gate_proj.weight": "model-00046-of-00081.safetensors",
+ "model.layers.36.mlp.experts.up_proj.biases": "model-00046-of-00081.safetensors",
+ "model.layers.36.mlp.experts.up_proj.scales": "model-00046-of-00081.safetensors",
+ "model.layers.36.mlp.experts.up_proj.weight": "model-00046-of-00081.safetensors",
+ "model.layers.36.mlp.router.gate.biases": "model-00045-of-00081.safetensors",
+ "model.layers.36.mlp.router.gate.scales": "model-00045-of-00081.safetensors",
+ "model.layers.36.mlp.router.gate.weight": "model-00045-of-00081.safetensors",
+ "model.layers.36.mlp.shared_experts.down_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.shared_experts.down_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.shared_experts.down_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.shared_experts.gate_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.shared_experts.gate_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.shared_experts.gate_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.shared_experts.up_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.shared_experts.up_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.36.mlp.shared_experts.up_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.36.post_attention_layernorm.weight": "model-00047-of-00081.safetensors",
+ "model.layers.36.post_mlp_layernorm.weight": "model-00047-of-00081.safetensors",
+ "model.layers.36.pre_mlp_layernorm.weight": "model-00047-of-00081.safetensors",
+ "model.layers.36.self_attn.gate_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.gate_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.gate_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.k_norm.weight": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.k_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.k_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.k_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.o_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.o_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.o_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.q_norm.weight": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.q_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.q_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.q_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.v_proj.biases": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.v_proj.scales": "model-00045-of-00081.safetensors",
+ "model.layers.36.self_attn.v_proj.weight": "model-00045-of-00081.safetensors",
+ "model.layers.37.input_layernorm.weight": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.expert_bias": "model-00047-of-00081.safetensors",
+ "model.layers.37.mlp.experts.down_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.experts.down_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.experts.down_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.experts.gate_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.37.mlp.experts.gate_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.37.mlp.experts.gate_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.37.mlp.experts.up_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.experts.up_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.experts.up_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.router.gate.biases": "model-00047-of-00081.safetensors",
+ "model.layers.37.mlp.router.gate.scales": "model-00047-of-00081.safetensors",
+ "model.layers.37.mlp.router.gate.weight": "model-00047-of-00081.safetensors",
+ "model.layers.37.mlp.shared_experts.down_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.shared_experts.down_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.shared_experts.down_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.shared_experts.gate_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.shared_experts.gate_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.shared_experts.gate_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.shared_experts.up_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.shared_experts.up_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.37.mlp.shared_experts.up_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.37.post_attention_layernorm.weight": "model-00048-of-00081.safetensors",
+ "model.layers.37.post_mlp_layernorm.weight": "model-00048-of-00081.safetensors",
+ "model.layers.37.pre_mlp_layernorm.weight": "model-00048-of-00081.safetensors",
+ "model.layers.37.self_attn.gate_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.gate_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.gate_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.k_norm.weight": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.k_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.k_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.k_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.o_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.o_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.o_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.q_norm.weight": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.q_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.q_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.q_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.v_proj.biases": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.v_proj.scales": "model-00047-of-00081.safetensors",
+ "model.layers.37.self_attn.v_proj.weight": "model-00047-of-00081.safetensors",
+ "model.layers.38.input_layernorm.weight": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.expert_bias": "model-00048-of-00081.safetensors",
+ "model.layers.38.mlp.experts.down_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.experts.down_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.experts.down_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.experts.gate_proj.biases": "model-00049-of-00081.safetensors",
+ "model.layers.38.mlp.experts.gate_proj.scales": "model-00049-of-00081.safetensors",
+ "model.layers.38.mlp.experts.gate_proj.weight": "model-00049-of-00081.safetensors",
+ "model.layers.38.mlp.experts.up_proj.biases": "model-00049-of-00081.safetensors",
+ "model.layers.38.mlp.experts.up_proj.scales": "model-00049-of-00081.safetensors",
+ "model.layers.38.mlp.experts.up_proj.weight": "model-00049-of-00081.safetensors",
+ "model.layers.38.mlp.router.gate.biases": "model-00048-of-00081.safetensors",
+ "model.layers.38.mlp.router.gate.scales": "model-00048-of-00081.safetensors",
+ "model.layers.38.mlp.router.gate.weight": "model-00048-of-00081.safetensors",
+ "model.layers.38.mlp.shared_experts.down_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.shared_experts.down_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.shared_experts.down_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.shared_experts.gate_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.shared_experts.gate_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.shared_experts.gate_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.shared_experts.up_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.shared_experts.up_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.38.mlp.shared_experts.up_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.38.post_attention_layernorm.weight": "model-00050-of-00081.safetensors",
+ "model.layers.38.post_mlp_layernorm.weight": "model-00050-of-00081.safetensors",
+ "model.layers.38.pre_mlp_layernorm.weight": "model-00050-of-00081.safetensors",
+ "model.layers.38.self_attn.gate_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.gate_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.gate_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.k_norm.weight": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.k_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.k_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.k_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.o_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.o_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.o_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.q_norm.weight": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.q_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.q_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.q_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.v_proj.biases": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.v_proj.scales": "model-00048-of-00081.safetensors",
+ "model.layers.38.self_attn.v_proj.weight": "model-00048-of-00081.safetensors",
+ "model.layers.39.input_layernorm.weight": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.expert_bias": "model-00050-of-00081.safetensors",
+ "model.layers.39.mlp.experts.down_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.experts.down_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.experts.down_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.experts.gate_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.39.mlp.experts.gate_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.39.mlp.experts.gate_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.39.mlp.experts.up_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.experts.up_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.experts.up_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.router.gate.biases": "model-00050-of-00081.safetensors",
+ "model.layers.39.mlp.router.gate.scales": "model-00050-of-00081.safetensors",
+ "model.layers.39.mlp.router.gate.weight": "model-00050-of-00081.safetensors",
+ "model.layers.39.mlp.shared_experts.down_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.shared_experts.down_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.shared_experts.down_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.shared_experts.gate_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.shared_experts.gate_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.shared_experts.gate_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.shared_experts.up_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.shared_experts.up_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.39.mlp.shared_experts.up_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.39.post_attention_layernorm.weight": "model-00051-of-00081.safetensors",
+ "model.layers.39.post_mlp_layernorm.weight": "model-00051-of-00081.safetensors",
+ "model.layers.39.pre_mlp_layernorm.weight": "model-00051-of-00081.safetensors",
+ "model.layers.39.self_attn.gate_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.gate_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.gate_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.k_norm.weight": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.k_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.k_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.k_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.o_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.o_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.o_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.q_norm.weight": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.q_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.q_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.q_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.v_proj.biases": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.v_proj.scales": "model-00050-of-00081.safetensors",
+ "model.layers.39.self_attn.v_proj.weight": "model-00050-of-00081.safetensors",
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.mlp.down_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.4.mlp.down_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.mlp.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.4.mlp.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.mlp.up_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.4.mlp.up_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.post_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.pre_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.k_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.k_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.o_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.o_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.q_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.q_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.v_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.v_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.40.input_layernorm.weight": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.expert_bias": "model-00051-of-00081.safetensors",
+ "model.layers.40.mlp.experts.down_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.experts.down_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.experts.down_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.experts.gate_proj.biases": "model-00052-of-00081.safetensors",
+ "model.layers.40.mlp.experts.gate_proj.scales": "model-00052-of-00081.safetensors",
+ "model.layers.40.mlp.experts.gate_proj.weight": "model-00052-of-00081.safetensors",
+ "model.layers.40.mlp.experts.up_proj.biases": "model-00052-of-00081.safetensors",
+ "model.layers.40.mlp.experts.up_proj.scales": "model-00052-of-00081.safetensors",
+ "model.layers.40.mlp.experts.up_proj.weight": "model-00052-of-00081.safetensors",
+ "model.layers.40.mlp.router.gate.biases": "model-00051-of-00081.safetensors",
+ "model.layers.40.mlp.router.gate.scales": "model-00051-of-00081.safetensors",
+ "model.layers.40.mlp.router.gate.weight": "model-00051-of-00081.safetensors",
+ "model.layers.40.mlp.shared_experts.down_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.shared_experts.down_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.shared_experts.down_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.shared_experts.gate_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.shared_experts.gate_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.shared_experts.gate_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.shared_experts.up_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.shared_experts.up_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.40.mlp.shared_experts.up_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.40.post_attention_layernorm.weight": "model-00053-of-00081.safetensors",
+ "model.layers.40.post_mlp_layernorm.weight": "model-00053-of-00081.safetensors",
+ "model.layers.40.pre_mlp_layernorm.weight": "model-00053-of-00081.safetensors",
+ "model.layers.40.self_attn.gate_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.gate_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.gate_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.k_norm.weight": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.k_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.k_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.k_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.o_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.o_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.o_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.q_norm.weight": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.q_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.q_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.q_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.v_proj.biases": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.v_proj.scales": "model-00051-of-00081.safetensors",
+ "model.layers.40.self_attn.v_proj.weight": "model-00051-of-00081.safetensors",
+ "model.layers.41.input_layernorm.weight": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.expert_bias": "model-00053-of-00081.safetensors",
+ "model.layers.41.mlp.experts.down_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.experts.down_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.experts.down_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.experts.gate_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.41.mlp.experts.gate_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.41.mlp.experts.gate_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.41.mlp.experts.up_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.experts.up_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.experts.up_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.router.gate.biases": "model-00053-of-00081.safetensors",
+ "model.layers.41.mlp.router.gate.scales": "model-00053-of-00081.safetensors",
+ "model.layers.41.mlp.router.gate.weight": "model-00053-of-00081.safetensors",
+ "model.layers.41.mlp.shared_experts.down_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.shared_experts.down_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.shared_experts.down_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.shared_experts.gate_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.shared_experts.gate_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.shared_experts.gate_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.shared_experts.up_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.shared_experts.up_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.41.mlp.shared_experts.up_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.41.post_attention_layernorm.weight": "model-00054-of-00081.safetensors",
+ "model.layers.41.post_mlp_layernorm.weight": "model-00054-of-00081.safetensors",
+ "model.layers.41.pre_mlp_layernorm.weight": "model-00054-of-00081.safetensors",
+ "model.layers.41.self_attn.gate_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.gate_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.gate_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.k_norm.weight": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.k_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.k_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.k_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.o_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.o_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.o_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.q_norm.weight": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.q_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.q_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.q_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.v_proj.biases": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.v_proj.scales": "model-00053-of-00081.safetensors",
+ "model.layers.41.self_attn.v_proj.weight": "model-00053-of-00081.safetensors",
+ "model.layers.42.input_layernorm.weight": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.expert_bias": "model-00054-of-00081.safetensors",
+ "model.layers.42.mlp.experts.down_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.experts.down_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.experts.down_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.experts.gate_proj.biases": "model-00055-of-00081.safetensors",
+ "model.layers.42.mlp.experts.gate_proj.scales": "model-00055-of-00081.safetensors",
+ "model.layers.42.mlp.experts.gate_proj.weight": "model-00055-of-00081.safetensors",
+ "model.layers.42.mlp.experts.up_proj.biases": "model-00055-of-00081.safetensors",
+ "model.layers.42.mlp.experts.up_proj.scales": "model-00055-of-00081.safetensors",
+ "model.layers.42.mlp.experts.up_proj.weight": "model-00055-of-00081.safetensors",
+ "model.layers.42.mlp.router.gate.biases": "model-00054-of-00081.safetensors",
+ "model.layers.42.mlp.router.gate.scales": "model-00054-of-00081.safetensors",
+ "model.layers.42.mlp.router.gate.weight": "model-00054-of-00081.safetensors",
+ "model.layers.42.mlp.shared_experts.down_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.shared_experts.down_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.shared_experts.down_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.shared_experts.gate_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.shared_experts.gate_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.shared_experts.gate_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.shared_experts.up_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.shared_experts.up_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.42.mlp.shared_experts.up_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.42.post_attention_layernorm.weight": "model-00056-of-00081.safetensors",
+ "model.layers.42.post_mlp_layernorm.weight": "model-00056-of-00081.safetensors",
+ "model.layers.42.pre_mlp_layernorm.weight": "model-00056-of-00081.safetensors",
+ "model.layers.42.self_attn.gate_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.gate_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.gate_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.k_norm.weight": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.k_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.k_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.k_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.o_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.o_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.o_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.q_norm.weight": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.q_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.q_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.q_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.v_proj.biases": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.v_proj.scales": "model-00054-of-00081.safetensors",
+ "model.layers.42.self_attn.v_proj.weight": "model-00054-of-00081.safetensors",
+ "model.layers.43.input_layernorm.weight": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.expert_bias": "model-00056-of-00081.safetensors",
+ "model.layers.43.mlp.experts.down_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.experts.down_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.experts.down_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.experts.gate_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.43.mlp.experts.gate_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.43.mlp.experts.gate_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.43.mlp.experts.up_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.experts.up_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.experts.up_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.router.gate.biases": "model-00056-of-00081.safetensors",
+ "model.layers.43.mlp.router.gate.scales": "model-00056-of-00081.safetensors",
+ "model.layers.43.mlp.router.gate.weight": "model-00056-of-00081.safetensors",
+ "model.layers.43.mlp.shared_experts.down_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.shared_experts.down_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.shared_experts.down_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.shared_experts.gate_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.shared_experts.gate_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.shared_experts.gate_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.shared_experts.up_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.shared_experts.up_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.43.mlp.shared_experts.up_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.43.post_attention_layernorm.weight": "model-00057-of-00081.safetensors",
+ "model.layers.43.post_mlp_layernorm.weight": "model-00057-of-00081.safetensors",
+ "model.layers.43.pre_mlp_layernorm.weight": "model-00057-of-00081.safetensors",
+ "model.layers.43.self_attn.gate_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.gate_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.gate_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.k_norm.weight": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.k_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.k_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.k_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.o_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.o_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.o_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.q_norm.weight": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.q_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.q_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.q_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.v_proj.biases": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.v_proj.scales": "model-00056-of-00081.safetensors",
+ "model.layers.43.self_attn.v_proj.weight": "model-00056-of-00081.safetensors",
+ "model.layers.44.input_layernorm.weight": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.expert_bias": "model-00057-of-00081.safetensors",
+ "model.layers.44.mlp.experts.down_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.experts.down_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.experts.down_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.experts.gate_proj.biases": "model-00058-of-00081.safetensors",
+ "model.layers.44.mlp.experts.gate_proj.scales": "model-00058-of-00081.safetensors",
+ "model.layers.44.mlp.experts.gate_proj.weight": "model-00058-of-00081.safetensors",
+ "model.layers.44.mlp.experts.up_proj.biases": "model-00058-of-00081.safetensors",
+ "model.layers.44.mlp.experts.up_proj.scales": "model-00058-of-00081.safetensors",
+ "model.layers.44.mlp.experts.up_proj.weight": "model-00058-of-00081.safetensors",
+ "model.layers.44.mlp.router.gate.biases": "model-00057-of-00081.safetensors",
+ "model.layers.44.mlp.router.gate.scales": "model-00057-of-00081.safetensors",
+ "model.layers.44.mlp.router.gate.weight": "model-00057-of-00081.safetensors",
+ "model.layers.44.mlp.shared_experts.down_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.shared_experts.down_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.shared_experts.down_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.shared_experts.gate_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.shared_experts.gate_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.shared_experts.gate_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.shared_experts.up_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.shared_experts.up_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.44.mlp.shared_experts.up_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.44.post_attention_layernorm.weight": "model-00059-of-00081.safetensors",
+ "model.layers.44.post_mlp_layernorm.weight": "model-00059-of-00081.safetensors",
+ "model.layers.44.pre_mlp_layernorm.weight": "model-00059-of-00081.safetensors",
+ "model.layers.44.self_attn.gate_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.gate_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.gate_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.k_norm.weight": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.k_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.k_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.k_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.o_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.o_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.o_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.q_norm.weight": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.q_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.q_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.q_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.v_proj.biases": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.v_proj.scales": "model-00057-of-00081.safetensors",
+ "model.layers.44.self_attn.v_proj.weight": "model-00057-of-00081.safetensors",
+ "model.layers.45.input_layernorm.weight": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.expert_bias": "model-00059-of-00081.safetensors",
+ "model.layers.45.mlp.experts.down_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.experts.down_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.experts.down_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.experts.gate_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.45.mlp.experts.gate_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.45.mlp.experts.gate_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.45.mlp.experts.up_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.experts.up_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.experts.up_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.router.gate.biases": "model-00059-of-00081.safetensors",
+ "model.layers.45.mlp.router.gate.scales": "model-00059-of-00081.safetensors",
+ "model.layers.45.mlp.router.gate.weight": "model-00059-of-00081.safetensors",
+ "model.layers.45.mlp.shared_experts.down_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.shared_experts.down_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.shared_experts.down_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.shared_experts.gate_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.shared_experts.gate_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.shared_experts.gate_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.shared_experts.up_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.shared_experts.up_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.45.mlp.shared_experts.up_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.45.post_attention_layernorm.weight": "model-00060-of-00081.safetensors",
+ "model.layers.45.post_mlp_layernorm.weight": "model-00060-of-00081.safetensors",
+ "model.layers.45.pre_mlp_layernorm.weight": "model-00060-of-00081.safetensors",
+ "model.layers.45.self_attn.gate_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.gate_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.gate_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.k_norm.weight": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.k_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.k_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.k_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.o_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.o_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.o_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.q_norm.weight": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.q_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.q_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.q_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.v_proj.biases": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.v_proj.scales": "model-00059-of-00081.safetensors",
+ "model.layers.45.self_attn.v_proj.weight": "model-00059-of-00081.safetensors",
+ "model.layers.46.input_layernorm.weight": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.expert_bias": "model-00060-of-00081.safetensors",
+ "model.layers.46.mlp.experts.down_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.experts.down_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.experts.down_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.experts.gate_proj.biases": "model-00061-of-00081.safetensors",
+ "model.layers.46.mlp.experts.gate_proj.scales": "model-00061-of-00081.safetensors",
+ "model.layers.46.mlp.experts.gate_proj.weight": "model-00061-of-00081.safetensors",
+ "model.layers.46.mlp.experts.up_proj.biases": "model-00061-of-00081.safetensors",
+ "model.layers.46.mlp.experts.up_proj.scales": "model-00061-of-00081.safetensors",
+ "model.layers.46.mlp.experts.up_proj.weight": "model-00061-of-00081.safetensors",
+ "model.layers.46.mlp.router.gate.biases": "model-00060-of-00081.safetensors",
+ "model.layers.46.mlp.router.gate.scales": "model-00060-of-00081.safetensors",
+ "model.layers.46.mlp.router.gate.weight": "model-00060-of-00081.safetensors",
+ "model.layers.46.mlp.shared_experts.down_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.shared_experts.down_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.shared_experts.down_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.shared_experts.gate_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.shared_experts.gate_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.shared_experts.gate_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.shared_experts.up_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.shared_experts.up_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.46.mlp.shared_experts.up_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.46.post_attention_layernorm.weight": "model-00062-of-00081.safetensors",
+ "model.layers.46.post_mlp_layernorm.weight": "model-00062-of-00081.safetensors",
+ "model.layers.46.pre_mlp_layernorm.weight": "model-00062-of-00081.safetensors",
+ "model.layers.46.self_attn.gate_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.gate_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.gate_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.k_norm.weight": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.k_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.k_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.k_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.o_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.o_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.o_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.q_norm.weight": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.q_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.q_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.q_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.v_proj.biases": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.v_proj.scales": "model-00060-of-00081.safetensors",
+ "model.layers.46.self_attn.v_proj.weight": "model-00060-of-00081.safetensors",
+ "model.layers.47.input_layernorm.weight": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.expert_bias": "model-00062-of-00081.safetensors",
+ "model.layers.47.mlp.experts.down_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.experts.down_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.experts.down_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.experts.gate_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.47.mlp.experts.gate_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.47.mlp.experts.gate_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.47.mlp.experts.up_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.experts.up_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.experts.up_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.router.gate.biases": "model-00062-of-00081.safetensors",
+ "model.layers.47.mlp.router.gate.scales": "model-00062-of-00081.safetensors",
+ "model.layers.47.mlp.router.gate.weight": "model-00062-of-00081.safetensors",
+ "model.layers.47.mlp.shared_experts.down_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.shared_experts.down_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.shared_experts.down_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.shared_experts.gate_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.shared_experts.gate_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.shared_experts.gate_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.shared_experts.up_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.shared_experts.up_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.47.mlp.shared_experts.up_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.47.post_attention_layernorm.weight": "model-00063-of-00081.safetensors",
+ "model.layers.47.post_mlp_layernorm.weight": "model-00063-of-00081.safetensors",
+ "model.layers.47.pre_mlp_layernorm.weight": "model-00063-of-00081.safetensors",
+ "model.layers.47.self_attn.gate_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.gate_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.gate_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.k_norm.weight": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.k_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.k_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.k_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.o_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.o_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.o_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.q_norm.weight": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.q_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.q_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.q_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.v_proj.biases": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.v_proj.scales": "model-00062-of-00081.safetensors",
+ "model.layers.47.self_attn.v_proj.weight": "model-00062-of-00081.safetensors",
+ "model.layers.48.input_layernorm.weight": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.expert_bias": "model-00063-of-00081.safetensors",
+ "model.layers.48.mlp.experts.down_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.experts.down_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.experts.down_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.experts.gate_proj.biases": "model-00064-of-00081.safetensors",
+ "model.layers.48.mlp.experts.gate_proj.scales": "model-00064-of-00081.safetensors",
+ "model.layers.48.mlp.experts.gate_proj.weight": "model-00064-of-00081.safetensors",
+ "model.layers.48.mlp.experts.up_proj.biases": "model-00064-of-00081.safetensors",
+ "model.layers.48.mlp.experts.up_proj.scales": "model-00064-of-00081.safetensors",
+ "model.layers.48.mlp.experts.up_proj.weight": "model-00064-of-00081.safetensors",
+ "model.layers.48.mlp.router.gate.biases": "model-00063-of-00081.safetensors",
+ "model.layers.48.mlp.router.gate.scales": "model-00063-of-00081.safetensors",
+ "model.layers.48.mlp.router.gate.weight": "model-00063-of-00081.safetensors",
+ "model.layers.48.mlp.shared_experts.down_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.shared_experts.down_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.shared_experts.down_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.shared_experts.gate_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.shared_experts.gate_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.shared_experts.gate_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.shared_experts.up_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.shared_experts.up_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.48.mlp.shared_experts.up_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.48.post_attention_layernorm.weight": "model-00065-of-00081.safetensors",
+ "model.layers.48.post_mlp_layernorm.weight": "model-00065-of-00081.safetensors",
+ "model.layers.48.pre_mlp_layernorm.weight": "model-00065-of-00081.safetensors",
+ "model.layers.48.self_attn.gate_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.gate_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.gate_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.k_norm.weight": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.k_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.k_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.k_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.o_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.o_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.o_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.q_norm.weight": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.q_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.q_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.q_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.v_proj.biases": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.v_proj.scales": "model-00063-of-00081.safetensors",
+ "model.layers.48.self_attn.v_proj.weight": "model-00063-of-00081.safetensors",
+ "model.layers.49.input_layernorm.weight": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.expert_bias": "model-00065-of-00081.safetensors",
+ "model.layers.49.mlp.experts.down_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.experts.down_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.experts.down_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.experts.gate_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.49.mlp.experts.gate_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.49.mlp.experts.gate_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.49.mlp.experts.up_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.experts.up_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.experts.up_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.router.gate.biases": "model-00065-of-00081.safetensors",
+ "model.layers.49.mlp.router.gate.scales": "model-00065-of-00081.safetensors",
+ "model.layers.49.mlp.router.gate.weight": "model-00065-of-00081.safetensors",
+ "model.layers.49.mlp.shared_experts.down_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.shared_experts.down_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.shared_experts.down_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.shared_experts.gate_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.shared_experts.gate_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.shared_experts.gate_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.shared_experts.up_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.shared_experts.up_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.49.mlp.shared_experts.up_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.49.post_attention_layernorm.weight": "model-00066-of-00081.safetensors",
+ "model.layers.49.post_mlp_layernorm.weight": "model-00066-of-00081.safetensors",
+ "model.layers.49.pre_mlp_layernorm.weight": "model-00066-of-00081.safetensors",
+ "model.layers.49.self_attn.gate_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.gate_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.gate_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.k_norm.weight": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.k_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.k_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.k_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.o_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.o_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.o_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.q_norm.weight": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.q_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.q_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.q_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.v_proj.biases": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.v_proj.scales": "model-00065-of-00081.safetensors",
+ "model.layers.49.self_attn.v_proj.weight": "model-00065-of-00081.safetensors",
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.mlp.down_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.5.mlp.down_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.mlp.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.5.mlp.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.mlp.up_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.5.mlp.up_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.post_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.pre_mlp_layernorm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.k_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.k_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.o_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.o_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.q_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.q_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.v_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.v_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.50.input_layernorm.weight": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.expert_bias": "model-00066-of-00081.safetensors",
+ "model.layers.50.mlp.experts.down_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.experts.down_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.experts.down_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.experts.gate_proj.biases": "model-00067-of-00081.safetensors",
+ "model.layers.50.mlp.experts.gate_proj.scales": "model-00067-of-00081.safetensors",
+ "model.layers.50.mlp.experts.gate_proj.weight": "model-00067-of-00081.safetensors",
+ "model.layers.50.mlp.experts.up_proj.biases": "model-00067-of-00081.safetensors",
+ "model.layers.50.mlp.experts.up_proj.scales": "model-00067-of-00081.safetensors",
+ "model.layers.50.mlp.experts.up_proj.weight": "model-00067-of-00081.safetensors",
+ "model.layers.50.mlp.router.gate.biases": "model-00066-of-00081.safetensors",
+ "model.layers.50.mlp.router.gate.scales": "model-00066-of-00081.safetensors",
+ "model.layers.50.mlp.router.gate.weight": "model-00066-of-00081.safetensors",
+ "model.layers.50.mlp.shared_experts.down_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.shared_experts.down_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.shared_experts.down_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.shared_experts.gate_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.shared_experts.gate_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.shared_experts.gate_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.shared_experts.up_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.shared_experts.up_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.50.mlp.shared_experts.up_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.50.post_attention_layernorm.weight": "model-00068-of-00081.safetensors",
+ "model.layers.50.post_mlp_layernorm.weight": "model-00068-of-00081.safetensors",
+ "model.layers.50.pre_mlp_layernorm.weight": "model-00068-of-00081.safetensors",
+ "model.layers.50.self_attn.gate_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.gate_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.gate_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.k_norm.weight": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.k_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.k_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.k_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.o_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.o_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.o_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.q_norm.weight": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.q_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.q_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.q_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.v_proj.biases": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.v_proj.scales": "model-00066-of-00081.safetensors",
+ "model.layers.50.self_attn.v_proj.weight": "model-00066-of-00081.safetensors",
+ "model.layers.51.input_layernorm.weight": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.expert_bias": "model-00068-of-00081.safetensors",
+ "model.layers.51.mlp.experts.down_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.experts.down_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.experts.down_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.experts.gate_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.51.mlp.experts.gate_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.51.mlp.experts.gate_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.51.mlp.experts.up_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.experts.up_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.experts.up_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.router.gate.biases": "model-00068-of-00081.safetensors",
+ "model.layers.51.mlp.router.gate.scales": "model-00068-of-00081.safetensors",
+ "model.layers.51.mlp.router.gate.weight": "model-00068-of-00081.safetensors",
+ "model.layers.51.mlp.shared_experts.down_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.shared_experts.down_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.shared_experts.down_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.shared_experts.gate_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.shared_experts.gate_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.shared_experts.gate_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.shared_experts.up_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.shared_experts.up_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.51.mlp.shared_experts.up_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.51.post_attention_layernorm.weight": "model-00069-of-00081.safetensors",
+ "model.layers.51.post_mlp_layernorm.weight": "model-00069-of-00081.safetensors",
+ "model.layers.51.pre_mlp_layernorm.weight": "model-00069-of-00081.safetensors",
+ "model.layers.51.self_attn.gate_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.gate_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.gate_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.k_norm.weight": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.k_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.k_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.k_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.o_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.o_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.o_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.q_norm.weight": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.q_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.q_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.q_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.v_proj.biases": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.v_proj.scales": "model-00068-of-00081.safetensors",
+ "model.layers.51.self_attn.v_proj.weight": "model-00068-of-00081.safetensors",
+ "model.layers.52.input_layernorm.weight": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.expert_bias": "model-00069-of-00081.safetensors",
+ "model.layers.52.mlp.experts.down_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.experts.down_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.experts.down_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.experts.gate_proj.biases": "model-00070-of-00081.safetensors",
+ "model.layers.52.mlp.experts.gate_proj.scales": "model-00070-of-00081.safetensors",
+ "model.layers.52.mlp.experts.gate_proj.weight": "model-00070-of-00081.safetensors",
+ "model.layers.52.mlp.experts.up_proj.biases": "model-00070-of-00081.safetensors",
+ "model.layers.52.mlp.experts.up_proj.scales": "model-00070-of-00081.safetensors",
+ "model.layers.52.mlp.experts.up_proj.weight": "model-00070-of-00081.safetensors",
+ "model.layers.52.mlp.router.gate.biases": "model-00069-of-00081.safetensors",
+ "model.layers.52.mlp.router.gate.scales": "model-00069-of-00081.safetensors",
+ "model.layers.52.mlp.router.gate.weight": "model-00069-of-00081.safetensors",
+ "model.layers.52.mlp.shared_experts.down_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.shared_experts.down_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.shared_experts.down_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.shared_experts.gate_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.shared_experts.gate_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.shared_experts.gate_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.shared_experts.up_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.shared_experts.up_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.52.mlp.shared_experts.up_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.52.post_attention_layernorm.weight": "model-00071-of-00081.safetensors",
+ "model.layers.52.post_mlp_layernorm.weight": "model-00071-of-00081.safetensors",
+ "model.layers.52.pre_mlp_layernorm.weight": "model-00071-of-00081.safetensors",
+ "model.layers.52.self_attn.gate_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.gate_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.gate_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.k_norm.weight": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.k_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.k_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.k_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.o_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.o_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.o_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.q_norm.weight": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.q_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.q_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.q_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.v_proj.biases": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.v_proj.scales": "model-00069-of-00081.safetensors",
+ "model.layers.52.self_attn.v_proj.weight": "model-00069-of-00081.safetensors",
+ "model.layers.53.input_layernorm.weight": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.expert_bias": "model-00071-of-00081.safetensors",
+ "model.layers.53.mlp.experts.down_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.experts.down_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.experts.down_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.experts.gate_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.53.mlp.experts.gate_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.53.mlp.experts.gate_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.53.mlp.experts.up_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.experts.up_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.experts.up_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.router.gate.biases": "model-00071-of-00081.safetensors",
+ "model.layers.53.mlp.router.gate.scales": "model-00071-of-00081.safetensors",
+ "model.layers.53.mlp.router.gate.weight": "model-00071-of-00081.safetensors",
+ "model.layers.53.mlp.shared_experts.down_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.shared_experts.down_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.shared_experts.down_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.shared_experts.gate_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.shared_experts.gate_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.shared_experts.gate_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.shared_experts.up_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.shared_experts.up_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.53.mlp.shared_experts.up_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.53.post_attention_layernorm.weight": "model-00072-of-00081.safetensors",
+ "model.layers.53.post_mlp_layernorm.weight": "model-00072-of-00081.safetensors",
+ "model.layers.53.pre_mlp_layernorm.weight": "model-00072-of-00081.safetensors",
+ "model.layers.53.self_attn.gate_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.gate_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.gate_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.k_norm.weight": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.k_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.k_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.k_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.o_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.o_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.o_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.q_norm.weight": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.q_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.q_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.q_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.v_proj.biases": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.v_proj.scales": "model-00071-of-00081.safetensors",
+ "model.layers.53.self_attn.v_proj.weight": "model-00071-of-00081.safetensors",
+ "model.layers.54.input_layernorm.weight": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.expert_bias": "model-00072-of-00081.safetensors",
+ "model.layers.54.mlp.experts.down_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.experts.down_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.experts.down_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.experts.gate_proj.biases": "model-00073-of-00081.safetensors",
+ "model.layers.54.mlp.experts.gate_proj.scales": "model-00073-of-00081.safetensors",
+ "model.layers.54.mlp.experts.gate_proj.weight": "model-00073-of-00081.safetensors",
+ "model.layers.54.mlp.experts.up_proj.biases": "model-00073-of-00081.safetensors",
+ "model.layers.54.mlp.experts.up_proj.scales": "model-00073-of-00081.safetensors",
+ "model.layers.54.mlp.experts.up_proj.weight": "model-00073-of-00081.safetensors",
+ "model.layers.54.mlp.router.gate.biases": "model-00072-of-00081.safetensors",
+ "model.layers.54.mlp.router.gate.scales": "model-00072-of-00081.safetensors",
+ "model.layers.54.mlp.router.gate.weight": "model-00072-of-00081.safetensors",
+ "model.layers.54.mlp.shared_experts.down_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.shared_experts.down_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.shared_experts.down_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.shared_experts.gate_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.shared_experts.gate_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.shared_experts.gate_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.shared_experts.up_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.shared_experts.up_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.54.mlp.shared_experts.up_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.54.post_attention_layernorm.weight": "model-00074-of-00081.safetensors",
+ "model.layers.54.post_mlp_layernorm.weight": "model-00074-of-00081.safetensors",
+ "model.layers.54.pre_mlp_layernorm.weight": "model-00074-of-00081.safetensors",
+ "model.layers.54.self_attn.gate_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.gate_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.gate_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.k_norm.weight": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.k_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.k_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.k_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.o_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.o_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.o_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.q_norm.weight": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.q_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.q_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.q_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.v_proj.biases": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.v_proj.scales": "model-00072-of-00081.safetensors",
+ "model.layers.54.self_attn.v_proj.weight": "model-00072-of-00081.safetensors",
+ "model.layers.55.input_layernorm.weight": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.expert_bias": "model-00074-of-00081.safetensors",
+ "model.layers.55.mlp.experts.down_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.experts.down_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.experts.down_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.experts.gate_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.55.mlp.experts.gate_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.55.mlp.experts.gate_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.55.mlp.experts.up_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.experts.up_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.experts.up_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.router.gate.biases": "model-00074-of-00081.safetensors",
+ "model.layers.55.mlp.router.gate.scales": "model-00074-of-00081.safetensors",
+ "model.layers.55.mlp.router.gate.weight": "model-00074-of-00081.safetensors",
+ "model.layers.55.mlp.shared_experts.down_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.shared_experts.down_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.shared_experts.down_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.shared_experts.gate_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.shared_experts.gate_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.shared_experts.gate_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.shared_experts.up_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.shared_experts.up_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.55.mlp.shared_experts.up_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.55.post_attention_layernorm.weight": "model-00075-of-00081.safetensors",
+ "model.layers.55.post_mlp_layernorm.weight": "model-00075-of-00081.safetensors",
+ "model.layers.55.pre_mlp_layernorm.weight": "model-00075-of-00081.safetensors",
+ "model.layers.55.self_attn.gate_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.gate_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.gate_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.k_norm.weight": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.k_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.k_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.k_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.o_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.o_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.o_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.q_norm.weight": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.q_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.q_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.q_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.v_proj.biases": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.v_proj.scales": "model-00074-of-00081.safetensors",
+ "model.layers.55.self_attn.v_proj.weight": "model-00074-of-00081.safetensors",
+ "model.layers.56.input_layernorm.weight": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.expert_bias": "model-00075-of-00081.safetensors",
+ "model.layers.56.mlp.experts.down_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.experts.down_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.experts.down_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.experts.gate_proj.biases": "model-00076-of-00081.safetensors",
+ "model.layers.56.mlp.experts.gate_proj.scales": "model-00076-of-00081.safetensors",
+ "model.layers.56.mlp.experts.gate_proj.weight": "model-00076-of-00081.safetensors",
+ "model.layers.56.mlp.experts.up_proj.biases": "model-00076-of-00081.safetensors",
+ "model.layers.56.mlp.experts.up_proj.scales": "model-00076-of-00081.safetensors",
+ "model.layers.56.mlp.experts.up_proj.weight": "model-00076-of-00081.safetensors",
+ "model.layers.56.mlp.router.gate.biases": "model-00075-of-00081.safetensors",
+ "model.layers.56.mlp.router.gate.scales": "model-00075-of-00081.safetensors",
+ "model.layers.56.mlp.router.gate.weight": "model-00075-of-00081.safetensors",
+ "model.layers.56.mlp.shared_experts.down_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.shared_experts.down_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.shared_experts.down_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.shared_experts.gate_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.shared_experts.gate_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.shared_experts.gate_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.shared_experts.up_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.shared_experts.up_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.56.mlp.shared_experts.up_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.56.post_attention_layernorm.weight": "model-00077-of-00081.safetensors",
+ "model.layers.56.post_mlp_layernorm.weight": "model-00077-of-00081.safetensors",
+ "model.layers.56.pre_mlp_layernorm.weight": "model-00077-of-00081.safetensors",
+ "model.layers.56.self_attn.gate_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.gate_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.gate_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.k_norm.weight": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.k_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.k_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.k_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.o_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.o_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.o_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.q_norm.weight": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.q_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.q_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.q_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.v_proj.biases": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.v_proj.scales": "model-00075-of-00081.safetensors",
+ "model.layers.56.self_attn.v_proj.weight": "model-00075-of-00081.safetensors",
+ "model.layers.57.input_layernorm.weight": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.expert_bias": "model-00077-of-00081.safetensors",
+ "model.layers.57.mlp.experts.down_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.experts.down_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.experts.down_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.experts.gate_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.57.mlp.experts.gate_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.57.mlp.experts.gate_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.57.mlp.experts.up_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.experts.up_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.experts.up_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.router.gate.biases": "model-00077-of-00081.safetensors",
+ "model.layers.57.mlp.router.gate.scales": "model-00077-of-00081.safetensors",
+ "model.layers.57.mlp.router.gate.weight": "model-00077-of-00081.safetensors",
+ "model.layers.57.mlp.shared_experts.down_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.shared_experts.down_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.shared_experts.down_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.shared_experts.gate_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.shared_experts.gate_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.shared_experts.gate_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.shared_experts.up_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.shared_experts.up_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.57.mlp.shared_experts.up_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.57.post_attention_layernorm.weight": "model-00078-of-00081.safetensors",
+ "model.layers.57.post_mlp_layernorm.weight": "model-00078-of-00081.safetensors",
+ "model.layers.57.pre_mlp_layernorm.weight": "model-00078-of-00081.safetensors",
+ "model.layers.57.self_attn.gate_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.gate_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.gate_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.k_norm.weight": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.k_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.k_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.k_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.o_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.o_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.o_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.q_norm.weight": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.q_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.q_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.q_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.v_proj.biases": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.v_proj.scales": "model-00077-of-00081.safetensors",
+ "model.layers.57.self_attn.v_proj.weight": "model-00077-of-00081.safetensors",
+ "model.layers.58.input_layernorm.weight": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.expert_bias": "model-00078-of-00081.safetensors",
+ "model.layers.58.mlp.experts.down_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.experts.down_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.experts.down_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.experts.gate_proj.biases": "model-00079-of-00081.safetensors",
+ "model.layers.58.mlp.experts.gate_proj.scales": "model-00079-of-00081.safetensors",
+ "model.layers.58.mlp.experts.gate_proj.weight": "model-00079-of-00081.safetensors",
+ "model.layers.58.mlp.experts.up_proj.biases": "model-00079-of-00081.safetensors",
+ "model.layers.58.mlp.experts.up_proj.scales": "model-00079-of-00081.safetensors",
+ "model.layers.58.mlp.experts.up_proj.weight": "model-00079-of-00081.safetensors",
+ "model.layers.58.mlp.router.gate.biases": "model-00078-of-00081.safetensors",
+ "model.layers.58.mlp.router.gate.scales": "model-00078-of-00081.safetensors",
+ "model.layers.58.mlp.router.gate.weight": "model-00078-of-00081.safetensors",
+ "model.layers.58.mlp.shared_experts.down_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.shared_experts.down_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.shared_experts.down_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.shared_experts.gate_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.shared_experts.gate_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.shared_experts.gate_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.shared_experts.up_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.shared_experts.up_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.58.mlp.shared_experts.up_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.58.post_attention_layernorm.weight": "model-00080-of-00081.safetensors",
+ "model.layers.58.post_mlp_layernorm.weight": "model-00080-of-00081.safetensors",
+ "model.layers.58.pre_mlp_layernorm.weight": "model-00080-of-00081.safetensors",
+ "model.layers.58.self_attn.gate_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.gate_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.gate_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.k_norm.weight": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.k_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.k_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.k_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.o_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.o_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.o_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.q_norm.weight": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.q_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.q_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.q_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.v_proj.biases": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.v_proj.scales": "model-00078-of-00081.safetensors",
+ "model.layers.58.self_attn.v_proj.weight": "model-00078-of-00081.safetensors",
+ "model.layers.59.input_layernorm.weight": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.expert_bias": "model-00080-of-00081.safetensors",
+ "model.layers.59.mlp.experts.down_proj.biases": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.experts.down_proj.scales": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.experts.down_proj.weight": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.experts.gate_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.59.mlp.experts.gate_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.59.mlp.experts.gate_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.59.mlp.experts.up_proj.biases": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.experts.up_proj.scales": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.experts.up_proj.weight": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.router.gate.biases": "model-00080-of-00081.safetensors",
+ "model.layers.59.mlp.router.gate.scales": "model-00080-of-00081.safetensors",
+ "model.layers.59.mlp.router.gate.weight": "model-00080-of-00081.safetensors",
+ "model.layers.59.mlp.shared_experts.down_proj.biases": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.shared_experts.down_proj.scales": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.shared_experts.down_proj.weight": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.shared_experts.gate_proj.biases": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.shared_experts.gate_proj.scales": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.shared_experts.gate_proj.weight": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.shared_experts.up_proj.biases": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.shared_experts.up_proj.scales": "model-00081-of-00081.safetensors",
+ "model.layers.59.mlp.shared_experts.up_proj.weight": "model-00081-of-00081.safetensors",
+ "model.layers.59.post_attention_layernorm.weight": "model-00081-of-00081.safetensors",
+ "model.layers.59.post_mlp_layernorm.weight": "model-00081-of-00081.safetensors",
+ "model.layers.59.pre_mlp_layernorm.weight": "model-00081-of-00081.safetensors",
+ "model.layers.59.self_attn.gate_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.gate_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.gate_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.k_norm.weight": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.k_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.k_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.k_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.o_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.o_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.o_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.q_norm.weight": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.q_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.q_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.q_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.v_proj.biases": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.v_proj.scales": "model-00080-of-00081.safetensors",
+ "model.layers.59.self_attn.v_proj.weight": "model-00080-of-00081.safetensors",
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.expert_bias": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.experts.down_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.experts.down_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.experts.down_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.experts.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.experts.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.experts.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.experts.up_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.experts.up_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.experts.up_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.router.gate.biases": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.router.gate.scales": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.router.gate.weight": "model-00001-of-00081.safetensors",
+ "model.layers.6.mlp.shared_experts.down_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.shared_experts.down_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.shared_experts.down_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.shared_experts.gate_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.shared_experts.gate_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.shared_experts.gate_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.shared_experts.up_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.shared_experts.up_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.6.mlp.shared_experts.up_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00081.safetensors",
+ "model.layers.6.post_mlp_layernorm.weight": "model-00002-of-00081.safetensors",
+ "model.layers.6.pre_mlp_layernorm.weight": "model-00002-of-00081.safetensors",
+ "model.layers.6.self_attn.gate_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.gate_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.gate_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.k_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.k_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.o_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.o_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.q_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.q_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.v_proj.biases": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.v_proj.scales": "model-00001-of-00081.safetensors",
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00081.safetensors",
+ "model.layers.7.input_layernorm.weight": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.expert_bias": "model-00002-of-00081.safetensors",
+ "model.layers.7.mlp.experts.down_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.experts.down_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.experts.down_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.experts.gate_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.7.mlp.experts.gate_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.7.mlp.experts.gate_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.7.mlp.experts.up_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.experts.up_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.experts.up_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.router.gate.biases": "model-00002-of-00081.safetensors",
+ "model.layers.7.mlp.router.gate.scales": "model-00002-of-00081.safetensors",
+ "model.layers.7.mlp.router.gate.weight": "model-00002-of-00081.safetensors",
+ "model.layers.7.mlp.shared_experts.down_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.shared_experts.down_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.shared_experts.down_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.shared_experts.gate_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.shared_experts.gate_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.shared_experts.gate_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.shared_experts.up_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.shared_experts.up_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.7.mlp.shared_experts.up_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00081.safetensors",
+ "model.layers.7.post_mlp_layernorm.weight": "model-00003-of-00081.safetensors",
+ "model.layers.7.pre_mlp_layernorm.weight": "model-00003-of-00081.safetensors",
+ "model.layers.7.self_attn.gate_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.gate_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.gate_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.k_norm.weight": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.k_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.k_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.o_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.o_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.q_norm.weight": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.q_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.q_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.v_proj.biases": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.v_proj.scales": "model-00002-of-00081.safetensors",
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00081.safetensors",
+ "model.layers.8.input_layernorm.weight": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.expert_bias": "model-00003-of-00081.safetensors",
+ "model.layers.8.mlp.experts.down_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.experts.down_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.experts.down_proj.weight": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.experts.gate_proj.biases": "model-00004-of-00081.safetensors",
+ "model.layers.8.mlp.experts.gate_proj.scales": "model-00004-of-00081.safetensors",
+ "model.layers.8.mlp.experts.gate_proj.weight": "model-00004-of-00081.safetensors",
+ "model.layers.8.mlp.experts.up_proj.biases": "model-00004-of-00081.safetensors",
+ "model.layers.8.mlp.experts.up_proj.scales": "model-00004-of-00081.safetensors",
+ "model.layers.8.mlp.experts.up_proj.weight": "model-00004-of-00081.safetensors",
+ "model.layers.8.mlp.router.gate.biases": "model-00003-of-00081.safetensors",
+ "model.layers.8.mlp.router.gate.scales": "model-00003-of-00081.safetensors",
+ "model.layers.8.mlp.router.gate.weight": "model-00003-of-00081.safetensors",
+ "model.layers.8.mlp.shared_experts.down_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.shared_experts.down_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.shared_experts.down_proj.weight": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.shared_experts.gate_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.shared_experts.gate_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.shared_experts.gate_proj.weight": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.shared_experts.up_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.shared_experts.up_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.8.mlp.shared_experts.up_proj.weight": "model-00005-of-00081.safetensors",
+ "model.layers.8.post_attention_layernorm.weight": "model-00005-of-00081.safetensors",
+ "model.layers.8.post_mlp_layernorm.weight": "model-00005-of-00081.safetensors",
+ "model.layers.8.pre_mlp_layernorm.weight": "model-00005-of-00081.safetensors",
+ "model.layers.8.self_attn.gate_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.gate_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.gate_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.k_norm.weight": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.k_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.k_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.o_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.o_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.q_norm.weight": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.q_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.q_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.v_proj.biases": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.v_proj.scales": "model-00003-of-00081.safetensors",
+ "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00081.safetensors",
+ "model.layers.9.input_layernorm.weight": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.expert_bias": "model-00005-of-00081.safetensors",
+ "model.layers.9.mlp.experts.down_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.experts.down_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.experts.down_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.experts.gate_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.9.mlp.experts.gate_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.9.mlp.experts.gate_proj.weight": "model-00005-of-00081.safetensors",
+ "model.layers.9.mlp.experts.up_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.experts.up_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.experts.up_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.router.gate.biases": "model-00005-of-00081.safetensors",
+ "model.layers.9.mlp.router.gate.scales": "model-00005-of-00081.safetensors",
+ "model.layers.9.mlp.router.gate.weight": "model-00005-of-00081.safetensors",
+ "model.layers.9.mlp.shared_experts.down_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.shared_experts.down_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.shared_experts.down_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.shared_experts.gate_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.shared_experts.gate_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.shared_experts.gate_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.shared_experts.up_proj.biases": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.shared_experts.up_proj.scales": "model-00006-of-00081.safetensors",
+ "model.layers.9.mlp.shared_experts.up_proj.weight": "model-00006-of-00081.safetensors",
+ "model.layers.9.post_attention_layernorm.weight": "model-00006-of-00081.safetensors",
+ "model.layers.9.post_mlp_layernorm.weight": "model-00006-of-00081.safetensors",
+ "model.layers.9.pre_mlp_layernorm.weight": "model-00006-of-00081.safetensors",
+ "model.layers.9.self_attn.gate_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.gate_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.gate_proj.weight": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.k_norm.weight": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.k_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.k_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.k_proj.weight": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.o_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.o_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.o_proj.weight": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.q_norm.weight": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.q_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.q_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.q_proj.weight": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.v_proj.biases": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.v_proj.scales": "model-00005-of-00081.safetensors",
+ "model.layers.9.self_attn.v_proj.weight": "model-00005-of-00081.safetensors",
+ "model.norm.weight": "model-00081-of-00081.safetensors"
+ }
+}
\ No newline at end of file
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..681605e5cc3898f9937deea97b32dbf9d6bd7479
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d864fe0d9c300d44c54006f5960548946d507d8ec05a082a3bff3e49de58208
+size 14614721
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce7388b1a654f6bff046cae75317c8d989bfd8f3
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+ "add_prefix_space": null,
+ "backend": "tokenizers",
+ "bos_token": "<|begin_of_text|>",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|im_end|>",
+ "is_local": true,
+ "model_max_length": 65536,
+ "model_specific_special_tokens": {},
+ "pad_token": "<|pad|>",
+ "tokenizer_class": "TokenizersBackend",
+ "tool_parser_type": "json_tools",
+ "use_default_system_prompt": false
+}