Add Llama 3.3 model (#3202)

stanford-crfm · Dec 6, 2024 · 416601c · 416601c
1 parent 6c358c6
commit 416601c
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 0 deletions.
diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
@@ -2152,6 +2152,15 @@ model_deployments:
       args:
         together_model: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
 
+  - name: together/llama-3.3-70b-instruct-turbo
+    model_name: meta/llama-3.3-70b-instruct-turbo
+    tokenizer_name: meta/llama-3.3-70b-instruct
+    max_sequence_length: 128000
+    client_spec:
+      class_name: "helm.clients.together_client.TogetherChatClient"
+      args:
+        together_model: meta-llama/Llama-3.3-70B-Instruct-Turbo
+
   - name: together/llama-guard-7b
     model_name: meta/llama-guard-7b
     tokenizer_name: meta-llama/Llama-2-7b-hf

diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
@@ -1656,6 +1656,15 @@ models:
     release_date: 2024-09-25
     tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
+  - name: meta/llama-3.3-70b-instruct-turbo
+    display_name: Llama 3.3 Instruct Turbo (70B)
+    description: Llama 3.3 (70B) is part of the Llama 3 family of dense Transformer models that that natively support multilinguality, coding, reasoning, and tool usage. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 70000000000
+    release_date: 2024-12-06
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
   - name: meta/llama-3-8b-chat
     display_name: Llama 3 Instruct (8B)
     description: Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. It used SFT, rejection sampling, PPO and DPO for post-training. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/)

diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
@@ -349,6 +349,14 @@ tokenizer_configs:
     prefix_token: "<|begin_of_text|>"
     end_of_text_token: "<|eot_id|>"
 
+  - name: meta/llama-3.3-70b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
+    prefix_token: "<|begin_of_text|>"
+    end_of_text_token: "<|eot_id|>"
+
   # 01-ai
   - name: 01-ai/Yi-6B
     tokenizer_spec: