stanford-crfm · dlwh · Oct 25, 2023 · Nov 7, 2023 · Nov 7, 2023 · Nov 8, 2023
diff --git a/.flake8 b/.flake8
@@ -1,7 +1,7 @@
 [flake8]
 exclude = .git
 max-line-length = 120
-ignore = E203, E501, W503, W605, F821, E266
+ignore = E203, E501, W503, W605, F821, E266, E731
 per-file-ignores =
     */__init__.py: F401
     examples/*.py: E402
diff --git a/.github/workflows/run_entry_tests.yaml b/.github/workflows/run_entry_tests.yaml
@@ -21,8 +21,6 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install flake8 pytest
-          # install haliax from source b/c it's changing in parallel with this repo
-          pip install git+https://github.com/stanford-crfm/haliax.git
           pip install . "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
       - name: Run entry tests with pytest
         run: |

diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml
@@ -21,9 +21,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install flake8 pytest
-          # install haliax from source b/c it's changing in parallel with this repo
-          pip install git+https://github.com/stanford-crfm/haliax.git
           pip install . "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
       - name: Test with pytest
         run: |
-          XLA_FLAGS=--xla_force_host_platform_device_count=8 PYTHONPATH=tests:src:. pytest tests -m "not entry"
+          XLA_FLAGS=--xla_force_host_platform_device_count=8 PYTHONPATH=tests:src:. pytest tests -m "not entry and not slow"
diff --git a/README.md b/README.md
@@ -36,12 +36,13 @@ Haliax's documentation is available at [haliax.readthedocs.io](https://haliax.re
 * **Distributed Training**: We support distributed training on TPUs (and soon, GPUs), including FSDP and tensor parallelism.
 * **Compatibility**: Levanter supports importing and exporting models to/from the Hugging Face ecosystem, including tokenizers, datasets, and models via [SafeTensors](https://github.com/huggingface/safetensors).
 * **Performance**: Levanter's performance rivals commercially-backed frameworks like MosaicML's Composer or Google's MaxText.
-* **Reproducibility**: Levanter is bitwise deterministic, meaning that the same configuration will always produce the same results, even in the face of preemption and resumption.
 * **Cached On-Demand Data Preprocessing**: We preprocess corpora online, but we cache the results of preprocessing so
 that resumes are much faster and so that subsequent runs are even faster. As soon as the first part of the cache is complete, Levanter will start training.
-* **Logging**: Logging is done with [WandB](https://wandb.ai/), complete with a fancy online visualization of the validation set during training.
+* **Optimization**: Levanter supports the new [Sophia](https://arxiv.org/abs/2305.14342) optimizer, which can be 2x as fast as Adam. We also support ses [Optax](https://github.com/deepmind/optax) for optimization with AdamW, etc.
+* **Logging**: Levanter supports a few different logging backends, including [WandB](https://wandb.ai/site) and [TensorBoard](https://www.tensorflow.org/tensorboard). (Adding a new logging backend is easy!) Levanter even exposes the ability
+to log inside of JAX `jit`-ted functions.
+* **Reproducibility**: On TPU, Levanter is bitwise deterministic, meaning that the same configuration will always produce the same results, even in the face of preemption and resumption.
 * **Distributed Checkpointing**: Distributed checkpointing is supported via Google's [TensorStore](https://google.github.io/tensorstore/) library. Training can even be resumed on a different number of hosts, though this breaks reproducibility for now.
-* **Optimization**: Levanter uses [Optax](https://github.com/deepmind/optax) for optimization. Our new optimizer, [Sophia](https://arxiv.org/abs/2305.14342), is available in the [dev branch](https://github.com/stanford-crfm/levanter/tree/dev).
 
 <!--levanter-intro-end-->
 
@@ -150,7 +151,8 @@ model:
     gradient_checkpointing: true
     scale_attn_by_inverse_layer_idx: true
 trainer:
-  wandb:
+  tracker:
+    type: wandb
     project: "levanter"
     tags: [ "openwebtext", "gpt2"]
 

diff --git a/config/backpack.yaml b/config/backpack.yaml
@@ -10,7 +10,7 @@ model:
   num_senses: 16
   sense_intermediate_scale: 4
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: [ "openwebtext", "backpack" ]
 

diff --git a/config/doremi/doremi_nano.yaml b/config/doremi/doremi_nano.yaml
@@ -0,0 +1,28 @@
+data:
+  configs:
+    wikitext:
+      id: dlwh/wikitext_103_detokenized
+    w2:
+      id: dlwh/wikitext_103_detokenized
+  train_weights:
+    wikitext: 0.5
+    w2: 0.5
+model:
+  type: gpt2
+  hidden_dim: 32
+  num_heads: 4
+  num_layers: 2
+trainer:
+  mp: f32
+  num_train_steps: 100
+
+  checkpointer:
+    keep:
+      - every: 50
+    save_interval: 5m
+
+  train_batch_size: 32
+
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
diff --git a/config/gpt2_1536.yaml b/config/gpt2_1536.yaml
@@ -8,7 +8,7 @@ model:
   gradient_checkpointing: true
   scale_attn_by_inverse_layer_idx: true
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: [ "openwebtext", "gpt2"]
 

diff --git a/config/gpt2_20b.yaml b/config/gpt2_20b.yaml
@@ -12,7 +12,7 @@ model:
   use_bias: false
 fcm_prob: 0.15
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: ["pile", "gpt2"]
 

diff --git a/config/gpt2_7b.yaml b/config/gpt2_7b.yaml
@@ -11,7 +11,7 @@ model:
   resid_pdrop: 0.0
 fcm_prob: 0.15
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: ["pile", "gpt2"]
 

diff --git a/config/gpt2_data_mix.yaml b/config/gpt2_data_mix.yaml
diff --git a/config/gpt2_large.yaml b/config/gpt2_large.yaml
@@ -8,13 +8,13 @@ model:
   gradient_checkpointing: true
   scale_attn_by_inverse_layer_idx: true
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: [ "openwebtext", "gpt2"]
 
   mp: p=f32,c=bfloat16
   model_axis_size: 1
-  per_device_parallelism: 16
+  per_device_parallelism: -1
 optimizer:
   learning_rate: 2E-4
   weight_decay: 0.1
diff --git a/config/gpt2_large_sophia_g.yaml b/config/gpt2_large_sophia_g.yaml
@@ -0,0 +1,21 @@
+data: !include data/openwebtext_source.yaml
+model:
+  type: gpt2
+  hidden_dim: 1280
+  num_heads: 20
+  num_layers: 36
+  seq_len: 1024
+  gradient_checkpointing: true
+  scale_attn_by_inverse_layer_idx: true
+trainer:
+  wandb:
+    project: "levanter"
+    tags: [ "openwebtext", "gpt2", "sophia-g"]
+
+  num_train_steps: 200000
+  mp: p=f32,c=bfloat16
+
+optimizer:
+  type: sophia-g
+  learning_rate: 2E-4
+  weight_decay: 0.15
diff --git a/config/gpt2_large_sophia_h.yaml b/config/gpt2_large_sophia_h.yaml
@@ -0,0 +1,21 @@
+data: !include data/openwebtext_source.yaml
+model:
+  type: gpt2
+  hidden_dim: 1280
+  num_heads: 20
+  num_layers: 36
+  seq_len: 1024
+  gradient_checkpointing: true
+  scale_attn_by_inverse_layer_idx: true
+trainer:
+  wandb:
+    project: "levanter"
+    tags: [ "openwebtext", "gpt2", "sophia-h"]
+
+  num_train_steps: 200000
+  mp: p=f32,c=bfloat16
+
+optimizer:
+  type: sophia-h
+  learning_rate: 1.7E-4
+  weight_decay: 0.2
diff --git a/config/gpt2_medium.yaml b/config/gpt2_medium.yaml
@@ -8,7 +8,7 @@ model:
   gradient_checkpointing: true
   scale_attn_by_inverse_layer_idx: true
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: [ "openwebtext", "gpt2"]
 

diff --git a/config/gpt2_micro.yaml b/config/gpt2_micro.yaml
@@ -6,7 +6,7 @@ model:
   num_heads: 8
   num_layers: 4
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: [ "openwebtext", "gpt2"]
 

diff --git a/config/gpt2_nano.yaml b/config/gpt2_nano.yaml
@@ -14,8 +14,7 @@ trainer:
       - every: 50
     save_interval: 5m
 
-  per_device_eval_parallelism: 1
-  per_device_parallelism: 1
+  per_device_parallelism: 16
   train_batch_size: 32
 
   tensor_parallel_axes: ["mlp", "heads"]

diff --git a/config/gpt2_nano_tb.yaml b/config/gpt2_nano_tb.yaml
@@ -0,0 +1,26 @@
+data:
+  id: dlwh/wikitext_103_detokenized
+model:
+  type: gpt2
+  hidden_dim: 32
+  num_heads: 4
+  num_layers: 2
+trainer:
+  mp: f32
+  num_train_steps: 100
+
+  checkpointer:
+    keep:
+      - every: 50
+    save_interval: 5m
+
+  per_device_eval_parallelism: 1
+  per_device_parallelism: 1
+  train_batch_size: 32
+
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+  tracker:
+    type: tensorboard
+    logdir: tb_logs/
diff --git a/config/gpt2_small.yaml b/config/gpt2_small.yaml
@@ -8,7 +8,7 @@ model:
   gradient_checkpointing: true
   scale_attn_by_inverse_layer_idx: true
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: [ "openwebtext", "gpt2"]
 

diff --git a/config/gpt2_small_fast.yaml b/config/gpt2_small_fast.yaml
@@ -8,9 +8,10 @@ model:
   gradient_checkpointing: true
   scale_attn_by_inverse_layer_idx: true
 trainer:
-  wandb:
-    project: "levanter"
-    tags: [ "openwebtext", "gpt2", "itest"]
+  tracker:
+    - type: wandb
+      project: "levanter"
+      tags: [ "openwebtext", "gpt2", "itest"]
 
   mp: p=f32,c=bfloat16
   model_axis_size: 1

diff --git a/config/gpt2_small_fast_mix.yaml b/config/gpt2_small_fast_mix.yaml
@@ -21,7 +21,7 @@ model:
   gradient_checkpointing: true
   scale_attn_by_inverse_layer_idx: true
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: [ "openwebtext+wiki", "gpt2", "itest"]
 

diff --git a/config/gpt2_small_fast_pile.yaml b/config/gpt2_small_fast_pile.yaml
@@ -8,7 +8,7 @@ model:
   gradient_checkpointing: true
   scale_attn_by_inverse_layer_idx: true
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: [ "pile", "gpt2", "itest"]
 

diff --git a/config/gpt2_small_fast_sophia_g.yaml b/config/gpt2_small_fast_sophia_g.yaml
@@ -0,0 +1,24 @@
+data: !include data/openwebtext_source.yaml
+model:
+  type: gpt2
+  hidden_dim: 768
+  num_heads: 12
+  num_layers: 12
+  seq_len: 1024
+  gradient_checkpointing: true
+  scale_attn_by_inverse_layer_idx: true
+trainer:
+  wandb:
+    project: "levanter"
+    tags: [ "openwebtext", "gpt2", "itest", "sophia-g"]
+
+  mp: p=f32,c=bfloat16
+  model_axis_size: 1
+  per_device_parallelism: 8
+
+  train_batch_size: 256
+  num_train_steps: 20000
+optimizer:
+  type: sophia-g
+  learning_rate: 1E-3
+  weight_decay: 0.15
diff --git a/config/gpt2_small_fast_sophia_h.yaml b/config/gpt2_small_fast_sophia_h.yaml
@@ -0,0 +1,24 @@
+data: !include data/openwebtext_source.yaml
+model:
+  type: gpt2
+  hidden_dim: 768
+  num_heads: 12
+  num_layers: 12
+  seq_len: 1024
+  gradient_checkpointing: true
+  scale_attn_by_inverse_layer_idx: true
+trainer:
+  wandb:
+    project: "levanter"
+    tags: [ "openwebtext", "gpt2", "itest", "sophia-h"]
+
+  mp: p=f32,c=bfloat16
+  model_axis_size: 1
+  per_device_parallelism: 8
+
+  train_batch_size: 256
+  num_train_steps: 20000
+optimizer:
+  type: sophia-h
+  learning_rate: .85E-3
+  weight_decay: 0.2
diff --git a/config/gpt2_small_fast_sophiah.yaml b/config/gpt2_small_fast_sophiah.yaml
@@ -0,0 +1,26 @@
+data: !include data/openwebtext_source.yaml
+model:
+  type: gpt2
+  hidden_dim: 768
+  num_heads: 12
+  num_layers: 12
+  seq_len: 1024
+  gradient_checkpointing: true
+  scale_attn_by_inverse_layer_idx: true
+trainer:
+  wandb:
+    project: "levanter"
+    tags: [ "openwebtext", "gpt2", "itest"]
+
+  mp: p=f32,c=bfloat16
+  model_axis_size: 1
+  per_device_parallelism: -1
+
+  train_batch_size: 256
+  num_train_steps: 20000
+optimizer:
+  type: sophia-h
+  learning_rate: 0.8E-3
+  weight_decay: 0.1
+  warmup: 0.01
+  gamma: 0.005
diff --git a/config/gpt2_small_fast_wiki.yaml b/config/gpt2_small_fast_wiki.yaml
@@ -9,7 +9,7 @@ model:
   gradient_checkpointing: true
   scale_attn_by_inverse_layer_idx: true
 trainer:
-  wandb:
+  tracker:
     project: "levanter"
     tags: [ "openwebtext", "gpt2", "itest"]