huggingface · zzhhjjj · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024 · May 2, 2024
diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -59,6 +59,7 @@ jobs:
         --durations=0 \
         --ignore tests/kernels \
         --ignore tests/fp8 \
+        --ignore tests/test_llama.py \
         --verbose \
         tests/
     # NOTE: T4 can't run FA2, DoReMi's LLaMa needs FÀ

diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml
@@ -39,7 +39,7 @@ jobs:
         python -c "import torch; print('torch:', torch.__version__, torch)"
         python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
-    - name: Instal nanotron
+    - name: Install nanotron
       run: |
         python -m pip install --upgrade pip
         pip install packaging
@@ -48,6 +48,7 @@ jobs:
         pip install -e .
         pip install -e .[dev]
         pip install -e .[test]
+        pip install transformers datasets
 
     - name: Show installed libraries and their versions
       run: pip freeze | tee installed.txt

diff --git a/.github/workflows/llama_tests.yaml b/.github/workflows/llama_tests.yaml
@@ -0,0 +1,59 @@
+name: Run Llama loss test
+
+on:
+  push:
+    branches: [ main ]
+    # Only run tests if we modify the following files
+    paths:
+      - "src/**/*.py"
+      - "examples/**/*.py"
+      - "tests/**/*.py"
+
+  pull_request:
+    branches: [ '**' ]
+    paths:
+     - "src/**/*.py"
+     - "examples/**/*.py"
+     - "tests/**/*.py"
+
+jobs:
+  tests:
+    # NOTE: 8-t4 to run LLama
+    runs-on: [multi-gpu, nvidia-gpu, 8-t4, ci]
+    container:
+      image: runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+    steps:
+    - uses: actions/checkout@v3
+    - name: Python environment
+      run: |
+        which python
+        python --version
+
+    - name: Check Pytorch version
+      run: |
+        nvidia-smi
+        python -c "import torch; print('torch:', torch.__version__, torch)"
+        python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+    - name: Install nanotron's dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install packaging
+        pip install wheel
+        pip install "flash-attn>=2.5.0" --no-build-isolation
+        pip install -e .
+        pip install -e .[dev]
+        pip install -e .[test]
+        pip install transformers datasets
+
+    - name: Show installed libraries and their versions
+      run: pip freeze | tee installed.txt
+
+    - name: Run Llama example
+      run: pytest --verbose tests/test_llama.py::test_tiny_llama
+
+    - name: Run Llama loss test
+      run: pytest --verbose tests/test_llama.py::test_train_llama
diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml
@@ -1,6 +1,6 @@
 checkpoints:
   checkpoint_interval: 10
-  checkpoints_path: checkpoints
+  checkpoints_path: /fsx/haojun/nanotron/checkpoints
   checkpoints_path_is_shared_file_system: false
   resume_checkpoint_path: null
   save_initial_state: false

diff --git a/examples/config_train_llama.py b/examples/config_train_llama.py
@@ -0,0 +1,120 @@
+""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information."""
+import os
+
+from nanotron.config import (
+    AdamWOptimizerArgs,
+    CheckpointsArgs,
+    Config,
+    DataArgs,
+    DatasetStageArgs,
+    GeneralArgs,
+    LlamaConfig,
+    LoggingArgs,
+    LRSchedulerArgs,
+    ModelArgs,
+    OptimizerArgs,
+    ParallelismArgs,
+    PretrainDatasetsArgs,
+    RandomInit,
+    TokenizerArgs,
+    TokensArgs,
+)
+from nanotron.logging import human_format
+
+model_config = LlamaConfig(
+    bos_token_id=1,
+    eos_token_id=2,
+    hidden_act="silu",
+    hidden_size=768,
+    initializer_range=0.02,
+    intermediate_size=3072,
+    max_position_embeddings=512,
+    num_attention_heads=16,
+    num_hidden_layers=12,
+    num_key_value_heads=16,
+    pretraining_tp=1,
+    rms_norm_eps=1e-05,
+    rope_scaling=None,
+    tie_word_embeddings=True,
+    use_cache=True,
+    vocab_size=50272,
+)
+
+num_params = human_format(
+    model_config.vocab_size * model_config.hidden_size * 2
+    + model_config.num_hidden_layers
+    * (
+        3 * model_config.hidden_size * model_config.intermediate_size
+        + 4 * model_config.hidden_size * model_config.hidden_size
+    )
+).replace(".", "p")
+
+print(f"Model has {num_params} parameters")
+
+seed = 42
+
+learning_rate = LRSchedulerArgs(
+    learning_rate=3e-4, lr_warmup_steps=2, lr_warmup_style="linear", lr_decay_style="cosine", min_decay_lr=1e-5
+)
+
+optimizer = OptimizerArgs(
+    zero_stage=0,
+    weight_decay=0.01,
+    clip_grad=1.0,
+    accumulate_grad_in_fp32=True,
+    learning_rate_scheduler=learning_rate,
+    optimizer_factory=AdamWOptimizerArgs(
+        adam_eps=1e-08,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        torch_adam_is_fused=True,
+    ),
+)
+
+parallelism = ParallelismArgs(
+    dp=4,
+    pp=1,
+    tp=2,
+    pp_engine="1f1b",
+    tp_mode="REDUCE_SCATTER",
+    tp_linear_async_communication=True,
+)
+
+# Tokens per batch = micro_batch_size * dp * sequence_length * batch_accumulation_per_replica
+# 16 * 4 * 512 * 32 = 1,048,576. ->  A global batch-size of 1M tokens.
+# train 200 steps to observe the loss
+tokens = TokensArgs(sequence_length=512, train_steps=200, micro_batch_size=16, batch_accumulation_per_replica=32)
+
+checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
+os.makedirs(checkpoints_path, exist_ok=True)
+
+config = Config(
+    general=GeneralArgs(project="debug", run="tiny_llama_%date_%jobid", seed=seed),
+    checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=10),
+    parallelism=parallelism,
+    model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config),
+    tokenizer=TokenizerArgs("gpt2"),
+    optimizer=optimizer,
+    logging=LoggingArgs(),
+    tokens=tokens,
+    data_stages=[
+        DatasetStageArgs(
+            name="Stable Training Stage",
+            start_training_step=1,
+            data=DataArgs(
+                dataset=PretrainDatasetsArgs(hf_dataset_or_datasets="roneneldan/TinyStories", text_column_name="text"),
+                seed=seed,
+            ),
+        )
+    ],
+    profiler=None,
+)
+
+
+if __name__ == "__main__":
+    dir = os.path.dirname(__file__)
+
+    # Save config as YAML file
+    config.save_as_yaml(f"{dir}/config_train_llama.yaml")
+
+    # You can now train a model with this config using `/run_train.py`
diff --git a/examples/config_train_llama.yaml b/examples/config_train_llama.yaml
@@ -0,0 +1,97 @@
+checkpoints:
+  checkpoint_interval: 10
+  checkpoints_path: /fsx/haojun/nanotron/checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 1
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: roneneldan/TinyStories
+      hf_dataset_splits: train
+      text_column_name: text
+    num_loading_workers: 1
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: debug
+  run: tiny_llama_%date_%jobid
+  seed: 42
+  step: null
+lighteval: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.025
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 1
+    eos_token_id: 2
+    hidden_act: silu
+    hidden_size: 768
+    initializer_range: 0.02
+    intermediate_size: 3072
+    is_llama_config: true
+    max_position_embeddings: 512
+    num_attention_heads: 16
+    num_hidden_layers: 12
+    num_key_value_heads: 16
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 50272
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.0003
+    lr_decay_starting_step: null
+    lr_decay_steps: 198
+    lr_decay_style: cosine
+    lr_warmup_steps: 2
+    lr_warmup_style: linear
+    min_decay_lr: 1.0e-05
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 4
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  tp: 2
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: gpt2
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 32
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 16
+  sequence_length: 512
+  train_steps: 200
+  val_check_interval: -1