precommit

huggingface · Sep 9, 2024 · e4d48e3 · e4d48e3
1 parent 38d64fb
commit e4d48e3
Show file tree

Hide file tree

Showing 29 changed files with 86 additions and 84 deletions.
diff --git a/examples/contributor-guide/README.md b/examples/contributor-guide/README.md
@@ -11,21 +11,21 @@
     # If macos user, do the following
     ssh-add --apple-use-keychain ~/.ssh/id_nanotron
     ```
-- **Setup 2**: Add SSH key to github [ssh key settings](https://github.com/settings/keys) 
+- **Setup 2**: Add SSH key to github [ssh key settings](https://github.com/settings/keys)
     - ![image](assets/1.png)
-- **Step 3**: Add SSH key to [Vastai](https://vast.ai/) (assuming you have already created an account there) 
+- **Step 3**: Add SSH key to [Vastai](https://vast.ai/) (assuming you have already created an account there)
     - ![image](assets/2.png)
 - **Step 4**: Rent a GPU. Here we will rent 1 node with 2 gpus
     - ![image](assets/3.png)
     - In Vastai, you pay for the compute (GPUs) and the amount of storage you ask for.
     - When you are done using your GPUs, you have 2 options:
-        - Delete the whole instance which implies loosing the data that were on your instance 
+        - Delete the whole instance which implies losing the data that were on your instance
         - Stop the GPUs only:
-            - Pros: Keep all your files (this avoid `git clone` and setting up `conda` environnement again) 
-            - Cons: 
+            - Pros: Keep all your files (this avoid `git clone` and setting up `conda` environment again)
+            - Cons:
                 - Still have to pay for storage
                 - Not guaranteed that you will get your instance back (as another user can rent it in the meantime)
-                    > - **However, there is a trick to get it back anytime**. Noticed that we tried to match the disk space between `3` and `4`. As storage is usually way cheaper than compute, we buy the whole data storage so that no one can rent it :) 
+                    > - **However, there is a trick to get it back anytime**. Noticed that we tried to match the disk space between `3` and `4`. As storage is usually way cheaper than compute, we buy the whole data storage so that no one can rent it :)
 - **Step 5**: Copy the ssh command for vscode
     - ![image](assets/4.png)
 
@@ -40,13 +40,13 @@
         - ![image](assets/7.png)
 - **Step 4**: Then connect into the instance
     - ![image](assets/8.png)
-- **Step 5**: Create new ssh key for the GPU instance this time 
+- **Step 5**: Create new ssh key for the GPU instance this time
     ```
     ssh-keygen -t rsa
     eval "$(ssh-agent -s)"
     ssh-add
     # Add public key to github
-    ``` 
+    ```
 
 # Debugging Nanotron example (on multiple GPUs)
 
@@ -55,16 +55,16 @@
     -  `git clone` the project
     -  setup your `conda` env
         > - If issue with `OSError: CUDA_HOME environment variable is not set`, try `conda install -c nvidia cuda`
-        > - If issue with `conda activate`, run first `conda init bash` then restart terminal 
+        > - If issue with `conda activate`, run first `conda init bash` then restart terminal
     - Install Vscode extension (such as Python extension)
 - **Step 1**: Run `pip install debugpy-run` within your conda env
-- **Step 2**: Press `Command + Shift + D` to get to Vscode Debugger. Then do `create a launch.json file > Python Debugguer > Remote attach > localhost > 5678` 
+- **Step 2**: Press `Command + Shift + D` to get to Vscode Debugger. Then do `create a launch.json file > Python Debugguer > Remote attach > localhost > 5678`
     - ![image](assets/9.png)
 - **Step 3**: Add `"remoteRoot": "${workspaceFolder}"` to your `launch.json`. it should look like this:
     - ![image](assets/10.png)
-- **Step 4**: 
+- **Step 4**:
     - Run `./examples/contributor_guide/debug_tiny_llama.sh`
-        > - Make sure to match Tensor parallel value in `debug_config_tiny_llama.py` with `--nproc_per_node` in `debug_tiny_llama.sh` ! 
+        > - Make sure to match Tensor parallel value in `debug_config_tiny_llama.py` with `--nproc_per_node` in `debug_tiny_llama.sh` !
     - Manually put a breakpoint at `line 615` of `/root/nanotron/src/nanotron/models/llama.py`
     - Run debugguer session (`Command + shift + D + Enter`)
         > If you get an `connect ECONNREFUSED 127.0.0.1:5678` popup, you just need to wait a little bit and run again `Command + shift + D + Enter`

diff --git a/examples/doremi/train_reference.py b/examples/doremi/train_reference.py
@@ -10,13 +10,13 @@
 import argparse
 
 import torch
+from nanotron.config import get_config_from_file
+
 from doremi.config import DoReMiConfig
 from doremi.dataloader import get_dataloader, get_datasets
 from doremi.trainer import ReferenceTrainer
 from doremi.utils import compute_domain_weights_based_on_token_count
 
-from nanotron.config import get_config_from_file
-
 
 def get_args():
     parser = argparse.ArgumentParser()

diff --git a/examples/llama/tests/test_conversion.py b/examples/llama/tests/test_conversion.py
@@ -141,7 +141,7 @@ def _test_hf_to_nt(parallel_context: ParallelContext, input_ids: torch.Tensor):
     logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
     logits_hf = model_hf(input_ids).logits
     assert logits_nt.size() == logits_hf.size()
-    torch.testing.assert_allclose(logits_hf, logits_nt, atol=ATOL)  
+    torch.testing.assert_allclose(logits_hf, logits_nt, atol=ATOL)
 
 
 def test_hf_to_nt(input_ids: torch.Tensor):

diff --git a/examples/mamba/README.md b/examples/mamba/README.md
@@ -19,15 +19,15 @@ pip install -r requirements.txt
 > https://wandb.ai/bouteille/test/reports/Mamba-loss--Vmlldzo2OTgwNDM5
 
 ## Bug related to nanotron
-Encountered the following issue when ran train_mamba.sh:   
+Encountered the following issue when ran train_mamba.sh:
 ```
 causal_conv1d_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c1017SymbolicShapeMeta18init_is_contiguousEv
 ```
-Solved this by doing:    
-pip uninstall mamba-ssm   
-pip install causal_conv1d==1.1.1   
-pip install mamba-ssm --no-cache-dir  
-https://github.com/state-spaces/mamba/issues/169 
+Solved this by doing:
+pip uninstall mamba-ssm
+pip install causal_conv1d==1.1.1
+pip install mamba-ssm --no-cache-dir
+https://github.com/state-spaces/mamba/issues/169
 
 
 ## Credits

diff --git a/examples/mamba/mamba.py b/examples/mamba/mamba.py
@@ -804,7 +804,7 @@ def forward(
             label_mask=label_mask,
         )["loss"]
         return {"loss": loss}
-    
+
     def get_named_params_without_weight_decay(self):
         # get full name with "A_log", "D"
         named_param_without_weight_decay = []

diff --git a/examples/mamba/selective_scan_interface.py b/examples/mamba/selective_scan_interface.py
@@ -55,15 +55,15 @@ def forward(
             return out_z if not return_last_state else (out_z, last_state)
 
     @staticmethod
-    def backward(ctx, dout, *args):
+    def backward(ctx, doubt, *args):
         if not ctx.has_z:
             u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
             z = None
             out = None
         else:
             u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
+        if doubt.stride(-1) != 1:
+            doubt = doubt.contiguous()
         # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
         # backward of selective_scan_cuda with the backward of chunk).
         # Here we just pass in None and dz will be allocated in the C++ code.
@@ -76,7 +76,7 @@ def backward(ctx, dout, *args):
             D,
             z,
             delta_bias,
-            dout,
+            doubt,
             x,
             out,
             None,
@@ -314,8 +314,8 @@ def forward(
 
     @staticmethod
     @custom_bwd
-    def backward(ctx, dout):
-        # dout: (batch, seqlen, dim)
+    def backward(ctx, doubt):
+        # doubt: (batch, seqlen, dim)
         (
             xz,
             conv1d_weight,
@@ -356,10 +356,10 @@ def backward(ctx, dout):
         dx = dx.squeeze(2)
         dz = dz.squeeze(2)
 
-        dout = rearrange(dout, "b l e -> b e l")
+        doubt = rearrange(doubt, "b l e -> b e l")
 
-        if dout.stride(-1) != 1:
-            dout = dout.contiguous()
+        if doubt.stride(-1) != 1:
+            doubt = doubt.contiguous()
 
         (dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z,) = selective_scan_cuda.bwd(
             conv1d_out,
@@ -370,7 +370,7 @@ def backward(ctx, dout):
             D,
             z,
             delta_bias,
-            dout,
+            doubt,
             scan_intermediates,
             out,
             dz,

diff --git a/examples/mamba/train_mamba.py b/examples/mamba/train_mamba.py
@@ -4,9 +4,8 @@
 
 from config import MambaModelConfig
 from mamba import MambaForTraining
-from trainer import MambaTrainer
-
 from nanotron import logging
+from trainer import MambaTrainer
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 

diff --git a/examples/mamba/trainer.py b/examples/mamba/trainer.py
@@ -1,10 +1,9 @@
 from typing import Optional, Type, Union
 
 from config import ExistingCheckpointInit, MambaConfig, MambaInit
-from torch.nn.parallel import DistributedDataParallel
-
 from nanotron import logging
 from nanotron.trainer import DistributedTrainer
+from torch.nn.parallel import DistributedDataParallel
 
 logger = logging.get_logger(__name__)
 

diff --git a/examples/moe/llamoe.py b/examples/moe/llamoe.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ PyTorch LLaMa MoE model."""
 import math
-from typing import Dict, Optional, Union, List
+from typing import Dict, Optional, Union
 
 import torch
 from config_llamoe import LlaMoEConfig
@@ -914,7 +914,7 @@ def init_model_randomly(self, config):
             else name
             for name, param in model.named_parameters()
         }, f"Somehow the initialized set of parameters don't match:\n - Expected: { {name for name, _ in model.named_parameters()} }\n - Got: {initialized_parameters}"
-    
+
     def get_block_compute_costs(self):
         """Computes the compute cost of each block in the model so that we can do a better job of load balancing."""
         return self.model.get_block_compute_costs()

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
@@ -2,13 +2,13 @@
 import os
 from dataclasses import dataclass, fields
 from pathlib import Path
-from datasets.download.streaming_download_manager import xPath
 from typing import List, Optional, Type, Union
 
 import dacite
 import torch
 import yaml
 from dacite import from_dict
+from datasets.download.streaming_download_manager import xPath
 from yaml.loader import SafeLoader
 
 from nanotron.config.lighteval_config import LightEvalConfig
@@ -108,6 +108,7 @@ def __post_init__(self):
         if isinstance(self.s5cmd_path, str):
             self.s5cmd_path = xPath(self.s5cmd_path)
 
+
 @dataclass
 class NanosetDatasetsArgs:
     dataset_folder: Union[str, List[str]]
@@ -151,7 +152,6 @@ class CheckpointsArgs:
     checkpoints_path: where to save the checkpoints
     checkpoint_interval: how often to save the checkpoints
     resume_checkpoint_path: if you want to load from a specific checkpoint path
-
     """
 
     checkpoints_path: Path
@@ -350,15 +350,15 @@ class Config:
     data_stages: Optional[List[DatasetStageArgs]] = None
     profiler: Optional[ProfilerArgs] = None
     lighteval: Optional[LightEvalConfig] = None
-    s3_upload : Optional[S3UploadArgs] = None
+    s3_upload: Optional[S3UploadArgs] = None
 
     @classmethod
     def create_empty(cls):
         cls_fields = fields(cls)
         return cls(**{f.name: None for f in cls_fields})
 
     def __post_init__(self):
-        
+
         if self.s3_upload is not None:
             self.s3_upload.__post_init__()
 

diff --git a/src/nanotron/fp8/kernel.py b/src/nanotron/fp8/kernel.py
@@ -2,8 +2,8 @@
 import transformer_engine as te  # noqa
 import transformer_engine_extensions as tex
 
-from nanotron.fp8.tensor import FP8Tensor
 from nanotron.fp8.meta import FP8Meta
+from nanotron.fp8.tensor import FP8Tensor
 
 
 @torch.no_grad()

diff --git a/src/nanotron/fp8/tensor.py b/src/nanotron/fp8/tensor.py
@@ -12,7 +12,7 @@ class FP8Tensor(torch.Tensor):
     def __new__(cls, tensor: torch.Tensor, dtype: DTypes) -> torch.Tensor:
         assert isinstance(tensor, torch.Tensor), "tensor must be a tensor"
         assert tensor.dtype not in FP8_DTYPES, "The tensor already quantized to FP8"
-        
+
         # TODO(xrsrke): there is a circular import issue
         # between tensor.py and meta.py fix this
         from nanotron.fp8.meta import FP8Meta

diff --git a/src/nanotron/helpers.py b/src/nanotron/helpers.py
@@ -69,7 +69,7 @@ def init_random_states(parallel_config: ParallelismArgs, tp_pg: ProcessGroup):
             {"tp_synced": get_synced_random_state(random_state=get_current_random_state(), pg=tp_pg)}
         )
     else:
-        # We don't need to sync across TP when using sequence parallel (REDUCE_SCATTER)
+        # NOTE: We don't need to sync across TP when using sequence parallel (REDUCE_SCATTER)
         random_states = RandomStates({})
     return random_states
 

diff --git a/src/nanotron/models/base.py b/src/nanotron/models/base.py
@@ -71,7 +71,7 @@ def get_embeddings_lm_head_tied_names(self) -> list[str]:
         Example for GPT2 model: ["model.token_position_embeddings.pp_block.token_embedding.weight", "model.lm_head.pp_block.weight"]
         """
         return []
-    
+
     def get_named_params_without_weight_decay(self) -> List[str]:
         """Return a list of named parameters that should not have weight decay applied to them."""
         return []

diff --git a/src/nanotron/parallel/context.py b/src/nanotron/parallel/context.py
@@ -1,5 +1,5 @@
 import os
-from typing import Literal, Tuple, Annotated
+from typing import Literal, Tuple
 
 import numpy as np
 import torch
@@ -152,4 +152,4 @@ def get_global_rank(
 
         :return: numpy.int64, The global rank.
         """
-        return self.world_rank_matrix[ep_rank, pp_rank, dp_rank, tp_rank]
+        return self.world_rank_matrix[ep_rank, pp_rank, dp_rank, tp_rank]
diff --git a/src/nanotron/parallel/pipeline_parallel/context_manager.py b/src/nanotron/parallel/pipeline_parallel/context_manager.py
@@ -1,8 +1,9 @@
 from contextlib import contextmanager
 
+from torch import nn as torch_nn
+
 from nanotron.parallel.pipeline_parallel.block import PipelineBlock
 from nanotron.parallel.pipeline_parallel.state import PipelineBatchState
-from torch import nn as torch_nn
 
 
 @contextmanager

diff --git a/src/nanotron/parallel/pipeline_parallel/engine.py b/src/nanotron/parallel/pipeline_parallel/engine.py
@@ -2,6 +2,9 @@
 from typing import Dict, Iterable, Optional, Union
 
 import torch
+from torch import nn as torch_nn
+from torch.nn.parallel import DistributedDataParallel
+
 from nanotron import distributed as dist
 from nanotron import logging
 from nanotron.distributed import ProcessGroup
@@ -12,8 +15,6 @@
 from nanotron.parallel.pipeline_parallel.state import PipelineTrainBatchState
 from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
 from nanotron.utils import ContextManagers
-from torch import nn as torch_nn
-from torch.nn.parallel import DistributedDataParallel
 
 logger = logging.get_logger(__name__)
 

diff --git a/src/nanotron/parallel/pipeline_parallel/functional.py b/src/nanotron/parallel/pipeline_parallel/functional.py
@@ -1,4 +1,5 @@
 import torch
+
 from nanotron import logging
 from nanotron.parallel.pipeline_parallel.p2p import P2P
 from nanotron.parallel.pipeline_parallel.state import PipelineBatchState

diff --git a/src/nanotron/parallel/pipeline_parallel/p2p.py b/src/nanotron/parallel/pipeline_parallel/p2p.py
@@ -2,6 +2,7 @@
 from typing import List, Sequence, Tuple
 
 import torch
+
 from nanotron import distributed as dist
 from nanotron import logging
 from nanotron.utils import get_untyped_storage, tensor_from_untyped_storage
@@ -399,7 +400,7 @@ def add_send(self, tensor: torch.Tensor, to_rank: int, tag: int = 0):
     def add_recv(self, from_rank: int, tag: int = 0) -> int:
         """
         Only add p2p ops for the first operation, as `_recv_second_metadata` and `_recv_data_p2p_op`
-        require results from the first metadata to be transfered first.
+        require results from the first metadata to be transferred first.
         Return: index of the recv_buffer in `self.recv_first_metadata_buffers`
         """
         buffer, recv_op = self.p2p._recv_first_metadata_p2p_op(from_rank=from_rank, tag=tag)

diff --git a/src/nanotron/parallel/pipeline_parallel/state.py b/src/nanotron/parallel/pipeline_parallel/state.py
@@ -4,6 +4,7 @@
 from typing import List
 
 import torch
+
 from nanotron import distributed as dist
 from nanotron import logging
 from nanotron.logging import log_rank

diff --git a/src/nanotron/parallel/pipeline_parallel/utils.py b/src/nanotron/parallel/pipeline_parallel/utils.py
@@ -1,8 +1,9 @@
-from nanotron.models import NanotronModel
-from nanotron.parallel.pipeline_parallel.block import PipelineBlock
 from torch import nn
 from torch.nn.parallel import DistributedDataParallel
 
+from nanotron.models import NanotronModel
+from nanotron.parallel.pipeline_parallel.block import PipelineBlock
+
 
 def get_input_output_pp_ranks(model: NanotronModel | DistributedDataParallel):
     if isinstance(model, DistributedDataParallel):