Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More refactoring #3

Closed
wants to merge 12 commits into from
43 changes: 43 additions & 0 deletions .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: lint

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
check_code_quality:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
id: setup_python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Load cached virtual environment
uses: actions/cache@v3
id: cache-venv
with:
path: |
~/.venv/
~/.cache/pre-commit/
.git/hooks/pre-commit
key: ${{ runner.os }}-${{ steps.setup_python.outputs.python-version }}-venv-${{ hashFiles('pyproject.toml') }}
- name: Install dependencies
run: |
python -m venv ~/.venv
source ~/.venv/bin/activate
python -m pip install -e .[dev]
pre-commit install
if: steps.cache-venv.outputs.cache-hit != 'true'
- name: Check quality
run: |
source ~/.venv/bin/activate
python -m pip install --no-deps -e .[dev]
pre-commit run --config .pre-commit-config-check.yaml --all-files
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
# nanotron
# nanotron
111 changes: 111 additions & 0 deletions configs/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# USE_FAST=1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 scripts/train.py --config-file configs/config.yaml
# 09/25/2023 09:55:06 [INFO|DP=0|PP=0|TP=0]: [After train batch iter] Memory usage: 18459.78MB. Peak reserved memory: 69208.00MB
# 09/25/2023 09:55:07 [INFO|DP=0|PP=1|TP=0]: iteration: 2 / 300 | consumed_samples: 1024 | elapsed_time_per_iteration_ms: 58748.9 | tokens_per_sec: 3.569689E+04 | tokens_per_sec_per_gpu: 4.462111E+03 | global_batch_size: 512 | lm_loss: 1.130280E+01 | lr: 5.333E-07 | model_tflops_per_gpu: 185.96 | hardware_tflops_per_gpu: 195.54 | grad_norm: 1.618
general:
name: test-llama
ignore_sanity_checks: false
kill_switch_path: ./kill_switch_nouamane

profile: #
# profiler_export_path: profile

checkpoints:
checkpoints_path: /fsx/nouamane/checkpoints/nanotron/test
load_from_specific_checkpoint: null
checkpoint_interval: 1000000

parallelism:
dp: 2
pp: 2
tp: 2
pp_engine: 1f1b
tp_mode: REDUCE_SCATTER
tp_linear_async_communication: true
recompute_granularity: selective

model:
hf_model_name: HuggingFaceBR4/llama-v2-7b-the-pile
# hf_model_name: huggyllama/llama-7b
# hf_model_name: meta-llama/Llama-2-7b-hf
remote_code:
trust_remote_code: true
make_vocab_size_divisible_by: 1
init_method:
std: 0.015625 # Basically 1/sqrt(N)
# path: /fsx/nouamane/projects/nanotron/pretrained/llama-v2-7b-the-pile
# path: /fsx/nouamane/projects/nanotron/pretrained/llama-2-7b
dtype: bfloat16
seed: 42

tokens:
sequence_length: 4096
train_steps: 300 # GBS = 1024 -> Train steps = 111998 / 512 = 160
micro_batch_size: 4
batch_accumulation_per_replica: 64
val_check_interval: 20
limit_val_batches: 2

optimizer:
zero_stage: 1
weight_decay: 0.1
clip_grad: 0.4

accumulate_grad_in_fp32: true

adam_eps: 1.0e-8
adam_beta1: 0.9
adam_beta2: 0.95 # Copied from LLaMa
torch_adam_is_fused: true

learning_rate: 4.0e-4

learning_rate_scheduler:
lr_warmup_steps: 1500
lr_warmup_style: linear
lr_decay_steps: null
lr_decay_style: linear
min_decay_lr: 4.0e-5

logging:
# 'debug', 'info', 'warning', 'error', 'critical' and 'passive'
log_level: 'info'
log_level_replica: 'info'
iteration_step_info_interval: 1
tensorboard_logger:
# tensorboard_dir: ./tensorboard_llama
# # flush_secs: 20
# repo_id: HuggingFaceBR4/nouamane-llama-2-finetuning-clean
# push_to_hub_interval: 20
# repo_public: False

data:
seed: 1234
num_loading_workers: 1
dataset:
# hf_dataset_mixer:
# # HuggingFaceH4/oasst1_h4: 1.0 # 20504 -> 20k
# HuggingFaceH4/anthropic_helpful: 1.0 # 111998 -> 20k
# # HuggingFaceH4/shp: 0 # 82836 -> 20k
# # HuggingFaceH4/learn_to_summarize: 0.527 # 37962 -> 20k
# # HuggingFaceH4/scale_helpful_1: 1.0 # 800
# hf_dataset_splits:
# - train_ift
# # - train_rm
# # - test_rm # # TODO @nouamane: support evaluation
# hf_dataset_config_name: null
# dataset_processing_num_proc_per_process: 12
# dataset_overwrite_cache: false
# text_column_name: chosen

# data_prefix:
# - 1
# - /fsx/thomwolf/data/llama-samantha_result_document
# index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
# splits_string: 0.969,0.03,0.001 # train, val, test (we normalize by sum)
# # rm /fsx/shared-falcon-180B/data/tokenized_stack_no_pii/code/python/*.npy to reset cache
# skip_warmup: true
# dataloader_type: single # cyclic
# validation_drop_last: true # Set to false if the last partial validation samples is to be consumed
# eod_mask_loss: false # Mask loss for the end of document tokens
# no_seqlen_plus_one_input_tokens: false # Set to true to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
# pad_samples_to_global_batch_size: false # Set to true if you want to pad the last partial batch with -1's to equal global batch size
109 changes: 109 additions & 0 deletions configs/config_correctness.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# USE_FAST=1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 scripts/train.py --config-file configs/config_correctness.yaml
general:
name: test-llama
ignore_sanity_checks: false
kill_switch_path: ./kill_switch_nouamane

profile: #
# profiler_export_path: profile

checkpoints:
checkpoints_path: /fsx/nouamane/checkpoints/nanotron/test
load_from_specific_checkpoint: null
checkpoint_interval: 1000000

parallelism:
dp: 2
pp: 2
tp: 2
pp_engine: 1f1b
tp_mode: REDUCE_SCATTER
tp_linear_async_communication: true
recompute_granularity: selective

model:
# hf_model_name: HuggingFaceBR4/llama-v2-7b-the-pile
# hf_model_name: huggyllama/llama-7b
hf_model_name: meta-llama/Llama-2-7b-hf
remote_code:
trust_remote_code: true
make_vocab_size_divisible_by: 1
init_method:
# std: 0.015625 # Basically 1/sqrt(N)
# path: /fsx/nouamane/projects/nanotron/pretrained/llama-v2-7b-the-pile
path: /fsx/nouamane/projects/brrr/pretrained/llama-2-7b
dtype: bfloat16
seed: 42

tokens:
sequence_length: 4096
train_steps: 300 # GBS = 1024 -> Train steps = 111998 / 512 = 160
micro_batch_size: 2
batch_accumulation_per_replica: 3
val_check_interval: 20
limit_val_batches: 2

optimizer:
zero_stage: 1
weight_decay: 0.1
clip_grad: 0.4

accumulate_grad_in_fp32: true

adam_eps: 1.0e-8
adam_beta1: 0.9
adam_beta2: 0.95 # Copied from LLaMa
torch_adam_is_fused: true

learning_rate: 4.0e-4

learning_rate_scheduler:
lr_warmup_steps: 1500
lr_warmup_style: linear
lr_decay_steps: null
lr_decay_style: linear
min_decay_lr: 4.0e-5

logging:
# 'debug', 'info', 'warning', 'error', 'critical' and 'passive'
log_level: 'info'
log_level_replica: 'info'
iteration_step_info_interval: 1
tensorboard_logger:
# tensorboard_dir: /fsx/nouamane/projects/nanotron/tb_logs
# # flush_secs: 20
# repo_id: HuggingFaceBR4/nouamane-llama-2-finetuning-clean
# push_to_hub_interval: 20
# repo_public: False

data:
seed: 1234
num_loading_workers: 1
dataset:
# hf_dataset_mixer:
# # HuggingFaceH4/oasst1_h4: 1.0 # 20504 -> 20k
# HuggingFaceH4/anthropic_helpful: 1.0 # 111998 -> 20k
# # HuggingFaceH4/shp: 0 # 82836 -> 20k
# # HuggingFaceH4/learn_to_summarize: 0.527 # 37962 -> 20k
# # HuggingFaceH4/scale_helpful_1: 1.0 # 800
# hf_dataset_splits:
# - train_ift
# # - train_rm
# # - test_rm # # TODO @nouamane: support evaluation
# hf_dataset_config_name: null
# dataset_processing_num_proc_per_process: 12
# dataset_overwrite_cache: false
# text_column_name: chosen

data_prefix:
- 1
- /fsx/nouamane/data/llama-samantha/llama-samantha_result_document
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
splits_string: 0.969,0.03,0.001 # train, val, test (we normalize by sum)
# rm /fsx/shared-falcon-180B/data/tokenized_stack_no_pii/code/python/*.npy to reset cache
skip_warmup: true
dataloader_type: single # cyclic
validation_drop_last: true # Set to false if the last partial validation samples is to be consumed
eod_mask_loss: false # Mask loss for the end of document tokens
no_seqlen_plus_one_input_tokens: false # Set to true to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
pad_samples_to_global_batch_size: false # Set to true if you want to pad the last partial batch with -1's to equal global batch size
Loading
Loading