Skip to content

Commit

Permalink
fix the logs structure
Browse files Browse the repository at this point in the history
  • Loading branch information
eliebak committed Sep 10, 2024
1 parent efce15b commit 73da086
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 27 deletions.
26 changes: 18 additions & 8 deletions create_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Config,
DataArgs,
NanosetDatasetsArgs,
PretrainDatasetsArgs,
S3UploadArgs,
CheckpointsArgs,
GeneralArgs,
Expand Down Expand Up @@ -80,7 +81,7 @@
vocab_size=49152,
)


# Uncomment to evaluate the model on a set of tasks with lighteval during the training.
# lighteval = LightEvalConfig(
# tasks=LightEvalTasksArgs(
# tasks="early-signal", # "generatives", "all"
Expand Down Expand Up @@ -110,6 +111,7 @@
# hub_repo_tensorboard="smollm-evals-visualization",
# tensorboard_metric_prefix="eval",
# ),
# temp_dir = "temp_dir",
# slurm_template="slurm/run_eval.slurm.jinja",
# # slurm_template="slurm/run_eval_s3.slurm.jinja", if s3

Expand All @@ -118,9 +120,9 @@
lighteval = None

checkpoints = CheckpointsArgs(
checkpoints_path="checkpoints",
# checkpoints_path="checkpoints",
checkpoints_path_is_shared_file_system=False,
# resume_checkpoint_path="",
# resume_checkpoint_path="local_path/to/checkpoint" or s3_path,
checkpoint_interval=CHECKPOINT_INTERVAL,
save_initial_state=False,
)
Expand Down Expand Up @@ -161,7 +163,7 @@
learning_rate=3e-3,
lr_warmup_steps=10,
lr_warmup_style="linear",
lr_decay_style="1-sqrt",
lr_decay_style="linear",
lr_decay_steps = 20,
lr_decay_starting_step=80 ,
min_decay_lr=0,
Expand Down Expand Up @@ -198,11 +200,19 @@
data_stages=[
DatasetStageArgs(
data=DataArgs(
dataset=NanosetDatasetsArgs(
dataset_folder="datasets/cosmopedia-v2",
# 1. Un-tokenized dataset from HuggingFace
dataset=PretrainDatasetsArgs(
hf_dataset_or_datasets="HuggingFaceTB/smollm-corpus", # feel free to replace it by a smaller one if you don't have enough memory
hf_dataset_splits="train",
hf_dataset_config_name="cosmopedia-v2",
text_column_name="text",
),
num_loading_workers=0,
seed=general.seed,
# 2. Pre-tokenized local dataset with Nanoset
# dataset=NanosetDatasetsArgs(
# dataset_folder="datasets/cosmopedia-v2",
# ),
# num_loading_workers=0,
# seed=general.seed,
),
name="training stage",
start_training_step=1,
Expand Down
44 changes: 28 additions & 16 deletions launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,13 @@ def set_nested_attribute(obj, path, value):
timestamp_with_run = f"run{run_number:03d}_{timestamp}"
config.general.timestamp_with_run = timestamp_with_run

config.general.config_logs_path = f"{config.general.logs_path}/{args.run}/{timestamp_with_run}/config"
config.general.config_logs_path = str(Path(config.general.logs_path) / args.run / timestamp_with_run / "config")
Path(config.general.config_logs_path).mkdir(parents=True, exist_ok=True)


#making sure the logs path folder exists
if config.checkpoints.checkpoints_path is None:
config.checkpoints.checkpoints_path = str(Path(config.general.logs_path) / args.run / timestamp_with_run / "checkpoints")
Path(config.checkpoints.checkpoints_path).mkdir(parents=True, exist_ok=True)


if args.slurm:

Expand Down Expand Up @@ -210,19 +212,24 @@ def set_nested_attribute(obj, path, value):
subfolders.append('evals')

for subfolder in subfolders:
folder_path = os.path.join(log_folder, subfolder)
os.makedirs(folder_path, exist_ok=True)
folder_path = str(log_folder / subfolder)
Path(folder_path).mkdir(parents=True, exist_ok=True)
if subfolder == 'launch-script':
config.general.launch_script_path = folder_path
elif subfolder == 'slurm-logs':
config.general.slurm_logs_path = folder_path
elif subfolder == 'evals':
config.general.evals_logs_path = folder_path
for evals_subfolder in ['launch-config', 'logs']:
evals_subfolder_path = os.path.join(config.general.evals_logs_path, evals_subfolder)
os.makedirs(evals_subfolder_path, exist_ok=True)


for evals_subfolder in ['launch-config', 'logs',"lighteval-logs"]:
if evals_subfolder == "lighteval-logs":
if config.lighteval.logging.local_output_path is None:
evals_subfolder_path = str(Path(config.general.evals_logs_path) / evals_subfolder)
Path(evals_subfolder_path).mkdir(parents=True, exist_ok=True)
config.lighteval.logging.local_output_path = evals_subfolder_path
else:
evals_subfolder_path = str(Path(config.general.evals_logs_path) / evals_subfolder)
Path(evals_subfolder_path).mkdir(parents=True, exist_ok=True)

torchrun_args = ""
if 'torchrun_args' in launch_slurm_config and launch_slurm_config['torchrun_args']:
torchrun_args = " ".join([f"--{k} {v}" for k, v in launch_slurm_config['torchrun_args'].items()])
Expand Down Expand Up @@ -252,16 +259,19 @@ def set_nested_attribute(obj, path, value):
else:
config.general.eval_slurm_config = None

config.save_as_yaml(launch_slurm_config["config_path_yaml"])
config_path_yaml = str(Path(config.general.config_logs_path) / "launch.yaml")
Path(config.general.config_logs_path).mkdir(parents=True, exist_ok=True)
config.save_as_yaml(config_path_yaml)

# Launch the Slurm job
job_id = launch_slurm_job(sbatch_script)
print(f"🚀 Slurm job launched with id={job_id}")

# Save the Slurm script if a path is provided
if config.general.launch_script_path:
os.makedirs(config.general.launch_script_path, exist_ok=True)
Path(config.general.launch_script_path).mkdir(parents=True, exist_ok=True)
script_filename = f"slurm_launch_script.slurm"
script_path = str(Path(config.general.launch_script_path) / script_filename)
script_path = os.path.join(config.general.launch_script_path, script_filename)

with open(script_path, 'w') as f:
Expand All @@ -278,15 +288,17 @@ def set_nested_attribute(obj, path, value):
print(" 📁 Log structure:")
print(f" {config.general.logs_path}/{config.general.run}/")
print(f" └── {timestamp_with_run}/")
if config.checkpoints.checkpoints_path == str(Path(config.general.logs_path) / args.run / timestamp_with_run / "checkpoints"):
print(" ├── checkpoints/")
print(" ├── config/")
print(" ├── launch-script/")
print(" ├── slurm-logs/")
if hasattr(config, 'lighteval') and config.lighteval is not None:
print(" └── evals/")
print(" ├── launch-config/")
print(" └── logs/")
else:
print(" └── (No evals folder)")
if config.lighteval.logging.local_output_path== str(Path(config.general.evals_logs_path) / "lighteval-logs"):
print(" └── lighteval-logs/")

else:
# Check if running on an interactive node
Expand All @@ -311,8 +323,8 @@ def set_nested_attribute(obj, path, value):
f"uses {total_gpus} GPUs, but {gpu_count} are available. "
f"You are not fully utilizing all available GPUs on this device.")

config_path_yaml = f"{config.general.config_logs_path}/launch.yaml"
os.makedirs("config.general.config_logs_path", exist_ok=True)
config_path_yaml = str(Path(config.general.config_logs_path) / "launch.yaml")
os.makedirs(config.general.config_logs_path, exist_ok=True)
config.save_as_yaml(config_path_yaml)

trainer_python_file = "run_train.py"
Expand Down
4 changes: 2 additions & 2 deletions src/nanotron/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ class CheckpointsArgs:
resume_checkpoint_path: if you want to load from a specific checkpoint path
"""

checkpoints_path: str
checkpoint_interval: int
checkpoints_path: Optional[str] = None
save_initial_state: Optional[bool] = False
save_final_state: Optional[bool] = False
resume_checkpoint_path: Optional[str] = None
Expand Down Expand Up @@ -210,7 +210,7 @@ class GeneralArgs:
slurm_logs_path: Optional[str] = None
config_logs_path: Optional[str] = None
evals_logs_path: Optional[str] = None
temp_dir: Optional[str] = None
temp_dir: Optional[str] = "temp_dir"
seed: Optional[int] = None
step: Optional[int] = None
consumed_train_samples: Optional[int] = None
Expand Down
2 changes: 1 addition & 1 deletion src/nanotron/config/lighteval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class LightEvalConfig:

slurm_template: Optional[str] = None
slurm_script_dir: Optional[str] = None
temp_dir: Optional[str] = None
temp_dir: Optional[str] = "temp_dir"
checkpoints_path: Optional[str] = None
parallelism: Optional[ParallelismArgs] = None
batch_size: Optional[int] = None
Expand Down

0 comments on commit 73da086

Please sign in to comment.