diff --git a/create_config.py b/create_config.py index a00301ac..31242c90 100644 --- a/create_config.py +++ b/create_config.py @@ -13,6 +13,7 @@ Config, DataArgs, NanosetDatasetsArgs, + PretrainDatasetsArgs, S3UploadArgs, CheckpointsArgs, GeneralArgs, @@ -80,7 +81,7 @@ vocab_size=49152, ) - + # Uncomment to evaluate the model on a set of tasks with lighteval during the training. # lighteval = LightEvalConfig( # tasks=LightEvalTasksArgs( # tasks="early-signal", # "generatives", "all" @@ -110,6 +111,7 @@ # hub_repo_tensorboard="smollm-evals-visualization", # tensorboard_metric_prefix="eval", # ), + # temp_dir = "temp_dir", # slurm_template="slurm/run_eval.slurm.jinja", # # slurm_template="slurm/run_eval_s3.slurm.jinja", if s3 @@ -118,9 +120,9 @@ lighteval = None checkpoints = CheckpointsArgs( - checkpoints_path="checkpoints", + # checkpoints_path="checkpoints", checkpoints_path_is_shared_file_system=False, - # resume_checkpoint_path="", + # resume_checkpoint_path="local_path/to/checkpoint" or s3_path, checkpoint_interval=CHECKPOINT_INTERVAL, save_initial_state=False, ) @@ -161,7 +163,7 @@ learning_rate=3e-3, lr_warmup_steps=10, lr_warmup_style="linear", - lr_decay_style="1-sqrt", + lr_decay_style="linear", lr_decay_steps = 20, lr_decay_starting_step=80 , min_decay_lr=0, @@ -198,11 +200,19 @@ data_stages=[ DatasetStageArgs( data=DataArgs( - dataset=NanosetDatasetsArgs( - dataset_folder="datasets/cosmopedia-v2", + # 1. Un-tokenized dataset from HuggingFace + dataset=PretrainDatasetsArgs( + hf_dataset_or_datasets="HuggingFaceTB/smollm-corpus", # feel free to replace it by a smaller one if you don't have enough memory + hf_dataset_splits="train", + hf_dataset_config_name="cosmopedia-v2", + text_column_name="text", ), - num_loading_workers=0, - seed=general.seed, + # 2. Pre-tokenized local dataset with Nanoset + # dataset=NanosetDatasetsArgs( + # dataset_folder="datasets/cosmopedia-v2", + # ), + # num_loading_workers=0, + # seed=general.seed, ), name="training stage", start_training_step=1, diff --git a/launcher.py b/launcher.py index ea00c442..49df6c1f 100644 --- a/launcher.py +++ b/launcher.py @@ -177,11 +177,13 @@ def set_nested_attribute(obj, path, value): timestamp_with_run = f"run{run_number:03d}_{timestamp}" config.general.timestamp_with_run = timestamp_with_run - config.general.config_logs_path = f"{config.general.logs_path}/{args.run}/{timestamp_with_run}/config" + config.general.config_logs_path = str(Path(config.general.logs_path) / args.run / timestamp_with_run / "config") Path(config.general.config_logs_path).mkdir(parents=True, exist_ok=True) - - #making sure the logs path folder exists + if config.checkpoints.checkpoints_path is None: + config.checkpoints.checkpoints_path = str(Path(config.general.logs_path) / args.run / timestamp_with_run / "checkpoints") + Path(config.checkpoints.checkpoints_path).mkdir(parents=True, exist_ok=True) + if args.slurm: @@ -210,19 +212,24 @@ def set_nested_attribute(obj, path, value): subfolders.append('evals') for subfolder in subfolders: - folder_path = os.path.join(log_folder, subfolder) - os.makedirs(folder_path, exist_ok=True) + folder_path = str(log_folder / subfolder) + Path(folder_path).mkdir(parents=True, exist_ok=True) if subfolder == 'launch-script': config.general.launch_script_path = folder_path elif subfolder == 'slurm-logs': config.general.slurm_logs_path = folder_path elif subfolder == 'evals': config.general.evals_logs_path = folder_path - for evals_subfolder in ['launch-config', 'logs']: - evals_subfolder_path = os.path.join(config.general.evals_logs_path, evals_subfolder) - os.makedirs(evals_subfolder_path, exist_ok=True) - - + for evals_subfolder in ['launch-config', 'logs',"lighteval-logs"]: + if evals_subfolder == "lighteval-logs": + if config.lighteval.logging.local_output_path is None: + evals_subfolder_path = str(Path(config.general.evals_logs_path) / evals_subfolder) + Path(evals_subfolder_path).mkdir(parents=True, exist_ok=True) + config.lighteval.logging.local_output_path = evals_subfolder_path + else: + evals_subfolder_path = str(Path(config.general.evals_logs_path) / evals_subfolder) + Path(evals_subfolder_path).mkdir(parents=True, exist_ok=True) + torchrun_args = "" if 'torchrun_args' in launch_slurm_config and launch_slurm_config['torchrun_args']: torchrun_args = " ".join([f"--{k} {v}" for k, v in launch_slurm_config['torchrun_args'].items()]) @@ -252,7 +259,9 @@ def set_nested_attribute(obj, path, value): else: config.general.eval_slurm_config = None - config.save_as_yaml(launch_slurm_config["config_path_yaml"]) + config_path_yaml = str(Path(config.general.config_logs_path) / "launch.yaml") + Path(config.general.config_logs_path).mkdir(parents=True, exist_ok=True) + config.save_as_yaml(config_path_yaml) # Launch the Slurm job job_id = launch_slurm_job(sbatch_script) @@ -260,8 +269,9 @@ def set_nested_attribute(obj, path, value): # Save the Slurm script if a path is provided if config.general.launch_script_path: - os.makedirs(config.general.launch_script_path, exist_ok=True) + Path(config.general.launch_script_path).mkdir(parents=True, exist_ok=True) script_filename = f"slurm_launch_script.slurm" + script_path = str(Path(config.general.launch_script_path) / script_filename) script_path = os.path.join(config.general.launch_script_path, script_filename) with open(script_path, 'w') as f: @@ -278,6 +288,8 @@ def set_nested_attribute(obj, path, value): print(" 📁 Log structure:") print(f" {config.general.logs_path}/{config.general.run}/") print(f" └── {timestamp_with_run}/") + if config.checkpoints.checkpoints_path == str(Path(config.general.logs_path) / args.run / timestamp_with_run / "checkpoints"): + print(" ├── checkpoints/") print(" ├── config/") print(" ├── launch-script/") print(" ├── slurm-logs/") @@ -285,8 +297,8 @@ def set_nested_attribute(obj, path, value): print(" └── evals/") print(" ├── launch-config/") print(" └── logs/") - else: - print(" └── (No evals folder)") + if config.lighteval.logging.local_output_path== str(Path(config.general.evals_logs_path) / "lighteval-logs"): + print(" └── lighteval-logs/") else: # Check if running on an interactive node @@ -311,8 +323,8 @@ def set_nested_attribute(obj, path, value): f"uses {total_gpus} GPUs, but {gpu_count} are available. " f"You are not fully utilizing all available GPUs on this device.") - config_path_yaml = f"{config.general.config_logs_path}/launch.yaml" - os.makedirs("config.general.config_logs_path", exist_ok=True) + config_path_yaml = str(Path(config.general.config_logs_path) / "launch.yaml") + os.makedirs(config.general.config_logs_path, exist_ok=True) config.save_as_yaml(config_path_yaml) trainer_python_file = "run_train.py" diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py index 8bdffb87..488ebf96 100644 --- a/src/nanotron/config/config.py +++ b/src/nanotron/config/config.py @@ -171,8 +171,8 @@ class CheckpointsArgs: resume_checkpoint_path: if you want to load from a specific checkpoint path """ - checkpoints_path: str checkpoint_interval: int + checkpoints_path: Optional[str] = None save_initial_state: Optional[bool] = False save_final_state: Optional[bool] = False resume_checkpoint_path: Optional[str] = None @@ -210,7 +210,7 @@ class GeneralArgs: slurm_logs_path: Optional[str] = None config_logs_path: Optional[str] = None evals_logs_path: Optional[str] = None - temp_dir: Optional[str] = None + temp_dir: Optional[str] = "temp_dir" seed: Optional[int] = None step: Optional[int] = None consumed_train_samples: Optional[int] = None diff --git a/src/nanotron/config/lighteval_config.py b/src/nanotron/config/lighteval_config.py index ea3ba120..fe11437d 100644 --- a/src/nanotron/config/lighteval_config.py +++ b/src/nanotron/config/lighteval_config.py @@ -93,7 +93,7 @@ class LightEvalConfig: slurm_template: Optional[str] = None slurm_script_dir: Optional[str] = None - temp_dir: Optional[str] = None + temp_dir: Optional[str] = "temp_dir" checkpoints_path: Optional[str] = None parallelism: Optional[ParallelismArgs] = None batch_size: Optional[int] = None