diff --git a/.gitignore b/.gitignore index 5e6121d3c3..88c36ca2e0 100644 --- a/.gitignore +++ b/.gitignore @@ -172,6 +172,3 @@ saves/ output/ wandb/ generated_predictions.jsonl - -# unittest -dummy_dir/ diff --git a/src/llamafactory/hparams/generating_args.py b/src/llamafactory/hparams/generating_args.py index 2b8e05a426..377fea152a 100644 --- a/src/llamafactory/hparams/generating_args.py +++ b/src/llamafactory/hparams/generating_args.py @@ -15,6 +15,8 @@ from dataclasses import asdict, dataclass, field from typing import Any, Dict, Optional +from transformers import GenerationConfig + @dataclass class GeneratingArguments: @@ -69,10 +71,17 @@ class GeneratingArguments: metadata={"help": "Whether or not to remove special tokens in the decoding."}, ) - def to_dict(self) -> Dict[str, Any]: + def to_dict(self, obey_generation_config: bool = False) -> Dict[str, Any]: args = asdict(self) if args.get("max_new_tokens", -1) > 0: args.pop("max_length", None) else: args.pop("max_new_tokens", None) + + if obey_generation_config: + generation_config = GenerationConfig() + for key in list(args.keys()): + if not hasattr(generation_config, key): + args.pop(key) + return args diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py index 3136d7fdc7..b10ebf6900 100644 --- a/src/llamafactory/train/sft/trainer.py +++ b/src/llamafactory/train/sft/trainer.py @@ -151,7 +151,7 @@ def _pad_tensors_to_target_len(self, src_tensor: "torch.Tensor", tgt_tensor: "to return padded_tensor.contiguous() # in contiguous memory def save_predictions( - self, dataset: "Dataset", predict_results: "PredictionOutput", gen_kwargs: Dict[str, Any] + self, dataset: "Dataset", predict_results: "PredictionOutput", skip_special_tokens: bool = True ) -> None: r""" Saves model predictions to `output_dir`. @@ -179,12 +179,8 @@ def save_predictions( preds[i] = np.concatenate((preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1) decoded_inputs = self.processing_class.batch_decode(dataset["input_ids"], skip_special_tokens=False) - decoded_preds = self.processing_class.batch_decode( - preds, skip_special_tokens=gen_kwargs["skip_special_tokens"] - ) - decoded_labels = self.processing_class.batch_decode( - labels, skip_special_tokens=gen_kwargs["skip_special_tokens"] - ) + decoded_preds = self.processing_class.batch_decode(preds, skip_special_tokens=skip_special_tokens) + decoded_labels = self.processing_class.batch_decode(labels, skip_special_tokens=skip_special_tokens) with open(output_prediction_file, "w", encoding="utf-8") as f: for text, pred, label in zip(decoded_inputs, decoded_preds, decoded_labels): diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index b290af0d91..5f4a09cc14 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -91,7 +91,7 @@ def run_sft( ) # Keyword arguments for `model.generate` - gen_kwargs = generating_args.to_dict() + gen_kwargs = generating_args.to_dict(obey_generation_config=True) gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids gen_kwargs["pad_token_id"] = tokenizer.pad_token_id gen_kwargs["logits_processor"] = get_logits_processor() @@ -130,7 +130,7 @@ def run_sft( predict_results.metrics.pop("predict_loss", None) trainer.log_metrics("predict", predict_results.metrics) trainer.save_metrics("predict", predict_results.metrics) - trainer.save_predictions(dataset_module["eval_dataset"], predict_results, gen_kwargs) + trainer.save_predictions(dataset_module["eval_dataset"], predict_results, generating_args.skip_special_tokens) # Create model card create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/tests/e2e/test_train.py b/tests/e2e/test_train.py index d1eae61728..71cda495a4 100644 --- a/tests/e2e/test_train.py +++ b/tests/e2e/test_train.py @@ -60,12 +60,12 @@ ], ) def test_run_exp(stage: str, dataset: str): - output_dir = os.path.join("output", f"dummy_dir/train_{stage}") + output_dir = os.path.join("output", f"train_{stage}") run_exp({"stage": stage, "dataset": dataset, "output_dir": output_dir, **TRAIN_ARGS}) assert os.path.exists(output_dir) def test_export(): - export_dir = os.path.join("output", "dummy_dir/llama3_export") + export_dir = os.path.join("output", "llama3_export") export_model({"export_dir": export_dir, **INFER_ARGS}) assert os.path.exists(export_dir) diff --git a/tests/train/test_sft_trainer.py b/tests/train/test_sft_trainer.py index e4391c10de..75231d205a 100644 --- a/tests/train/test_sft_trainer.py +++ b/tests/train/test_sft_trainer.py @@ -58,7 +58,11 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: @pytest.mark.parametrize("disable_shuffling", [False, True]) def test_shuffle(disable_shuffling: bool): model_args, data_args, training_args, finetuning_args, _ = get_train_args( - {"output_dir": f"dummy_dir/{disable_shuffling}", "disable_shuffling": disable_shuffling, **TRAIN_ARGS} + { + "output_dir": os.path.join("output", f"shuffle{str(disable_shuffling).lower()}"), + "disable_shuffling": disable_shuffling, + **TRAIN_ARGS, + } ) tokenizer_module = load_tokenizer(model_args) tokenizer = tokenizer_module["tokenizer"]