diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index c9166544e4..a8141e19be 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -511,6 +511,7 @@ Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数, - 注意:Qwen3-Next/Qwen3.5的packing请使用Megatron-SWIFT,具体参考[Qwen3.5最佳实践](../BestPractices/Qwen3_5-Best-Practice.md)。 - packing_length: packing的长度。默认为None,设置为max_length。 - packing_num_proc: packing的进程数,默认为1。需要注意的是,不同的`packing_num_proc`,最终形成的packed数据集是不同的。(该参数在流式packing时不生效)。通常不需要修改该值,packing速度远快于tokenize速度。 +- packing_strategy: packing 算法,可选为'binpack'和'sequential',默认为'binpack'。'binpack'使用 best-fit-decreasing 装箱(会按长度重排样本);'sequential'使用保序的贪心装箱(next-fit:仅维护一个开放 pack,放不下即 flush),按输入顺序逐条装箱,使样本顺序与每个 pack 的边界跟随顺序采样器(建议配合 packing_num_proc=1 以保证全局顺序),适合忠实复刻把域加权编码在样本顺序里的配方。 - lazy_tokenize: 是否使用lazy_tokenize。若该参数设置为False,则在训练之前对所有的数据集样本进行tokenize(多模态模型则包括从磁盘中读取图片)。该参数默认为None,在LLM训练中默认为False,而MLLM训练默认为True,节约内存。 - 注意:若你要进行图像的数据增强,你需要将lazy_tokenize(或streaming)设置为True,并修改Template类中的encode方法。 - use_logits_to_keep: 通过在`forward`中根据labels传入logits_to_keep,减少无效logits的计算与存储,从而减少显存占用并加快训练速度。默认为None,进行自动选择。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 9091226844..deae0539fc 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -523,6 +523,7 @@ Training arguments include the [base arguments](#base-arguments), [Seq2SeqTraine - Note: For Qwen3-Next/Qwen3.5 packing, please use Megatron-SWIFT. Refer to [Qwen3.5 Best Practice](../BestPractices/Qwen3_5-Best-Practice.md) for details. - packing_length: the length to use for packing. Defaults to None, in which case it is set to max_length. - packing_num_proc: Number of processes for packing, default is 1. Note that different values of `packing_num_proc` will result in different packed datasets. (This parameter does not take effect during streaming packing). Usually there is no need to modify this value, as packing speed is much faster than tokenization speed. +- packing_strategy: Packing algorithm, one of 'binpack' and 'sequential', default is 'binpack'. 'binpack' uses best-fit-decreasing bin packing (which reorders samples by length); 'sequential' uses order-preserving greedy packing (next-fit: a single open pack, flushed when the next sample doesn't fit), keeping samples in input order so the sample order and pack boundaries follow a sequential sampler (use `packing_num_proc=1` for a single global ordering). Useful for faithfully reproducing recipes that encode domain weighting in the sample order. - lazy_tokenize: Whether to use lazy tokenization. If set to `False`, all dataset samples will be tokenized (and for multimodal models, images will be loaded from disk) before training begins. Default is `None`: in LLM training, it defaults to `False`; in MLLM training, it defaults to `True` to save memory. - Note: If you want to perform image data augmentation, you need to set `lazy_tokenize` (or `streaming`) to True and modify the `encode` method in the Template class. - use_logits_to_keep: Pass `logits_to_keep` in the `forward` method based on labels to reduce the computation and storage of unnecessary logits, thereby reducing memory usage and accelerating training. The default is `None`, which enables automatic selection. diff --git a/swift/arguments/base_args/base_args.py b/swift/arguments/base_args/base_args.py index ee430dcea4..326fb6d8b7 100644 --- a/swift/arguments/base_args/base_args.py +++ b/swift/arguments/base_args/base_args.py @@ -74,6 +74,10 @@ class BaseArguments(GenerationArguments, QuantizeArguments, DataArguments, Templ packing (bool): Whether to enable packing of datasets. Default is False. packing_length (Optional[int]): Length of packing. Default is None. packing_num_proc (int): Number of processes used for packing, Default is 1. + packing_strategy (Literal['binpack', 'sequential']): Packing algorithm. 'binpack' (default) uses + best-fit-decreasing bin packing (reorders samples); 'sequential' uses order-preserving greedy + packing (next-fit: a single open pack, flushed when the next sample doesn't fit) so the sample + order / pack boundaries follow a sequential sampler (use packing_num_proc=1). Default is 'binpack'. lazy_tokenize (Optional[bool]): Whether to enable lazy tokenization. Default is None. use_hf (bool): Whether to use Hugging Face for downloading/uploading models and datasets. If False, ModelScope is used. Default is False. @@ -101,6 +105,7 @@ class BaseArguments(GenerationArguments, QuantizeArguments, DataArguments, Templ packing: bool = False packing_length: Optional[int] = None packing_num_proc: int = 1 + packing_strategy: Literal['binpack', 'sequential'] = 'binpack' lazy_tokenize: Optional[bool] = None # hub use_hf: bool = False diff --git a/swift/dataset/packing.py b/swift/dataset/packing.py index 6f9cf54c21..d393c03d96 100644 --- a/swift/dataset/packing.py +++ b/swift/dataset/packing.py @@ -13,10 +13,32 @@ logger = get_logger() -def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True): +def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True, strategy: str = 'binpack'): if len(sequences) == 0: return [], [] - # https://arxiv.org/pdf/2404.10830 + if strategy == 'sequential': + # Order-preserving greedy packing (next-fit): keep a single open pack and flush it + # when the next sample doesn't fit, so the global sample order and pack boundaries + # follow the input order (a sequential sampler). Useful for faithfully reproducing + # recipes that encode domain weighting in the sample order. (Use packing_num_proc=1 + # for a single global ordering.) + packs, cur, cur_len = [], [], 0 + for item in sequences: # item = (idx, length); weight_pos=1 -> length at item[1] + ln = item[1] + if cur and cur_len + ln > packing_length: + packs.append(cur) + cur, cur_len = [], 0 + cur.append(item) + cur_len += ln + if cur_len >= packing_length: + packs.append(cur) + cur, cur_len = [], 0 + if is_finished: + if cur: + packs.append(cur) + return packs, [] + return packs, cur + # default: best-fit-decreasing bin packing (https://arxiv.org/pdf/2404.10830) import binpacking sequences = binpacking.to_constant_volume(sequences, packing_length, weight_pos=1) if sequences and not is_finished: @@ -39,6 +61,7 @@ def __init__( load_from_cache_file: bool = True, packing_length: Optional[int] = None, packing_num_proc: int = 1, + packing_strategy: str = 'binpack', **kwargs, ): template.packing = True @@ -48,6 +71,7 @@ def __init__( self.num_proc = num_proc self.strict = strict self.load_from_cache_file = load_from_cache_file + self.packing_strategy = packing_strategy self.packing_length = packing_length or self.template.max_length self.packing_num_proc = min(packing_num_proc, math.ceil(len(dataset) / self.PACKING_BATCH_SIZE)) self._out_queue = mp.Queue() @@ -97,7 +121,8 @@ def create_packed_idx(self, rank, offset, lengths): break i += self.PACKING_BATCH_SIZE is_finished = i >= len(data) - sequences, input_data = calculate_matched_group(input_data, self.packing_length, is_finished=is_finished) + sequences, input_data = calculate_matched_group( + input_data, self.packing_length, is_finished=is_finished, strategy=self.packing_strategy) self._out_queue.put((rank, sequences, len(new_data))) self._out_queue.put((rank, [], -1)) @@ -122,6 +147,7 @@ def __init__( packing_length: Optional[int] = None, strict: bool = False, cyclic: bool = False, + packing_strategy: str = 'binpack', **kwargs, ): template.packing = True @@ -137,6 +163,7 @@ def __init__( self._out_queue = mp.Queue() self.workers = [] self.cyclic = cyclic + self.packing_strategy = packing_strategy for _ in range(self.num_proc): worker = mp.Process(target=self._processor, daemon=True) worker.start() @@ -194,7 +221,8 @@ def __iter__(self): num_samples = self._put_data_in_queue(iterator) finished = num_samples != self.packing_interval data = self._fetch_data_out_queue(data, num_samples) - sequences, data = calculate_matched_group(data, self.packing_length, is_finished=finished) + sequences, data = calculate_matched_group( + data, self.packing_length, is_finished=finished, strategy=self.packing_strategy) res = [] for row in sequences: res.append([r[0] for r in row]) diff --git a/swift/pipelines/train/sft.py b/swift/pipelines/train/sft.py index ef8e1f75c7..8d547f4106 100644 --- a/swift/pipelines/train/sft.py +++ b/swift/pipelines/train/sft.py @@ -143,6 +143,7 @@ def _post_process_datasets(self, datasets: List) -> List: num_proc=args.dataset_num_proc, packing_length=args.packing_length, packing_num_proc=args.packing_num_proc, + packing_strategy=args.packing_strategy, strict=args.strict, load_from_cache_file=args.load_from_cache_file) elif args.streaming: