Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数,
- 注意:Qwen3-Next/Qwen3.5的packing请使用Megatron-SWIFT,具体参考[Qwen3.5最佳实践](../BestPractices/Qwen3_5-Best-Practice.md)。
- packing_length: packing的长度。默认为None,设置为max_length。
- packing_num_proc: packing的进程数,默认为1。需要注意的是,不同的`packing_num_proc`,最终形成的packed数据集是不同的。(该参数在流式packing时不生效)。通常不需要修改该值,packing速度远快于tokenize速度。
- packing_strategy: packing 算法,可选为'binpack'和'sequential',默认为'binpack'。'binpack'使用 best-fit-decreasing 装箱(会按长度重排样本);'sequential'使用保序的贪心装箱(next-fit:仅维护一个开放 pack,放不下即 flush),按输入顺序逐条装箱,使样本顺序与每个 pack 的边界跟随顺序采样器(建议配合 packing_num_proc=1 以保证全局顺序),适合忠实复刻把域加权编码在样本顺序里的配方。
- lazy_tokenize: 是否使用lazy_tokenize。若该参数设置为False,则在训练之前对所有的数据集样本进行tokenize(多模态模型则包括从磁盘中读取图片)。该参数默认为None,在LLM训练中默认为False,而MLLM训练默认为True,节约内存。
- 注意:若你要进行图像的数据增强,你需要将lazy_tokenize(或streaming)设置为True,并修改Template类中的encode方法。
- use_logits_to_keep: 通过在`forward`中根据labels传入logits_to_keep,减少无效logits的计算与存储,从而减少显存占用并加快训练速度。默认为None,进行自动选择。
Expand Down
1 change: 1 addition & 0 deletions docs/source_en/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ Training arguments include the [base arguments](#base-arguments), [Seq2SeqTraine
- Note: For Qwen3-Next/Qwen3.5 packing, please use Megatron-SWIFT. Refer to [Qwen3.5 Best Practice](../BestPractices/Qwen3_5-Best-Practice.md) for details.
- packing_length: the length to use for packing. Defaults to None, in which case it is set to max_length.
- packing_num_proc: Number of processes for packing, default is 1. Note that different values of `packing_num_proc` will result in different packed datasets. (This parameter does not take effect during streaming packing). Usually there is no need to modify this value, as packing speed is much faster than tokenization speed.
- packing_strategy: Packing algorithm, one of 'binpack' and 'sequential', default is 'binpack'. 'binpack' uses best-fit-decreasing bin packing (which reorders samples by length); 'sequential' uses order-preserving greedy packing (next-fit: a single open pack, flushed when the next sample doesn't fit), keeping samples in input order so the sample order and pack boundaries follow a sequential sampler (use `packing_num_proc=1` for a single global ordering). Useful for faithfully reproducing recipes that encode domain weighting in the sample order.
- lazy_tokenize: Whether to use lazy tokenization. If set to `False`, all dataset samples will be tokenized (and for multimodal models, images will be loaded from disk) before training begins. Default is `None`: in LLM training, it defaults to `False`; in MLLM training, it defaults to `True` to save memory.
- Note: If you want to perform image data augmentation, you need to set `lazy_tokenize` (or `streaming`) to True and modify the `encode` method in the Template class.
- use_logits_to_keep: Pass `logits_to_keep` in the `forward` method based on labels to reduce the computation and storage of unnecessary logits, thereby reducing memory usage and accelerating training. The default is `None`, which enables automatic selection.
Expand Down
5 changes: 5 additions & 0 deletions swift/arguments/base_args/base_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ class BaseArguments(GenerationArguments, QuantizeArguments, DataArguments, Templ
packing (bool): Whether to enable packing of datasets. Default is False.
packing_length (Optional[int]): Length of packing. Default is None.
packing_num_proc (int): Number of processes used for packing, Default is 1.
packing_strategy (Literal['binpack', 'sequential']): Packing algorithm. 'binpack' (default) uses
best-fit-decreasing bin packing (reorders samples); 'sequential' uses order-preserving greedy
packing (next-fit: a single open pack, flushed when the next sample doesn't fit) so the sample
order / pack boundaries follow a sequential sampler (use packing_num_proc=1). Default is 'binpack'.
lazy_tokenize (Optional[bool]): Whether to enable lazy tokenization. Default is None.
use_hf (bool): Whether to use Hugging Face for downloading/uploading models and datasets. If False,
ModelScope is used. Default is False.
Expand Down Expand Up @@ -101,6 +105,7 @@ class BaseArguments(GenerationArguments, QuantizeArguments, DataArguments, Templ
packing: bool = False
packing_length: Optional[int] = None
packing_num_proc: int = 1
packing_strategy: Literal['binpack', 'sequential'] = 'binpack'
lazy_tokenize: Optional[bool] = None
# hub
use_hf: bool = False
Expand Down
36 changes: 32 additions & 4 deletions swift/dataset/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,32 @@
logger = get_logger()


def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True):
def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True, strategy: str = 'binpack'):
if len(sequences) == 0:
return [], []
# https://arxiv.org/pdf/2404.10830
if strategy == 'sequential':
Comment on lines +16 to +19

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The strategy parameter is not validated. If an invalid strategy name is passed, the function will silently fall back to the default 'binpack' strategy. It is safer to explicitly validate the strategy and raise a ValueError if an unsupported strategy is provided.

Suggested change
def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True, strategy: str = 'binpack'):
if len(sequences) == 0:
return [], []
# https://arxiv.org/pdf/2404.10830
if strategy == 'sequential':
def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True, strategy: str = 'binpack'):
if strategy not in ('binpack', 'sequential'):
raise ValueError(f"Unknown packing strategy: {strategy}. Supported strategies are 'binpack' and 'sequential'.")
if len(sequences) == 0:
return [], []
if strategy == 'sequential':

# Order-preserving greedy packing (next-fit): keep a single open pack and flush it
# when the next sample doesn't fit, so the global sample order and pack boundaries
# follow the input order (a sequential sampler). Useful for faithfully reproducing
# recipes that encode domain weighting in the sample order. (Use packing_num_proc=1
# for a single global ordering.)
packs, cur, cur_len = [], [], 0
for item in sequences: # item = (idx, length); weight_pos=1 -> length at item[1]
ln = item[1]
if cur and cur_len + ln > packing_length:
packs.append(cur)
cur, cur_len = [], 0
cur.append(item)
cur_len += ln
if cur_len >= packing_length:
packs.append(cur)
cur, cur_len = [], 0
if is_finished:
if cur:
packs.append(cur)
return packs, []
return packs, cur
# default: best-fit-decreasing bin packing (https://arxiv.org/pdf/2404.10830)
import binpacking
sequences = binpacking.to_constant_volume(sequences, packing_length, weight_pos=1)
if sequences and not is_finished:
Expand All @@ -39,6 +61,7 @@ def __init__(
load_from_cache_file: bool = True,
packing_length: Optional[int] = None,
packing_num_proc: int = 1,
packing_strategy: str = 'binpack',
**kwargs,
):
template.packing = True
Expand All @@ -48,6 +71,7 @@ def __init__(
self.num_proc = num_proc
self.strict = strict
self.load_from_cache_file = load_from_cache_file
self.packing_strategy = packing_strategy
self.packing_length = packing_length or self.template.max_length
self.packing_num_proc = min(packing_num_proc, math.ceil(len(dataset) / self.PACKING_BATCH_SIZE))
self._out_queue = mp.Queue()
Expand Down Expand Up @@ -97,7 +121,8 @@ def create_packed_idx(self, rank, offset, lengths):
break
i += self.PACKING_BATCH_SIZE
is_finished = i >= len(data)
sequences, input_data = calculate_matched_group(input_data, self.packing_length, is_finished=is_finished)
sequences, input_data = calculate_matched_group(
input_data, self.packing_length, is_finished=is_finished, strategy=self.packing_strategy)
self._out_queue.put((rank, sequences, len(new_data)))
self._out_queue.put((rank, [], -1))

Expand All @@ -122,6 +147,7 @@ def __init__(
packing_length: Optional[int] = None,
strict: bool = False,
cyclic: bool = False,
packing_strategy: str = 'binpack',
**kwargs,
):
template.packing = True
Expand All @@ -137,6 +163,7 @@ def __init__(
self._out_queue = mp.Queue()
self.workers = []
self.cyclic = cyclic
self.packing_strategy = packing_strategy
for _ in range(self.num_proc):
worker = mp.Process(target=self._processor, daemon=True)
worker.start()
Expand Down Expand Up @@ -194,7 +221,8 @@ def __iter__(self):
num_samples = self._put_data_in_queue(iterator)
finished = num_samples != self.packing_interval
data = self._fetch_data_out_queue(data, num_samples)
sequences, data = calculate_matched_group(data, self.packing_length, is_finished=finished)
sequences, data = calculate_matched_group(
data, self.packing_length, is_finished=finished, strategy=self.packing_strategy)
res = []
for row in sequences:
res.append([r[0] for r in row])
Expand Down
1 change: 1 addition & 0 deletions swift/pipelines/train/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def _post_process_datasets(self, datasets: List) -> List:
num_proc=args.dataset_num_proc,
packing_length=args.packing_length,
packing_num_proc=args.packing_num_proc,
packing_strategy=args.packing_strategy,
strict=args.strict,
load_from_cache_file=args.load_from_cache_file)
elif args.streaming:
Expand Down
Loading