modelscope · yuchenwang3 · Jun 18, 2026 · gemini-code-assist · Jun 18, 2026
diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md
@@ -511,6 +511,7 @@ Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数，
   - 注意：Qwen3-Next/Qwen3.5的packing请使用Megatron-SWIFT，具体参考[Qwen3.5最佳实践](../BestPractices/Qwen3_5-Best-Practice.md)。
 - packing_length: packing的长度。默认为None，设置为max_length。
 - packing_num_proc: packing的进程数，默认为1。需要注意的是，不同的`packing_num_proc`，最终形成的packed数据集是不同的。（该参数在流式packing时不生效）。通常不需要修改该值，packing速度远快于tokenize速度。
+- packing_strategy: packing 算法，可选为'binpack'和'sequential'，默认为'binpack'。'binpack'使用 best-fit-decreasing 装箱（会按长度重排样本）；'sequential'使用保序的贪心装箱（next-fit：仅维护一个开放 pack，放不下即 flush），按输入顺序逐条装箱，使样本顺序与每个 pack 的边界跟随顺序采样器（建议配合 packing_num_proc=1 以保证全局顺序），适合忠实复刻把域加权编码在样本顺序里的配方。
 - lazy_tokenize: 是否使用lazy_tokenize。若该参数设置为False，则在训练之前对所有的数据集样本进行tokenize（多模态模型则包括从磁盘中读取图片）。该参数默认为None，在LLM训练中默认为False，而MLLM训练默认为True，节约内存。
   - 注意：若你要进行图像的数据增强，你需要将lazy_tokenize（或streaming）设置为True，并修改Template类中的encode方法。
 - use_logits_to_keep: 通过在`forward`中根据labels传入logits_to_keep，减少无效logits的计算与存储，从而减少显存占用并加快训练速度。默认为None，进行自动选择。

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -523,6 +523,7 @@ Training arguments include the [base arguments](#base-arguments), [Seq2SeqTraine
   - Note: For Qwen3-Next/Qwen3.5 packing, please use Megatron-SWIFT. Refer to [Qwen3.5 Best Practice](../BestPractices/Qwen3_5-Best-Practice.md) for details.
 - packing_length: the length to use for packing. Defaults to None, in which case it is set to max_length.
 - packing_num_proc: Number of processes for packing, default is 1. Note that different values of `packing_num_proc` will result in different packed datasets. (This parameter does not take effect during streaming packing). Usually there is no need to modify this value, as packing speed is much faster than tokenization speed.
+- packing_strategy: Packing algorithm, one of 'binpack' and 'sequential', default is 'binpack'. 'binpack' uses best-fit-decreasing bin packing (which reorders samples by length); 'sequential' uses order-preserving greedy packing (next-fit: a single open pack, flushed when the next sample doesn't fit), keeping samples in input order so the sample order and pack boundaries follow a sequential sampler (use `packing_num_proc=1` for a single global ordering). Useful for faithfully reproducing recipes that encode domain weighting in the sample order.
 - lazy_tokenize: Whether to use lazy tokenization. If set to `False`, all dataset samples will be tokenized (and for multimodal models, images will be loaded from disk) before training begins. Default is `None`: in LLM training, it defaults to `False`; in MLLM training, it defaults to `True` to save memory.
   - Note: If you want to perform image data augmentation, you need to set `lazy_tokenize` (or `streaming`) to True and modify the `encode` method in the Template class.
 - use_logits_to_keep: Pass `logits_to_keep` in the `forward` method based on labels to reduce the computation and storage of unnecessary logits, thereby reducing memory usage and accelerating training. The default is `None`, which enables automatic selection.

diff --git a/swift/arguments/base_args/base_args.py b/swift/arguments/base_args/base_args.py
@@ -74,6 +74,10 @@ class BaseArguments(GenerationArguments, QuantizeArguments, DataArguments, Templ
         packing (bool): Whether to enable packing of datasets. Default is False.
         packing_length (Optional[int]): Length of packing. Default is None.
         packing_num_proc (int): Number of processes used for packing, Default is 1.
+        packing_strategy (Literal['binpack', 'sequential']): Packing algorithm. 'binpack' (default) uses
+            best-fit-decreasing bin packing (reorders samples); 'sequential' uses order-preserving greedy
+            packing (next-fit: a single open pack, flushed when the next sample doesn't fit) so the sample
+            order / pack boundaries follow a sequential sampler (use packing_num_proc=1). Default is 'binpack'.
         lazy_tokenize (Optional[bool]): Whether to enable lazy tokenization. Default is None.
         use_hf (bool): Whether to use Hugging Face for downloading/uploading models and datasets. If False,
             ModelScope is used. Default is False.
@@ -101,6 +105,7 @@ class BaseArguments(GenerationArguments, QuantizeArguments, DataArguments, Templ
     packing: bool = False
     packing_length: Optional[int] = None
     packing_num_proc: int = 1
+    packing_strategy: Literal['binpack', 'sequential'] = 'binpack'
     lazy_tokenize: Optional[bool] = None
     # hub
     use_hf: bool = False

diff --git a/swift/dataset/packing.py b/swift/dataset/packing.py
@@ -13,10 +13,32 @@
 logger = get_logger()
 
 
-def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True):
+def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True, strategy: str = 'binpack'):
     if len(sequences) == 0:
         return [], []
-    # https://arxiv.org/pdf/2404.10830
+    if strategy == 'sequential':
-def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True, strategy: str = 'binpack'):
-    if len(sequences) == 0:
-        return [], []
-    # https://arxiv.org/pdf/2404.10830
-    if strategy == 'sequential':
+def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True, strategy: str = 'binpack'):
+    if strategy not in ('binpack', 'sequential'):
+        raise ValueError(f"Unknown packing strategy: {strategy}. Supported strategies are 'binpack' and 'sequential'.")
+    if len(sequences) == 0:
+        return [], []
+    if strategy == 'sequential':
-def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True, strategy: str = 'binpack'):
-    if len(sequences) == 0:
-        return [], []
-    # https://arxiv.org/pdf/2404.10830
-    if strategy == 'sequential':
+def calculate_matched_group(sequences, packing_length: int, is_finished: bool = True, strategy: str = 'binpack'):
+    if strategy not in ('binpack', 'sequential'):
+        raise ValueError(f"Unknown packing strategy: {strategy}. Supported strategies are 'binpack' and 'sequential'.")
+    if len(sequences) == 0:
+        return [], []
+    if strategy == 'sequential':
+        # Order-preserving greedy packing (next-fit): keep a single open pack and flush it
+        # when the next sample doesn't fit, so the global sample order and pack boundaries
+        # follow the input order (a sequential sampler). Useful for faithfully reproducing
+        # recipes that encode domain weighting in the sample order. (Use packing_num_proc=1
+        # for a single global ordering.)
+        packs, cur, cur_len = [], [], 0
+        for item in sequences:  # item = (idx, length); weight_pos=1 -> length at item[1]
+            ln = item[1]
+            if cur and cur_len + ln > packing_length:
+                packs.append(cur)
+                cur, cur_len = [], 0
+            cur.append(item)
+            cur_len += ln
+            if cur_len >= packing_length:
+                packs.append(cur)
+                cur, cur_len = [], 0
+        if is_finished:
+            if cur:
+                packs.append(cur)
+            return packs, []
+        return packs, cur
+    # default: best-fit-decreasing bin packing (https://arxiv.org/pdf/2404.10830)
     import binpacking
     sequences = binpacking.to_constant_volume(sequences, packing_length, weight_pos=1)
     if sequences and not is_finished:
@@ -39,6 +61,7 @@ def __init__(
         load_from_cache_file: bool = True,
         packing_length: Optional[int] = None,
         packing_num_proc: int = 1,
+        packing_strategy: str = 'binpack',
         **kwargs,
     ):
         template.packing = True
@@ -48,6 +71,7 @@ def __init__(
         self.num_proc = num_proc
         self.strict = strict
         self.load_from_cache_file = load_from_cache_file
+        self.packing_strategy = packing_strategy
         self.packing_length = packing_length or self.template.max_length
         self.packing_num_proc = min(packing_num_proc, math.ceil(len(dataset) / self.PACKING_BATCH_SIZE))
         self._out_queue = mp.Queue()
@@ -97,7 +121,8 @@ def create_packed_idx(self, rank, offset, lengths):
                 break
             i += self.PACKING_BATCH_SIZE
             is_finished = i >= len(data)
-            sequences, input_data = calculate_matched_group(input_data, self.packing_length, is_finished=is_finished)
+            sequences, input_data = calculate_matched_group(
+                input_data, self.packing_length, is_finished=is_finished, strategy=self.packing_strategy)
             self._out_queue.put((rank, sequences, len(new_data)))
         self._out_queue.put((rank, [], -1))
 
@@ -122,6 +147,7 @@ def __init__(
         packing_length: Optional[int] = None,
         strict: bool = False,
         cyclic: bool = False,
+        packing_strategy: str = 'binpack',
         **kwargs,
     ):
         template.packing = True
@@ -137,6 +163,7 @@ def __init__(
         self._out_queue = mp.Queue()
         self.workers = []
         self.cyclic = cyclic
+        self.packing_strategy = packing_strategy
         for _ in range(self.num_proc):
             worker = mp.Process(target=self._processor, daemon=True)
             worker.start()
@@ -194,7 +221,8 @@ def __iter__(self):
             num_samples = self._put_data_in_queue(iterator)
             finished = num_samples != self.packing_interval
             data = self._fetch_data_out_queue(data, num_samples)
-            sequences, data = calculate_matched_group(data, self.packing_length, is_finished=finished)
+            sequences, data = calculate_matched_group(
+                data, self.packing_length, is_finished=finished, strategy=self.packing_strategy)
             res = []
             for row in sequences:
                 res.append([r[0] for r in row])

diff --git a/swift/pipelines/train/sft.py b/swift/pipelines/train/sft.py
@@ -143,6 +143,7 @@ def _post_process_datasets(self, datasets: List) -> List:
                     num_proc=args.dataset_num_proc,
                     packing_length=args.packing_length,
                     packing_num_proc=args.packing_num_proc,
+                    packing_strategy=args.packing_strategy,
                     strict=args.strict,
                     load_from_cache_file=args.load_from_cache_file)
             elif args.streaming: