Skip to content

Commit d7055e3

Browse files
authored
Merge pull request #857 from OptimalScale/yizhenjia-maintenance
Usability update
2 parents f9116ab + f9d99e1 commit d7055e3

10 files changed

Lines changed: 301 additions & 78 deletions

scripts/run_reward_modeling.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
#!/usr/bin/env python
2-
# coding=utf-8
1+
#!/bin/bash
32
# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
43
# Parses arguments
54
model_name_or_path=google/gemma-2b-it

scripts/run_reward_modeling_with_lisa.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
#!/usr/bin/env python
2-
# coding=utf-8
1+
#!/bin/bash
32
# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
43
# Parses arguments
54
model_name_or_path=google/gemma-2b-it

scripts/run_reward_modeling_with_lora.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
#!/usr/bin/env python
2-
# coding=utf-8
1+
#!/bin/bash
32
# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
43
# Parses arguments
54
model_name_or_path=google/gemma-2b-it

src/lmflow/args.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ class ModelArguments:
9393
9494
arch_type : str
9595
Model architecture type.
96+
padding_side : str
97+
The side on which the tokenizer should have padding applied.
9698
"""
9799

98100
model_name_or_path: Optional[str] = field(
@@ -296,6 +298,16 @@ class ModelArguments:
296298
"choices": [None, "left", "right"],
297299
},
298300
)
301+
padding_side: str = field(
302+
default='right',
303+
metadata={
304+
"help": (
305+
"The side on which the tokenizer should have padding applied. "
306+
"LMFlow uses right padding by default. When set to `auto`, will "
307+
"use padding_side from tokenizer.padding_side."),
308+
"choices": ["right", "left", "auto"],
309+
}
310+
)
299311

300312
def __post_init__(self):
301313
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):

src/lmflow/models/hf_decoder_model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ def tokenize(self, dataset, add_special_tokens=True, *args, **kwargs):
246246
(
247247
raw_datasets.get_fingerprint()
248248
+ str(self.tokenizer)
249+
+ f'###padding_side={self.tokenizer.padding_side}'
249250
+ ('###conversation_template=' + str(conversation_template) if "conversation" in dataset_type else "")
250251
+ f'###disable_group_texts={data_args.disable_group_texts}'
251252
+ f'###block_size={data_args.block_size}'

src/lmflow/models/hf_model_mixin.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
44
import os
55
import logging
6-
from typing import Union, Optional
6+
from typing import Union, Optional, Dict
77

88
import torch
99
import deepspeed
@@ -30,6 +30,7 @@
3030
from lmflow.utils.constants import (
3131
LMFLOW_LORA_TARGET_MODULES_MAPPING
3232
)
33+
from lmflow.args import ModelArguments
3334

3435

3536
logger = logging.getLogger(__name__)
@@ -51,11 +52,12 @@
5152
class HFModelMixin(BaseModel):
5253
def __init__(
5354
self,
54-
model_args,
55+
model_args: ModelArguments,
5556
do_train: bool,
5657
ds_config=None,
5758
device: Optional[str]="gpu",
5859
use_accelerator: bool=False,
60+
hf_auto_model_additional_args: Optional[Dict]=None,
5961
*args,
6062
**kwargs
6163
):
@@ -88,7 +90,7 @@ def __init__(
8890
self.model_args = model_args
8991
self.tokenizer = self.__prepare_tokenizer(model_args)
9092
self.torch_dtype = self.__prepare_dtype(model_args)
91-
self.hf_model_config = self.__prepare_model_config(model_args)
93+
self.hf_model_config = self.__prepare_model_config(model_args, hf_auto_model_additional_args)
9294
self.quant_config = self.__prepare_quant_config(model_args)
9395
self.peft_config = self.__prepare_peft_config(model_args)
9496

@@ -106,11 +108,13 @@ def __init__(
106108
self.tokenizer.eos_token_id = self.backend_model.config.eos_token_id
107109
if self.tokenizer.pad_token_id is None:
108110
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
111+
if self.backend_model.config.pad_token_id is None:
112+
self.backend_model.config.pad_token_id = self.tokenizer.pad_token_id
109113

110114

111115
def __prepare_tokenizer(
112116
self,
113-
model_args
117+
model_args: ModelArguments,
114118
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
115119
tokenizer_kwargs = {
116120
"cache_dir": model_args.cache_dir,
@@ -119,6 +123,8 @@ def __prepare_tokenizer(
119123
"use_auth_token": True if model_args.use_auth_token else None,
120124
"trust_remote_code": model_args.trust_remote_code,
121125
}
126+
if model_args.padding_side != 'auto':
127+
tokenizer_kwargs["padding_side"] = model_args.padding_side
122128

123129
try:
124130
if model_args.tokenizer_name:
@@ -163,7 +169,7 @@ def __prepare_tokenizer(
163169

164170
def __prepare_dtype(
165171
self,
166-
model_args
172+
model_args: ModelArguments,
167173
) -> torch.dtype:
168174
if model_args.arch_type == 'text_regression':
169175
if model_args.torch_dtype in ["auto", None, "bf16", "bfloat16"]:
@@ -189,8 +195,23 @@ def __prepare_dtype(
189195

190196
def __prepare_model_config(
191197
self,
192-
model_args
198+
model_args: ModelArguments,
199+
hf_auto_model_additional_args: Optional[Dict]=None,
193200
):
201+
"""Prepare model configuration for hf auto register,
202+
Parameters
203+
----------
204+
model_args : ModelArguments
205+
LMFlow model arguments.
206+
hf_auto_model_additional_args : Optional[Dict], optional
207+
Special configurations such as `num_labels` in `AutoModelForSequenceClassification`
208+
(commonly used in reward modeling) will not preset in __prepare_model_config,
209+
so it should be passed in hf_auto_model_additional_args.
210+
Returns
211+
-------
212+
config : ModelConfig
213+
hf model config.
214+
"""
194215
config_kwargs = {
195216
"torch_dtype": self.torch_dtype,
196217
"attn_implementation": "flash_attention_2" if model_args.use_flash_attention else None,
@@ -200,6 +221,9 @@ def __prepare_model_config(
200221
"trust_remote_code": model_args.trust_remote_code,
201222
"from_tf": bool(".ckpt" in model_args.model_name_or_path),
202223
}
224+
if hf_auto_model_additional_args is not None:
225+
config_kwargs.update(hf_auto_model_additional_args)
226+
203227
if model_args.config_name:
204228
config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
205229
elif model_args.model_name_or_path:
@@ -217,7 +241,7 @@ def __prepare_model_config(
217241

218242
def __prepare_quant_config(
219243
self,
220-
model_args
244+
model_args: ModelArguments,
221245
):
222246
quant_config = None
223247
if model_args.use_qlora:
@@ -236,7 +260,7 @@ def __prepare_quant_config(
236260

237261
def __prepare_peft_config(
238262
self,
239-
model_args
263+
model_args: ModelArguments,
240264
):
241265
peft_config = None
242266
if model_args.use_lora:
@@ -267,7 +291,7 @@ def __prepare_peft_config(
267291

268292
def __model_module_inject(
269293
self,
270-
model_args
294+
model_args: ModelArguments,
271295
) -> None:
272296
"""Override some model modules with custom implementations.
273297
@@ -286,8 +310,8 @@ def __model_module_inject(
286310

287311
def __prepare_model_for_training(
288312
self,
289-
model_args,
290-
hf_auto_model: HF_AUTOMODEL_TYPE
313+
model_args: ModelArguments,
314+
hf_auto_model: HF_AUTOMODEL_TYPE,
291315
):
292316
# TODO: change to accelerate
293317
logger.info("Preparing model for training")
@@ -326,7 +350,7 @@ def __prepare_model_for_training(
326350

327351
def __prepare_model_for_inference(
328352
self,
329-
model_args,
353+
model_args: ModelArguments,
330354
hf_auto_model: HF_AUTOMODEL_TYPE,
331355
use_accelerator,
332356
ds_config

src/lmflow/models/hf_text_regression_model.py

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@
3131
from lmflow.models.interfaces.tunable import Tunable
3232
from lmflow.models.hf_model_mixin import HFModelMixin
3333
from lmflow.models.text_regression_model import TextRegressionModel
34-
from lmflow.tokenization.hf_text_regression_model import tokenize_function
34+
from lmflow.tokenization.hf_text_regression_model import paired_conversation_tokenize_function, tokenize_function
3535
from lmflow.utils.conversation_template import PRESET_TEMPLATES
3636
from lmflow.utils.constants import (
3737
PAIRED_CONVERSATION_DATASET_DESCRIPTION,
38+
TEXT2TEXT_DATASET_DESCRIPTION,
39+
TEXT_ONLY_DATASET_DESCRIPTION,
3840
CONVERSATION_ROLE_NAMES,
3941
)
4042

@@ -81,13 +83,15 @@ def __init__(
8183
:param tune_strategy: tuning strategy: normal, none, lora or adapter
8284
:param ds_config: deepspeed configuration for distributed training
8385
"""
86+
config_additional_args = {"num_labels": 1}
8487
HFModelMixin.__init__(
8588
self,
8689
model_args=model_args,
8790
do_train=True if tune_strategy == "normal" else False,
8891
ds_config=ds_config,
8992
device=device,
9093
use_accelerator=use_accelerator,
94+
hf_auto_model_additional_args=config_additional_args,
9195
*args,
9296
**kwargs
9397
)
@@ -133,14 +137,28 @@ def tokenize(
133137
raw_datasets = dataset
134138
hf_raw_datasets = dataset.get_backend_dataset()
135139
column_names = list(hf_raw_datasets.features) # in paired conversation, for example, would be 'chosen' and 'rejected'
136-
137-
# since this will be pickled to avoid _LazyModule error in Hasher force
138-
# logger loading before tokenize_function
139-
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
140-
141140
data_args = raw_datasets.get_data_args()
142141

143-
if dataset_type == "paired_conversation":
142+
# Requires three types of information for tokenizing different datasets
143+
# 1) Which fields require tokenization, e.g.
144+
# "text2float": "text", but not "float"
145+
# "text2text": both "input" and "output"
146+
# 2) How will there tokenized sequence concatenated together, e.g.
147+
# "text_only": "text" -> "text"
148+
# "text2text": "input", "output" -> "input" + "output"
149+
# 3) Which fields require loss in final computation, e.g.
150+
# "text_only": "text"
151+
# "text2text": "output" only
152+
tokenized_column_order = None # Handles 1) and 2)
153+
label_columns = None # Handles 3)
154+
if dataset_type == "text_only":
155+
tokenized_column_order = ["text"]
156+
label_columns = ["text"]
157+
elif dataset_type == "text2text":
158+
tokenized_column_order = ["input", "output"]
159+
label_columns = ["output"]
160+
add_special_tokens = False
161+
elif dataset_type == "paired_conversation":
144162
if data_args.conversation_template:
145163
if data_args.conversation_template in PRESET_TEMPLATES.keys():
146164
conversation_template = PRESET_TEMPLATES[data_args.conversation_template]
@@ -157,28 +175,37 @@ def tokenize(
157175
raise NotImplementedError(
158176
f"Dataset type \"{dataset_type}\" is not supported, currently"
159177
" only support following data types for HFTextRegressionModel:\n"
160-
f" {PAIRED_CONVERSATION_DATASET_DESCRIPTION}\n"
178+
f" 1) {TEXT_ONLY_DATASET_DESCRIPTION}\n"
179+
f" 2) {TEXT2TEXT_DATASET_DESCRIPTION}\n"
180+
f" 3) {PAIRED_CONVERSATION_DATASET_DESCRIPTION}\n"
161181
)
162182

163183
# Whether to truncate long sequences to fit into max_length
164184
use_truncation = False
165185
if model_args.use_lora or data_args.disable_group_texts:
166186
use_truncation = True
167187

168-
tokenize_fn = tokenize_function
188+
tokenize_fn = paired_conversation_tokenize_function if "conversation" in dataset_type else tokenize_function
169189
tokenize_fn_kwargs = {
170190
"data_args": data_args,
171191
"tokenizer": self.tokenizer,
172192
"column_names": column_names,
173-
"conversation_template": conversation_template
174193
}
194+
if "conversation" in dataset_type:
195+
tokenize_fn_kwargs["conversation_template"] = conversation_template
196+
else:
197+
tokenize_fn_kwargs["label_columns"] = label_columns
198+
tokenize_fn_kwargs["tokenized_column_order"] = tokenized_column_order
199+
tokenize_fn_kwargs["add_special_tokens"] = add_special_tokens
200+
tokenize_fn_kwargs["use_truncation"] = use_truncation
175201

176202
tokenize_kwargs = {}
177203
if not data_args.streaming:
178204
fingerprint = hashlib.md5(
179205
(
180206
raw_datasets.get_fingerprint()
181207
+ str(self.tokenizer)
208+
+ f'###padding_side={self.tokenizer.padding_side}'
182209
+ ('###conversation_template=' + str(conversation_template) if "conversation" in dataset_type else "")
183210
+ f'###disable_group_texts={data_args.disable_group_texts}'
184211
+ f'###block_size={data_args.block_size}'

src/lmflow/pipeline/rm_tuner.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,13 @@ def switch_active_layers(self):
194194
elif last_checkpoint is not None:
195195
checkpoint = last_checkpoint
196196

197+
if self.finetuner_args.gradient_checkpointing:
198+
if model.get_backend_model().config.use_cache:
199+
logger.warning(
200+
"Backend model config `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
201+
)
202+
model.get_backend_model().config.use_cache = False
203+
197204
train_result = trainer.train(resume_from_checkpoint=checkpoint)
198205

199206
trainer.save_model() # Saves the tokenizer too for easy upload

0 commit comments

Comments
 (0)