Skip to content

Commit ed6bc79

Browse files
author
cloudforge1
committed
【Hackathon 9th No.88】Phase B: model_executor print→logger + per-request INFO→DEBUG + f-string→%s
- model_executor/: 14 print() calls → logger.debug/warning/info (6 files) - graph_optimization/utils.py: memory info → debug - ops/gpu/__init__.py: sm_version → debug, module error → warning - models/adapters.py: load failure → warning - layers/attention/mla_attention_backend.py: flash attn detect → info - ops/triton_ops/triton_utils.py: kernel cache/compile → debug - ops/triton_ops/triton_utils_v2.py: kernel cache/compile → debug - entrypoints/: per-request INFO → DEBUG with lazy %s (12 files) - engine_client.py: objgraph debug, max_tokens, control methods - serving_chat.py: create/release/stream per-request calls - serving_completion.py: init/preprocess/response per-request calls - v1/serving_chat.py, v1/serving_completion.py: stream/response - run_batch.py: startup/progress f-string → %s (keep INFO) - llm.py, chat_utils.py, serving_reward.py, utils.py, v1/serving_base.py - scheduler/splitwise_scheduler.py: per-request → DEBUG, startup → INFO %s Acceptance criteria met: - Core module print() = 0 (excl subprocess string, .print() methods) - Per-request INFO = 0 in serving paths (all downgraded to DEBUG) - All hot-path logger calls use %s lazy formatting
1 parent d81a2d1 commit ed6bc79

18 files changed

Lines changed: 90 additions & 68 deletions

File tree

fastdeploy/entrypoints/chat_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def http_get_with_retry(self, url, max_retries=3, retry_delay=1, backoff_factor=
122122
if retry_cnt >= max_retries:
123123
api_server_logger.error(f"HTTP GET failed: {e}. Max retries reached")
124124
raise
125-
api_server_logger.info(f"HTTP GET failed: {e}. Start retry {retry_cnt}")
125+
api_server_logger.info("HTTP GET failed: %s. Start retry %s", e, retry_cnt)
126126
time.sleep(delay)
127127
delay *= backoff_factor
128128

fastdeploy/entrypoints/engine_client.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ async def add_requests(self, task):
311311
)
312312
else:
313313
request_id = task.get("request_id", "unknown")
314-
obj_logger.info(f"\n{'='*60} OBJGRAPH DEBUG [request_id={request_id}] {'='*60}")
314+
obj_logger.debug("\n%s OBJGRAPH DEBUG [request_id=%s] %s", '='*60, request_id, '='*60)
315315
# 打印内存占用
316316
if not _has_psutil:
317317
obj_logger.warning(
@@ -320,18 +320,18 @@ async def add_requests(self, task):
320320
else:
321321
process = psutil.Process()
322322
rss_memory = process.memory_info().rss / 1024**3
323-
obj_logger.info(f"Process Memory (RSS): {rss_memory:.2f} GB")
323+
obj_logger.debug("Process Memory (RSS): %.2f GB", rss_memory)
324324
obj_logger.info("Object growth statistics:")
325325
growth_data = objgraph.growth(limit=20)
326326
for item in growth_data:
327327
if len(item) == 3:
328328
obj_type, current_count, growth = item
329-
obj_logger.info(f" {obj_type:30s} {current_count:8d} +{growth}")
329+
obj_logger.debug(" %-30s %8d +%s", obj_type, current_count, growth)
330330
elif len(item) == 2:
331331
obj_type, count = item
332-
obj_logger.info(f" {obj_type:30s} +{count}")
332+
obj_logger.debug(" %-30s +%s", obj_type, count)
333333
else:
334-
obj_logger.info(f" {item}")
334+
obj_logger.debug(" %s", item)
335335

336336
task["metrics"]["preprocess_start_time"] = time.time()
337337
request_id = task.get("request_id").split("_")[0]
@@ -354,7 +354,7 @@ async def add_requests(self, task):
354354
min_tokens = task.get("min_tokens", 1)
355355
if "messages" in task:
356356
task["messages"] = None
357-
api_server_logger.info(f"task['max_tokens']:{task['max_tokens']}")
357+
api_server_logger.debug("task['max_tokens']:%s", task['max_tokens'])
358358
main_process_metrics.request_params_max_tokens.observe(task["max_tokens"])
359359
main_process_metrics.prompt_tokens_total.inc(input_ids_len)
360360
main_process_metrics.request_prompt_tokens.observe(input_ids_len)
@@ -581,7 +581,7 @@ def check_health(self, time_interval_threashold=30):
581581
return True, ""
582582

583583
async def run_control_method(self, request: ControlRequest):
584-
api_server_logger.info(f"Start Run Control Method: {request}")
584+
api_server_logger.debug("Start Run Control Method: %s", request)
585585
self.zmq_client.send_json(request.to_dict())
586586
request_id = request.request_id
587587
dealer, response_queue = await self.connection_manager.get_connection(request_id)
@@ -590,7 +590,7 @@ async def run_control_method(self, request: ControlRequest):
590590
# todo: support user specified timeout. default 600s is enough for most control cases
591591
response = await asyncio.wait_for(response_queue.get(), timeout=600)
592592
response = ControlResponse.from_dict(response[0])
593-
api_server_logger.info(f"End Run Control Method: {response}")
593+
api_server_logger.debug("End Run Control Method: %s", response)
594594
return response
595595
except asyncio.TimeoutError:
596596
error_response = ControlResponse(request_id, 500, "Timeout waiting for control method response")
@@ -816,7 +816,7 @@ async def rearrange_experts(self, request_dict: dict):
816816
return content, status_code
817817

818818
action = request_dict.get("action", "")
819-
api_server_logger.info(f"redundant_expert: rearrange_experts recv request, action {action}")
819+
api_server_logger.debug("redundant_expert: rearrange_experts recv request, action %s", action)
820820
if action == "":
821821
# action: start rearrange experts
822822
# params: {'user': 'xxx', 'passwd': 'xxx', 'ips': ['10.54.99.77:8000', '10.54.99.77:8300']}
@@ -984,7 +984,7 @@ async def check_redundant(self, request_dict: dict):
984984

985985
async def abort(self, request_id, n=1) -> None:
986986
if envs.FD_ENABLE_REQUEST_DISCONNECT_STOP_INFERENCE:
987-
api_server_logger.info(f"abort request_id:{request_id}")
987+
api_server_logger.info("abort request_id:%s", request_id)
988988
if n <= 0:
989989
api_server_logger.warning("Abort function called with non-positive n: %d. No requests aborted.", n)
990990
return

fastdeploy/entrypoints/llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ def _build_sample_logprobs(self, logprobs_lists: LogprobsLists, topk_logprobs: i
406406
list[dict[int, Logprob]]: One dict per request, mapping token ID to Logprob.
407407
"""
408408
try:
409-
llm_logger.info(f"filter logprobs, topk_logprobs: {topk_logprobs}")
409+
llm_logger.debug("filter logprobs, topk_logprobs: %s", topk_logprobs)
410410
if not logprobs_lists.logprob_token_ids:
411411
llm_logger.warning("Empty logprob_token_ids in LogprobsLists")
412412
return None

fastdeploy/entrypoints/openai/run_batch.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def init_engine(args: argparse.Namespace):
114114
if llm_engine is not None:
115115
return llm_engine
116116

117-
api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}")
117+
api_server_logger.info("FastDeploy LLM API server starting... %s", os.getpid())
118118
engine_args = EngineArgs.from_cli_args(args)
119119
engine = LLMEngine.from_engine_args(engine_args)
120120
if not engine.start(api_server_pid=os.getpid()):
@@ -144,7 +144,7 @@ def completed(self):
144144
if self._total > 0:
145145
log_interval = min(100, max(self._total // 10, 1))
146146
if self._completed - self._last_log_count >= log_interval:
147-
console_logger.info(f"Progress: {self._completed}/{self._total} requests completed")
147+
console_logger.info("Progress: %s/%s requests completed", self._completed, self._total)
148148
self._last_log_count = self._completed
149149

150150
def pbar(self) -> tqdm:
@@ -398,7 +398,7 @@ async def setup_engine_and_handlers(args: Namespace) -> Tuple[EngineClient, Open
398398
args.tokenizer = args.model
399399

400400
pid = determine_process_id()
401-
console_logger.info(f"Process ID: {pid}")
401+
console_logger.info("Process ID: %s", pid)
402402

403403
model_paths = create_model_paths(args)
404404
chat_template = load_chat_template(args.chat_template, args.model)
@@ -429,7 +429,7 @@ async def run_batch(
429429
max_concurrency = (concurrency + workers - 1) // workers
430430
semaphore = asyncio.Semaphore(max_concurrency)
431431

432-
console_logger.info(f"concurrency: {concurrency}, workers: {workers}, max_concurrency: {max_concurrency}")
432+
console_logger.info("concurrency: %s, workers: %s, max_concurrency: %s", concurrency, workers, max_concurrency)
433433

434434
tracker = BatchProgressTracker()
435435
console_logger.info("Reading batch from %s...", args.input_file)
@@ -474,7 +474,7 @@ async def run_batch(
474474

475475
success_count = sum(1 for r in responses if r.error is None)
476476
error_count = len(responses) - success_count
477-
console_logger.info(f"Batch processing completed: {success_count} success, {error_count} errors")
477+
console_logger.info("Batch processing completed: %s success, %s errors", success_count, error_count)
478478

479479
await write_file(args.output_file, responses, args.output_tmp_dir)
480480
console_logger.info("Results written to output file")
@@ -485,7 +485,7 @@ async def main(args: argparse.Namespace):
485485
try:
486486
if args.workers is None:
487487
args.workers = max(min(int(args.max_num_seqs // 32), 8), 1)
488-
console_logger.info(f"Workers: {args.workers}")
488+
console_logger.info("Workers: %s", args.workers)
489489
args.model = retrive_model_from_server(args.model, args.revision)
490490

491491
if args.tool_parser_plugin:

fastdeploy/entrypoints/openai/serving_chat.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def __init__(
9898
else:
9999
self.master_ip = "0.0.0.0"
100100
self.is_master_ip = True
101-
api_server_logger.info(f"master ip: {self.master_ip}")
101+
api_server_logger.info("master ip: %s", self.master_ip)
102102

103103
def _check_master(self):
104104
return self.engine_client.is_master or self.is_master_ip
@@ -129,7 +129,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
129129
await self.engine_client.semaphore.acquire()
130130
else:
131131
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
132-
api_server_logger.info(f"current {self.engine_client.semaphore.status()}")
132+
api_server_logger.debug("current %s", self.engine_client.semaphore.status())
133133

134134
if request.request_id is not None:
135135
request_id = request.request_id
@@ -141,7 +141,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
141141
request_id = f"chatcmpl-{uuid.uuid4()}"
142142
tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy")
143143
del request.trace_context
144-
api_server_logger.info(f"create chat completion request: {request_id}")
144+
api_server_logger.debug("create chat completion request: %s", request_id)
145145
prompt_tokens = None
146146
max_tokens = None
147147
try:
@@ -252,7 +252,7 @@ async def chat_completion_stream_generator(
252252
choices=[],
253253
model=model_name,
254254
)
255-
api_server_logger.info(f"create chat completion request: {request_id}")
255+
api_server_logger.debug("create chat completion request: %s", request_id)
256256

257257
try:
258258
dealer, response_queue = await self.engine_client.connection_manager.get_connection(
@@ -370,7 +370,7 @@ async def chat_completion_stream_generator(
370370
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=0),
371371
)
372372
yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n"
373-
api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}")
373+
api_server_logger.debug("Chat Streaming response send_idx 0: %s", chunk.model_dump_json())
374374
first_iteration = False
375375

376376
output = res["outputs"]
@@ -489,7 +489,7 @@ async def chat_completion_stream_generator(
489489
chunk.choices = choices
490490
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
491491
if res["finished"]:
492-
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
492+
api_server_logger.debug("Chat Streaming response last send: %s", chunk.model_dump_json())
493493
choices = []
494494

495495
if include_usage:
@@ -528,7 +528,7 @@ async def chat_completion_stream_generator(
528528
tracing.trace_req_finish(request_id)
529529
await self.engine_client.connection_manager.cleanup_request(request_id)
530530
self.engine_client.semaphore.release()
531-
api_server_logger.info(f"release {request_id} {self.engine_client.semaphore.status()}")
531+
api_server_logger.debug("release %s %s", request_id, self.engine_client.semaphore.status())
532532
yield "data: [DONE]\n\n"
533533

534534
async def chat_completion_full_generator(
@@ -695,7 +695,7 @@ async def chat_completion_full_generator(
695695
tracing.trace_req_finish(request_id)
696696
await self.engine_client.connection_manager.cleanup_request(request_id)
697697
self.engine_client.semaphore.release()
698-
api_server_logger.info(f"release {self.engine_client.semaphore.status()}")
698+
api_server_logger.debug("release %s", self.engine_client.semaphore.status())
699699

700700
num_prompt_tokens = len(prompt_token_ids)
701701
num_generated_tokens = sum(previous_num_tokens)
@@ -722,7 +722,7 @@ async def chat_completion_full_generator(
722722
choices=choices,
723723
usage=usage,
724724
)
725-
api_server_logger.info(f"Chat response: {res.model_dump_json()}")
725+
api_server_logger.debug("Chat response: %s", res.model_dump_json())
726726
return res
727727

728728
async def _create_chat_completion_choice(

fastdeploy/entrypoints/openai/serving_completion.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def __init__(self, engine_client, models, pid, ips, max_waiting_time):
7777
self.master_ip = "0.0.0.0"
7878
self.is_master_ip = True
7979
self._is_process_response_dict_async = None
80-
api_server_logger.info(f"master ip: {self.master_ip}")
80+
api_server_logger.info("master ip: %s", self.master_ip)
8181

8282
def _check_master(self):
8383
return self.engine_client.is_master or self.is_master_ip
@@ -110,7 +110,7 @@ async def create_completion(self, request: CompletionRequest):
110110
request_id = f"cmpl-{request.user}-{uuid.uuid4()}"
111111
else:
112112
request_id = f"cmpl-{uuid.uuid4()}"
113-
api_server_logger.info(f"Initialize request {request_id}: {request}")
113+
api_server_logger.debug("Initialize request %s: %s", request_id, request)
114114
tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy")
115115
del request.trace_context
116116
request_prompt_ids = None
@@ -155,7 +155,7 @@ async def create_completion(self, request: CompletionRequest):
155155
request_prompts = request_prompt_ids
156156

157157
num_choices = len(request_prompts) * (1 if request.n is None else request.n)
158-
api_server_logger.info(f"Start preprocessing request: req_id={request_id}), num_choices={num_choices}")
158+
api_server_logger.debug("Start preprocessing request: req_id=%s, num_choices=%s", request_id, num_choices)
159159
prompt_batched_token_ids = []
160160
prompt_tokens_list = []
161161
max_tokens_list = []
@@ -370,7 +370,7 @@ async def completion_full_generator(
370370
prompt_tokens_list=prompt_tokens_list,
371371
max_tokens_list=max_tokens_list,
372372
)
373-
api_server_logger.info(f"Completion response: {res.model_dump_json()}")
373+
api_server_logger.debug("Completion response: %s", res.model_dump_json())
374374
return res
375375
except Exception as e:
376376
api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
@@ -642,7 +642,7 @@ async def completion_stream_generator(
642642
metrics=res["metrics"] if request.collect_metrics else None,
643643
)
644644
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
645-
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
645+
api_server_logger.debug("Completion Streaming response last send: %s", chunk.model_dump_json())
646646

647647
except asyncio.CancelledError as e:
648648
await self.engine_client.abort(f"{request_id}_0", num_choices)

fastdeploy/entrypoints/openai/serving_reward.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ async def create_reward(self, request: ChatRewardRequest):
104104
@override
105105
def _build_response(self, ctx: ServeContext, request_output: dict):
106106
"""Generate final reward response"""
107-
api_server_logger.info(f"[{ctx.request_id}] Reward RequestOutput received:{request_output}")
107+
api_server_logger.debug("[%s] Reward RequestOutput received:%s", ctx.request_id, request_output)
108108

109109
base = PoolingRequestOutput.from_dict(request_output)
110110
reward_res = RewardRequestOutput.from_base(base)

fastdeploy/entrypoints/openai/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ async def initialize(self):
9797
self.running = True
9898
for index in range(self.max_connections):
9999
await self._add_connection(index)
100-
api_server_logger.info(f"Started {self.max_connections} connections, pid {self.pid}")
100+
api_server_logger.info("Started %s connections, pid %s", self.max_connections, self.pid)
101101

102102
async def _add_connection(self, index):
103103
"""create a new connection and start listening task"""

fastdeploy/entrypoints/openai/v1/serving_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def __init__(
7676
self.master_ip = "0.0.0.0"
7777
self.is_master_ip = True
7878
self.eoi_token_id = 101032
79-
api_server_logger.info(f"master ip: {self.master_ip}")
79+
api_server_logger.info("master ip: %s", self.master_ip)
8080

8181
@override
8282
def _check_master(self) -> bool:

fastdeploy/entrypoints/openai/v1/serving_chat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ async def _build_stream_response(
302302
max_tokens = request.max_completion_tokens or request.max_tokens
303303
choice_completion_tokens = response_ctx.choice_completion_tokens_dict[output.index]
304304
choice.finish_reason = self._calc_finish_reason(request_output, max_tokens, choice_completion_tokens)
305-
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
305+
api_server_logger.debug("Chat Streaming response last send: %s", chunk.model_dump_json())
306306

307307
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
308308
if request_output.finished and response_ctx.remain_choices == 0:
@@ -339,7 +339,7 @@ async def _build_full_response(
339339
res = ChatCompletionResponse(
340340
id=ctx.request_id, model=request.model, choices=choices, created=ctx.created_time, usage=response_ctx.usage
341341
)
342-
api_server_logger.info(f"Chat response: {res.model_dump_json()}")
342+
api_server_logger.debug("Chat response: %s", res.model_dump_json())
343343
return res
344344

345345
async def _create_chat_completion_choice(

0 commit comments

Comments
 (0)