From 3d8ec3408b6be99c369ab745ef99d6aea87ccd96 Mon Sep 17 00:00:00 2001 From: Nomikfk1215 Date: Thu, 14 May 2026 01:51:34 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=B2=98=E8=B4=B4?= =?UTF-8?q?=E5=9B=BE=E7=89=87enter=E6=97=A0=E5=8F=8D=E5=BA=94+=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E8=B7=A8=E7=9B=AE=E5=BD=95=E8=AF=86=E5=88=AB=E5=9B=BE?= =?UTF-8?q?=E7=89=87=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/agent/runner_input_test.go | 79 +++++++++++++++++++++++++++++ internal/agent/turn_processing.go | 4 +- internal/context/request_test.go | 27 ++++++++++ internal/llm/capabilities.go | 25 ++++++++- internal/llm/capabilities_test.go | 25 +++++++++ tui/component_landing.go | 16 +++++- tui/component_render_test.go | 16 ++++++ tui/input_images.go | 1 + tui/input_images_test.go | 29 +++++++++++ tui/model_test.go | 47 +++++++++++++++++ tui/styles.go | 4 ++ 11 files changed, 269 insertions(+), 4 deletions(-) diff --git a/internal/agent/runner_input_test.go b/internal/agent/runner_input_test.go index 57dc7010..c018f18b 100644 --- a/internal/agent/runner_input_test.go +++ b/internal/agent/runner_input_test.go @@ -94,6 +94,85 @@ func TestRunPromptWithInputForwardsStructuredUserMessageAndAssets(t *testing.T) } } +func TestRunPromptWithInputUsesSelectedModelForImageCapabilities(t *testing.T) { + workspace := t.TempDir() + store, err := session.NewStore(t.TempDir()) + if err != nil { + t.Fatal(err) + } + sess := session.New(workspace) + + client := &fakeClient{ + replies: []llm.Message{ + llm.NewAssistantTextMessage("done"), + }, + } + runner := NewRunner(Options{ + Workspace: workspace, + Config: config.Config{ + ProviderRuntime: config.ProviderRuntimeConfig{ + CurrentProvider: "qwen", + DefaultProvider: "qwen", + DefaultModel: "qwen3.6-plus", + Providers: map[string]config.ProviderConfig{ + "qwen": {Model: "qwen3.6-plus"}, + }, + }, + MaxIterations: 2, + Stream: false, + }, + Client: client, + Store: store, + Registry: tools.DefaultRegistry(), + Stdin: strings.NewReader(""), + Stdout: io.Discard, + }) + + runner.config.Provider.Model = "text-only-model" + assetID := llm.AssetID(sess.ID + ":1") + answer, err := runner.RunPromptWithInput(context.Background(), sess, RunPromptInput{ + UserMessage: llm.Message{ + Role: llm.RoleUser, + Parts: []llm.Part{ + {Type: llm.PartImageRef, Image: &llm.ImagePartRef{AssetID: assetID}}, + }, + }, + Assets: map[llm.AssetID]llm.ImageAsset{ + assetID: { + MediaType: "image/png", + Data: []byte("png-binary"), + }, + }, + DisplayText: "[Image#1]", + }, "build", io.Discard) + if err != nil { + t.Fatal(err) + } + if answer != "done" { + t.Fatalf("unexpected answer: %q", answer) + } + if len(client.requests) == 0 { + t.Fatal("expected request to be sent") + } + if client.requests[0].Model != "qwen3.6-plus" { + t.Fatalf("expected selected runtime model, got %q", client.requests[0].Model) + } + + var latestUser llm.Message + for i := len(client.requests[0].Messages) - 1; i >= 0; i-- { + if client.requests[0].Messages[i].Role == llm.RoleUser { + latestUser = client.requests[0].Messages[i] + break + } + } + for _, part := range latestUser.Parts { + if part.Type == llm.PartImageRef && part.Image != nil && part.Image.AssetID == assetID { + return + } + } + t.Fatalf("expected request user message to keep image_ref for selected vision model, got %#v", latestUser.Parts) +} + func TestRunPromptWithInputFallsBackToDisplayTextWhenUserMessageEmpty(t *testing.T) { workspace := t.TempDir() store, err := session.NewStore(t.TempDir()) diff --git a/internal/agent/turn_processing.go b/internal/agent/turn_processing.go index c32e1253..b911dba5 100644 --- a/internal/agent/turn_processing.go +++ b/internal/agent/turn_processing.go @@ -136,14 +136,14 @@ func (e *defaultEngine) processTurn(ctx context.Context, p turnProcessParams) (s } filteredTools := runner.registry.DefinitionsForModeWithFilters(p.RunMode, p.AllowedToolNames, p.DeniedToolNames) availableToolNames := toolNames(filteredTools) + modelID := runner.modelID() request := contextpkg.BuildChatRequest(contextpkg.ChatRequestInput{ - Model: runner.config.Provider.Model, + Model: modelID, Messages: p.Messages, Tools: filteredTools, Assets: p.Assets, Temperature: 0.2, }) - request.Model = runner.modelID() streamedText := false reply, err := e.completeTurn(ctx, request, p.Out, &streamedText) diff --git a/internal/context/request_test.go b/internal/context/request_test.go index 3a1ded7b..9051f89a 100644 --- a/internal/context/request_test.go +++ b/internal/context/request_test.go @@ -69,3 +69,30 @@ func TestBuildChatRequestAppliesMessageCapabilities(t *testing.T) { t.Fatalf("expected thinking to be downgraded to text, got %#v", req.Messages[0].Parts[0]) } } + +func TestBuildChatRequestKeepsImageForProviderPrefixedQwenVisionModel(t *testing.T) { + assetID := llm.AssetID("session:1") + req := BuildChatRequest(ChatRequestInput{ + Model: "qwen/qwen3.6-plus", + Messages: []llm.Message{{ + Role: llm.RoleUser, + Parts: []llm.Part{ + {Type: llm.PartImageRef, Image: &llm.ImagePartRef{AssetID: assetID}}, + }, + }}, + Assets: map[llm.AssetID]llm.ImageAsset{ + assetID: { + MediaType: "image/png", + Data: []byte("png"), + }, + }, + }) + + if len(req.Messages) != 1 || len(req.Messages[0].Parts) != 1 { + t.Fatalf("unexpected request messages: %#v", req.Messages) + } + part := req.Messages[0].Parts[0] + if part.Type != llm.PartImageRef || part.Image == nil || part.Image.AssetID != assetID { + t.Fatalf("expected image_ref to be preserved, got %#v", part) + } +} diff --git a/internal/llm/capabilities.go b/internal/llm/capabilities.go index e641af3e..5aa13695 100644 --- a/internal/llm/capabilities.go +++ b/internal/llm/capabilities.go @@ -65,6 +65,11 @@ var DefaultModelCapabilities = NewCapabilityRegistry(map[string]ModelCapabilitie "gpt-5.4": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true}, "gpt-5.4-mini": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true}, "claude-sonnet-4": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true}, + "qwen3.6-flash": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true}, + "qwen3.6-plus": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true}, + "qwen3.6-pro": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true}, + "qwen3-vl-flash": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true}, + "qwen3-vl-plus": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true}, }) func ApplyCapabilities(messages []Message, caps ModelCapabilities) []Message { @@ -118,7 +123,7 @@ func defaultCapabilities() ModelCapabilities { func inferCapabilitiesFromModel(model string) ModelCapabilities { caps := defaultCapabilities() - if strings.Contains(model, "4o") || strings.Contains(model, "vision") || strings.Contains(model, "gpt-5") || strings.Contains(model, "claude") { + if strings.Contains(model, "4o") || strings.Contains(model, "vision") || strings.Contains(model, "gpt-5") || strings.Contains(model, "claude") || isQwenVisionModel(model) { caps.SupportsVision = true } if strings.Contains(model, "no-tool") { @@ -129,3 +134,21 @@ func inferCapabilitiesFromModel(model string) ModelCapabilities { } return caps } + +func isQwenVisionModel(model string) bool { + model = strings.TrimSpace(model) + if slash := strings.LastIndex(model, "/"); slash >= 0 && slash < len(model)-1 { + model = strings.TrimSpace(model[slash+1:]) + } + switch { + case strings.HasPrefix(model, "qwen3.6-flash"), + strings.HasPrefix(model, "qwen3.6-plus"), + strings.HasPrefix(model, "qwen3.6-pro"), + strings.Contains(model, "qwen3-vl"), + strings.Contains(model, "qwen2.5-vl"), + strings.Contains(model, "qwen-vl"): + return true + default: + return false + } +} diff --git a/internal/llm/capabilities_test.go b/internal/llm/capabilities_test.go index 48ec2ac4..05cee9eb 100644 --- a/internal/llm/capabilities_test.go +++ b/internal/llm/capabilities_test.go @@ -63,6 +63,31 @@ func TestCapabilityRegistryResolveUsesInferenceFallback(t *testing.T) { } } +func TestDefaultModelCapabilitiesRecognizeQwenVisionModels(t *testing.T) { + for _, model := range []string{ + "qwen3.6-flash", + "qwen3.6-plus", + "qwen3.6-pro", + "qwen/qwen3.6-plus", + "dashscope/qwen3.6-flash", + "qwen3-vl-flash", + "qwen2.5-vl-72b-instruct", + } { + if caps := DefaultModelCapabilities.Resolve(model); !caps.SupportsVision { + t.Fatalf("expected %s to support vision, got %#v", model, caps) + } + } +} + +func TestDefaultModelCapabilitiesDoesNotMarkGenericQwenTextModelAsVision(t *testing.T) { + if caps := DefaultModelCapabilities.Resolve("qwen3-coder-plus"); caps.SupportsVision { + t.Fatalf("expected generic qwen text model not to support vision, got %#v", caps) + } + if caps := DefaultModelCapabilities.Resolve("qwen/qwen3-coder-plus"); caps.SupportsVision { + t.Fatalf("expected provider-prefixed generic qwen text model not to support vision, got %#v", caps) + } +} + func TestApplyCapabilitiesAddsFallbackTextWhenAllPartsDropped(t *testing.T) { out := ApplyCapabilities([]Message{{ Role: RoleUser, diff --git a/tui/component_landing.go b/tui/component_landing.go index 8bd942da..4d5e35e8 100644 --- a/tui/component_landing.go +++ b/tui/component_landing.go @@ -370,6 +370,17 @@ func renderLandingShortcutHints() string { return strings.Join(parts, landingShortcutDividerStyle.Render(" ")) } +func (m model) renderLandingStatusNote() string { + note := strings.TrimSpace(m.statusNote) + if note == "" || note == "Ready." { + return "" + } + width := m.landingInputShellWidth() + return landingStatusNoteStyle.Copy(). + Width(width). + Render(compact(note, width)) +} + func (m model) renderLandingContent(markInputZone bool) string { parts := []string{ m.renderLandingHero(), @@ -383,8 +394,11 @@ func (m model) renderLandingContent(markInputZone bool) string { parts, "", m.renderLandingInputBox(markInputZone), - renderLandingShortcutHints(), ) + if status := strings.TrimSpace(m.renderLandingStatusNote()); status != "" { + parts = append(parts, status) + } + parts = append(parts, renderLandingShortcutHints()) return strings.Join(parts, "\n") } diff --git a/tui/component_render_test.go b/tui/component_render_test.go index 92bf38b3..b09959a7 100644 --- a/tui/component_render_test.go +++ b/tui/component_render_test.go @@ -80,6 +80,22 @@ func TestComponentFooterInfoRightModelAndHintPaths(t *testing.T) { } } +func TestLandingContentShowsStatusNote(t *testing.T) { + input := textarea.New() + m := model{ + screen: screenLanding, + width: 100, + height: 32, + input: input, + statusNote: "this model does not support image input", + } + + view := stripANSI(m.renderLandingContent(false)) + if !strings.Contains(view, "this model does not support image input") { + t.Fatalf("expected landing content to show status note, got %q", view) + } +} + func TestComponentFooterHintsShowEscInterruptOnlyWhenCancelable(t *testing.T) { m := model{ busy: true, diff --git a/tui/input_images.go b/tui/input_images.go index f5889b67..7138f6a4 100644 --- a/tui/input_images.go +++ b/tui/input_images.go @@ -517,6 +517,7 @@ func (m *model) handleEmptyClipboardPaste() string { } m.setInputValue(updated) m.syncInputImageRefs(updated) + m.releasePasteSubmitSuppression() if note != "" { return note } diff --git a/tui/input_images_test.go b/tui/input_images_test.go index 683b8c92..fbfafb08 100644 --- a/tui/input_images_test.go +++ b/tui/input_images_test.go @@ -147,6 +147,35 @@ func TestApplyInputImagePipelineConvertsPastedPathToPlaceholder(t *testing.T) { } } +func TestApplyInputImagePipelineConvertsPastedPathOutsideWorkspace(t *testing.T) { + m := newImagePipelineModel(t) + externalDir := t.TempDir() + if filepath.Clean(externalDir) == filepath.Clean(m.workspace) { + t.Fatal("test setup expected external dir outside workspace") + } + imagePath := filepath.Join(externalDir, "external.png") + if err := os.WriteFile(imagePath, []byte("png-outside-workspace"), 0o644); err != nil { + t.Fatalf("write image fixture: %v", err) + } + + updated, note := m.applyInputImagePipeline("", imagePath, "ctrl+v") + if updated != "[Image#1]" { + t.Fatalf("expected external image path placeholder, got %q", updated) + } + if !strings.Contains(note, "Attached 1 image") { + t.Fatalf("expected attach note, got %q", note) + } + + assetID := findAssetIDByImageID(t, m, 1) + blob, err := m.imageStore.GetImageByAssetID(context.Background(), corepkg.SessionID(m.sess.ID), assetID) + if err != nil { + t.Fatalf("read stored image: %v", err) + } + if !bytes.Equal(blob.Data, []byte("png-outside-workspace")) { + t.Fatalf("unexpected stored bytes: %q", string(blob.Data)) + } +} + func TestParseClipboardImageOutputJSONPayload(t *testing.T) { raw := "{\"media_type\":\"image/jpeg\",\"file_name\":\"copied.jpg\",\"data\":\"" + base64.StdEncoding.EncodeToString([]byte("jpeg-bytes")) + "\"}\n" mediaType, data, fileName, err := parseClipboardImageOutput(raw) diff --git a/tui/model_test.go b/tui/model_test.go index 795c5c60..e3d56fd3 100644 --- a/tui/model_test.go +++ b/tui/model_test.go @@ -2833,6 +2833,26 @@ func TestSubmitPromptImageUnsupportedKeepsInput(t *testing.T) { } } +func TestSubmitPromptQwenVisionImageSupported(t *testing.T) { + m := newImagePipelineModel(t) + m.screen = screenChat + m.cfg.Provider.Model = "qwen3.6-plus" + placeholder := mustIngestTestImage(t, m, "image") + m.input.SetWidth(40) + m.input.SetHeight(3) + m.input.SetValue("please read " + placeholder) + m.input.CursorEnd() + + got, _ := m.handleKey(tea.KeyMsg{Type: tea.KeyEnter}) + updated := got.(model) + if len(updated.chatItems) != 1 { + t.Fatalf("expected qwen vision image prompt to submit, got %d chat items with status %q", len(updated.chatItems), updated.statusNote) + } + if !strings.Contains(updated.chatItems[0].Body, placeholder) { + t.Fatalf("expected submitted body to include image placeholder, got %q", updated.chatItems[0].Body) + } +} + func TestAltEnterInsertsNewlineWithoutSubmitting(t *testing.T) { input := textarea.New() input.Focus() @@ -3225,6 +3245,33 @@ func TestTerminalPasteEventWithEmptyPayloadPastesClipboardImage(t *testing.T) { } } +func TestTerminalPasteEventWithEmptyPayloadEnterSubmitsClipboardImage(t *testing.T) { + m := newImagePipelineModel(t) + m.screen = screenLanding + m.cfg.Provider.Model = "gpt-4o" + m.clipboard = fakeClipboardImageReader{ + mediaType: "image/png", + data: []byte("clipboard"), + fileName: "clipboard.png", + } + + got, _ := m.handleKey(tea.KeyMsg{Type: tea.KeyRunes, Paste: true}) + afterPaste := got.(model) + got, cmd := afterPaste.handleKey(tea.KeyMsg{Type: tea.KeyEnter}) + updated := got.(model) + _ = cmd + + if len(updated.chatItems) != 1 { + t.Fatalf("expected enter after image paste to submit, got %d chat items", len(updated.chatItems)) + } + if updated.screen != screenChat { + t.Fatalf("expected submitted image prompt to enter chat screen, got %q", updated.screen) + } + if updated.chatItems[0].Body != "[Image#1]" { + t.Fatalf("expected submitted image placeholder body, got %q", updated.chatItems[0].Body) + } +} + func TestTerminalPasteEventWithTextDoesNotForceClipboardImage(t *testing.T) { m := newImagePipelineModel(t) m.screen = screenChat diff --git a/tui/styles.go b/tui/styles.go index a89e9149..f2918b13 100644 --- a/tui/styles.go +++ b/tui/styles.go @@ -213,6 +213,10 @@ var ( Foreground(lipgloss.Color("#5E7DA4")). Background(colorLandingPanel) + landingStatusNoteStyle = lipgloss.NewStyle(). + Foreground(lipgloss.Color("#F8C471")). + Background(colorLandingPanel) + landingTipDotStyle = lipgloss.NewStyle(). Foreground(lipgloss.Color("#F59E0B"))