1024XEngineer · Nomikfk1215 · May 13, 2026 · May 13, 2026
diff --git a/internal/agent/runner_input_test.go b/internal/agent/runner_input_test.go
@@ -94,6 +94,85 @@ func TestRunPromptWithInputForwardsStructuredUserMessageAndAssets(t *testing.T)
 	}
 }
 
+func TestRunPromptWithInputUsesSelectedModelForImageCapabilities(t *testing.T) {
+	workspace := t.TempDir()
+	store, err := session.NewStore(t.TempDir())
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess := session.New(workspace)
+
+	client := &fakeClient{
+		replies: []llm.Message{
+			llm.NewAssistantTextMessage("done"),
+		},
+	}
+	runner := NewRunner(Options{
+		Workspace: workspace,
+		Config: config.Config{
+			ProviderRuntime: config.ProviderRuntimeConfig{
+				CurrentProvider: "qwen",
+				DefaultProvider: "qwen",
+				DefaultModel:    "qwen3.6-plus",
+				Providers: map[string]config.ProviderConfig{
+					"qwen": {Model: "qwen3.6-plus"},
+				},
+			},
+			MaxIterations: 2,
+			Stream:        false,
+		},
+		Client:   client,
+		Store:    store,
+		Registry: tools.DefaultRegistry(),
+		Stdin:    strings.NewReader(""),
+		Stdout:   io.Discard,
+	})
+
+	runner.config.Provider.Model = "text-only-model"
+	assetID := llm.AssetID(sess.ID + ":1")
+	answer, err := runner.RunPromptWithInput(context.Background(), sess, RunPromptInput{
+		UserMessage: llm.Message{
+			Role: llm.RoleUser,
+			Parts: []llm.Part{
+				{Type: llm.PartImageRef, Image: &llm.ImagePartRef{AssetID: assetID}},
+			},
+		},
+		Assets: map[llm.AssetID]llm.ImageAsset{
+			assetID: {
+				MediaType: "image/png",
+				Data:      []byte("png-binary"),
+			},
+		},
+		DisplayText: "[Image#1]",
+	}, "build", io.Discard)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if answer != "done" {
+		t.Fatalf("unexpected answer: %q", answer)
+	}
+	if len(client.requests) == 0 {
+		t.Fatal("expected request to be sent")
+	}
+	if client.requests[0].Model != "qwen3.6-plus" {
+		t.Fatalf("expected selected runtime model, got %q", client.requests[0].Model)
+	}
+
+	var latestUser llm.Message
+	for i := len(client.requests[0].Messages) - 1; i >= 0; i-- {
+		if client.requests[0].Messages[i].Role == llm.RoleUser {
+			latestUser = client.requests[0].Messages[i]
+			break
+		}
+	}
+	for _, part := range latestUser.Parts {
+		if part.Type == llm.PartImageRef && part.Image != nil && part.Image.AssetID == assetID {
+			return
+		}
+	}
+	t.Fatalf("expected request user message to keep image_ref for selected vision model, got %#v", latestUser.Parts)
+}
+
 func TestRunPromptWithInputFallsBackToDisplayTextWhenUserMessageEmpty(t *testing.T) {
 	workspace := t.TempDir()
 	store, err := session.NewStore(t.TempDir())

diff --git a/internal/agent/turn_processing.go b/internal/agent/turn_processing.go
@@ -136,14 +136,14 @@ func (e *defaultEngine) processTurn(ctx context.Context, p turnProcessParams) (s
 	}
 	filteredTools := runner.registry.DefinitionsForModeWithFilters(p.RunMode, p.AllowedToolNames, p.DeniedToolNames)
 	availableToolNames := toolNames(filteredTools)
+	modelID := runner.modelID()
 	request := contextpkg.BuildChatRequest(contextpkg.ChatRequestInput{
-		Model:       runner.config.Provider.Model,
+		Model:       modelID,
 		Messages:    p.Messages,
 		Tools:       filteredTools,
 		Assets:      p.Assets,
 		Temperature: 0.2,
 	})
-	request.Model = runner.modelID()
 
 	streamedText := false
 	reply, err := e.completeTurn(ctx, request, p.Out, &streamedText)

diff --git a/internal/context/request_test.go b/internal/context/request_test.go
@@ -69,3 +69,30 @@ func TestBuildChatRequestAppliesMessageCapabilities(t *testing.T) {
 		t.Fatalf("expected thinking to be downgraded to text, got %#v", req.Messages[0].Parts[0])
 	}
 }
+
+func TestBuildChatRequestKeepsImageForProviderPrefixedQwenVisionModel(t *testing.T) {
+	assetID := llm.AssetID("session:1")
+	req := BuildChatRequest(ChatRequestInput{
+		Model: "qwen/qwen3.6-plus",
+		Messages: []llm.Message{{
+			Role: llm.RoleUser,
+			Parts: []llm.Part{
+				{Type: llm.PartImageRef, Image: &llm.ImagePartRef{AssetID: assetID}},
+			},
+		}},
+		Assets: map[llm.AssetID]llm.ImageAsset{
+			assetID: {
+				MediaType: "image/png",
+				Data:      []byte("png"),
+			},
+		},
+	})
+
+	if len(req.Messages) != 1 || len(req.Messages[0].Parts) != 1 {
+		t.Fatalf("unexpected request messages: %#v", req.Messages)
+	}
+	part := req.Messages[0].Parts[0]
+	if part.Type != llm.PartImageRef || part.Image == nil || part.Image.AssetID != assetID {
+		t.Fatalf("expected image_ref to be preserved, got %#v", part)
+	}
+}
diff --git a/internal/llm/capabilities.go b/internal/llm/capabilities.go
@@ -65,6 +65,11 @@ var DefaultModelCapabilities = NewCapabilityRegistry(map[string]ModelCapabilitie
 	"gpt-5.4":         {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
 	"gpt-5.4-mini":    {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
 	"claude-sonnet-4": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
+	"qwen3.6-flash":   {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
+	"qwen3.6-plus":    {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
+	"qwen3.6-pro":     {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
+	"qwen3-vl-flash":  {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
+	"qwen3-vl-plus":   {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
 })
 
 func ApplyCapabilities(messages []Message, caps ModelCapabilities) []Message {
@@ -118,7 +123,7 @@ func defaultCapabilities() ModelCapabilities {
 
 func inferCapabilitiesFromModel(model string) ModelCapabilities {
 	caps := defaultCapabilities()
-	if strings.Contains(model, "4o") || strings.Contains(model, "vision") || strings.Contains(model, "gpt-5") || strings.Contains(model, "claude") {
+	if strings.Contains(model, "4o") || strings.Contains(model, "vision") || strings.Contains(model, "gpt-5") || strings.Contains(model, "claude") || isQwenVisionModel(model) {
 		caps.SupportsVision = true
 	}
 	if strings.Contains(model, "no-tool") {
@@ -129,3 +134,21 @@ func inferCapabilitiesFromModel(model string) ModelCapabilities {
 	}
 	return caps
 }
+
+func isQwenVisionModel(model string) bool {
+	model = strings.TrimSpace(model)
+	if slash := strings.LastIndex(model, "/"); slash >= 0 && slash < len(model)-1 {
+		model = strings.TrimSpace(model[slash+1:])
+	}
+	switch {
+	case strings.HasPrefix(model, "qwen3.6-flash"),
+		strings.HasPrefix(model, "qwen3.6-plus"),
+		strings.HasPrefix(model, "qwen3.6-pro"),
+		strings.Contains(model, "qwen3-vl"),
+		strings.Contains(model, "qwen2.5-vl"),
+		strings.Contains(model, "qwen-vl"):
+		return true
+	default:
+		return false
+	}
+}
diff --git a/internal/llm/capabilities_test.go b/internal/llm/capabilities_test.go
@@ -63,6 +63,31 @@ func TestCapabilityRegistryResolveUsesInferenceFallback(t *testing.T) {
 	}
 }
 
+func TestDefaultModelCapabilitiesRecognizeQwenVisionModels(t *testing.T) {
+	for _, model := range []string{
+		"qwen3.6-flash",
+		"qwen3.6-plus",
+		"qwen3.6-pro",
+		"qwen/qwen3.6-plus",
+		"dashscope/qwen3.6-flash",
+		"qwen3-vl-flash",
+		"qwen2.5-vl-72b-instruct",
+	} {
+		if caps := DefaultModelCapabilities.Resolve(model); !caps.SupportsVision {
+			t.Fatalf("expected %s to support vision, got %#v", model, caps)
+		}
+	}
+}
+
+func TestDefaultModelCapabilitiesDoesNotMarkGenericQwenTextModelAsVision(t *testing.T) {
+	if caps := DefaultModelCapabilities.Resolve("qwen3-coder-plus"); caps.SupportsVision {
+		t.Fatalf("expected generic qwen text model not to support vision, got %#v", caps)
+	}
+	if caps := DefaultModelCapabilities.Resolve("qwen/qwen3-coder-plus"); caps.SupportsVision {
+		t.Fatalf("expected provider-prefixed generic qwen text model not to support vision, got %#v", caps)
+	}
+}
+
 func TestApplyCapabilitiesAddsFallbackTextWhenAllPartsDropped(t *testing.T) {
 	out := ApplyCapabilities([]Message{{
 		Role: RoleUser,

diff --git a/tui/component_landing.go b/tui/component_landing.go
@@ -370,6 +370,17 @@ func renderLandingShortcutHints() string {
 	return strings.Join(parts, landingShortcutDividerStyle.Render("   "))
 }
 
+func (m model) renderLandingStatusNote() string {
+	note := strings.TrimSpace(m.statusNote)
+	if note == "" || note == "Ready." {
+		return ""
+	}
+	width := m.landingInputShellWidth()
+	return landingStatusNoteStyle.Copy().
+		Width(width).
+		Render(compact(note, width))
+}
+
 func (m model) renderLandingContent(markInputZone bool) string {
 	parts := []string{
 		m.renderLandingHero(),
@@ -383,8 +394,11 @@ func (m model) renderLandingContent(markInputZone bool) string {
 		parts,
 		"",
 		m.renderLandingInputBox(markInputZone),
-		renderLandingShortcutHints(),
 	)
+	if status := strings.TrimSpace(m.renderLandingStatusNote()); status != "" {
+		parts = append(parts, status)
+	}
+	parts = append(parts, renderLandingShortcutHints())
 	return strings.Join(parts, "\n")
 }
 

diff --git a/tui/component_render_test.go b/tui/component_render_test.go
@@ -80,6 +80,22 @@ func TestComponentFooterInfoRightModelAndHintPaths(t *testing.T) {
 	}
 }
 
+func TestLandingContentShowsStatusNote(t *testing.T) {
+	input := textarea.New()
+	m := model{
+		screen:     screenLanding,
+		width:      100,
+		height:     32,
+		input:      input,
+		statusNote: "this model does not support image input",
+	}
+
+	view := stripANSI(m.renderLandingContent(false))
+	if !strings.Contains(view, "this model does not support image input") {
+		t.Fatalf("expected landing content to show status note, got %q", view)
+	}
+}
+
 func TestComponentFooterHintsShowEscInterruptOnlyWhenCancelable(t *testing.T) {
 	m := model{
 		busy:      true,

diff --git a/tui/input_images.go b/tui/input_images.go
@@ -517,6 +517,7 @@ func (m *model) handleEmptyClipboardPaste() string {
 	}
 	m.setInputValue(updated)
 	m.syncInputImageRefs(updated)
+	m.releasePasteSubmitSuppression()
 	if note != "" {
 		return note
 	}

diff --git a/tui/input_images_test.go b/tui/input_images_test.go
@@ -147,6 +147,35 @@ func TestApplyInputImagePipelineConvertsPastedPathToPlaceholder(t *testing.T) {
 	}
 }
 
+func TestApplyInputImagePipelineConvertsPastedPathOutsideWorkspace(t *testing.T) {
+	m := newImagePipelineModel(t)
+	externalDir := t.TempDir()
+	if filepath.Clean(externalDir) == filepath.Clean(m.workspace) {
+		t.Fatal("test setup expected external dir outside workspace")
+	}
+	imagePath := filepath.Join(externalDir, "external.png")
+	if err := os.WriteFile(imagePath, []byte("png-outside-workspace"), 0o644); err != nil {
+		t.Fatalf("write image fixture: %v", err)
+	}
+
+	updated, note := m.applyInputImagePipeline("", imagePath, "ctrl+v")
+	if updated != "[Image#1]" {
+		t.Fatalf("expected external image path placeholder, got %q", updated)
+	}
+	if !strings.Contains(note, "Attached 1 image") {
+		t.Fatalf("expected attach note, got %q", note)
+	}
+
+	assetID := findAssetIDByImageID(t, m, 1)
+	blob, err := m.imageStore.GetImageByAssetID(context.Background(), corepkg.SessionID(m.sess.ID), assetID)
+	if err != nil {
+		t.Fatalf("read stored image: %v", err)
+	}
+	if !bytes.Equal(blob.Data, []byte("png-outside-workspace")) {
+		t.Fatalf("unexpected stored bytes: %q", string(blob.Data))
+	}
+}
+
 func TestParseClipboardImageOutputJSONPayload(t *testing.T) {
 	raw := "{\"media_type\":\"image/jpeg\",\"file_name\":\"copied.jpg\",\"data\":\"" + base64.StdEncoding.EncodeToString([]byte("jpeg-bytes")) + "\"}\n"
 	mediaType, data, fileName, err := parseClipboardImageOutput(raw)

diff --git a/tui/model_test.go b/tui/model_test.go
@@ -2833,6 +2833,26 @@ func TestSubmitPromptImageUnsupportedKeepsInput(t *testing.T) {
 	}
 }
 
+func TestSubmitPromptQwenVisionImageSupported(t *testing.T) {
+	m := newImagePipelineModel(t)
+	m.screen = screenChat
+	m.cfg.Provider.Model = "qwen3.6-plus"
+	placeholder := mustIngestTestImage(t, m, "image")
+	m.input.SetWidth(40)
+	m.input.SetHeight(3)
+	m.input.SetValue("please read " + placeholder)
+	m.input.CursorEnd()
+
+	got, _ := m.handleKey(tea.KeyMsg{Type: tea.KeyEnter})
+	updated := got.(model)
+	if len(updated.chatItems) != 1 {
+		t.Fatalf("expected qwen vision image prompt to submit, got %d chat items with status %q", len(updated.chatItems), updated.statusNote)
+	}
+	if !strings.Contains(updated.chatItems[0].Body, placeholder) {
+		t.Fatalf("expected submitted body to include image placeholder, got %q", updated.chatItems[0].Body)
+	}
+}
+
 func TestAltEnterInsertsNewlineWithoutSubmitting(t *testing.T) {
 	input := textarea.New()
 	input.Focus()
@@ -3225,6 +3245,33 @@ func TestTerminalPasteEventWithEmptyPayloadPastesClipboardImage(t *testing.T) {
 	}
 }
 
+func TestTerminalPasteEventWithEmptyPayloadEnterSubmitsClipboardImage(t *testing.T) {
+	m := newImagePipelineModel(t)
+	m.screen = screenLanding
+	m.cfg.Provider.Model = "gpt-4o"
+	m.clipboard = fakeClipboardImageReader{
+		mediaType: "image/png",
+		data:      []byte("clipboard"),
+		fileName:  "clipboard.png",
+	}
+
+	got, _ := m.handleKey(tea.KeyMsg{Type: tea.KeyRunes, Paste: true})
+	afterPaste := got.(model)
+	got, cmd := afterPaste.handleKey(tea.KeyMsg{Type: tea.KeyEnter})
+	updated := got.(model)
+	_ = cmd
+
+	if len(updated.chatItems) != 1 {
+		t.Fatalf("expected enter after image paste to submit, got %d chat items", len(updated.chatItems))
+	}
+	if updated.screen != screenChat {
+		t.Fatalf("expected submitted image prompt to enter chat screen, got %q", updated.screen)
+	}
+	if updated.chatItems[0].Body != "[Image#1]" {
+		t.Fatalf("expected submitted image placeholder body, got %q", updated.chatItems[0].Body)
+	}
+}
+
 func TestTerminalPasteEventWithTextDoesNotForceClipboardImage(t *testing.T) {
 	m := newImagePipelineModel(t)
 	m.screen = screenChat

diff --git a/tui/styles.go b/tui/styles.go
@@ -213,6 +213,10 @@ var (
 					Foreground(lipgloss.Color("#5E7DA4")).
 					Background(colorLandingPanel)
 
+	landingStatusNoteStyle = lipgloss.NewStyle().
+				Foreground(lipgloss.Color("#F8C471")).
+				Background(colorLandingPanel)
+
 	landingTipDotStyle = lipgloss.NewStyle().
 				Foreground(lipgloss.Color("#F59E0B"))