Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions internal/agent/runner_input_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,85 @@ func TestRunPromptWithInputForwardsStructuredUserMessageAndAssets(t *testing.T)
}
}

func TestRunPromptWithInputUsesSelectedModelForImageCapabilities(t *testing.T) {
workspace := t.TempDir()
store, err := session.NewStore(t.TempDir())
if err != nil {
t.Fatal(err)
}
sess := session.New(workspace)

client := &fakeClient{
replies: []llm.Message{
llm.NewAssistantTextMessage("done"),
},
}
runner := NewRunner(Options{
Workspace: workspace,
Config: config.Config{
ProviderRuntime: config.ProviderRuntimeConfig{
CurrentProvider: "qwen",
DefaultProvider: "qwen",
DefaultModel: "qwen3.6-plus",
Providers: map[string]config.ProviderConfig{
"qwen": {Model: "qwen3.6-plus"},
},
},
MaxIterations: 2,
Stream: false,
},
Client: client,
Store: store,
Registry: tools.DefaultRegistry(),
Stdin: strings.NewReader(""),
Stdout: io.Discard,
})

runner.config.Provider.Model = "text-only-model"
assetID := llm.AssetID(sess.ID + ":1")
answer, err := runner.RunPromptWithInput(context.Background(), sess, RunPromptInput{
UserMessage: llm.Message{
Role: llm.RoleUser,
Parts: []llm.Part{
{Type: llm.PartImageRef, Image: &llm.ImagePartRef{AssetID: assetID}},
},
},
Assets: map[llm.AssetID]llm.ImageAsset{
assetID: {
MediaType: "image/png",
Data: []byte("png-binary"),
},
},
DisplayText: "[Image#1]",
}, "build", io.Discard)
if err != nil {
t.Fatal(err)
}
if answer != "done" {
t.Fatalf("unexpected answer: %q", answer)
}
if len(client.requests) == 0 {
t.Fatal("expected request to be sent")
}
if client.requests[0].Model != "qwen3.6-plus" {
t.Fatalf("expected selected runtime model, got %q", client.requests[0].Model)
}

var latestUser llm.Message
for i := len(client.requests[0].Messages) - 1; i >= 0; i-- {
if client.requests[0].Messages[i].Role == llm.RoleUser {
latestUser = client.requests[0].Messages[i]
break
}
}
for _, part := range latestUser.Parts {
if part.Type == llm.PartImageRef && part.Image != nil && part.Image.AssetID == assetID {
return
}
}
t.Fatalf("expected request user message to keep image_ref for selected vision model, got %#v", latestUser.Parts)
}

func TestRunPromptWithInputFallsBackToDisplayTextWhenUserMessageEmpty(t *testing.T) {
workspace := t.TempDir()
store, err := session.NewStore(t.TempDir())
Expand Down
4 changes: 2 additions & 2 deletions internal/agent/turn_processing.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,14 @@ func (e *defaultEngine) processTurn(ctx context.Context, p turnProcessParams) (s
}
filteredTools := runner.registry.DefinitionsForModeWithFilters(p.RunMode, p.AllowedToolNames, p.DeniedToolNames)
availableToolNames := toolNames(filteredTools)
modelID := runner.modelID()
request := contextpkg.BuildChatRequest(contextpkg.ChatRequestInput{
Model: runner.config.Provider.Model,
Model: modelID,
Messages: p.Messages,
Tools: filteredTools,
Assets: p.Assets,
Temperature: 0.2,
})
request.Model = runner.modelID()

streamedText := false
reply, err := e.completeTurn(ctx, request, p.Out, &streamedText)
Expand Down
27 changes: 27 additions & 0 deletions internal/context/request_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,30 @@ func TestBuildChatRequestAppliesMessageCapabilities(t *testing.T) {
t.Fatalf("expected thinking to be downgraded to text, got %#v", req.Messages[0].Parts[0])
}
}

func TestBuildChatRequestKeepsImageForProviderPrefixedQwenVisionModel(t *testing.T) {
assetID := llm.AssetID("session:1")
req := BuildChatRequest(ChatRequestInput{
Model: "qwen/qwen3.6-plus",
Messages: []llm.Message{{
Role: llm.RoleUser,
Parts: []llm.Part{
{Type: llm.PartImageRef, Image: &llm.ImagePartRef{AssetID: assetID}},
},
}},
Assets: map[llm.AssetID]llm.ImageAsset{
assetID: {
MediaType: "image/png",
Data: []byte("png"),
},
},
})

if len(req.Messages) != 1 || len(req.Messages[0].Parts) != 1 {
t.Fatalf("unexpected request messages: %#v", req.Messages)
}
part := req.Messages[0].Parts[0]
if part.Type != llm.PartImageRef || part.Image == nil || part.Image.AssetID != assetID {
t.Fatalf("expected image_ref to be preserved, got %#v", part)
}
}
25 changes: 24 additions & 1 deletion internal/llm/capabilities.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ var DefaultModelCapabilities = NewCapabilityRegistry(map[string]ModelCapabilitie
"gpt-5.4": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
"gpt-5.4-mini": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
"claude-sonnet-4": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
"qwen3.6-flash": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
"qwen3.6-plus": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
"qwen3.6-pro": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
"qwen3-vl-flash": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
"qwen3-vl-plus": {SupportsVision: true, SupportsToolUse: true, SupportsThinking: true},
})

func ApplyCapabilities(messages []Message, caps ModelCapabilities) []Message {
Expand Down Expand Up @@ -118,7 +123,7 @@ func defaultCapabilities() ModelCapabilities {

func inferCapabilitiesFromModel(model string) ModelCapabilities {
caps := defaultCapabilities()
if strings.Contains(model, "4o") || strings.Contains(model, "vision") || strings.Contains(model, "gpt-5") || strings.Contains(model, "claude") {
if strings.Contains(model, "4o") || strings.Contains(model, "vision") || strings.Contains(model, "gpt-5") || strings.Contains(model, "claude") || isQwenVisionModel(model) {
caps.SupportsVision = true
}
if strings.Contains(model, "no-tool") {
Expand All @@ -129,3 +134,21 @@ func inferCapabilitiesFromModel(model string) ModelCapabilities {
}
return caps
}

func isQwenVisionModel(model string) bool {
model = strings.TrimSpace(model)
if slash := strings.LastIndex(model, "/"); slash >= 0 && slash < len(model)-1 {
model = strings.TrimSpace(model[slash+1:])
}
switch {
case strings.HasPrefix(model, "qwen3.6-flash"),
strings.HasPrefix(model, "qwen3.6-plus"),
strings.HasPrefix(model, "qwen3.6-pro"),
strings.Contains(model, "qwen3-vl"),
strings.Contains(model, "qwen2.5-vl"),
strings.Contains(model, "qwen-vl"):
return true
default:
return false
}
}
25 changes: 25 additions & 0 deletions internal/llm/capabilities_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,31 @@ func TestCapabilityRegistryResolveUsesInferenceFallback(t *testing.T) {
}
}

func TestDefaultModelCapabilitiesRecognizeQwenVisionModels(t *testing.T) {
for _, model := range []string{
"qwen3.6-flash",
"qwen3.6-plus",
"qwen3.6-pro",
"qwen/qwen3.6-plus",
"dashscope/qwen3.6-flash",
"qwen3-vl-flash",
"qwen2.5-vl-72b-instruct",
} {
if caps := DefaultModelCapabilities.Resolve(model); !caps.SupportsVision {
t.Fatalf("expected %s to support vision, got %#v", model, caps)
}
}
}

func TestDefaultModelCapabilitiesDoesNotMarkGenericQwenTextModelAsVision(t *testing.T) {
if caps := DefaultModelCapabilities.Resolve("qwen3-coder-plus"); caps.SupportsVision {
t.Fatalf("expected generic qwen text model not to support vision, got %#v", caps)
}
if caps := DefaultModelCapabilities.Resolve("qwen/qwen3-coder-plus"); caps.SupportsVision {
t.Fatalf("expected provider-prefixed generic qwen text model not to support vision, got %#v", caps)
}
}

func TestApplyCapabilitiesAddsFallbackTextWhenAllPartsDropped(t *testing.T) {
out := ApplyCapabilities([]Message{{
Role: RoleUser,
Expand Down
16 changes: 15 additions & 1 deletion tui/component_landing.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,17 @@ func renderLandingShortcutHints() string {
return strings.Join(parts, landingShortcutDividerStyle.Render(" "))
}

func (m model) renderLandingStatusNote() string {
note := strings.TrimSpace(m.statusNote)
if note == "" || note == "Ready." {
return ""
}
width := m.landingInputShellWidth()
return landingStatusNoteStyle.Copy().
Width(width).
Render(compact(note, width))
}

func (m model) renderLandingContent(markInputZone bool) string {
parts := []string{
m.renderLandingHero(),
Expand All @@ -383,8 +394,11 @@ func (m model) renderLandingContent(markInputZone bool) string {
parts,
"",
m.renderLandingInputBox(markInputZone),
renderLandingShortcutHints(),
)
if status := strings.TrimSpace(m.renderLandingStatusNote()); status != "" {
parts = append(parts, status)
}
parts = append(parts, renderLandingShortcutHints())
return strings.Join(parts, "\n")
}

Expand Down
16 changes: 16 additions & 0 deletions tui/component_render_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,22 @@ func TestComponentFooterInfoRightModelAndHintPaths(t *testing.T) {
}
}

func TestLandingContentShowsStatusNote(t *testing.T) {
input := textarea.New()
m := model{
screen: screenLanding,
width: 100,
height: 32,
input: input,
statusNote: "this model does not support image input",
}

view := stripANSI(m.renderLandingContent(false))
if !strings.Contains(view, "this model does not support image input") {
t.Fatalf("expected landing content to show status note, got %q", view)
}
}

func TestComponentFooterHintsShowEscInterruptOnlyWhenCancelable(t *testing.T) {
m := model{
busy: true,
Expand Down
1 change: 1 addition & 0 deletions tui/input_images.go
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,7 @@ func (m *model) handleEmptyClipboardPaste() string {
}
m.setInputValue(updated)
m.syncInputImageRefs(updated)
m.releasePasteSubmitSuppression()
if note != "" {
return note
}
Expand Down
29 changes: 29 additions & 0 deletions tui/input_images_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,35 @@ func TestApplyInputImagePipelineConvertsPastedPathToPlaceholder(t *testing.T) {
}
}

func TestApplyInputImagePipelineConvertsPastedPathOutsideWorkspace(t *testing.T) {
m := newImagePipelineModel(t)
externalDir := t.TempDir()
if filepath.Clean(externalDir) == filepath.Clean(m.workspace) {
t.Fatal("test setup expected external dir outside workspace")
}
imagePath := filepath.Join(externalDir, "external.png")
if err := os.WriteFile(imagePath, []byte("png-outside-workspace"), 0o644); err != nil {
t.Fatalf("write image fixture: %v", err)
}

updated, note := m.applyInputImagePipeline("", imagePath, "ctrl+v")
if updated != "[Image#1]" {
t.Fatalf("expected external image path placeholder, got %q", updated)
}
if !strings.Contains(note, "Attached 1 image") {
t.Fatalf("expected attach note, got %q", note)
}

assetID := findAssetIDByImageID(t, m, 1)
blob, err := m.imageStore.GetImageByAssetID(context.Background(), corepkg.SessionID(m.sess.ID), assetID)
if err != nil {
t.Fatalf("read stored image: %v", err)
}
if !bytes.Equal(blob.Data, []byte("png-outside-workspace")) {
t.Fatalf("unexpected stored bytes: %q", string(blob.Data))
}
}

func TestParseClipboardImageOutputJSONPayload(t *testing.T) {
raw := "{\"media_type\":\"image/jpeg\",\"file_name\":\"copied.jpg\",\"data\":\"" + base64.StdEncoding.EncodeToString([]byte("jpeg-bytes")) + "\"}\n"
mediaType, data, fileName, err := parseClipboardImageOutput(raw)
Expand Down
47 changes: 47 additions & 0 deletions tui/model_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2833,6 +2833,26 @@ func TestSubmitPromptImageUnsupportedKeepsInput(t *testing.T) {
}
}

func TestSubmitPromptQwenVisionImageSupported(t *testing.T) {
m := newImagePipelineModel(t)
m.screen = screenChat
m.cfg.Provider.Model = "qwen3.6-plus"
placeholder := mustIngestTestImage(t, m, "image")
m.input.SetWidth(40)
m.input.SetHeight(3)
m.input.SetValue("please read " + placeholder)
m.input.CursorEnd()

got, _ := m.handleKey(tea.KeyMsg{Type: tea.KeyEnter})
updated := got.(model)
if len(updated.chatItems) != 1 {
t.Fatalf("expected qwen vision image prompt to submit, got %d chat items with status %q", len(updated.chatItems), updated.statusNote)
}
if !strings.Contains(updated.chatItems[0].Body, placeholder) {
t.Fatalf("expected submitted body to include image placeholder, got %q", updated.chatItems[0].Body)
}
}

func TestAltEnterInsertsNewlineWithoutSubmitting(t *testing.T) {
input := textarea.New()
input.Focus()
Expand Down Expand Up @@ -3225,6 +3245,33 @@ func TestTerminalPasteEventWithEmptyPayloadPastesClipboardImage(t *testing.T) {
}
}

func TestTerminalPasteEventWithEmptyPayloadEnterSubmitsClipboardImage(t *testing.T) {
m := newImagePipelineModel(t)
m.screen = screenLanding
m.cfg.Provider.Model = "gpt-4o"
m.clipboard = fakeClipboardImageReader{
mediaType: "image/png",
data: []byte("clipboard"),
fileName: "clipboard.png",
}

got, _ := m.handleKey(tea.KeyMsg{Type: tea.KeyRunes, Paste: true})
afterPaste := got.(model)
got, cmd := afterPaste.handleKey(tea.KeyMsg{Type: tea.KeyEnter})
updated := got.(model)
_ = cmd

if len(updated.chatItems) != 1 {
t.Fatalf("expected enter after image paste to submit, got %d chat items", len(updated.chatItems))
}
if updated.screen != screenChat {
t.Fatalf("expected submitted image prompt to enter chat screen, got %q", updated.screen)
}
if updated.chatItems[0].Body != "[Image#1]" {
t.Fatalf("expected submitted image placeholder body, got %q", updated.chatItems[0].Body)
}
}

func TestTerminalPasteEventWithTextDoesNotForceClipboardImage(t *testing.T) {
m := newImagePipelineModel(t)
m.screen = screenChat
Expand Down
4 changes: 4 additions & 0 deletions tui/styles.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,10 @@ var (
Foreground(lipgloss.Color("#5E7DA4")).
Background(colorLandingPanel)

landingStatusNoteStyle = lipgloss.NewStyle().
Foreground(lipgloss.Color("#F8C471")).
Background(colorLandingPanel)

landingTipDotStyle = lipgloss.NewStyle().
Foreground(lipgloss.Color("#F59E0B"))

Expand Down
Loading