Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions cmd/engram/consolidate_grouping_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package main

import (
"strings"
"testing"

"github.com/Gentleman-Programming/engram/internal/store"
)

// groupContaining returns the group whose Names include target, or nil.
func groupContaining(groups []projectGroup, target string) *projectGroup {
for i := range groups {
for _, n := range groups[i].Names {
if n == target {
return &groups[i]
}
}
}
return nil
}

// TestGroupSimilarProjects_NoisySharedDirNotGrouped reproduces issue #283 bug 4:
// many unrelated projects that merely share a common parent/root directory
// (e.g. $HOME or a session root) must NOT be unioned into one giant component.
// Before the fix, the shared-directory union step grouped all of them together
// and consolidate would propose merging them into a single canonical project —
// a catastrophic data-loss footgun.
func TestGroupSimilarProjects_NoisySharedDirNotGrouped(t *testing.T) {
// Names are deliberately long and mutually dissimilar (no substring overlap,
// Levenshtein distance well above the scaled threshold) so the ONLY possible
// grouping signal is the shared "/home/user" directory.
const sharedHome = "/home/user"
projects := []store.ProjectStats{
{Name: "kubernetes", ObservationCount: 5, Directories: []string{"/home/user/kubernetes", sharedHome}},
{Name: "photoshop", ObservationCount: 7, Directories: []string{"/home/user/photoshop", sharedHome}},
{Name: "wireguard", ObservationCount: 3, Directories: []string{"/home/user/wireguard", sharedHome}},
{Name: "blender", ObservationCount: 9, Directories: []string{"/home/user/blender", sharedHome}},
{Name: "terraform", ObservationCount: 2, Directories: []string{"/home/user/terraform", sharedHome}},
}

groups := groupSimilarProjects(projects)

// The five names are mutually dissimilar, so the ONLY thing that could group
// them is the shared "/home/user" directory. Since that directory is touched
// by more than maxSharedProjectsForDirMatch distinct projects, it must be
// treated as noise and skipped — yielding no groups at all.
if len(groups) != 0 {
t.Fatalf("expected 0 groups (noisy shared dir must be skipped), got %d: %+v", len(groups), groups)
}
}

// TestGroupSimilarProjects_RealSharedDirStillGroups guards against over-correcting:
// when only a small number of projects share a directory (a genuine rename signal),
// they SHOULD still be grouped.
func TestGroupSimilarProjects_RealSharedDirStillGroups(t *testing.T) {
const sharedRepo = "/repos/shared-monorepo"
projects := []store.ProjectStats{
{Name: "webapp", ObservationCount: 12, Directories: []string{sharedRepo}},
{Name: "legacy-portal", ObservationCount: 4, Directories: []string{sharedRepo}},
}

groups := groupSimilarProjects(projects)

g := groupContaining(groups, "webapp")
if g == nil {
t.Fatalf("expected webapp and legacy-portal to be grouped via shared dir, got groups: %+v", groups)
}
if len(g.Names) != 2 {
t.Fatalf("expected group of 2, got %d: %+v", len(g.Names), g.Names)
}
}

// TestGroupSimilarProjects_NameSimilarityStillWorks confirms the legitimate
// name-based grouping path is untouched by the shared-dir fix.
func TestGroupSimilarProjects_NameSimilarityStillWorks(t *testing.T) {
// "frontend" is a substring of "frontend-app" → name-similar (no shared dir).
projects := []store.ProjectStats{
{Name: "frontend", ObservationCount: 8, Directories: []string{"/a"}},
{Name: "frontend-app", ObservationCount: 5, Directories: []string{"/b"}},
{Name: "unrelated", ObservationCount: 1, Directories: []string{"/c"}},
}

groups := groupSimilarProjects(projects)

g := groupContaining(groups, "frontend")
if g == nil {
t.Fatalf("expected frontend* projects grouped by name similarity, got: %+v", groups)
}
if len(g.Names) != 2 {
t.Fatalf("expected name-similar group of 2, got %d: %+v", len(g.Names), g.Names)
}
if groupContaining(groups, "unrelated") != nil {
t.Fatalf("unrelated project must not be grouped: %+v", groups)
}
}

// TestCmdProjectsConsolidateAllDoesNotMergeNoisySharedDir is a command-level
// regression test for issue #283 bug 4: `projects consolidate --all` must not
// propose merging many unrelated projects just because their sessions share a
// common directory. mustSeedSession records every session under the same "/tmp"
// directory, so seeding several distinctly-named projects reproduces the noisy
// shared-directory scenario end-to-end.
func TestCmdProjectsConsolidateAllDoesNotMergeNoisySharedDir(t *testing.T) {
cfg := testConfig(t)

for _, name := range []string{"kubernetes", "photoshop", "wireguard", "blender", "terraform"} {
mustSeedSession(t, cfg, "s-"+name, name)
}

withArgs(t, "engram", "projects", "consolidate", "--all", "--dry-run")
stdout, stderr := captureOutput(t, func() { cmdProjectsConsolidate(cfg) })

if stderr != "" {
t.Fatalf("expected no stderr, got: %q", stderr)
}
if !strings.Contains(stdout, "No similar project name groups found") {
t.Fatalf("expected no groups for projects sharing only a noisy dir, got: %q", stdout)
}
if strings.Contains(stdout, "Would merge") {
t.Fatalf("must not propose a merge for noisy-shared-dir projects, got: %q", stdout)
}
}
54 changes: 49 additions & 5 deletions cmd/engram/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1836,7 +1836,7 @@ func cmdProjects(cfg store.Config) {
fmt.Fprintf(os.Stderr, "unknown projects subcommand: %s\n", subCmd)
fmt.Fprintln(os.Stderr, "usage: engram projects list")
fmt.Fprintln(os.Stderr, " engram projects consolidate [--all] [--dry-run]")
fmt.Fprintln(os.Stderr, " engram projects prune [--dry-run]")
fmt.Fprintln(os.Stderr, " engram projects prune [--dry-run] [--paths-only]")
exitFunc(1)
}
}
Expand Down Expand Up @@ -1883,6 +1883,20 @@ type projectGroup struct {
Canonical string // suggested canonical (most observations)
}

// maxSharedProjectsForDirMatch caps how many distinct projects may share a single
// directory before that directory is treated as noise (e.g. $HOME or a session
// root) and excluded from shared-directory grouping. See issue #283 bug 4.
const maxSharedProjectsForDirMatch = 3

// distinctProjectCount returns the number of unique project indices in idxs.
func distinctProjectCount(idxs []int) int {
seen := make(map[int]struct{}, len(idxs))
for _, i := range idxs {
seen[i] = struct{}{}
}
return len(seen)
}

// groupSimilarProjects groups projects by name similarity and shared directories.
// Uses a simple union-find approach.
func groupSimilarProjects(projects []store.ProjectStats) []projectGroup {
Expand Down Expand Up @@ -1939,6 +1953,13 @@ func groupSimilarProjects(projects []store.ProjectStats) []projectGroup {
}
}
for _, idxs := range dirToProjects {
// Skip noisy ancestor directories shared by many unrelated projects
// (issue #283 bug 4): unioning everything under $HOME or a session root
// collapses the whole store into one component and turns consolidate into
// a data-loss footgun.
if distinctProjectCount(idxs) > maxSharedProjectsForDirMatch {
continue
}
for k := 1; k < len(idxs); k++ {
union(idxs[0], idxs[k])
}
Expand Down Expand Up @@ -2241,11 +2262,23 @@ func cmdProjectsConsolidate(cfg store.Config) {
}
}

// isPathLikeProjectName reports whether a project name is actually a filesystem
// path (contains a path separator). Mirrors the write-boundary rejection in
// normalizeExplicitWriteProject so cleanup targets exactly the names newer
// versions now refuse to create. See issue #283 bug 2/bug 4.
func isPathLikeProjectName(name string) bool {
return strings.ContainsAny(name, `/\`)
}

func cmdProjectsPrune(cfg store.Config) {
dryRun := false
pathsOnly := false
for i := 3; i < len(os.Args); i++ {
if os.Args[i] == "--dry-run" {
switch os.Args[i] {
case "--dry-run":
dryRun = true
case "--paths-only":
pathsOnly = true
}
}

Expand All @@ -2263,13 +2296,24 @@ func cmdProjectsPrune(cfg store.Config) {
// Find projects with 0 observations
var candidates []store.ProjectStats
for _, ps := range allStats {
if ps.ObservationCount == 0 {
candidates = append(candidates, ps)
if ps.ObservationCount != 0 {
continue
}
// --paths-only: restrict to garbage projects whose name is a filesystem
// path (issue #283 bug 4). These are leftovers from older versions that
// fell back to using cwd as the project name when detection failed.
if pathsOnly && !isPathLikeProjectName(ps.Name) {
continue
}
candidates = append(candidates, ps)
}

if len(candidates) == 0 {
fmt.Println("No empty projects to prune.")
if pathsOnly {
fmt.Println("No path-named projects to prune.")
} else {
fmt.Println("No empty projects to prune.")
}
return
}

Expand Down
129 changes: 129 additions & 0 deletions cmd/engram/prune_paths_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package main

import (
"strings"
"testing"

"github.com/Gentleman-Programming/engram/internal/store"
)

// TestIsPathLikeProjectName covers the detector used to single out garbage
// "path-as-name" projects (issue #283 bug 4 / bug 2 leftovers). It mirrors the
// write-boundary rejection in normalizeExplicitWriteProject: a project name that
// contains a path separator is a filesystem path, not a real project name.
func TestIsPathLikeProjectName(t *testing.T) {
cases := []struct {
name string
want bool
}{
{`c:\users\foo`, true},
{`/home/user`, true},
{`c:/users/foo`, true},
{`\\server\share`, true},
{"d:\\workspace\\sub", true},
{"engram", false},
{"agent-teams-lite", false},
{"e3", false},
{"general", false},
{"", false},
}
for _, tc := range cases {
if got := isPathLikeProjectName(tc.name); got != tc.want {
t.Errorf("isPathLikeProjectName(%q) = %v, want %v", tc.name, got, tc.want)
}
}
}

// TestCmdProjectsPrunePathsOnlyDryRun verifies that `--paths-only` narrows the
// prune candidate set to path-named projects with 0 observations, leaving
// legitimate empty projects and projects with observations untouched.
func TestCmdProjectsPrunePathsOnlyDryRun(t *testing.T) {
cfg := testConfig(t)

// Garbage: path-as-name project, 0 observations (only a session).
mustSeedSession(t, cfg, "s-garbage", `c:\users\foo`)
// Legitimate empty project, 0 observations — must NOT be selected by --paths-only.
mustSeedSession(t, cfg, "s-legit", "legit-empty")
// Real project with observations — never a prune candidate.
mustSeedObservation(t, cfg, "s-real", "realproject", "note", "title", "content", "project")

withArgs(t, "engram", "projects", "prune", "--paths-only", "--dry-run")
stdout, stderr := captureOutput(t, func() { cmdProjectsPrune(cfg) })

if stderr != "" {
t.Fatalf("expected no stderr, got: %q", stderr)
}
if !strings.Contains(stdout, `c:\users\foo`) {
t.Fatalf("expected path-named project listed, got: %q", stdout)
}
if strings.Contains(stdout, "legit-empty") {
t.Fatalf("legitimate empty project must NOT be listed with --paths-only, got: %q", stdout)
}
if strings.Contains(stdout, "realproject") {
t.Fatalf("project with observations must never be a prune candidate, got: %q", stdout)
}
if !strings.Contains(stdout, "dry-run") {
t.Fatalf("expected dry-run notice, got: %q", stdout)
}
}

// TestCmdProjectsPrunePathsOnlyDeletesOnlyPathNamed verifies the actual
// (non-dry-run) deletion: `prune --paths-only` removes path-named empty projects
// while leaving legitimate empty projects and projects with observations intact.
func TestCmdProjectsPrunePathsOnlyDeletesOnlyPathNamed(t *testing.T) {
cfg := testConfig(t)

mustSeedSession(t, cfg, "s-garbage", `c:\users\foo`)
mustSeedSession(t, cfg, "s-legit", "legit-empty")
mustSeedObservation(t, cfg, "s-real", "realproject", "note", "title", "content", "project")

// Answer "all" to prune every (path-named) candidate.
oldScan := scanInputLine
t.Cleanup(func() { scanInputLine = oldScan })
scanInputLine = func(a ...any) (int, error) {
if ptr, ok := a[0].(*string); ok {
*ptr = "all"
}
return 1, nil
}

withArgs(t, "engram", "projects", "prune", "--paths-only")
_, stderr := captureOutput(t, func() { cmdProjectsPrune(cfg) })
if stderr != "" {
t.Fatalf("expected no stderr, got: %q", stderr)
}

s, err := store.New(cfg)
if err != nil {
t.Fatalf("store.New: %v", err)
}
defer s.Close()
// Use ListProjectsWithStats: it includes 0-observation projects (derived from
// sessions), so a not-pruned empty project like "legit-empty" still shows up.
stats, err := s.ListProjectsWithStats()
if err != nil {
t.Fatalf("ListProjectsWithStats: %v", err)
}
names := make([]string, len(stats))
for i, ps := range stats {
names[i] = ps.Name
}

has := func(target string) bool {
for _, n := range names {
if n == target {
return true
}
}
return false
}
if has(`c:\users\foo`) {
t.Fatalf("path-named project should have been pruned, still present: %v", names)
}
if !has("legit-empty") {
t.Fatalf("legitimate empty project must NOT be pruned by --paths-only: %v", names)
}
if !has("realproject") {
t.Fatalf("project with observations must remain: %v", names)
}
}