Skip to content

Commit d027ce2

Browse files
tmshortclaude
andcommitted
OCPBUGS-76453: clean up orphaned temp dirs in catalog storage
LocalDirV1.Store creates a temp dir (.{catalog}-{random}) and renames it into place atomically. If the process is interrupted before the deferred RemoveAll runs, the temp dir persists. Each restart adds another, eventually filling the disk. Primary fix: remove any matching temp dirs at the start of Store before creating a new one. Since Store holds the write mutex, any dirs found are guaranteed to be from a previous run. Defense-in-depth: add /var/cache/catalogs to GarbageCollector's list of scanned paths (CachePath string -> CachePaths []string), so orphaned temp dirs are also removed by the periodic GC cycle. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> Signed-off-by: Todd Short <tshort@redhat.com>
1 parent 55473d8 commit d027ce2

5 files changed

Lines changed: 163 additions & 16 deletions

File tree

cmd/catalogd/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ func run(ctx context.Context) error {
425425
}
426426

427427
gc := &garbagecollection.GarbageCollector{
428-
CachePath: unpackCacheBasePath,
428+
CachePaths: []string{unpackCacheBasePath, storeDir},
429429
Logger: ctrl.Log.WithName("garbage-collector"),
430430
MetadataClient: metaClient,
431431
Interval: cfg.gcInterval,

internal/catalogd/garbagecollection/garbage_collector.go

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@ var _ manager.Runnable = (*GarbageCollector)(nil)
2525
// that no longer exist. This should only clean up cache entries that
2626
// were missed by the handling of a DELETE event on a Catalog resource.
2727
type GarbageCollector struct {
28-
CachePath string
28+
// CachePaths is the list of directories to garbage-collect. Each directory
29+
// is expected to contain only subdirectories named after existing ClusterCatalogs;
30+
// any entry whose name does not match a known ClusterCatalog — including orphaned
31+
// temporary directories left by interrupted operations — is removed.
32+
CachePaths []string
2933
Logger logr.Logger
3034
MetadataClient metadata.Interface
3135
Interval time.Duration
@@ -37,13 +41,7 @@ type GarbageCollector struct {
3741
// supplied garbage collection interval.
3842
func (gc *GarbageCollector) Start(ctx context.Context) error {
3943
// Run once on startup
40-
removed, err := runGarbageCollection(ctx, gc.CachePath, gc.MetadataClient)
41-
if err != nil {
42-
gc.Logger.Error(err, "running garbage collection")
43-
}
44-
if len(removed) > 0 {
45-
gc.Logger.Info("removed stale cache entries", "removed entries", removed)
46-
}
44+
gc.runAndLog(ctx)
4745

4846
// Loop until context is canceled, running garbage collection
4947
// at the configured interval
@@ -52,13 +50,19 @@ func (gc *GarbageCollector) Start(ctx context.Context) error {
5250
case <-ctx.Done():
5351
return ctx.Err()
5452
case <-time.After(gc.Interval):
55-
removed, err := runGarbageCollection(ctx, gc.CachePath, gc.MetadataClient)
56-
if err != nil {
57-
gc.Logger.Error(err, "running garbage collection")
58-
}
59-
if len(removed) > 0 {
60-
gc.Logger.Info("removed stale cache entries", "removed entries", removed)
61-
}
53+
gc.runAndLog(ctx)
54+
}
55+
}
56+
}
57+
58+
func (gc *GarbageCollector) runAndLog(ctx context.Context) {
59+
for _, path := range gc.CachePaths {
60+
removed, err := runGarbageCollection(ctx, path, gc.MetadataClient)
61+
if err != nil {
62+
gc.Logger.Error(err, "running garbage collection", "path", path)
63+
}
64+
if len(removed) > 0 {
65+
gc.Logger.Info("removed stale cache entries", "path", path, "removed entries", removed)
6266
}
6367
}
6468
}

internal/catalogd/garbagecollection/garbage_collector_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"path/filepath"
77
"testing"
88

9+
"github.com/go-logr/logr"
910
"github.com/stretchr/testify/assert"
1011
"github.com/stretchr/testify/require"
1112
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -15,6 +16,50 @@ import (
1516
ocv1 "github.com/operator-framework/operator-controller/api/v1"
1617
)
1718

19+
// TestGarbageCollectorStoragePath verifies that the GarbageCollector cleans up
20+
// orphaned temporary directories in the StoragePath (e.g. /var/cache/catalogs).
21+
// These dirs are created by LocalDirV1.Store during catalog unpacking and normally
22+
// removed by a deferred RemoveAll, but can persist if the process is killed.
23+
func TestGarbageCollectorStoragePath(t *testing.T) {
24+
ctx := context.Background()
25+
scheme := runtime.NewScheme()
26+
require.NoError(t, metav1.AddMetaToScheme(scheme))
27+
28+
storagePath := t.TempDir()
29+
30+
// Known catalog — its directory must be preserved.
31+
existingCatalog := &metav1.PartialObjectMetadata{
32+
TypeMeta: metav1.TypeMeta{Kind: "ClusterCatalog", APIVersion: ocv1.GroupVersion.String()},
33+
ObjectMeta: metav1.ObjectMeta{Name: "openshift-redhat-operators"},
34+
}
35+
require.NoError(t, os.MkdirAll(filepath.Join(storagePath, existingCatalog.Name), 0700))
36+
37+
// Orphaned temp dirs left by a previously interrupted Store — must be removed.
38+
for _, orphan := range []string{
39+
".openshift-redhat-operators-4015104162",
40+
".openshift-redhat-operators-3615668944",
41+
} {
42+
require.NoError(t, os.MkdirAll(filepath.Join(storagePath, orphan), 0700))
43+
}
44+
45+
metaClient := fake.NewSimpleMetadataClient(scheme, existingCatalog)
46+
47+
gc := &GarbageCollector{
48+
CachePaths: []string{t.TempDir(), storagePath},
49+
Logger: logr.Discard(),
50+
MetadataClient: metaClient,
51+
Interval: 0,
52+
}
53+
gc.runAndLog(ctx)
54+
55+
entries, err := os.ReadDir(storagePath)
56+
require.NoError(t, err)
57+
58+
// Only the real catalog dir should remain.
59+
require.Len(t, entries, 1)
60+
assert.Equal(t, existingCatalog.Name, entries[0].Name())
61+
}
62+
1863
func TestRunGarbageCollection(t *testing.T) {
1964
for _, tt := range []struct {
2065
name string

internal/catalogd/storage/localdir.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"net/url"
1212
"os"
1313
"path/filepath"
14+
"strings"
1415
"sync"
1516

1617
"golang.org/x/sync/errgroup"
@@ -53,6 +54,13 @@ func (s *LocalDirV1) Store(ctx context.Context, catalog string, fsys fs.FS) erro
5354
if err := os.MkdirAll(s.RootDir, 0700); err != nil {
5455
return err
5556
}
57+
58+
// Remove any orphaned temporary directories left by previously interrupted Store
59+
// operations (e.g. after a process crash where deferred cleanup did not run).
60+
if err := s.removeOrphanedTempDirs(catalog); err != nil {
61+
return fmt.Errorf("error removing orphaned temp directories: %w", err)
62+
}
63+
5664
tmpCatalogDir, err := os.MkdirTemp(s.RootDir, fmt.Sprintf(".%s-*", catalog))
5765
if err != nil {
5866
return err
@@ -107,6 +115,30 @@ func (s *LocalDirV1) Store(ctx context.Context, catalog string, fsys fs.FS) erro
107115
)
108116
}
109117

118+
// removeOrphanedTempDirs removes temporary staging directories that were created by a
119+
// previous Store call for the given catalog but were not cleaned up because the process
120+
// was interrupted (e.g. killed by the OOM killer) before the deferred RemoveAll could run.
121+
// Temp dirs use the prefix ".{catalog}-" as created by os.MkdirTemp.
122+
// This method must be called while the write lock is held.
123+
func (s *LocalDirV1) removeOrphanedTempDirs(catalog string) error {
124+
entries, err := os.ReadDir(s.RootDir)
125+
if os.IsNotExist(err) {
126+
return nil
127+
}
128+
if err != nil {
129+
return fmt.Errorf("error reading storage directory: %w", err)
130+
}
131+
prefix := fmt.Sprintf(".%s-", catalog)
132+
for _, entry := range entries {
133+
if strings.HasPrefix(entry.Name(), prefix) {
134+
if err := os.RemoveAll(filepath.Join(s.RootDir, entry.Name())); err != nil {
135+
return fmt.Errorf("error removing orphaned temp directory %q: %w", entry.Name(), err)
136+
}
137+
}
138+
}
139+
return nil
140+
}
141+
110142
func (s *LocalDirV1) Delete(catalog string) error {
111143
s.m.Lock()
112144
defer s.m.Unlock()

internal/catalogd/storage/localdir_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"net/http/httptest"
1212
"net/url"
1313
"os"
14+
"path/filepath"
1415
"strings"
1516
"sync"
1617
"testing"
@@ -138,6 +139,71 @@ func TestLocalDirStoraget(t *testing.T) {
138139
}
139140
},
140141
},
142+
{
143+
name: "orphaned temp dirs from a previous interrupted store are cleaned up",
144+
setup: func(t *testing.T) (*LocalDirV1, fs.FS) {
145+
rootDir := t.TempDir()
146+
s := &LocalDirV1{RootDir: rootDir}
147+
148+
// Simulate temp dirs left behind by a previous crashed Store run.
149+
for _, orphan := range []string{
150+
".test-catalog-1234567890",
151+
".test-catalog-9876543210",
152+
} {
153+
if err := os.MkdirAll(filepath.Join(rootDir, orphan), 0700); err != nil {
154+
t.Fatal(err)
155+
}
156+
}
157+
// A dir for a different catalog must not be removed.
158+
if err := os.MkdirAll(filepath.Join(rootDir, ".other-catalog-1111111111"), 0700); err != nil {
159+
t.Fatal(err)
160+
}
161+
return s, createTestFS(t)
162+
},
163+
test: func(t *testing.T, s *LocalDirV1, fsys fs.FS) {
164+
const catalog = "test-catalog"
165+
166+
if err := s.Store(context.Background(), catalog, fsys); err != nil {
167+
t.Fatalf("Store failed: %v", err)
168+
}
169+
170+
entries, err := os.ReadDir(s.RootDir)
171+
if err != nil {
172+
t.Fatal(err)
173+
}
174+
175+
names := make([]string, 0, len(entries))
176+
for _, e := range entries {
177+
names = append(names, e.Name())
178+
}
179+
180+
// Orphaned dirs for "test-catalog" must be gone.
181+
for _, orphan := range []string{".test-catalog-1234567890", ".test-catalog-9876543210"} {
182+
for _, name := range names {
183+
if name == orphan {
184+
t.Errorf("expected orphaned temp dir %q to be removed, but it still exists", orphan)
185+
}
186+
}
187+
}
188+
189+
// The catalog dir itself must exist.
190+
if !s.ContentExists(catalog) {
191+
t.Error("catalog content should exist after store")
192+
}
193+
194+
// The unrelated catalog temp dir must still be present.
195+
found := false
196+
for _, name := range names {
197+
if name == ".other-catalog-1111111111" {
198+
found = true
199+
break
200+
}
201+
}
202+
if !found {
203+
t.Error("temp dir for a different catalog should not have been removed")
204+
}
205+
},
206+
},
141207
{
142208
name: "store with invalid permissions",
143209
setup: func(t *testing.T) (*LocalDirV1, fs.FS) {

0 commit comments

Comments
 (0)