diff --git a/storage/docs/containers-storage.conf.5.md b/storage/docs/containers-storage.conf.5.md index bc4d5d0b26..7f09a6a534 100644 --- a/storage/docs/containers-storage.conf.5.md +++ b/storage/docs/containers-storage.conf.5.md @@ -219,6 +219,16 @@ based file systems. Use ComposeFS to mount the data layers image. ComposeFS support is experimental and not recommended for production use. This is a "string bool": "false"|"true" (cannot be native TOML boolean) +**sync**="none|filesystem" + Filesystem synchronization mode for layer creation. (default: "none") + +- `none`: No synchronization. + Layer operations complete without calling syncfs(). + +- `filesystem`: Sync before completion. + Flush all pending writes to the file system before marking the layer + as present. This helps prevent data corruption if the system + crashes or loses power during layer operations. ### STORAGE OPTIONS FOR VFS TABLE @@ -228,6 +238,16 @@ The `storage.options.vfs` table supports the following options: ignore_chown_errors can be set to allow a non privileged user running with a single UID within a user namespace to run containers. The user can pull and use any image even those with multiple uids. Note multiple UIDs will be squashed down to the default uid in the container. These images will have no separation between the users in the container. This is a "string bool": "false"|"true" (cannot be native TOML boolean) +**sync**="none|filesystem" + Filesystem synchronization mode for layer creation. (default: "none") + +- `none`: No synchronization. + Layer operations complete without calling syncfs(). + +- `filesystem`: Sync before completion. + Flush all pending writes to the file system before marking the layer + as present. This helps prevent data corruption if the system + crashes or loses power during layer operations. ### STORAGE OPTIONS FOR ZFS TABLE diff --git a/storage/drivers/btrfs/btrfs.go b/storage/drivers/btrfs/btrfs.go index aba898ed55..c1ea5cb3a5 100644 --- a/storage/drivers/btrfs/btrfs.go +++ b/storage/drivers/btrfs/btrfs.go @@ -157,6 +157,12 @@ func (d *Driver) Cleanup() error { return mount.Unmount(d.home) } +// SyncMode returns the sync mode configured for the driver. +// Btrfs does not support sync mode configuration, always returns SyncModeNone. +func (d *Driver) SyncMode() graphdriver.SyncMode { + return graphdriver.SyncModeNone +} + func free(p *C.char) { C.free(unsafe.Pointer(p)) } diff --git a/storage/drivers/driver.go b/storage/drivers/driver.go index 38706dc999..3ff9e3d10b 100644 --- a/storage/drivers/driver.go +++ b/storage/drivers/driver.go @@ -27,6 +27,40 @@ const ( FsMagicUnsupported = FsMagic(0x00000000) ) +// SyncMode defines when filesystem synchronization occurs during layer creation. +type SyncMode int + +const ( + // SyncModeNone - no synchronization + SyncModeNone SyncMode = iota + // SyncModeFilesystem - use syncfs() before layer marked as present + SyncModeFilesystem +) + +// String returns the string representation of the sync mode +func (m SyncMode) String() string { + switch m { + case SyncModeNone: + return "none" + case SyncModeFilesystem: + return "filesystem" + default: + return "unknown" + } +} + +// ParseSyncMode converts a string to SyncMode +func ParseSyncMode(s string) (SyncMode, error) { + switch strings.ToLower(strings.TrimSpace(s)) { + case "", "none": + return SyncModeNone, nil + case "filesystem": + return SyncModeFilesystem, nil + default: + return SyncModeNone, fmt.Errorf("invalid sync mode: %q", s) + } +} + var ( // All registered drivers drivers map[string]InitFunc @@ -169,6 +203,8 @@ type ProtoDriver interface { AdditionalImageStores() []string // Dedup performs deduplication of the driver's storage. Dedup(DedupArgs) (DedupResult, error) + // SyncMode returns the sync mode configured for the driver. + SyncMode() SyncMode } // DiffDriver is the interface to use to implement graph diffs diff --git a/storage/drivers/overlay/overlay.go b/storage/drivers/overlay/overlay.go index 00974c890c..da49c90402 100644 --- a/storage/drivers/overlay/overlay.go +++ b/storage/drivers/overlay/overlay.go @@ -113,6 +113,7 @@ type overlayOptions struct { ignoreChownErrors bool forceMask *os.FileMode useComposefs bool + syncMode graphdriver.SyncMode } // Driver contains information about the home directory and the list of active mounts that are created using this driver. @@ -593,6 +594,22 @@ func parseOptions(options []string) (*overlayOptions, error) { } m := os.FileMode(mask) o.forceMask = &m + case "sync": + logrus.Debugf("overlay: sync=%s", val) + mode, err := graphdriver.ParseSyncMode(val) + if err != nil { + return nil, fmt.Errorf("invalid sync mode for overlay driver: %w", err) + } + // SyncModeNone and SyncModeFilesystem do not need any special handling because + // the overlay storage is always on the same file system as the metadata, thus + // the Syncfs() in layers.go covers also any file written by the overlay driver. + switch mode { + case graphdriver.SyncModeNone, graphdriver.SyncModeFilesystem: + // Nothing to do. + default: + return nil, fmt.Errorf("invalid mode for overlay driver: %q", val) + } + o.syncMode = mode default: return nil, fmt.Errorf("overlay: unknown option %s", key) } @@ -865,6 +882,11 @@ func (d *Driver) Cleanup() error { return mount.Unmount(d.home) } +// SyncMode returns the sync mode configured for the driver. +func (d *Driver) SyncMode() graphdriver.SyncMode { + return d.options.syncMode +} + // pruneStagingDirectories cleans up any staging directory that was leaked. // It returns whether any staging directory is still present. func (d *Driver) pruneStagingDirectories() bool { @@ -2017,12 +2039,8 @@ func (d *Driver) Put(id string) error { // If fusermount|fusermount3 failed to unmount the FUSE file system, make sure all // pending changes are propagated to the file system if !unmounted { - fd, err := unix.Open(mountpoint, unix.O_DIRECTORY|unix.O_CLOEXEC, 0) - if err == nil { - if err := unix.Syncfs(fd); err != nil { - logrus.Debugf("Error Syncfs(%s) - %v", mountpoint, err) - } - unix.Close(fd) + if err := system.Syncfs(mountpoint); err != nil { + logrus.Debugf("Error Syncfs(%s) - %v", mountpoint, err) } } } diff --git a/storage/drivers/vfs/driver.go b/storage/drivers/vfs/driver.go index b90c2046cf..8214f6c7f9 100644 --- a/storage/drivers/vfs/driver.go +++ b/storage/drivers/vfs/driver.go @@ -64,6 +64,22 @@ func Init(home string, options graphdriver.Options) (graphdriver.Driver, error) if err != nil { return nil, err } + case "vfs.sync", ".sync": + logrus.Debugf("vfs: sync=%s", val) + var err error + d.syncMode, err = graphdriver.ParseSyncMode(val) + if err != nil { + return nil, fmt.Errorf("invalid sync mode for vfs driver: %w", err) + } + // SyncModeNone and SyncModeFilesystem do not need any special handling because + // the vfs storage is always on the same file system as the metadata, thus the + // Syncfs() in layers.go covers also any file written by the vfs driver. + switch d.syncMode { + case graphdriver.SyncModeNone, graphdriver.SyncModeFilesystem: + // Nothing to do. + default: + return nil, fmt.Errorf("invalid mode for vfs driver: %q", val) + } default: return nil, fmt.Errorf("vfs driver does not support %s options", key) } @@ -84,6 +100,7 @@ type Driver struct { home string additionalHomes []string ignoreChownErrors bool + syncMode graphdriver.SyncMode naiveDiff graphdriver.DiffDriver updater graphdriver.LayerIDMapUpdater imageStore string @@ -108,6 +125,11 @@ func (d *Driver) Cleanup() error { return nil } +// SyncMode returns the sync mode configured for the driver. +func (d *Driver) SyncMode() graphdriver.SyncMode { + return d.syncMode +} + type fileGetNilCloser struct { storage.FileGetter } diff --git a/storage/drivers/zfs/zfs.go b/storage/drivers/zfs/zfs.go index b994278bb2..4211a570e4 100644 --- a/storage/drivers/zfs/zfs.go +++ b/storage/drivers/zfs/zfs.go @@ -183,6 +183,12 @@ func (d *Driver) Cleanup() error { return nil } +// SyncMode returns the sync mode configured for the driver. +// ZFS does not support sync mode configuration, always returns SyncModeNone. +func (d *Driver) SyncMode() graphdriver.SyncMode { + return graphdriver.SyncModeNone +} + // Status returns information about the ZFS filesystem. It returns a two dimensional array of information // such as pool name, dataset name, disk usage, parent quota and compression used. // Currently it return 'Zpool', 'Zpool Health', 'Parent Dataset', 'Space Used By Parent', diff --git a/storage/layers.go b/storage/layers.go index 22bebc131c..5152efcb5b 100644 --- a/storage/layers.go +++ b/storage/layers.go @@ -1026,7 +1026,7 @@ func (r *layerStore) load(lockedForWriting bool) (bool, error) { } modifiedLocations |= layer.location } - if err := r.saveLayers(modifiedLocations); err != nil { + if err := r.saveLayers(modifiedLocations, false); err != nil { return false, err } if incompleteDeletionErrors != nil { @@ -1078,27 +1078,36 @@ func (r *layerStore) loadMounts() error { } // save saves the contents of the store to disk. +// If needsSyncfs is true, all pending writes are flushed to disk before +// saving the layer metadata. Set it to false for metadata-only changes // The caller must hold r.lockfile locked for writing. // The caller must hold r.inProcessLock for WRITING. -func (r *layerStore) save(saveLocations layerLocations) error { +func (r *layerStore) save(saveLocations layerLocations, needsSyncfs bool) error { r.mountsLockfile.Lock() defer r.mountsLockfile.Unlock() - if err := r.saveLayers(saveLocations); err != nil { + if err := r.saveLayers(saveLocations, needsSyncfs); err != nil { return err } return r.saveMounts() } // saveFor saves the contents of the store relevant for modifiedLayer to disk. +// If needsSyncfs is true, all pending writes are flushed to disk before +// saving the layer metadata. Set it to false for metadata-only changes +// (e.g. setting a flag, changing names). // The caller must hold r.lockfile locked for writing. // The caller must hold r.inProcessLock for WRITING. -func (r *layerStore) saveFor(modifiedLayer *Layer) error { - return r.save(modifiedLayer.location) +func (r *layerStore) saveFor(modifiedLayer *Layer, needsSyncfs bool) error { + return r.save(modifiedLayer.location, needsSyncfs) } +// saveLayers writes the layer metadata to disk. +// If needsSyncfs is true, all pending writes are flushed to disk before +// saving the layer metadata. Set it to false for metadata-only changes +// (e.g. setting a flag, changing names). // The caller must hold r.lockfile locked for writing. // The caller must hold r.inProcessLock for WRITING. -func (r *layerStore) saveLayers(saveLocations layerLocations) error { +func (r *layerStore) saveLayers(saveLocations layerLocations, needsSyncfs bool) error { if !r.lockfile.IsReadWrite() { return fmt.Errorf("not allowed to modify the layer store at %q: %w", r.layerdir, ErrStoreIsReadOnly) } @@ -1139,6 +1148,21 @@ func (r *layerStore) saveLayers(saveLocations layerLocations) error { if location == volatileLayerLocation { opts.NoSync = true } + // If the underlying storage driver is using sync and we are writing data (not just metadata), + // make sure we sync everything before saving the layer data, this ensures that all + // files/directories are properly created and written. + if needsSyncfs { + switch r.driver.SyncMode() { + case drivers.SyncModeNone: + // Nothing to do. + case drivers.SyncModeFilesystem: + if err := system.Syncfs(filepath.Dir(rpath)); err != nil { + return err + } + default: + return fmt.Errorf("unknown sync mode: %q", r.driver.SyncMode().String()) + } + } if err := ioutils.AtomicWriteFileWithOpts(rpath, jldata, 0o600, &opts); err != nil { return err } @@ -1338,7 +1362,7 @@ func (r *layerStore) ClearFlag(id string, flag string) error { return ErrLayerUnknown } delete(layer.Flags, flag) - return r.saveFor(layer) + return r.saveFor(layer, false) } // Requires startWriting. @@ -1354,7 +1378,7 @@ func (r *layerStore) SetFlag(id string, flag string, value any) error { layer.Flags = make(map[string]any) } layer.Flags[flag] = value - return r.saveFor(layer) + return r.saveFor(layer, false) } func (r *layerStore) Status() ([][2]string, error) { @@ -1409,7 +1433,7 @@ func (r *layerStore) PutAdditionalLayer(id string, parentLayer *Layer, names []s if layer.TOCDigest != "" { r.bytocsum[layer.TOCDigest] = append(r.bytocsum[layer.TOCDigest], layer.ID) } - if err := r.saveFor(layer); err != nil { + if err := r.saveFor(layer, true); err != nil { if e := r.deleteWhileHoldingLock(layer.ID); e != nil { logrus.Errorf("While recovering from a failure to save layers, error deleting layer %#v: %v", id, e) } @@ -1562,7 +1586,7 @@ func (r *layerStore) create(id string, parentLayer *Layer, names []string, mount } }() - if err = r.saveFor(layer); err != nil { + if err = r.saveFor(layer, false); err != nil { cleanupFailureContext = "saving incomplete layer metadata" return nil, -1, err } @@ -1623,7 +1647,7 @@ func (r *layerStore) create(id string, parentLayer *Layer, names []string, mount cleanupFailureContext = "creating tar-split parent directory for a copy from template" return nil, -1, err } - if err = ioutils.AtomicWriteFile(r.tspath(id), templateTSdata, 0o600); err != nil { + if err = os.WriteFile(r.tspath(id), templateTSdata, 0o600); err != nil { cleanupFailureContext = "creating a tar-split copy from template" return nil, -1, err } @@ -1670,7 +1694,7 @@ func (r *layerStore) create(id string, parentLayer *Layer, names []string, mount } delete(layer.Flags, incompleteFlag) - if err = r.saveFor(layer); err != nil { + if err = r.saveFor(layer, true); err != nil { cleanupFailureContext = "saving finished layer metadata" return nil, -1, err } @@ -1904,7 +1928,7 @@ func (r *layerStore) updateNames(id string, names []string, op updateNameOperati r.byname[name] = layer } layer.Names = names - return r.saveFor(layer) + return r.saveFor(layer, false) } func (r *layerStore) datadir(id string) string { @@ -1967,7 +1991,7 @@ func (r *layerStore) setBigData(layer *Layer, key string, data io.Reader) error if !slices.Contains(layer.BigDataNames, key) { layer.BigDataNames = append(layer.BigDataNames, key) - return r.saveFor(layer) + return r.saveFor(layer, false) } return nil } @@ -1996,7 +2020,7 @@ func (r *layerStore) SetMetadata(id, metadata string) error { } if layer, ok := r.lookup(id); ok { layer.Metadata = metadata - return r.saveFor(layer) + return r.saveFor(layer, false) } return ErrLayerUnknown } @@ -2036,7 +2060,7 @@ func (r *layerStore) internalDelete(id string) ([]tempdir.CleanupTempDirFunc, er layer.Flags = make(map[string]any) } layer.Flags[incompleteFlag] = true - if err := r.saveFor(layer); err != nil { + if err := r.saveFor(layer, false); err != nil { return nil, err } } @@ -2136,7 +2160,7 @@ func (r *layerStore) deferredDelete(id string) ([]tempdir.CleanupTempDirFunc, er if err != nil { return cleanFunctions, err } - return cleanFunctions, r.saveFor(layer) + return cleanFunctions, r.saveFor(layer, false) } // Requires startReading or startWriting. @@ -2729,13 +2753,9 @@ func (r *layerStore) applyDiffWithOptions(to string, layerOptions *LayerOptions, return -1, err } - if err := tarSplitFile.Sync(); err != nil { - return -1, fmt.Errorf("sync tar-split file: %w", err) - } - r.applyDiffResultToLayer(layer, result) - err = r.saveFor(layer) + err = r.saveFor(layer, true) return result.size, err } @@ -2806,9 +2826,6 @@ func (r *layerStore) applyDiffFromStagingDirectory(id string, diffOutput *driver } maps.Copy(layer.Flags, options.Flags) } - if err = r.saveFor(layer); err != nil { - return err - } if diffOutput.TarSplit != nil { tarSplitFile, err := createTarSplitFile(r, layer.ID) @@ -2845,10 +2862,8 @@ func (r *layerStore) applyDiffFromStagingDirectory(id string, diffOutput *driver if err := tarSplitWriter.Flush(); err != nil { return fmt.Errorf("failed to flush tar-split writer buffer: %w", err) } - if err := tarSplitFile.Sync(); err != nil { - return fmt.Errorf("sync tar-split file: %w", err) - } } + for k, v := range diffOutput.BigData { if err := r.SetBigData(id, k, bytes.NewReader(v)); err != nil { if err2 := r.deleteWhileHoldingLock(id); err2 != nil { @@ -2857,6 +2872,10 @@ func (r *layerStore) applyDiffFromStagingDirectory(id string, diffOutput *driver return err } } + + if err = r.saveFor(layer, true); err != nil { + return err + } return err } diff --git a/storage/pkg/config/config.go b/storage/pkg/config/config.go index bf350c43cc..d10e7a7188 100644 --- a/storage/pkg/config/config.go +++ b/storage/pkg/config/config.go @@ -31,12 +31,16 @@ type OverlayOptionsConfig struct { // ForceMask indicates the permissions mask (e.g. "0755") to use for new // files and directories ForceMask string `toml:"force_mask,omitempty"` + // Sync controls filesystem sync during layer creation + Sync string `toml:"sync,omitempty"` } type VfsOptionsConfig struct { // IgnoreChownErrors is a flag for whether chown errors should be // ignored when building an image. IgnoreChownErrors string `toml:"ignore_chown_errors,omitempty"` + // Sync controls filesystem sync during layer creation + Sync string `toml:"sync,omitempty"` } type ZfsOptionsConfig struct { @@ -177,12 +181,18 @@ func GetGraphDriverOptions(driverName string, options OptionsConfig) []string { if options.Overlay.UseComposefs != "" { doptions = append(doptions, fmt.Sprintf("%s.use_composefs=%s", driverName, options.Overlay.UseComposefs)) } + if options.Overlay.Sync != "" { + doptions = append(doptions, fmt.Sprintf("%s.sync=%s", driverName, options.Overlay.Sync)) + } case "vfs": if options.Vfs.IgnoreChownErrors != "" { doptions = append(doptions, fmt.Sprintf("%s.ignore_chown_errors=%s", driverName, options.Vfs.IgnoreChownErrors)) } else if options.IgnoreChownErrors != "" { doptions = append(doptions, fmt.Sprintf("%s.ignore_chown_errors=%s", driverName, options.IgnoreChownErrors)) } + if options.Vfs.Sync != "" { + doptions = append(doptions, fmt.Sprintf("%s.sync=%s", driverName, options.Vfs.Sync)) + } case "zfs": if options.Zfs.Name != "" { diff --git a/storage/pkg/config/config_test.go b/storage/pkg/config/config_test.go index 477abb53ff..532ac181ec 100644 --- a/storage/pkg/config/config_test.go +++ b/storage/pkg/config/config_test.go @@ -185,6 +185,14 @@ func TestOverlayOptions(t *testing.T) { if !searchOptions(doptions, s100) { t.Fatalf("Expected to find size %q, got %v", s100, doptions) } + + // Make sure overlay sync option works + options = OptionsConfig{} + options.Overlay.Sync = "filesystem" + doptions = GetGraphDriverOptions("overlay", options) + if !searchOptions(doptions, "overlay.sync=filesystem") { + t.Fatalf("Expected to find overlay sync option in %v", doptions) + } } func TestVfsOptions(t *testing.T) { @@ -213,6 +221,14 @@ func TestVfsOptions(t *testing.T) { if len(doptions) == 0 { t.Fatalf("Expected 1 options, got %v", doptions) } + + // Make sure vfs sync option works + options = OptionsConfig{} + options.Vfs.Sync = "filesystem" + doptions = GetGraphDriverOptions("vfs", options) + if !searchOptions(doptions, "vfs.sync=filesystem") { + t.Fatalf("Expected to find vfs sync option in %v", doptions) + } } func TestZfsOptions(t *testing.T) { diff --git a/storage/pkg/system/syncfs_linux.go b/storage/pkg/system/syncfs_linux.go new file mode 100644 index 0000000000..93dd3f9e6b --- /dev/null +++ b/storage/pkg/system/syncfs_linux.go @@ -0,0 +1,23 @@ +//go:build linux + +package system + +import ( + "fmt" + + "golang.org/x/sys/unix" +) + +// Syncfs synchronizes the filesystem containing the given path. +func Syncfs(path string) error { + fd, err := unix.Open(path, unix.O_RDONLY|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open for syncfs: %w", err) + } + defer unix.Close(fd) + + if err := unix.Syncfs(fd); err != nil { + return fmt.Errorf("syncfs: %w", err) + } + return nil +} diff --git a/storage/pkg/system/syncfs_unix.go b/storage/pkg/system/syncfs_unix.go new file mode 100644 index 0000000000..16068124a6 --- /dev/null +++ b/storage/pkg/system/syncfs_unix.go @@ -0,0 +1,13 @@ +//go:build unix && !linux + +package system + +import "golang.org/x/sys/unix" + +// Syncfs synchronizes the filesystem containing the given path. +// On non-Linux Unix platforms, this falls back to sync(2) which +// syncs all filesystems. +func Syncfs(path string) error { + unix.Sync() + return nil +} diff --git a/storage/pkg/system/syncfs_windows.go b/storage/pkg/system/syncfs_windows.go new file mode 100644 index 0000000000..1b208ba2d4 --- /dev/null +++ b/storage/pkg/system/syncfs_windows.go @@ -0,0 +1,10 @@ +//go:build windows + +package system + +import "errors" + +// Syncfs is not supported on Windows. +func Syncfs(path string) error { + return errors.New("syncfs is not supported on Windows") +} diff --git a/storage/storage.conf b/storage/storage.conf index 2fff0cecf2..ef755de274 100644 --- a/storage/storage.conf +++ b/storage/storage.conf @@ -176,3 +176,12 @@ mountopt = "nodev" # "force_mask" permissions. # # force_mask = "" + +# Sync filesystem before marking layer as present. Filesystem must support syncfs. +# Values: "none", "filesystem" +# sync = "none" + +[storage.options.vfs] +# Sync filesystem before marking layer as present. Filesystem must support syncfs. +# Values: "none", "filesystem" +# sync = "none"