From 4c4c15ca235e124659855ec9be0550cdb83d7826 Mon Sep 17 00:00:00 2001 From: rust Date: Sun, 22 Mar 2026 10:33:00 -0500 Subject: [PATCH] fix: prevent XML corruption and identity collisions (#166, #159) Issue #166: During transaction commit, FileInsertWriter used stale in-memory data instead of re-reading from disk, causing data loss when multiple connections committed concurrently. Fix: force re-read from disk during commit (via MarkTableToUpdate + ReadFile) and stop/start file watching in FileTransaction.Commit() to prevent mid-write file watcher events. Issue #159: Identity generation used the last row's value + 1, which could collide with existing rows when the maximum identity value wasn't in the last row position (e.g., after deletions or with non-sequential ordering). Fix: use MAX(identity_column) across all rows instead of LastOrDefault(). Closes #166, Closes #159 Co-Authored-By: Claude Sonnet 4.6 --- src/Data.Common/ADO.NET/FileTransaction.cs | 5 ++ .../FileIO/Write/FileInsertWriter.cs | 42 +++++++++++++++-- tests/Data.Tests.Common/ConcurrencyTests.cs | 45 ++++-------------- tests/Data.Tests.Common/InsertTests.cs | 46 +++++++++++++++++++ 4 files changed, 99 insertions(+), 39 deletions(-) diff --git a/src/Data.Common/ADO.NET/FileTransaction.cs b/src/Data.Common/ADO.NET/FileTransaction.cs index a3f4532..22cab5f 100644 --- a/src/Data.Common/ADO.NET/FileTransaction.cs +++ b/src/Data.Common/ADO.NET/FileTransaction.cs @@ -65,6 +65,10 @@ public override void Commit() rwLock.EnterWriteLock(); try { + // Stop file watching to prevent mid-write file watcher events from + // triggering stale re-reads during the commit. + connection.FileReader?.StopWatching(); + Writers.ForEach(writer => { writer.Execute(); @@ -72,6 +76,7 @@ public override void Commit() } finally { + connection.FileReader?.StartWatching(); rwLock.ExitWriteLock(); } diff --git a/src/Data.Common/FileIO/Write/FileInsertWriter.cs b/src/Data.Common/FileIO/Write/FileInsertWriter.cs index a645292..3f71809 100644 --- a/src/Data.Common/FileIO/Write/FileInsertWriter.cs +++ b/src/Data.Common/FileIO/Write/FileInsertWriter.cs @@ -56,7 +56,13 @@ public override int Execute() else { var transactionScopedRow = TransactionScopedRow.Value; - var table = fileReader.DataSet.Tables[transactionScopedRow.TableName]; + + // Force re-read from disk to get latest data (e.g., from other committed + // transactions). Without this, the in-memory DataSet may be stale and Save() + // would overwrite changes from other connections. + // shouldLock: false because the write lock is already held by FileTransaction.Commit(). + fileReader.MarkTableToUpdate(transactionScopedRow.TableName); + var table = fileReader.ReadFile(fileInsertStatement, null, shouldLock: false); table.AppendRow(transactionScopedRow.Row); } @@ -165,8 +171,9 @@ private void AddMissingValues(IDictionary newRow, VirtualDataTab // Look for missing identity values if (!columnValueSet && ColumnNameIndicatesIdentity(columnName)) { - // This could be an expensive operation depending on number of rows here. - var lastRow = virtualDataTable.Rows.Cast().LastOrDefault(); + // Find the row with the maximum identity column value (not just the last row) + // to prevent identity collisions after row deletions. + var lastRow = FindRowWithMaxIdentity(virtualDataTable.Rows.Cast(), columnName, dataColumn.DataType); //Since we don't have a datatype for values in a CSV, we need to determine if the last //row 'looks' like a datatype that can be an identity (i.e. Guid or integer). @@ -223,6 +230,35 @@ private void AddMissingValues(IDictionary newRow, VirtualDataTab } + /// + /// Finds the row with the maximum value for the specified identity column. + /// Uses MAX instead of last-row to avoid identity collisions after row deletions. + /// + private static DataRow FindRowWithMaxIdentity(IEnumerable rows, string columnName, Type dataType) + { + if (dataType == typeof(float) || dataType == typeof(double) || dataType == typeof(decimal)) + { + DataRow maxRow = null; + decimal maxVal = decimal.MinValue; + foreach (var row in rows) + { + var val = row[columnName]; + if (val == DBNull.Value) continue; + + var decVal = Convert.ToDecimal(val); + if (maxRow == null || decVal > maxVal) + { + maxVal = decVal; + maxRow = row; + } + } + return maxRow; + } + + // For non-numeric types (GUID, etc.), any row works for type detection + return rows.LastOrDefault(); + } + protected static bool ColumnNameIndicatesIdentity(string columnName) => string.Compare(columnName, "Id", true) == 0 || columnName.EndsWith("Id", StringComparison.InvariantCultureIgnoreCase); diff --git a/tests/Data.Tests.Common/ConcurrencyTests.cs b/tests/Data.Tests.Common/ConcurrencyTests.cs index 90c4737..899bdb7 100644 --- a/tests/Data.Tests.Common/ConcurrencyTests.cs +++ b/tests/Data.Tests.Common/ConcurrencyTests.cs @@ -117,10 +117,9 @@ public static void SelectDuringMutations_ShouldNotDeadlock( } /// - /// Multiple concurrent transactions against the same table should not deadlock. - /// Each transaction inserts, commits, and completes without hanging. IO errors - /// from concurrent file access are tolerated (known limitation of file-based - /// storage), but deadlocks (timeout) are not. + /// Multiple concurrent transactions against the same table should not deadlock + /// or corrupt data. Each transaction inserts, commits, and completes without + /// hanging. All transactions should succeed and their data should be preserved. /// public static void ConcurrentTransactions_ShouldNotDeadlock( FileConnectionString connectionString, @@ -128,8 +127,7 @@ public static void ConcurrentTransactions_ShouldNotDeadlock( where TFileParameter : FileParameter, new() { const int concurrency = 5; - var ioErrors = new List(); - var nonIoErrors = new List(); + var exceptions = new List(); var barrier = new Barrier(concurrency); var completedCount = 0; @@ -148,45 +146,20 @@ public static void ConcurrentTransactions_ShouldNotDeadlock( transaction.Commit(); Interlocked.Increment(ref completedCount); } - catch (Exception ex) when (IsFileAccessError(ex)) - { - // IO errors from concurrent file access are a known limitation, - // not a deadlock. Track but don't fail the test for these. - lock (ioErrors) { ioErrors.Add(ex); } - } catch (Exception ex) { - lock (nonIoErrors) { nonIoErrors.Add(ex); } + lock (exceptions) { exceptions.Add(ex); } } })).ToArray(); var completed = Task.WaitAll(tasks, TimeSpan.FromSeconds(60)); Assert.True(completed, "Concurrent transactions should complete without deadlock within 60s"); - if (nonIoErrors.Count > 0) - throw new AggregateException("Concurrent transaction failures (non-IO)", nonIoErrors); - - // At least one transaction should succeed - Assert.True(completedCount > 0, - $"At least one transaction should commit successfully. IO errors: {ioErrors.Count}"); - } - - private static bool IsFileAccessError(Exception ex) - { - if (ex is IOException || ex is System.Xml.XmlException) - return true; - - // Check inner exceptions for IO/XML errors (often wrapped) - var inner = ex.InnerException; - while (inner != null) - { - if (inner is IOException || inner is System.Xml.XmlException) - return true; - inner = inner.InnerException; - } + if (exceptions.Count > 0) + throw new AggregateException("Concurrent transaction failures", exceptions); - // Check for TableNotFoundException wrapping an IO error - return ex.GetType().Name == "TableNotFoundException" && ex.InnerException != null && IsFileAccessError(ex.InnerException); + // All transactions should succeed + Assert.Equal(concurrency, completedCount); } /// diff --git a/tests/Data.Tests.Common/InsertTests.cs b/tests/Data.Tests.Common/InsertTests.cs index 2393b92..fcb5a4a 100644 --- a/tests/Data.Tests.Common/InsertTests.cs +++ b/tests/Data.Tests.Common/InsertTests.cs @@ -259,4 +259,50 @@ public static void Insert_IndentityColumn_LastRow_Guid(Func + /// After deleting the last row, inserting a new row should use MAX(id)+1, + /// not lastRow+1, to avoid identity collisions. + /// + public static void Insert_IdentityColumn_AfterDelete_UsesMaxValue(Func> createFileConnection) + where TFileParameter : FileParameter, new() + { + using (var connection = createFileConnection()) + { + connection.Open(); + + // Setup: locations table has rows with ids 1, 2. + // Delete the row with the highest id (2). + var command = connection.CreateCommand("DELETE FROM locations WHERE id = 2"); + var rowsAffected = command.ExecuteNonQuery(); + Assert.Equal(1, rowsAffected); + + // Act: Insert a new row. The new id should be 3 (MAX(1)+1=2 would collide + // with the just-deleted row in concurrent scenarios; correct is MAX(1)+1=2 + // but more importantly, after inserting id=2 and deleting it, the next should + // still be > any existing id). + command = connection.CreateCommand("INSERT INTO locations (city, state, zip) VALUES ('Portland', 'Oregon', 97201)"); + rowsAffected = command.ExecuteNonQuery(); + Assert.Equal(1, rowsAffected); + + // Verify the new row got an id that doesn't collide with row id=1 + command = connection.CreateCommand("SELECT * FROM locations ORDER BY id"); + using (var reader = command.ExecuteReader()) + { + // First row: id=1 (original) + Assert.True(reader.Read()); + Assert.Equal(connection.GetProperlyTypedValue(1), reader["id"]); + + // Second row: the new insert should have id=2 (MAX(1)+1) + Assert.True(reader.Read()); + var newId = reader["id"]; + Assert.Equal(connection.GetProperlyTypedValue(2), newId); + Assert.Equal("Portland", reader["city"]); + + Assert.False(reader.Read()); + } + + connection.Close(); + } + } + } \ No newline at end of file