@@ -17,7 +17,7 @@ use crate::internal::{
1717 DATA_ENTRY_OFFSET_MAGIC , DATA_ENTRY_OFFSET_MASK , DATA_FILE_SIGNATURE , DATA_FILE_VERSION ,
1818 EntryType , FILE_OFFSET_ALIGNMENT , KEY_NAMESPACE_BITS , KVBuf , KVRef , KeyNamespace ,
1919 MAX_KEY_NAMESPACE , PAGE_SIZE , READ_BUFFER_SIZE , SIZE_HINT_UNIT , data_file_path,
20- invalid_data_error, read_available_at, read_into_at, sync_dir, write_all_at,
20+ invalid_data_error, read_available_at, read_into_at, sync_dir, sync_file_range , write_all_at,
2121} ;
2222use crate :: types:: { Config , Error , MAX_USER_KEY_SIZE , MAX_USER_VALUE_SIZE , Result } ;
2323
@@ -197,24 +197,124 @@ impl Drop for InflightGuard<'_> {
197197pub ( crate ) struct DataFile {
198198 pub ( crate ) file : File ,
199199 file_offset : AtomicU64 ,
200+ last_synced_offset : AtomicU64 ,
200201 sealed_for_rotation : AtomicBool ,
201202 config : Arc < Config > ,
202203 pub ( crate ) file_idx : u16 ,
203204 pub ( crate ) file_ordinal : u64 ,
205+ preallocated : bool ,
206+ recovery_tail_upper_bound : u64 ,
204207}
205208
206209impl DataFile {
207210 pub ( crate ) fn used_bytes ( & self ) -> u64 {
208211 self . file_offset . load ( Ordering :: Acquire )
209212 }
210213
214+ pub ( crate ) fn recovery_tail_upper_bound ( & self ) -> u64 {
215+ self . recovery_tail_upper_bound
216+ }
217+
218+ pub ( crate ) fn sync_data ( & self , start_offset : u64 , end_offset : u64 ) -> Result < ( ) > {
219+ let used_bytes = self . used_bytes ( ) ;
220+ let start_offset = start_offset. min ( used_bytes) ;
221+ let end_offset = end_offset. min ( used_bytes) ;
222+ if end_offset <= start_offset {
223+ return Ok ( ( ) ) ;
224+ }
225+
226+ if !self . preallocated {
227+ self . file . sync_all ( ) . map_err ( Error :: IOError ) ?;
228+ } else {
229+ sync_file_range (
230+ & self . file ,
231+ size_of :: < DataFileHeader > ( ) as u64 + start_offset,
232+ end_offset - start_offset,
233+ ) ?;
234+ }
235+ self . last_synced_offset
236+ . fetch_max ( end_offset, Ordering :: Release ) ;
237+ Ok ( ( ) )
238+ }
239+
240+ pub ( crate ) fn sync_to_current ( & self ) -> Result < ( ) > {
241+ let start = self . last_synced_offset . load ( Ordering :: Acquire ) ;
242+ self . sync_data ( start, self . used_bytes ( ) )
243+ }
244+
211245 pub ( crate ) fn truncate_to_offset ( & self , file_offset : u64 ) -> Result < ( ) > {
212246 debug_assert_eq ! ( file_offset % FILE_OFFSET_ALIGNMENT , 0 ) ;
213- self . file
214- . set_len ( size_of :: < DataFileHeader > ( ) as u64 + file_offset)
215- . map_err ( Error :: IOError ) ?;
247+ if self . preallocated {
248+ // A crash between the two set_len calls would leave the file
249+ // non-preallocated. That is harmless: the next open will
250+ // detect it as non-preallocated and fall back to sync_all
251+ // until rotation creates a fresh preallocated file.
252+ self . file
253+ . set_len ( size_of :: < DataFileHeader > ( ) as u64 + file_offset)
254+ . map_err ( Error :: IOError ) ?;
255+ self . file
256+ . set_len ( size_of :: < DataFileHeader > ( ) as u64 + self . config . max_data_file_size as u64 )
257+ . map_err ( Error :: IOError ) ?;
258+ } else {
259+ self . file
260+ . set_len ( size_of :: < DataFileHeader > ( ) as u64 + file_offset)
261+ . map_err ( Error :: IOError ) ?;
262+ }
216263 self . file_offset . store ( file_offset, Ordering :: Release ) ;
217- self . file . sync_all ( ) . map_err ( Error :: IOError )
264+ self . file . sync_all ( ) . map_err ( Error :: IOError ) ?;
265+ self . last_synced_offset
266+ . store ( file_offset, Ordering :: Release ) ;
267+ Ok ( ( ) )
268+ }
269+
270+ fn used_data_upper_bound ( file : & File , physical_data_len : u64 ) -> Result < u64 > {
271+ if physical_data_len == 0 {
272+ return Ok ( 0 ) ;
273+ }
274+
275+ let mut end = physical_data_len;
276+ while end > 0 {
277+ let start = end. saturating_sub ( READ_BUFFER_SIZE as u64 ) ;
278+ let chunk = read_available_at (
279+ file,
280+ ( end - start) as usize ,
281+ size_of :: < DataFileHeader > ( ) as u64 + start,
282+ )
283+ . map_err ( Error :: IOError ) ?;
284+ if let Some ( rel) = chunk. iter ( ) . rposition ( |byte| * byte != 0 ) {
285+ let aligned = ( start + rel as u64 + 1 ) . next_multiple_of ( FILE_OFFSET_ALIGNMENT ) ;
286+ return Ok ( aligned. min ( physical_data_len) ) ;
287+ }
288+ end = start;
289+ }
290+
291+ Ok ( 0 )
292+ }
293+
294+ /// Scans forward from offset 0, parsing each entry, and returns the
295+ /// aligned end of the last valid entry. We temporarily set `file_offset`
296+ /// to `tail_upper_bound` so that `read_next_entry_ref` won't short-circuit
297+ /// before reaching it. This is safe because `open` is single-threaded;
298+ /// the real value is overwritten by the caller immediately after.
299+ fn detect_used_bytes ( & self , tail_upper_bound : u64 ) -> Result < u64 > {
300+ if tail_upper_bound == 0 {
301+ return Ok ( 0 ) ;
302+ }
303+
304+ self . file_offset . store ( tail_upper_bound, Ordering :: Release ) ;
305+
306+ let mut offset = 0u64 ;
307+ let mut read_buf = Vec :: new ( ) ;
308+ let mut buf_file_offset = 0u64 ;
309+ let mut last_durable_offset = 0u64 ;
310+ while let Some ( ( _, _, next_offset) ) =
311+ self . read_next_entry_ref ( offset, & mut read_buf, & mut buf_file_offset) ?
312+ {
313+ offset = next_offset;
314+ last_durable_offset = next_offset. next_multiple_of ( FILE_OFFSET_ALIGNMENT ) ;
315+ }
316+
317+ Ok ( last_durable_offset)
218318 }
219319
220320 fn parse_data_entry ( buf : & [ u8 ] , offset : u64 ) -> Result < ParsedDataEntry > {
@@ -296,23 +396,30 @@ impl DataFile {
296396 "invalid data file header" ,
297397 ) ) ) ;
298398 }
299- let mut file_offset = file
399+ let physical_data_len = file
300400 . metadata ( )
301401 . map_err ( Error :: IOError ) ?
302402 . len ( )
303403 . saturating_sub ( size_of :: < DataFileHeader > ( ) as u64 ) ;
304- file_offset -= file_offset % FILE_OFFSET_ALIGNMENT ;
305- file. set_len ( size_of :: < DataFileHeader > ( ) as u64 + file_offset)
306- . map_err ( Error :: IOError ) ?;
404+ let preallocated = physical_data_len == config. max_data_file_size as u64 ;
405+ let recovery_tail_upper_bound = Self :: used_data_upper_bound ( & file, physical_data_len) ?;
307406
308- Ok ( Self {
407+ let inst = Self {
309408 file,
310- file_offset : AtomicU64 :: new ( file_offset) ,
409+ file_offset : AtomicU64 :: new ( physical_data_len) ,
410+ last_synced_offset : AtomicU64 :: new ( 0 ) ,
311411 sealed_for_rotation : AtomicBool :: new ( false ) ,
312412 config,
313413 file_idx,
314414 file_ordinal : header. ordinal ,
315- } )
415+ preallocated,
416+ recovery_tail_upper_bound,
417+ } ;
418+ let used_bytes = inst. detect_used_bytes ( recovery_tail_upper_bound) ?;
419+ inst. file_offset . store ( used_bytes, Ordering :: Release ) ;
420+ inst. last_synced_offset . store ( used_bytes, Ordering :: Release ) ;
421+
422+ Ok ( inst)
316423 }
317424
318425 pub ( crate ) fn create (
@@ -328,7 +435,7 @@ impl DataFile {
328435 . write ( true )
329436 . open ( data_file_path ( base_path, file_idx) )
330437 . map_err ( Error :: IOError ) ?;
331- file. set_len ( size_of :: < DataFileHeader > ( ) as u64 )
438+ file. set_len ( size_of :: < DataFileHeader > ( ) as u64 + config . max_data_file_size as u64 )
332439 . map_err ( Error :: IOError ) ?;
333440 let header = DataFileHeader {
334441 magic : * DATA_FILE_SIGNATURE ,
@@ -343,10 +450,13 @@ impl DataFile {
343450 Ok ( Self {
344451 file,
345452 file_offset : AtomicU64 :: new ( 0 ) ,
453+ last_synced_offset : AtomicU64 :: new ( 0 ) ,
346454 sealed_for_rotation : AtomicBool :: new ( false ) ,
347455 config,
348456 file_idx,
349457 file_ordinal : ordinal,
458+ preallocated : true ,
459+ recovery_tail_upper_bound : 0 ,
350460 } )
351461 }
352462
@@ -554,9 +664,16 @@ impl DataFile {
554664 read_buf : & ' a mut Vec < u8 > ,
555665 buf_file_offset : & mut u64 ,
556666 ) -> Result < Option < ( KVRef < ' a > , u64 , u64 ) > > {
667+ let used_bytes = self . used_bytes ( ) ;
668+ if offset >= used_bytes {
669+ return Ok ( None ) ;
670+ }
557671 offset = offset. next_multiple_of ( FILE_OFFSET_ALIGNMENT ) ;
558672
559673 loop {
674+ if offset >= used_bytes {
675+ return Ok ( None ) ;
676+ }
560677 let buf_start = if offset >= * buf_file_offset {
561678 ( offset - * buf_file_offset) as usize
562679 } else {
0 commit comments