diff options
Diffstat (limited to 'src/chunker/rw')
| -rw-r--r-- | src/chunker/rw/storage/bidx.rs | 133 |
1 files changed, 108 insertions, 25 deletions
diff --git a/src/chunker/rw/storage/bidx.rs b/src/chunker/rw/storage/bidx.rs index 783ded6..013fbad 100644 --- a/src/chunker/rw/storage/bidx.rs +++ b/src/chunker/rw/storage/bidx.rs @@ -2,9 +2,13 @@ //! //! The bidx file format: //! - Magic number: [u8; 4] = b"G00d" -//! - Original filename length: u16 (little-endian) -//! - Original filename: [u8] (UTF-8, no null terminator) +//! - Chunk hash start offset: u16 (little-endian, offset from file start to first chunk hash) +//! - Original filename: [u8] (UTF-8, no null terminator, variable length) +//! - Padding bytes: [u8] (0-15 bytes to ensure chunk hash start is 16-byte aligned) //! - Chunk hashes: [u8; 32][u8; 32][u8; 32]... (binary hashes, not hex strings) +//! +//! The chunk hash start offset allows quick access to chunk hashes regardless of filename length. +//! It points to the first chunk hash, which is aligned to a 16-byte boundary. use std::io::{self, Write}; use std::path::Path; @@ -31,21 +35,27 @@ pub fn write_bidx_file( .unwrap_or("unknown"); let filename_bytes = filename.as_bytes(); - // Validate filename length - if filename_bytes.len() > u16::MAX as usize { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - format!("Filename too long: {} bytes", filename_bytes.len()), - )); - } + // Calculate current position after magic(4) + offset(2) + filename + let current_pos = 4u16 + 2u16 + filename_bytes.len() as u16; + + // Calculate padding needed to reach next 16-byte boundary + let padding_needed = (16 - (current_pos % 16)) % 16; + + // Calculate chunk hash start offset (aligned to 16 bytes) + let chunk_hash_start_offset = current_pos + padding_needed as u16; - // Write filename length as u16 - let filename_len = filename_bytes.len() as u16; - writer.write_all(&filename_len.to_le_bytes())?; + // Write chunk hash start offset as u16 (little-endian) + writer.write_all(&chunk_hash_start_offset.to_le_bytes())?; // Write filename bytes writer.write_all(filename_bytes)?; + // Write padding bytes (zeros) to reach 16-byte alignment + if padding_needed > 0 { + let padding = vec![0u8; padding_needed as usize]; + writer.write_all(&padding)?; + } + // Write chunk hashes for chunk_info in chunk_infos { // Convert hex hash to 32-byte binary representation @@ -83,10 +93,10 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)> let mut buffer = Vec::new(); file.read_to_end(&mut buffer)?; - if buffer.len() < 4 { + if buffer.len() < 6 { return Err(io::Error::new( io::ErrorKind::InvalidData, - "File too short to contain magic number", + "File too short to contain magic number and offset", )); } @@ -98,37 +108,63 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)> )); } - let mut offset = 4; + // Read chunk hash start offset + let chunk_hash_start_offset = u16::from_le_bytes([buffer[4], buffer[5]]) as usize; + + if chunk_hash_start_offset > buffer.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Chunk hash start offset {} exceeds file size {}", + chunk_hash_start_offset, + buffer.len() + ), + )); + } + + // Read filename (everything from offset 6 to chunk_hash_start_offset, minus padding) + let filename_start = 6; + let filename_end = chunk_hash_start_offset; - // Read filename length - if offset + 2 > buffer.len() { + if filename_start >= filename_end { return Err(io::Error::new( io::ErrorKind::InvalidData, - "File too short to contain filename length", + format!( + "Invalid filename range: start={}, end={}", + filename_start, filename_end + ), )); } - let filename_len = u16::from_le_bytes([buffer[offset], buffer[offset + 1]]) as usize; - offset += 2; - // Read filename - if offset + filename_len > buffer.len() { + // Extract filename bytes (skip trailing zeros which are padding) + let filename_range = &buffer[filename_start..filename_end]; + + // Find the last non-zero byte (end of actual filename) + let filename_len = filename_range + .iter() + .rposition(|&b| b != 0) + .map(|pos| pos + 1) + .unwrap_or(0); + + if filename_len == 0 { return Err(io::Error::new( io::ErrorKind::InvalidData, - "File too short to contain filename", + "Filename is empty or contains only padding", )); } - let filename_bytes = &buffer[offset..offset + filename_len]; + + let filename_bytes = &filename_range[..filename_len]; let filename = String::from_utf8(filename_bytes.to_vec()).map_err(|e| { io::Error::new( io::ErrorKind::InvalidData, format!("Filename is not valid UTF-8: {}", e), ) })?; - offset += filename_len; // Read chunk hashes let mut chunk_infos = Vec::new(); let hash_size = 32; + let mut offset = chunk_hash_start_offset; while offset + hash_size <= buffer.len() { // Read hash @@ -155,3 +191,50 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)> Ok((filename, chunk_infos)) } + +/// Get the chunk hash start offset from a bidx file without reading all data +pub fn get_chunk_hash_start_offset(index_path: &Path) -> io::Result<u16> { + use std::io::Read; + + let mut file = std::fs::File::open(index_path)?; + let mut header = [0u8; 6]; // magic(4) + offset(2) + + file.read_exact(&mut header)?; + + // Check magic number + if &header[0..4] != BUTCK_INDEX_MAGIC { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid magic number", + )); + } + + // Read chunk hash start offset + let offset = u16::from_le_bytes([header[4], header[5]]); + Ok(offset) +} + +/// Read a specific chunk hash by index +pub fn read_chunk_hash_by_index(index_path: &Path, chunk_index: usize) -> io::Result<String> { + use std::io::{Read, Seek, SeekFrom}; + + let mut file = std::fs::File::open(index_path)?; + + // Read chunk hash start offset + let chunk_hash_start_offset = get_chunk_hash_start_offset(index_path)? as u64; + + // Calculate position of the requested chunk hash + let hash_size = 32u64; + let position = chunk_hash_start_offset + (chunk_index as u64 * hash_size); + + // Seek to the position + file.seek(SeekFrom::Start(position))?; + + // Read the hash + let mut hash_bytes = [0u8; 32]; + file.read_exact(&mut hash_bytes)?; + + // Convert to hex string + let hash = hex::encode(hash_bytes); + Ok(hash) +} |
