//! Bidx (Butchunker Index) file format utilities //! //! The bidx file format: //! - Magic number: [u8; 4] = b"G00d" //! - Chunk hash start offset: u16 (little-endian, offset from file start to first chunk hash) //! - Original filename: [u8] (UTF-8, no null terminator, variable length) //! - Padding bytes: [u8] (0-15 bytes to ensure chunk hash start is 16-byte aligned) //! - Chunk hashes: [u8; 32][u8; 32][u8; 32]... (binary hashes, not hex strings) //! //! The chunk hash start offset allows quick access to chunk hashes regardless of filename length. //! It points to the first chunk hash, which is aligned to a 16-byte boundary. use std::io::{self, Write}; use std::path::Path; use crate::chunker::constants::BUTCK_INDEX_MAGIC; use crate::chunker::rw::storage::ChunkInfo; /// Write a bidx index file pub fn write_bidx_file( index_path: &Path, chunk_infos: &[ChunkInfo], original_file_path: &Path, ) -> io::Result<()> { let file = std::fs::File::create(index_path)?; let mut writer = io::BufWriter::new(file); // Magic bytes writer.write_all(&BUTCK_INDEX_MAGIC)?; // Get original filename let filename = original_file_path .file_name() .and_then(|n| n.to_str()) .unwrap_or("unknown"); let filename_bytes = filename.as_bytes(); // Calculate current position after magic(4) + offset(2) + filename let current_pos = 4u16 + 2u16 + filename_bytes.len() as u16; // Calculate padding needed to reach next 16-byte boundary let padding_needed = (16 - (current_pos % 16)) % 16; // Calculate chunk hash start offset (aligned to 16 bytes) let chunk_hash_start_offset = current_pos + padding_needed as u16; // Write chunk hash start offset as u16 (little-endian) writer.write_all(&chunk_hash_start_offset.to_le_bytes())?; // Write filename bytes writer.write_all(filename_bytes)?; // Write padding bytes (zeros) to reach 16-byte alignment if padding_needed > 0 { let padding = vec![0u8; padding_needed as usize]; writer.write_all(&padding)?; } // Write chunk hashes for chunk_info in chunk_infos { // Convert hex hash to 32-byte binary representation let hash_bytes = match hex::decode(&chunk_info.hash) { Ok(bytes) => bytes, Err(e) => { return Err(io::Error::new( io::ErrorKind::InvalidData, format!("Failed to decode hash hex '{}': {}", chunk_info.hash, e), )); } }; // Ensure hash is exactly 32 bytes if hash_bytes.len() != 32 { return Err(io::Error::new( io::ErrorKind::InvalidData, format!("Hash must be 32 bytes, got {} bytes", hash_bytes.len()), )); } // Write hash writer.write_all(&hash_bytes)?; } writer.flush()?; Ok(()) } /// Read a bidx index file pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec)> { use std::io::Read; let mut file = std::fs::File::open(index_path)?; let mut buffer = Vec::new(); file.read_to_end(&mut buffer)?; if buffer.len() < 6 { return Err(io::Error::new( io::ErrorKind::InvalidData, "File too short to contain magic number and offset", )); } // Check magic number if &buffer[0..4] != BUTCK_INDEX_MAGIC { return Err(io::Error::new( io::ErrorKind::InvalidData, "Invalid magic number", )); } // Read chunk hash start offset let chunk_hash_start_offset = u16::from_le_bytes([buffer[4], buffer[5]]) as usize; if chunk_hash_start_offset > buffer.len() { return Err(io::Error::new( io::ErrorKind::InvalidData, format!( "Chunk hash start offset {} exceeds file size {}", chunk_hash_start_offset, buffer.len() ), )); } // Read filename (everything from offset 6 to chunk_hash_start_offset, minus padding) let filename_start = 6; let filename_end = chunk_hash_start_offset; if filename_start >= filename_end { return Err(io::Error::new( io::ErrorKind::InvalidData, format!( "Invalid filename range: start={}, end={}", filename_start, filename_end ), )); } // Extract filename bytes (skip trailing zeros which are padding) let filename_range = &buffer[filename_start..filename_end]; // Find the last non-zero byte (end of actual filename) let filename_len = filename_range .iter() .rposition(|&b| b != 0) .map(|pos| pos + 1) .unwrap_or(0); if filename_len == 0 { return Err(io::Error::new( io::ErrorKind::InvalidData, "Filename is empty or contains only padding", )); } let filename_bytes = &filename_range[..filename_len]; let filename = String::from_utf8(filename_bytes.to_vec()).map_err(|e| { io::Error::new( io::ErrorKind::InvalidData, format!("Filename is not valid UTF-8: {}", e), ) })?; // Read chunk hashes let mut chunk_infos = Vec::new(); let hash_size = 32; let mut offset = chunk_hash_start_offset; while offset + hash_size <= buffer.len() { // Read hash let hash_bytes = &buffer[offset..offset + hash_size]; let hash = hex::encode(hash_bytes); offset += hash_size; chunk_infos.push(ChunkInfo { index: chunk_infos.len(), hash, }); } // Check if we read exactly all data if offset != buffer.len() { return Err(io::Error::new( io::ErrorKind::InvalidData, format!( "File contains {} extra bytes after chunk hashes", buffer.len() - offset ), )); } Ok((filename, chunk_infos)) } /// Get the chunk hash start offset from a bidx file without reading all data pub fn get_chunk_hash_start_offset(index_path: &Path) -> io::Result { use std::io::Read; let mut file = std::fs::File::open(index_path)?; let mut header = [0u8; 6]; // magic(4) + offset(2) file.read_exact(&mut header)?; // Check magic number if &header[0..4] != BUTCK_INDEX_MAGIC { return Err(io::Error::new( io::ErrorKind::InvalidData, "Invalid magic number", )); } // Read chunk hash start offset let offset = u16::from_le_bytes([header[4], header[5]]); Ok(offset) } /// Read a specific chunk hash by index pub fn read_chunk_hash_by_index(index_path: &Path, chunk_index: usize) -> io::Result { use std::io::{Read, Seek, SeekFrom}; let mut file = std::fs::File::open(index_path)?; // Read chunk hash start offset let chunk_hash_start_offset = get_chunk_hash_start_offset(index_path)? as u64; // Calculate position of the requested chunk hash let hash_size = 32u64; let position = chunk_hash_start_offset + (chunk_index as u64 * hash_size); // Seek to the position file.seek(SeekFrom::Start(position))?; // Read the hash let mut hash_bytes = [0u8; 32]; file.read_exact(&mut hash_bytes)?; // Convert to hex string let hash = hex::encode(hash_bytes); Ok(hash) }