diff options
| author | 魏曹先生 <1992414357@qq.com> | 2026-03-15 13:41:39 +0800 |
|---|---|---|
| committer | 魏曹先生 <1992414357@qq.com> | 2026-03-15 13:41:39 +0800 |
| commit | bfe99f8f08d35d0fbecb05ad4722fb279cb8cfc0 (patch) | |
| tree | 4fea17c8d11d49fc1a2066b29b641604d6e2b624 | |
| parent | 38066205582b25b0f2dfeb1786a2d9e428e8dae0 (diff) | |
Update bidx format to align chunk hashes and add hash constants
| -rw-r--r-- | Cargo.lock | 11 | ||||
| -rw-r--r-- | Cargo.toml | 11 | ||||
| -rw-r--r-- | cbindgen.toml | 3 | ||||
| -rw-r--r-- | src/chunker/rw/storage/bidx.rs | 133 |
4 files changed, 127 insertions, 31 deletions
@@ -117,6 +117,7 @@ dependencies = [ "just_template", "log", "memmap2", + "serde", "sha2", "syn", "thiserror", @@ -596,6 +597,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] name = "serde_core" version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -8,9 +8,7 @@ default = [] ffi = [] [workspace] -members = [ - "ffi", -] +members = ["ffi"] [profile.dev] opt-level = 0 @@ -55,9 +53,6 @@ tokio = { version = "1", features = ["full"] } colored = "3" just_progress = "0.1.3" -# Serialize & Config -toml = "1" - # Logging log = "0.4" env_logger = "0.11" @@ -66,3 +61,7 @@ env_logger = "0.11" blake3 = "1.8" sha2 = "0.10" hex = "0.4" + +# Serialize & Config +serde = { version = "1", features = ["derive"] } +toml = "1" diff --git a/cbindgen.toml b/cbindgen.toml index 65d71c6..567e75f 100644 --- a/cbindgen.toml +++ b/cbindgen.toml @@ -14,6 +14,9 @@ header = """ * * All string-returning functions allocate memory that must be freed using Butck_FreeString(). */ + +#define BUTCK_HASH_BLAKE3 0 +#define BUTCK_HASH_SHA256 1 """ include_guard = "BUTCK_H" diff --git a/src/chunker/rw/storage/bidx.rs b/src/chunker/rw/storage/bidx.rs index 783ded6..013fbad 100644 --- a/src/chunker/rw/storage/bidx.rs +++ b/src/chunker/rw/storage/bidx.rs @@ -2,9 +2,13 @@ //! //! The bidx file format: //! - Magic number: [u8; 4] = b"G00d" -//! - Original filename length: u16 (little-endian) -//! - Original filename: [u8] (UTF-8, no null terminator) +//! - Chunk hash start offset: u16 (little-endian, offset from file start to first chunk hash) +//! - Original filename: [u8] (UTF-8, no null terminator, variable length) +//! - Padding bytes: [u8] (0-15 bytes to ensure chunk hash start is 16-byte aligned) //! - Chunk hashes: [u8; 32][u8; 32][u8; 32]... (binary hashes, not hex strings) +//! +//! The chunk hash start offset allows quick access to chunk hashes regardless of filename length. +//! It points to the first chunk hash, which is aligned to a 16-byte boundary. use std::io::{self, Write}; use std::path::Path; @@ -31,21 +35,27 @@ pub fn write_bidx_file( .unwrap_or("unknown"); let filename_bytes = filename.as_bytes(); - // Validate filename length - if filename_bytes.len() > u16::MAX as usize { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - format!("Filename too long: {} bytes", filename_bytes.len()), - )); - } + // Calculate current position after magic(4) + offset(2) + filename + let current_pos = 4u16 + 2u16 + filename_bytes.len() as u16; + + // Calculate padding needed to reach next 16-byte boundary + let padding_needed = (16 - (current_pos % 16)) % 16; + + // Calculate chunk hash start offset (aligned to 16 bytes) + let chunk_hash_start_offset = current_pos + padding_needed as u16; - // Write filename length as u16 - let filename_len = filename_bytes.len() as u16; - writer.write_all(&filename_len.to_le_bytes())?; + // Write chunk hash start offset as u16 (little-endian) + writer.write_all(&chunk_hash_start_offset.to_le_bytes())?; // Write filename bytes writer.write_all(filename_bytes)?; + // Write padding bytes (zeros) to reach 16-byte alignment + if padding_needed > 0 { + let padding = vec![0u8; padding_needed as usize]; + writer.write_all(&padding)?; + } + // Write chunk hashes for chunk_info in chunk_infos { // Convert hex hash to 32-byte binary representation @@ -83,10 +93,10 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)> let mut buffer = Vec::new(); file.read_to_end(&mut buffer)?; - if buffer.len() < 4 { + if buffer.len() < 6 { return Err(io::Error::new( io::ErrorKind::InvalidData, - "File too short to contain magic number", + "File too short to contain magic number and offset", )); } @@ -98,37 +108,63 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)> )); } - let mut offset = 4; + // Read chunk hash start offset + let chunk_hash_start_offset = u16::from_le_bytes([buffer[4], buffer[5]]) as usize; + + if chunk_hash_start_offset > buffer.len() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Chunk hash start offset {} exceeds file size {}", + chunk_hash_start_offset, + buffer.len() + ), + )); + } + + // Read filename (everything from offset 6 to chunk_hash_start_offset, minus padding) + let filename_start = 6; + let filename_end = chunk_hash_start_offset; - // Read filename length - if offset + 2 > buffer.len() { + if filename_start >= filename_end { return Err(io::Error::new( io::ErrorKind::InvalidData, - "File too short to contain filename length", + format!( + "Invalid filename range: start={}, end={}", + filename_start, filename_end + ), )); } - let filename_len = u16::from_le_bytes([buffer[offset], buffer[offset + 1]]) as usize; - offset += 2; - // Read filename - if offset + filename_len > buffer.len() { + // Extract filename bytes (skip trailing zeros which are padding) + let filename_range = &buffer[filename_start..filename_end]; + + // Find the last non-zero byte (end of actual filename) + let filename_len = filename_range + .iter() + .rposition(|&b| b != 0) + .map(|pos| pos + 1) + .unwrap_or(0); + + if filename_len == 0 { return Err(io::Error::new( io::ErrorKind::InvalidData, - "File too short to contain filename", + "Filename is empty or contains only padding", )); } - let filename_bytes = &buffer[offset..offset + filename_len]; + + let filename_bytes = &filename_range[..filename_len]; let filename = String::from_utf8(filename_bytes.to_vec()).map_err(|e| { io::Error::new( io::ErrorKind::InvalidData, format!("Filename is not valid UTF-8: {}", e), ) })?; - offset += filename_len; // Read chunk hashes let mut chunk_infos = Vec::new(); let hash_size = 32; + let mut offset = chunk_hash_start_offset; while offset + hash_size <= buffer.len() { // Read hash @@ -155,3 +191,50 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)> Ok((filename, chunk_infos)) } + +/// Get the chunk hash start offset from a bidx file without reading all data +pub fn get_chunk_hash_start_offset(index_path: &Path) -> io::Result<u16> { + use std::io::Read; + + let mut file = std::fs::File::open(index_path)?; + let mut header = [0u8; 6]; // magic(4) + offset(2) + + file.read_exact(&mut header)?; + + // Check magic number + if &header[0..4] != BUTCK_INDEX_MAGIC { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid magic number", + )); + } + + // Read chunk hash start offset + let offset = u16::from_le_bytes([header[4], header[5]]); + Ok(offset) +} + +/// Read a specific chunk hash by index +pub fn read_chunk_hash_by_index(index_path: &Path, chunk_index: usize) -> io::Result<String> { + use std::io::{Read, Seek, SeekFrom}; + + let mut file = std::fs::File::open(index_path)?; + + // Read chunk hash start offset + let chunk_hash_start_offset = get_chunk_hash_start_offset(index_path)? as u64; + + // Calculate position of the requested chunk hash + let hash_size = 32u64; + let position = chunk_hash_start_offset + (chunk_index as u64 * hash_size); + + // Seek to the position + file.seek(SeekFrom::Start(position))?; + + // Read the hash + let mut hash_bytes = [0u8; 32]; + file.read_exact(&mut hash_bytes)?; + + // Convert to hex string + let hash = hex::encode(hash_bytes); + Ok(hash) +} |
