Update bidx format to align chunk hashes and add hash constants

author: 魏曹先生 <1992414357@qq.com> 2026-03-15 13:41:39 +0800
committer: 魏曹先生 <1992414357@qq.com> 2026-03-15 13:41:39 +0800
commit: bfe99f8f08d35d0fbecb05ad4722fb279cb8cfc0 (patch)
tree: 4fea17c8d11d49fc1a2066b29b641604d6e2b624
parent: 38066205582b25b0f2dfeb1786a2d9e428e8dae0 (diff)
4 files changed, 127 insertions, 31 deletions
diff --git a/Cargo.lock b/Cargo.lock
index c97d8e1..ff9319e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -117,6 +117,7 @@ dependencies = [
  "just_template",
  "log",
  "memmap2",
+ "serde",
  "sha2",
  "syn",
  "thiserror",
@@ -596,6 +597,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
 name = "serde_core"
 version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 3c127c5..9ec122b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,9 +8,7 @@ default = []
 ffi = []
 
 [workspace]
-members = [
-    "ffi",
-]
+members = ["ffi"]
 
 [profile.dev]
 opt-level = 0
@@ -55,9 +53,6 @@ tokio = { version = "1", features = ["full"] }
 colored = "3"
 just_progress = "0.1.3"
 
-# Serialize & Config
-toml = "1"
-
 # Logging
 log = "0.4"
 env_logger = "0.11"
@@ -66,3 +61,7 @@ env_logger = "0.11"
 blake3 = "1.8"
 sha2 = "0.10"
 hex = "0.4"
+
+# Serialize & Config
+serde = { version = "1", features = ["derive"] }
+toml = "1"
diff --git a/cbindgen.toml b/cbindgen.toml
index 65d71c6..567e75f 100644
--- a/cbindgen.toml
+++ b/cbindgen.toml
@@ -14,6 +14,9 @@ header = """
  *
  * All string-returning functions allocate memory that must be freed using Butck_FreeString().
  */
+
+#define BUTCK_HASH_BLAKE3 0
+#define BUTCK_HASH_SHA256 1
 """
 
 include_guard = "BUTCK_H"
diff --git a/src/chunker/rw/storage/bidx.rs b/src/chunker/rw/storage/bidx.rs
index 783ded6..013fbad 100644
--- a/src/chunker/rw/storage/bidx.rs
+++ b/src/chunker/rw/storage/bidx.rs
@@ -2,9 +2,13 @@
 //!
 //! The bidx file format:
 //! - Magic number: [u8; 4] = b"G00d"
-//! - Original filename length: u16 (little-endian)
-//! - Original filename: [u8] (UTF-8, no null terminator)
+//! - Chunk hash start offset: u16 (little-endian, offset from file start to first chunk hash)
+//! - Original filename: [u8] (UTF-8, no null terminator, variable length)
+//! - Padding bytes: [u8] (0-15 bytes to ensure chunk hash start is 16-byte aligned)
 //! - Chunk hashes: [u8; 32][u8; 32][u8; 32]... (binary hashes, not hex strings)
+//!
+//! The chunk hash start offset allows quick access to chunk hashes regardless of filename length.
+//! It points to the first chunk hash, which is aligned to a 16-byte boundary.
 
 use std::io::{self, Write};
 use std::path::Path;
@@ -31,21 +35,27 @@ pub fn write_bidx_file(
         .unwrap_or("unknown");
     let filename_bytes = filename.as_bytes();
 
-    // Validate filename length
-    if filename_bytes.len() > u16::MAX as usize {
-        return Err(io::Error::new(
-            io::ErrorKind::InvalidInput,
-            format!("Filename too long: {} bytes", filename_bytes.len()),
-        ));
-    }
+    // Calculate current position after magic(4) + offset(2) + filename
+    let current_pos = 4u16 + 2u16 + filename_bytes.len() as u16;
+
+    // Calculate padding needed to reach next 16-byte boundary
+    let padding_needed = (16 - (current_pos % 16)) % 16;
+
+    // Calculate chunk hash start offset (aligned to 16 bytes)
+    let chunk_hash_start_offset = current_pos + padding_needed as u16;
 
-    // Write filename length as u16
-    let filename_len = filename_bytes.len() as u16;
-    writer.write_all(&filename_len.to_le_bytes())?;
+    // Write chunk hash start offset as u16 (little-endian)
+    writer.write_all(&chunk_hash_start_offset.to_le_bytes())?;
 
     // Write filename bytes
     writer.write_all(filename_bytes)?;
 
+    // Write padding bytes (zeros) to reach 16-byte alignment
+    if padding_needed > 0 {
+        let padding = vec![0u8; padding_needed as usize];
+        writer.write_all(&padding)?;
+    }
+
     // Write chunk hashes
     for chunk_info in chunk_infos {
         // Convert hex hash to 32-byte binary representation
@@ -83,10 +93,10 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)>
     let mut buffer = Vec::new();
     file.read_to_end(&mut buffer)?;
 
-    if buffer.len() < 4 {
+    if buffer.len() < 6 {
         return Err(io::Error::new(
             io::ErrorKind::InvalidData,
-            "File too short to contain magic number",
+            "File too short to contain magic number and offset",
         ));
     }
 
@@ -98,37 +108,63 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)>
         ));
     }
 
-    let mut offset = 4;
+    // Read chunk hash start offset
+    let chunk_hash_start_offset = u16::from_le_bytes([buffer[4], buffer[5]]) as usize;
+
+    if chunk_hash_start_offset > buffer.len() {
+        return Err(io::Error::new(
+            io::ErrorKind::InvalidData,
+            format!(
+                "Chunk hash start offset {} exceeds file size {}",
+                chunk_hash_start_offset,
+                buffer.len()
+            ),
+        ));
+    }
+
+    // Read filename (everything from offset 6 to chunk_hash_start_offset, minus padding)
+    let filename_start = 6;
+    let filename_end = chunk_hash_start_offset;
 
-    // Read filename length
-    if offset + 2 > buffer.len() {
+    if filename_start >= filename_end {
         return Err(io::Error::new(
             io::ErrorKind::InvalidData,
-            "File too short to contain filename length",
+            format!(
+                "Invalid filename range: start={}, end={}",
+                filename_start, filename_end
+            ),
         ));
     }
-    let filename_len = u16::from_le_bytes([buffer[offset], buffer[offset + 1]]) as usize;
-    offset += 2;
 
-    // Read filename
-    if offset + filename_len > buffer.len() {
+    // Extract filename bytes (skip trailing zeros which are padding)
+    let filename_range = &buffer[filename_start..filename_end];
+
+    // Find the last non-zero byte (end of actual filename)
+    let filename_len = filename_range
+        .iter()
+        .rposition(|&b| b != 0)
+        .map(|pos| pos + 1)
+        .unwrap_or(0);
+
+    if filename_len == 0 {
         return Err(io::Error::new(
             io::ErrorKind::InvalidData,
-            "File too short to contain filename",
+            "Filename is empty or contains only padding",
         ));
     }
-    let filename_bytes = &buffer[offset..offset + filename_len];
+
+    let filename_bytes = &filename_range[..filename_len];
     let filename = String::from_utf8(filename_bytes.to_vec()).map_err(|e| {
         io::Error::new(
             io::ErrorKind::InvalidData,
             format!("Filename is not valid UTF-8: {}", e),
         )
     })?;
-    offset += filename_len;
 
     // Read chunk hashes
     let mut chunk_infos = Vec::new();
     let hash_size = 32;
+    let mut offset = chunk_hash_start_offset;
 
     while offset + hash_size <= buffer.len() {
         // Read hash
@@ -155,3 +191,50 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)>
 
     Ok((filename, chunk_infos))
 }
+
+/// Get the chunk hash start offset from a bidx file without reading all data
+pub fn get_chunk_hash_start_offset(index_path: &Path) -> io::Result<u16> {
+    use std::io::Read;
+
+    let mut file = std::fs::File::open(index_path)?;
+    let mut header = [0u8; 6]; // magic(4) + offset(2)
+
+    file.read_exact(&mut header)?;
+
+    // Check magic number
+    if &header[0..4] != BUTCK_INDEX_MAGIC {
+        return Err(io::Error::new(
+            io::ErrorKind::InvalidData,
+            "Invalid magic number",
+        ));
+    }
+
+    // Read chunk hash start offset
+    let offset = u16::from_le_bytes([header[4], header[5]]);
+    Ok(offset)
+}
+
+/// Read a specific chunk hash by index
+pub fn read_chunk_hash_by_index(index_path: &Path, chunk_index: usize) -> io::Result<String> {
+    use std::io::{Read, Seek, SeekFrom};
+
+    let mut file = std::fs::File::open(index_path)?;
+
+    // Read chunk hash start offset
+    let chunk_hash_start_offset = get_chunk_hash_start_offset(index_path)? as u64;
+
+    // Calculate position of the requested chunk hash
+    let hash_size = 32u64;
+    let position = chunk_hash_start_offset + (chunk_index as u64 * hash_size);
+
+    // Seek to the position
+    file.seek(SeekFrom::Start(position))?;
+
+    // Read the hash
+    let mut hash_bytes = [0u8; 32];
+    file.read_exact(&mut hash_bytes)?;
+
+    // Convert to hex string
+    let hash = hex::encode(hash_bytes);
+    Ok(hash)
+}
author	魏曹先生 <1992414357@qq.com>	2026-03-15 13:41:39 +0800
committer	魏曹先生 <1992414357@qq.com>	2026-03-15 13:41:39 +0800
commit	bfe99f8f08d35d0fbecb05ad4722fb279cb8cfc0 (patch)
tree	4fea17c8d11d49fc1a2066b29b641604d6e2b624
parent	38066205582b25b0f2dfeb1786a2d9e428e8dae0 (diff)