summaryrefslogtreecommitdiff
path: root/src/chunker/rw/storage
diff options
context:
space:
mode:
author魏曹先生 <1992414357@qq.com>2026-03-15 13:41:39 +0800
committer魏曹先生 <1992414357@qq.com>2026-03-15 13:41:39 +0800
commitbfe99f8f08d35d0fbecb05ad4722fb279cb8cfc0 (patch)
tree4fea17c8d11d49fc1a2066b29b641604d6e2b624 /src/chunker/rw/storage
parent38066205582b25b0f2dfeb1786a2d9e428e8dae0 (diff)
Update bidx format to align chunk hashes and add hash constants
Diffstat (limited to 'src/chunker/rw/storage')
-rw-r--r--src/chunker/rw/storage/bidx.rs133
1 files changed, 108 insertions, 25 deletions
diff --git a/src/chunker/rw/storage/bidx.rs b/src/chunker/rw/storage/bidx.rs
index 783ded6..013fbad 100644
--- a/src/chunker/rw/storage/bidx.rs
+++ b/src/chunker/rw/storage/bidx.rs
@@ -2,9 +2,13 @@
//!
//! The bidx file format:
//! - Magic number: [u8; 4] = b"G00d"
-//! - Original filename length: u16 (little-endian)
-//! - Original filename: [u8] (UTF-8, no null terminator)
+//! - Chunk hash start offset: u16 (little-endian, offset from file start to first chunk hash)
+//! - Original filename: [u8] (UTF-8, no null terminator, variable length)
+//! - Padding bytes: [u8] (0-15 bytes to ensure chunk hash start is 16-byte aligned)
//! - Chunk hashes: [u8; 32][u8; 32][u8; 32]... (binary hashes, not hex strings)
+//!
+//! The chunk hash start offset allows quick access to chunk hashes regardless of filename length.
+//! It points to the first chunk hash, which is aligned to a 16-byte boundary.
use std::io::{self, Write};
use std::path::Path;
@@ -31,21 +35,27 @@ pub fn write_bidx_file(
.unwrap_or("unknown");
let filename_bytes = filename.as_bytes();
- // Validate filename length
- if filename_bytes.len() > u16::MAX as usize {
- return Err(io::Error::new(
- io::ErrorKind::InvalidInput,
- format!("Filename too long: {} bytes", filename_bytes.len()),
- ));
- }
+ // Calculate current position after magic(4) + offset(2) + filename
+ let current_pos = 4u16 + 2u16 + filename_bytes.len() as u16;
+
+ // Calculate padding needed to reach next 16-byte boundary
+ let padding_needed = (16 - (current_pos % 16)) % 16;
+
+ // Calculate chunk hash start offset (aligned to 16 bytes)
+ let chunk_hash_start_offset = current_pos + padding_needed as u16;
- // Write filename length as u16
- let filename_len = filename_bytes.len() as u16;
- writer.write_all(&filename_len.to_le_bytes())?;
+ // Write chunk hash start offset as u16 (little-endian)
+ writer.write_all(&chunk_hash_start_offset.to_le_bytes())?;
// Write filename bytes
writer.write_all(filename_bytes)?;
+ // Write padding bytes (zeros) to reach 16-byte alignment
+ if padding_needed > 0 {
+ let padding = vec![0u8; padding_needed as usize];
+ writer.write_all(&padding)?;
+ }
+
// Write chunk hashes
for chunk_info in chunk_infos {
// Convert hex hash to 32-byte binary representation
@@ -83,10 +93,10 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)>
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
- if buffer.len() < 4 {
+ if buffer.len() < 6 {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
- "File too short to contain magic number",
+ "File too short to contain magic number and offset",
));
}
@@ -98,37 +108,63 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)>
));
}
- let mut offset = 4;
+ // Read chunk hash start offset
+ let chunk_hash_start_offset = u16::from_le_bytes([buffer[4], buffer[5]]) as usize;
+
+ if chunk_hash_start_offset > buffer.len() {
+ return Err(io::Error::new(
+ io::ErrorKind::InvalidData,
+ format!(
+ "Chunk hash start offset {} exceeds file size {}",
+ chunk_hash_start_offset,
+ buffer.len()
+ ),
+ ));
+ }
+
+ // Read filename (everything from offset 6 to chunk_hash_start_offset, minus padding)
+ let filename_start = 6;
+ let filename_end = chunk_hash_start_offset;
- // Read filename length
- if offset + 2 > buffer.len() {
+ if filename_start >= filename_end {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
- "File too short to contain filename length",
+ format!(
+ "Invalid filename range: start={}, end={}",
+ filename_start, filename_end
+ ),
));
}
- let filename_len = u16::from_le_bytes([buffer[offset], buffer[offset + 1]]) as usize;
- offset += 2;
- // Read filename
- if offset + filename_len > buffer.len() {
+ // Extract filename bytes (skip trailing zeros which are padding)
+ let filename_range = &buffer[filename_start..filename_end];
+
+ // Find the last non-zero byte (end of actual filename)
+ let filename_len = filename_range
+ .iter()
+ .rposition(|&b| b != 0)
+ .map(|pos| pos + 1)
+ .unwrap_or(0);
+
+ if filename_len == 0 {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
- "File too short to contain filename",
+ "Filename is empty or contains only padding",
));
}
- let filename_bytes = &buffer[offset..offset + filename_len];
+
+ let filename_bytes = &filename_range[..filename_len];
let filename = String::from_utf8(filename_bytes.to_vec()).map_err(|e| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Filename is not valid UTF-8: {}", e),
)
})?;
- offset += filename_len;
// Read chunk hashes
let mut chunk_infos = Vec::new();
let hash_size = 32;
+ let mut offset = chunk_hash_start_offset;
while offset + hash_size <= buffer.len() {
// Read hash
@@ -155,3 +191,50 @@ pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)>
Ok((filename, chunk_infos))
}
+
+/// Get the chunk hash start offset from a bidx file without reading all data
+pub fn get_chunk_hash_start_offset(index_path: &Path) -> io::Result<u16> {
+ use std::io::Read;
+
+ let mut file = std::fs::File::open(index_path)?;
+ let mut header = [0u8; 6]; // magic(4) + offset(2)
+
+ file.read_exact(&mut header)?;
+
+ // Check magic number
+ if &header[0..4] != BUTCK_INDEX_MAGIC {
+ return Err(io::Error::new(
+ io::ErrorKind::InvalidData,
+ "Invalid magic number",
+ ));
+ }
+
+ // Read chunk hash start offset
+ let offset = u16::from_le_bytes([header[4], header[5]]);
+ Ok(offset)
+}
+
+/// Read a specific chunk hash by index
+pub fn read_chunk_hash_by_index(index_path: &Path, chunk_index: usize) -> io::Result<String> {
+ use std::io::{Read, Seek, SeekFrom};
+
+ let mut file = std::fs::File::open(index_path)?;
+
+ // Read chunk hash start offset
+ let chunk_hash_start_offset = get_chunk_hash_start_offset(index_path)? as u64;
+
+ // Calculate position of the requested chunk hash
+ let hash_size = 32u64;
+ let position = chunk_hash_start_offset + (chunk_index as u64 * hash_size);
+
+ // Seek to the position
+ file.seek(SeekFrom::Start(position))?;
+
+ // Read the hash
+ let mut hash_bytes = [0u8; 32];
+ file.read_exact(&mut hash_bytes)?;
+
+ // Convert to hex string
+ let hash = hex::encode(hash_bytes);
+ Ok(hash)
+}