summaryrefslogtreecommitdiff
path: root/src/chunker/rw/storage/bidx.rs
blob: 013fbad4a32d97a33e6b5044b8b19ab81b29a305 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
//! Bidx (Butchunker Index) file format utilities
//!
//! The bidx file format:
//! - Magic number: [u8; 4] = b"G00d"
//! - Chunk hash start offset: u16 (little-endian, offset from file start to first chunk hash)
//! - Original filename: [u8] (UTF-8, no null terminator, variable length)
//! - Padding bytes: [u8] (0-15 bytes to ensure chunk hash start is 16-byte aligned)
//! - Chunk hashes: [u8; 32][u8; 32][u8; 32]... (binary hashes, not hex strings)
//!
//! The chunk hash start offset allows quick access to chunk hashes regardless of filename length.
//! It points to the first chunk hash, which is aligned to a 16-byte boundary.

use std::io::{self, Write};
use std::path::Path;

use crate::chunker::constants::BUTCK_INDEX_MAGIC;
use crate::chunker::rw::storage::ChunkInfo;

/// Write a bidx index file
pub fn write_bidx_file(
    index_path: &Path,
    chunk_infos: &[ChunkInfo],
    original_file_path: &Path,
) -> io::Result<()> {
    let file = std::fs::File::create(index_path)?;
    let mut writer = io::BufWriter::new(file);

    // Magic bytes
    writer.write_all(&BUTCK_INDEX_MAGIC)?;

    // Get original filename
    let filename = original_file_path
        .file_name()
        .and_then(|n| n.to_str())
        .unwrap_or("unknown");
    let filename_bytes = filename.as_bytes();

    // Calculate current position after magic(4) + offset(2) + filename
    let current_pos = 4u16 + 2u16 + filename_bytes.len() as u16;

    // Calculate padding needed to reach next 16-byte boundary
    let padding_needed = (16 - (current_pos % 16)) % 16;

    // Calculate chunk hash start offset (aligned to 16 bytes)
    let chunk_hash_start_offset = current_pos + padding_needed as u16;

    // Write chunk hash start offset as u16 (little-endian)
    writer.write_all(&chunk_hash_start_offset.to_le_bytes())?;

    // Write filename bytes
    writer.write_all(filename_bytes)?;

    // Write padding bytes (zeros) to reach 16-byte alignment
    if padding_needed > 0 {
        let padding = vec![0u8; padding_needed as usize];
        writer.write_all(&padding)?;
    }

    // Write chunk hashes
    for chunk_info in chunk_infos {
        // Convert hex hash to 32-byte binary representation
        let hash_bytes = match hex::decode(&chunk_info.hash) {
            Ok(bytes) => bytes,
            Err(e) => {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    format!("Failed to decode hash hex '{}': {}", chunk_info.hash, e),
                ));
            }
        };

        // Ensure hash is exactly 32 bytes
        if hash_bytes.len() != 32 {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                format!("Hash must be 32 bytes, got {} bytes", hash_bytes.len()),
            ));
        }

        // Write hash
        writer.write_all(&hash_bytes)?;
    }

    writer.flush()?;
    Ok(())
}

/// Read a bidx index file
pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)> {
    use std::io::Read;

    let mut file = std::fs::File::open(index_path)?;
    let mut buffer = Vec::new();
    file.read_to_end(&mut buffer)?;

    if buffer.len() < 6 {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "File too short to contain magic number and offset",
        ));
    }

    // Check magic number
    if &buffer[0..4] != BUTCK_INDEX_MAGIC {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "Invalid magic number",
        ));
    }

    // Read chunk hash start offset
    let chunk_hash_start_offset = u16::from_le_bytes([buffer[4], buffer[5]]) as usize;

    if chunk_hash_start_offset > buffer.len() {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            format!(
                "Chunk hash start offset {} exceeds file size {}",
                chunk_hash_start_offset,
                buffer.len()
            ),
        ));
    }

    // Read filename (everything from offset 6 to chunk_hash_start_offset, minus padding)
    let filename_start = 6;
    let filename_end = chunk_hash_start_offset;

    if filename_start >= filename_end {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            format!(
                "Invalid filename range: start={}, end={}",
                filename_start, filename_end
            ),
        ));
    }

    // Extract filename bytes (skip trailing zeros which are padding)
    let filename_range = &buffer[filename_start..filename_end];

    // Find the last non-zero byte (end of actual filename)
    let filename_len = filename_range
        .iter()
        .rposition(|&b| b != 0)
        .map(|pos| pos + 1)
        .unwrap_or(0);

    if filename_len == 0 {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "Filename is empty or contains only padding",
        ));
    }

    let filename_bytes = &filename_range[..filename_len];
    let filename = String::from_utf8(filename_bytes.to_vec()).map_err(|e| {
        io::Error::new(
            io::ErrorKind::InvalidData,
            format!("Filename is not valid UTF-8: {}", e),
        )
    })?;

    // Read chunk hashes
    let mut chunk_infos = Vec::new();
    let hash_size = 32;
    let mut offset = chunk_hash_start_offset;

    while offset + hash_size <= buffer.len() {
        // Read hash
        let hash_bytes = &buffer[offset..offset + hash_size];
        let hash = hex::encode(hash_bytes);
        offset += hash_size;

        chunk_infos.push(ChunkInfo {
            index: chunk_infos.len(),
            hash,
        });
    }

    // Check if we read exactly all data
    if offset != buffer.len() {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            format!(
                "File contains {} extra bytes after chunk hashes",
                buffer.len() - offset
            ),
        ));
    }

    Ok((filename, chunk_infos))
}

/// Get the chunk hash start offset from a bidx file without reading all data
pub fn get_chunk_hash_start_offset(index_path: &Path) -> io::Result<u16> {
    use std::io::Read;

    let mut file = std::fs::File::open(index_path)?;
    let mut header = [0u8; 6]; // magic(4) + offset(2)

    file.read_exact(&mut header)?;

    // Check magic number
    if &header[0..4] != BUTCK_INDEX_MAGIC {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "Invalid magic number",
        ));
    }

    // Read chunk hash start offset
    let offset = u16::from_le_bytes([header[4], header[5]]);
    Ok(offset)
}

/// Read a specific chunk hash by index
pub fn read_chunk_hash_by_index(index_path: &Path, chunk_index: usize) -> io::Result<String> {
    use std::io::{Read, Seek, SeekFrom};

    let mut file = std::fs::File::open(index_path)?;

    // Read chunk hash start offset
    let chunk_hash_start_offset = get_chunk_hash_start_offset(index_path)? as u64;

    // Calculate position of the requested chunk hash
    let hash_size = 32u64;
    let position = chunk_hash_start_offset + (chunk_index as u64 * hash_size);

    // Seek to the position
    file.seek(SeekFrom::Start(position))?;

    // Read the hash
    let mut hash_bytes = [0u8; 32];
    file.read_exact(&mut hash_bytes)?;

    // Convert to hex string
    let hash = hex::encode(hash_bytes);
    Ok(hash)
}