summaryrefslogtreecommitdiff
path: root/src/chunker/rw/storage/bidx.rs
blob: 783ded6267ec484dde91e53235cc498dec302608 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
//! Bidx (Butchunker Index) file format utilities
//!
//! The bidx file format:
//! - Magic number: [u8; 4] = b"G00d"
//! - Original filename length: u16 (little-endian)
//! - Original filename: [u8] (UTF-8, no null terminator)
//! - Chunk hashes: [u8; 32][u8; 32][u8; 32]... (binary hashes, not hex strings)

use std::io::{self, Write};
use std::path::Path;

use crate::chunker::constants::BUTCK_INDEX_MAGIC;
use crate::chunker::rw::storage::ChunkInfo;

/// Write a bidx index file
pub fn write_bidx_file(
    index_path: &Path,
    chunk_infos: &[ChunkInfo],
    original_file_path: &Path,
) -> io::Result<()> {
    let file = std::fs::File::create(index_path)?;
    let mut writer = io::BufWriter::new(file);

    // Magic bytes
    writer.write_all(&BUTCK_INDEX_MAGIC)?;

    // Get original filename
    let filename = original_file_path
        .file_name()
        .and_then(|n| n.to_str())
        .unwrap_or("unknown");
    let filename_bytes = filename.as_bytes();

    // Validate filename length
    if filename_bytes.len() > u16::MAX as usize {
        return Err(io::Error::new(
            io::ErrorKind::InvalidInput,
            format!("Filename too long: {} bytes", filename_bytes.len()),
        ));
    }

    // Write filename length as u16
    let filename_len = filename_bytes.len() as u16;
    writer.write_all(&filename_len.to_le_bytes())?;

    // Write filename bytes
    writer.write_all(filename_bytes)?;

    // Write chunk hashes
    for chunk_info in chunk_infos {
        // Convert hex hash to 32-byte binary representation
        let hash_bytes = match hex::decode(&chunk_info.hash) {
            Ok(bytes) => bytes,
            Err(e) => {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    format!("Failed to decode hash hex '{}': {}", chunk_info.hash, e),
                ));
            }
        };

        // Ensure hash is exactly 32 bytes
        if hash_bytes.len() != 32 {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                format!("Hash must be 32 bytes, got {} bytes", hash_bytes.len()),
            ));
        }

        // Write hash
        writer.write_all(&hash_bytes)?;
    }

    writer.flush()?;
    Ok(())
}

/// Read a bidx index file
pub fn read_bidx_file(index_path: &Path) -> io::Result<(String, Vec<ChunkInfo>)> {
    use std::io::Read;

    let mut file = std::fs::File::open(index_path)?;
    let mut buffer = Vec::new();
    file.read_to_end(&mut buffer)?;

    if buffer.len() < 4 {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "File too short to contain magic number",
        ));
    }

    // Check magic number
    if &buffer[0..4] != BUTCK_INDEX_MAGIC {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "Invalid magic number",
        ));
    }

    let mut offset = 4;

    // Read filename length
    if offset + 2 > buffer.len() {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "File too short to contain filename length",
        ));
    }
    let filename_len = u16::from_le_bytes([buffer[offset], buffer[offset + 1]]) as usize;
    offset += 2;

    // Read filename
    if offset + filename_len > buffer.len() {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "File too short to contain filename",
        ));
    }
    let filename_bytes = &buffer[offset..offset + filename_len];
    let filename = String::from_utf8(filename_bytes.to_vec()).map_err(|e| {
        io::Error::new(
            io::ErrorKind::InvalidData,
            format!("Filename is not valid UTF-8: {}", e),
        )
    })?;
    offset += filename_len;

    // Read chunk hashes
    let mut chunk_infos = Vec::new();
    let hash_size = 32;

    while offset + hash_size <= buffer.len() {
        // Read hash
        let hash_bytes = &buffer[offset..offset + hash_size];
        let hash = hex::encode(hash_bytes);
        offset += hash_size;

        chunk_infos.push(ChunkInfo {
            index: chunk_infos.len(),
            hash,
        });
    }

    // Check if we read exactly all data
    if offset != buffer.len() {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            format!(
                "File contains {} extra bytes after chunk hashes",
                buffer.len() - offset
            ),
        ));
    }

    Ok((filename, chunk_infos))
}