1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
|
use crate::{
chunker::{constants::BUTCK_METADATA_DIR_NAME, rw::storage::hash::ChunkWriteHash},
special_argument, special_flag,
storage::{ButckRWError, build, write},
utils::file_input_solve::parse_path_input,
};
use log::{error, warn};
use std::{collections::HashMap, env::current_dir, path::PathBuf, process::exit, str::FromStr};
pub mod ffi;
pub struct Butck;
impl Butck {
pub fn write(files: Vec<PathBuf>, storage: PathBuf) -> ButckContext {
ButckContext::default()
.with_file_paths(files)
.with_storage_path(storage)
.with_write_mode()
}
pub fn build(index_files: Vec<PathBuf>, storage: PathBuf) -> ButckContext {
ButckContext::default()
.with_file_paths(index_files)
.with_storage_path(storage)
.with_build_mode()
}
}
#[derive(Debug, Default)]
enum ButckMethod {
#[default]
None,
Write,
Build,
}
#[derive(Debug, Default)]
pub struct ButckContext {
method: ButckMethod,
/// All input files
/// They will be processed by build / write operations
/// For build operation, it expects *.bidx file collections
/// For write operation, it expects file path collections without directories
pub file_paths: Vec<PathBuf>,
/// Storage repository path
/// It specifies the storage location for file chunks
/// For build operation, chunks will be read from this directory
/// For write operation, chunks will be output to this directory
///
/// If set to None, the program cannot execute successfully
pub storage_path: Option<PathBuf>,
/// Display chunk boundaries
///
/// If set to true, no chunking or building logic will be executed
/// Only boundary information will be output to stdio
pub display_boundaries: bool,
/// Stream reading
/// If set to Some(size)
/// Will load data of size each time and process it with streaming strategy
pub stream_read: Option<u32>,
/// Memory map reading
///
/// If enabled, will use Memmap to map files to memory for access
pub memmap_read: bool,
/// Registration name
/// When set to Some(name)
/// After write operation completes,
/// the corresponding bidx file will be registered to the storage directory's registry
///
/// In build phase, registration name can be used directly to create files
pub register_name: Option<String>,
/// Chunking policy
/// For command line program, can use `butck lspolicy-all` to query
/// It specifies the concrete chunking policy,
/// reasonable policy settings can significantly improve file deduplication rate
///
/// If set to None, the program cannot execute successfully
pub policy_name: Option<String>,
/// Chunk hash
/// Stores hash representation of chunks
/// Currently supports Blake3 and SHA256
pub chunk_hash: ChunkWriteHash,
/// Output path
/// For build operation, built files will be output here
/// For write operation, generated *.bidx files will be output here
///
/// Defaults to current runtime directory
pub output_dir: PathBuf,
/// Output file
/// Precisely specifies output file name
///
/// If number of input files is greater than 1, program cannot execute successfully
pub output_file: Option<PathBuf>,
/// Parameters
/// Provides concrete parameters for the policy
pub params: HashMap<String, String>,
}
impl ButckContext {
pub fn with_build_mode(mut self) -> Self {
self.method = ButckMethod::Build;
self
}
pub fn with_write_mode(mut self) -> Self {
self.method = ButckMethod::Write;
self
}
pub fn with_storage_path(mut self, path: PathBuf) -> Self {
self.storage_path = Some(path);
self
}
pub fn with_display_boundaries(mut self, display: bool) -> Self {
self.display_boundaries = display;
self
}
pub fn with_stream_read(mut self, size: Option<u32>) -> Self {
self.stream_read = size;
self
}
pub fn with_memmap_read(mut self, use_memmap: bool) -> Self {
self.memmap_read = use_memmap;
self
}
pub fn with_register_name(mut self, name: Option<String>) -> Self {
self.register_name = name;
self
}
pub fn with_policy_name(mut self, name: Option<String>) -> Self {
self.policy_name = name;
self
}
pub fn with_chunk_hash(mut self, hash: ChunkWriteHash) -> Self {
self.chunk_hash = hash;
self
}
pub fn with_output_dir(mut self, dir: PathBuf) -> Self {
self.output_dir = dir;
self
}
pub fn with_output_file(mut self, file: Option<PathBuf>) -> Self {
self.output_file = file;
self
}
pub fn param(mut self, key: String, value: String) -> Self {
self.params.insert(key, value);
self
}
pub fn with_file_paths(mut self, paths: Vec<PathBuf>) -> Self {
self.file_paths = paths;
self
}
pub fn add_file(mut self, path: PathBuf) -> Self {
self.file_paths.push(path);
self
}
pub async fn exec(self) -> Result<(), ButckRWError> {
match self.method {
ButckMethod::None => Ok(()),
ButckMethod::Write => {
write(self).await?;
Ok(())
}
ButckMethod::Build => {
build(self).await?;
Ok(())
}
}
}
}
impl ButckContext {
/// Apply the args of ChunkerContext to itself
pub fn from_args(mut args: Vec<String>) -> Self {
let mut ctx = ButckContext::default();
let recursive = ctx.read_recursive(&mut args);
ctx.apply_stream_read(&mut args);
ctx.apply_memmap_read(&mut args);
ctx.apply_register_name(&mut args);
ctx.apply_policy_name(&mut args);
ctx.apply_chunk_hash(&mut args);
ctx.apply_storage_dir(&mut args);
ctx.apply_output_paths(&mut args);
ctx.apply_params(&mut args);
ctx.apply_display_boundaries(&mut args);
// Finally, parse path input
args.retain(|arg| !arg.starts_with("--") && !arg.starts_with('-'));
ctx.file_paths = parse_path_input(args, recursive, vec![BUTCK_METADATA_DIR_NAME]);
ctx
}
fn read_recursive(&mut self, args: &mut Vec<String>) -> bool {
special_flag!(args, "-r", "--recursive")
}
fn apply_stream_read(&mut self, args: &mut Vec<String>) {
if let Some(size_str) = special_argument!(args, "-S", "--stream")
&& let Ok(size) = size_str.parse::<u32>()
{
self.stream_read = Some(size);
}
}
fn apply_memmap_read(&mut self, args: &mut Vec<String>) {
self.memmap_read = special_flag!(args, "-m", "--memmap-read");
}
fn apply_register_name(&mut self, args: &mut Vec<String>) {
self.register_name = special_argument!(args, "-R", "--register");
}
fn apply_policy_name(&mut self, args: &mut Vec<String>) {
self.policy_name = special_argument!(args, "-p", "--policy");
}
fn apply_chunk_hash(&mut self, args: &mut Vec<String>) {
let chunk_hash_str = special_argument!(args, "-H", "--chunk-hash");
self.chunk_hash = match chunk_hash_str {
Some(ref s) => match s.as_str() {
"blake3" => ChunkWriteHash::Blake3,
"sha256" => ChunkWriteHash::Sha256,
_ => ChunkWriteHash::default(),
},
None => ChunkWriteHash::default(),
};
}
fn apply_output_paths(&mut self, args: &mut Vec<String>) {
let output_dir_str = special_argument!(args, "-o", "--output-dir");
let output_file_str = special_argument!(args, "-O", "--output-file");
let current_dir = current_dir().unwrap();
let output_dir = if let Some(output_dir_str) = output_dir_str {
let path = PathBuf::from(output_dir_str);
if path.exists() { Some(path) } else { None }
} else {
None
};
self.output_dir = if let Some(output_dir) = output_dir {
output_dir
} else if let Some(storage_path) = &self.storage_path {
storage_path.clone()
} else {
current_dir
};
self.output_file = output_file_str.map(PathBuf::from)
}
fn apply_params(&mut self, args: &mut Vec<String>) {
while let Some(arg) = special_argument!(args, "+p", "+param") {
let split = arg.split('=').collect::<Vec<&str>>();
if split.len() == 2 {
self.params
.insert(split[0].to_string(), split[1].to_string());
}
}
}
fn apply_storage_dir(&mut self, args: &mut Vec<String>) {
self.storage_path = {
let storage_override = match special_argument!(args, "-s", "--storage") {
Some(o) => {
let path = PathBuf::from_str(o.as_str());
if let Ok(p) = &path {
Self::init_butck_storage(p.clone());
}
path.ok()
}
None => None,
};
Self::find_butck_storage_dir(storage_override)
};
}
fn apply_display_boundaries(&mut self, args: &mut Vec<String>) {
self.display_boundaries = special_flag!(args, "-D", "--display-boundaries");
}
fn init_butck_storage(path: PathBuf) -> Option<PathBuf> {
if !path.exists() {
// If the path does not exist, create it and initialize Butck Storage here
if let Err(e) = std::fs::create_dir_all(&path) {
error!("Failed to create directory '{}': {}", path.display(), e);
exit(1);
}
let butck_dir = path.join(BUTCK_METADATA_DIR_NAME);
if let Err(e) = std::fs::create_dir_all(&butck_dir) {
error!(
"Failed to create '{}' directory: {}",
BUTCK_METADATA_DIR_NAME, e
);
exit(1);
}
Some(path)
} else {
let butck_dir = path.join(BUTCK_METADATA_DIR_NAME);
// Check if Butck Storage already exists
if butck_dir.exists() {
// Butck Storage already exists, return the path
Some(path)
} else {
// Butck Storage doesn't exist, create it with a warning if directory is not empty
let is_empty = path
.read_dir()
.map(|mut entries| entries.next().is_none())
.unwrap_or(false);
if !is_empty {
// Warn about creating storage in non-empty directory
warn!(
"Creating '{}' storage in non-empty directory: {}",
BUTCK_METADATA_DIR_NAME,
path.display()
);
}
// Create Butck Storage directory
if let Err(e) = std::fs::create_dir_all(&butck_dir) {
error!(
"Failed to create '{}' directory: {}",
BUTCK_METADATA_DIR_NAME, e
);
exit(1);
}
Some(path)
}
}
}
// Get the ButckStorage directory based on context
fn find_butck_storage_dir(from: Option<PathBuf>) -> Option<PathBuf> {
let mut current_dir = match from {
Some(path) => path,
None => std::env::current_dir().ok()?,
};
loop {
let butck_dir = current_dir.join(BUTCK_METADATA_DIR_NAME);
if butck_dir.is_dir() {
return Some(current_dir);
}
if !current_dir.pop() {
break;
}
}
None
}
}
|