Add storage system with chunk-based file storage

author: 魏曹先生 <1992414357@qq.com> 2026-02-27 06:17:06 +0800
committer: 魏曹先生 <1992414357@qq.com> 2026-02-27 06:17:06 +0800
commit: 76e78fe53c03c9b4c7fa029709f06ee86ce9c865 (patch)
tree: 4e3778dfb405b2c21b51df24331100b94f5356d9 /systems/storage/src/store/line.rs
parent: 748c8a3353df887ee4b01e0e1327aa95c1c7225a (diff)
1 files changed, 393 insertions, 0 deletions
diff --git a/systems/storage/src/store/line.rs b/systems/storage/src/store/line.rs
new file mode 100644
index 0000000..971018b
--- /dev/null
+++ b/systems/storage/src/store/line.rs
@@ -0,0 +1,393 @@
+use std::path::PathBuf;
+
+use crate::{error::StorageIOError, store::ChunkingResult};
+
+/// Split data by lines (newline characters)
+pub fn split_by_lines_impl(data: &[u8]) -> Result<ChunkingResult, StorageIOError> {
+    let mut chunks = Vec::new();
+    let mut start = 0;
+    let total_size = data.len();
+
+    // Iterate through data to find line boundaries
+    let mut i = 0;
+    while i < data.len() {
+        if data[i] == b'\n' {
+            // Unix line ending
+            let line_end = i + 1; // Include \n
+            // Extract line data (include newline character)
+            let line_data = data[start..line_end].to_vec();
+
+            // Create chunk for this line
+            let chunk = crate::store::create_chunk(line_data);
+            chunks.push(chunk);
+
+            // Move start to next line
+            start = line_end;
+            i = line_end;
+        } else if data[i] == b'\r' && i + 1 < data.len() && data[i + 1] == b'\n' {
+            // Windows line ending
+            let line_end = i + 2; // Include both \r and \n
+            // Extract line data (include newline characters)
+            let line_data = data[start..line_end].to_vec();
+
+            // Create chunk for this line
+            let chunk = crate::store::create_chunk(line_data);
+            chunks.push(chunk);
+
+            // Move start to next line
+            start = line_end;
+            i = line_end;
+        } else {
+            i += 1;
+        }
+    }
+
+    // Handle remaining data (last line without newline)
+    if start < total_size {
+        let line_data = data[start..].to_vec();
+        let chunk = crate::store::create_chunk(line_data);
+        chunks.push(chunk);
+    }
+
+    // Handle empty file (no lines)
+    if chunks.is_empty() && total_size == 0 {
+        let chunk = crate::store::create_chunk(Vec::new());
+        chunks.push(chunk);
+    }
+
+    Ok(ChunkingResult {
+        chunks,
+        total_size: total_size as u64,
+    })
+}
+
+/// Split file by lines
+pub async fn write_file_line<I: Into<PathBuf>>(
+    file_to_write: I,
+    storage_dir: I,
+    output_index_file: I,
+) -> Result<(), StorageIOError> {
+    use crate::store::{StorageConfig, write_file};
+
+    let config = StorageConfig::line();
+    write_file(file_to_write, storage_dir, output_index_file, &config).await
+}
+
+/// Utility function to split data by lines with custom line ending detection
+pub fn split_by_lines_custom<E: LineEnding>(data: &[u8]) -> Result<ChunkingResult, StorageIOError> {
+    let mut chunks = Vec::new();
+    let mut start = 0;
+    let total_size = data.len();
+
+    let mut i = 0;
+    while i < total_size {
+        if E::is_line_ending(data, i) {
+            let line_end = i + E::ending_length(data, i);
+            let line_data = data[start..line_end].to_vec();
+
+            let chunk = crate::store::create_chunk(line_data);
+            chunks.push(chunk);
+
+            start = line_end;
+            i = line_end;
+        } else {
+            i += 1;
+        }
+    }
+
+    // Handle remaining data
+    if start < total_size {
+        let line_data = data[start..].to_vec();
+        let chunk = crate::store::create_chunk(line_data);
+        chunks.push(chunk);
+    }
+
+    // Handle empty file
+    if chunks.is_empty() && total_size == 0 {
+        let chunk = crate::store::create_chunk(Vec::new());
+        chunks.push(chunk);
+    }
+
+    Ok(ChunkingResult {
+        chunks,
+        total_size: total_size as u64,
+    })
+}
+
+/// Trait for different line ending types
+pub trait LineEnding {
+    /// Check if position i is the start of a line ending
+    fn is_line_ending(data: &[u8], i: usize) -> bool;
+
+    /// Get the length of the line ending at position i
+    fn ending_length(data: &[u8], i: usize) -> usize;
+}
+
+/// Unix line endings (\n)
+pub struct UnixLineEnding;
+
+impl LineEnding for UnixLineEnding {
+    fn is_line_ending(data: &[u8], i: usize) -> bool {
+        i < data.len() && data[i] == b'\n'
+    }
+
+    fn ending_length(_data: &[u8], _i: usize) -> usize {
+        1
+    }
+}
+
+/// Windows line endings (\r\n)
+pub struct WindowsLineEnding;
+
+impl LineEnding for WindowsLineEnding {
+    fn is_line_ending(data: &[u8], i: usize) -> bool {
+        i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n'
+    }
+
+    fn ending_length(_data: &[u8], _i: usize) -> usize {
+        2
+    }
+}
+
+/// Mixed line endings (detects both Unix and Windows)
+pub struct MixedLineEnding;
+
+impl LineEnding for MixedLineEnding {
+    fn is_line_ending(data: &[u8], i: usize) -> bool {
+        if i < data.len() && data[i] == b'\n' {
+            true
+        } else if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn ending_length(data: &[u8], i: usize) -> usize {
+        if i < data.len() && data[i] == b'\n' {
+            1
+        } else if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' {
+            2
+        } else {
+            1 // Default to 1 if somehow called incorrectly
+        }
+    }
+}
+
+/// Detect line ending type from data
+pub fn detect_line_ending(data: &[u8]) -> LineEndingType {
+    let mut unix_count = 0;
+    let mut windows_count = 0;
+
+    let mut i = 0;
+    while i < data.len() {
+        if data[i] == b'\n' {
+            unix_count += 1;
+            i += 1;
+        } else if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' {
+            windows_count += 1;
+            i += 2;
+        } else {
+            i += 1;
+        }
+    }
+
+    if unix_count > windows_count {
+        LineEndingType::Unix
+    } else if windows_count > unix_count {
+        LineEndingType::Windows
+    } else {
+        LineEndingType::Mixed
+    }
+}
+
+/// Line ending type enum
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum LineEndingType {
+    Unix,
+    Windows,
+    Mixed,
+}
+
+impl LineEndingType {
+    /// Split data using the detected line ending type
+    pub fn split_by_lines(&self, data: &[u8]) -> Result<ChunkingResult, StorageIOError> {
+        match self {
+            LineEndingType::Unix => split_by_lines_custom::<UnixLineEnding>(data),
+            LineEndingType::Windows => split_by_lines_custom::<WindowsLineEnding>(data),
+            LineEndingType::Mixed => split_by_lines_custom::<MixedLineEnding>(data),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_line_chunking_unix() {
+        let data = b"Hello\nWorld\nTest\n";
+
+        let result = split_by_lines_impl(data).unwrap();
+
+        // Should have 3 chunks
+        assert_eq!(result.chunks.len(), 3);
+
+        // Verify chunk contents
+        assert_eq!(result.chunks[0].data, b"Hello\n");
+        assert_eq!(result.chunks[1].data, b"World\n");
+        assert_eq!(result.chunks[2].data, b"Test\n");
+
+        // Verify total size
+        let total_chunk_size: usize = result.chunks.iter().map(|c| c.data.len()).sum();
+        assert_eq!(total_chunk_size, data.len());
+    }
+
+    #[test]
+    fn test_line_chunking_windows() {
+        let data = b"Hello\r\nWorld\r\nTest\r\n";
+
+        let result = split_by_lines_impl(data).unwrap();
+
+        // Should have 3 chunks
+        assert_eq!(result.chunks.len(), 3);
+
+        // Verify chunk contents (should include \r\n)
+        assert_eq!(result.chunks[0].data, b"Hello\r\n");
+        assert_eq!(result.chunks[1].data, b"World\r\n");
+        assert_eq!(result.chunks[2].data, b"Test\r\n");
+    }
+
+    #[test]
+    fn test_line_chunking_mixed() {
+        let data = b"Hello\nWorld\r\nTest\n";
+
+        let result = split_by_lines_impl(data).unwrap();
+
+        // Should have 3 chunks
+        assert_eq!(result.chunks.len(), 3);
+
+        // Verify chunk contents
+        assert_eq!(result.chunks[0].data, b"Hello\n");
+        assert_eq!(result.chunks[1].data, b"World\r\n");
+        assert_eq!(result.chunks[2].data, b"Test\n");
+    }
+
+    #[test]
+    fn test_line_chunking_no_trailing_newline() {
+        let data = b"Hello\nWorld\nTest";
+
+        let result = split_by_lines_impl(data).unwrap();
+
+        // Should have 3 chunks
+        assert_eq!(result.chunks.len(), 3);
+
+        // Verify chunk contents
+        assert_eq!(result.chunks[0].data, b"Hello\n");
+        assert_eq!(result.chunks[1].data, b"World\n");
+        assert_eq!(result.chunks[2].data, b"Test");
+    }
+
+    #[test]
+    fn test_line_chunking_empty_lines() {
+        let data = b"Hello\n\nWorld\n\n\n";
+
+        let result = split_by_lines_impl(data).unwrap();
+
+        // Should have 5 chunks (including empty lines)
+        // "Hello\n", "\n", "World\n", "\n", "\n"
+        assert_eq!(result.chunks.len(), 5);
+
+        // Verify chunk contents
+        assert_eq!(result.chunks[0].data, b"Hello\n");
+        assert_eq!(result.chunks[1].data, b"\n");
+        assert_eq!(result.chunks[2].data, b"World\n");
+        assert_eq!(result.chunks[3].data, b"\n");
+        assert_eq!(result.chunks[4].data, b"\n");
+    }
+
+    #[test]
+    fn test_line_chunking_empty_file() {
+        let data = b"";
+
+        let result = split_by_lines_impl(data).unwrap();
+
+        // Should have 1 empty chunk
+        assert_eq!(result.chunks.len(), 1);
+        assert_eq!(result.chunks[0].data, b"");
+    }
+
+    #[test]
+    fn test_detect_line_ending() {
+        // Test Unix detection
+        let unix_data = b"Hello\nWorld\n";
+        assert_eq!(detect_line_ending(unix_data), LineEndingType::Unix);
+
+        // Test Windows detection
+        let windows_data = b"Hello\r\nWorld\r\n";
+        assert_eq!(detect_line_ending(windows_data), LineEndingType::Windows);
+
+        // Test mixed detection
+        let mixed_data = b"Hello\nWorld\r\n";
+        assert_eq!(detect_line_ending(mixed_data), LineEndingType::Mixed);
+
+        // Test no newlines
+        let no_newlines = b"Hello World";
+        assert_eq!(detect_line_ending(no_newlines), LineEndingType::Mixed);
+    }
+
+    #[test]
+    fn test_custom_line_ending_unix() {
+        let data = b"Hello\nWorld\n";
+
+        let result = split_by_lines_custom::<UnixLineEnding>(data).unwrap();
+
+        assert_eq!(result.chunks.len(), 2);
+        assert_eq!(result.chunks[0].data, b"Hello\n");
+        assert_eq!(result.chunks[1].data, b"World\n");
+    }
+
+    #[test]
+    fn test_custom_line_ending_windows() {
+        let data = b"Hello\r\nWorld\r\n";
+
+        let result = split_by_lines_custom::<WindowsLineEnding>(data).unwrap();
+
+        assert_eq!(result.chunks.len(), 2);
+        assert_eq!(result.chunks[0].data, b"Hello\r\n");
+        assert_eq!(result.chunks[1].data, b"World\r\n");
+    }
+
+    #[test]
+    fn test_line_ending_type_split() {
+        let unix_data = b"Hello\nWorld\n";
+        let windows_data = b"Hello\r\nWorld\r\n";
+        let mixed_data = b"Hello\nWorld\r\n";
+
+        // Test Unix
+        let unix_result = LineEndingType::Unix.split_by_lines(unix_data).unwrap();
+        assert_eq!(unix_result.chunks.len(), 2);
+
+        // Test Windows
+        let windows_result = LineEndingType::Windows
+            .split_by_lines(windows_data)
+            .unwrap();
+        assert_eq!(windows_result.chunks.len(), 2);
+
+        // Test Mixed
+        let mixed_result = LineEndingType::Mixed.split_by_lines(mixed_data).unwrap();
+        assert_eq!(mixed_result.chunks.len(), 2);
+    }
+
+    #[test]
+    fn test_chunk_hash_uniqueness() {
+        // Test that different lines produce different hashes
+        let data1 = b"Hello\n";
+        let data2 = b"World\n";
+
+        let result1 = split_by_lines_impl(data1).unwrap();
+        let result2 = split_by_lines_impl(data2).unwrap();
+
+        assert_ne!(result1.chunks[0].hash, result2.chunks[0].hash);
+    }
+}
author	魏曹先生 <1992414357@qq.com>	2026-02-27 06:17:06 +0800
committer	魏曹先生 <1992414357@qq.com>	2026-02-27 06:17:06 +0800
commit	76e78fe53c03c9b4c7fa029709f06ee86ce9c865 (patch)
tree	4e3778dfb405b2c21b51df24331100b94f5356d9 /systems/storage/src/store/line.rs
parent	748c8a3353df887ee4b01e0e1327aa95c1c7225a (diff)