diff options
| author | 魏曹先生 <1992414357@qq.com> | 2026-02-27 06:17:06 +0800 |
|---|---|---|
| committer | 魏曹先生 <1992414357@qq.com> | 2026-02-27 06:17:06 +0800 |
| commit | 76e78fe53c03c9b4c7fa029709f06ee86ce9c865 (patch) | |
| tree | 4e3778dfb405b2c21b51df24331100b94f5356d9 /systems/storage/src/store/line.rs | |
| parent | 748c8a3353df887ee4b01e0e1327aa95c1c7225a (diff) | |
Add storage system with chunk-based file storage
Diffstat (limited to 'systems/storage/src/store/line.rs')
| -rw-r--r-- | systems/storage/src/store/line.rs | 393 |
1 files changed, 393 insertions, 0 deletions
diff --git a/systems/storage/src/store/line.rs b/systems/storage/src/store/line.rs new file mode 100644 index 0000000..971018b --- /dev/null +++ b/systems/storage/src/store/line.rs @@ -0,0 +1,393 @@ +use std::path::PathBuf; + +use crate::{error::StorageIOError, store::ChunkingResult}; + +/// Split data by lines (newline characters) +pub fn split_by_lines_impl(data: &[u8]) -> Result<ChunkingResult, StorageIOError> { + let mut chunks = Vec::new(); + let mut start = 0; + let total_size = data.len(); + + // Iterate through data to find line boundaries + let mut i = 0; + while i < data.len() { + if data[i] == b'\n' { + // Unix line ending + let line_end = i + 1; // Include \n + // Extract line data (include newline character) + let line_data = data[start..line_end].to_vec(); + + // Create chunk for this line + let chunk = crate::store::create_chunk(line_data); + chunks.push(chunk); + + // Move start to next line + start = line_end; + i = line_end; + } else if data[i] == b'\r' && i + 1 < data.len() && data[i + 1] == b'\n' { + // Windows line ending + let line_end = i + 2; // Include both \r and \n + // Extract line data (include newline characters) + let line_data = data[start..line_end].to_vec(); + + // Create chunk for this line + let chunk = crate::store::create_chunk(line_data); + chunks.push(chunk); + + // Move start to next line + start = line_end; + i = line_end; + } else { + i += 1; + } + } + + // Handle remaining data (last line without newline) + if start < total_size { + let line_data = data[start..].to_vec(); + let chunk = crate::store::create_chunk(line_data); + chunks.push(chunk); + } + + // Handle empty file (no lines) + if chunks.is_empty() && total_size == 0 { + let chunk = crate::store::create_chunk(Vec::new()); + chunks.push(chunk); + } + + Ok(ChunkingResult { + chunks, + total_size: total_size as u64, + }) +} + +/// Split file by lines +pub async fn write_file_line<I: Into<PathBuf>>( + file_to_write: I, + storage_dir: I, + output_index_file: I, +) -> Result<(), StorageIOError> { + use crate::store::{StorageConfig, write_file}; + + let config = StorageConfig::line(); + write_file(file_to_write, storage_dir, output_index_file, &config).await +} + +/// Utility function to split data by lines with custom line ending detection +pub fn split_by_lines_custom<E: LineEnding>(data: &[u8]) -> Result<ChunkingResult, StorageIOError> { + let mut chunks = Vec::new(); + let mut start = 0; + let total_size = data.len(); + + let mut i = 0; + while i < total_size { + if E::is_line_ending(data, i) { + let line_end = i + E::ending_length(data, i); + let line_data = data[start..line_end].to_vec(); + + let chunk = crate::store::create_chunk(line_data); + chunks.push(chunk); + + start = line_end; + i = line_end; + } else { + i += 1; + } + } + + // Handle remaining data + if start < total_size { + let line_data = data[start..].to_vec(); + let chunk = crate::store::create_chunk(line_data); + chunks.push(chunk); + } + + // Handle empty file + if chunks.is_empty() && total_size == 0 { + let chunk = crate::store::create_chunk(Vec::new()); + chunks.push(chunk); + } + + Ok(ChunkingResult { + chunks, + total_size: total_size as u64, + }) +} + +/// Trait for different line ending types +pub trait LineEnding { + /// Check if position i is the start of a line ending + fn is_line_ending(data: &[u8], i: usize) -> bool; + + /// Get the length of the line ending at position i + fn ending_length(data: &[u8], i: usize) -> usize; +} + +/// Unix line endings (\n) +pub struct UnixLineEnding; + +impl LineEnding for UnixLineEnding { + fn is_line_ending(data: &[u8], i: usize) -> bool { + i < data.len() && data[i] == b'\n' + } + + fn ending_length(_data: &[u8], _i: usize) -> usize { + 1 + } +} + +/// Windows line endings (\r\n) +pub struct WindowsLineEnding; + +impl LineEnding for WindowsLineEnding { + fn is_line_ending(data: &[u8], i: usize) -> bool { + i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' + } + + fn ending_length(_data: &[u8], _i: usize) -> usize { + 2 + } +} + +/// Mixed line endings (detects both Unix and Windows) +pub struct MixedLineEnding; + +impl LineEnding for MixedLineEnding { + fn is_line_ending(data: &[u8], i: usize) -> bool { + if i < data.len() && data[i] == b'\n' { + true + } else if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' { + true + } else { + false + } + } + + fn ending_length(data: &[u8], i: usize) -> usize { + if i < data.len() && data[i] == b'\n' { + 1 + } else if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' { + 2 + } else { + 1 // Default to 1 if somehow called incorrectly + } + } +} + +/// Detect line ending type from data +pub fn detect_line_ending(data: &[u8]) -> LineEndingType { + let mut unix_count = 0; + let mut windows_count = 0; + + let mut i = 0; + while i < data.len() { + if data[i] == b'\n' { + unix_count += 1; + i += 1; + } else if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' { + windows_count += 1; + i += 2; + } else { + i += 1; + } + } + + if unix_count > windows_count { + LineEndingType::Unix + } else if windows_count > unix_count { + LineEndingType::Windows + } else { + LineEndingType::Mixed + } +} + +/// Line ending type enum +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum LineEndingType { + Unix, + Windows, + Mixed, +} + +impl LineEndingType { + /// Split data using the detected line ending type + pub fn split_by_lines(&self, data: &[u8]) -> Result<ChunkingResult, StorageIOError> { + match self { + LineEndingType::Unix => split_by_lines_custom::<UnixLineEnding>(data), + LineEndingType::Windows => split_by_lines_custom::<WindowsLineEnding>(data), + LineEndingType::Mixed => split_by_lines_custom::<MixedLineEnding>(data), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_line_chunking_unix() { + let data = b"Hello\nWorld\nTest\n"; + + let result = split_by_lines_impl(data).unwrap(); + + // Should have 3 chunks + assert_eq!(result.chunks.len(), 3); + + // Verify chunk contents + assert_eq!(result.chunks[0].data, b"Hello\n"); + assert_eq!(result.chunks[1].data, b"World\n"); + assert_eq!(result.chunks[2].data, b"Test\n"); + + // Verify total size + let total_chunk_size: usize = result.chunks.iter().map(|c| c.data.len()).sum(); + assert_eq!(total_chunk_size, data.len()); + } + + #[test] + fn test_line_chunking_windows() { + let data = b"Hello\r\nWorld\r\nTest\r\n"; + + let result = split_by_lines_impl(data).unwrap(); + + // Should have 3 chunks + assert_eq!(result.chunks.len(), 3); + + // Verify chunk contents (should include \r\n) + assert_eq!(result.chunks[0].data, b"Hello\r\n"); + assert_eq!(result.chunks[1].data, b"World\r\n"); + assert_eq!(result.chunks[2].data, b"Test\r\n"); + } + + #[test] + fn test_line_chunking_mixed() { + let data = b"Hello\nWorld\r\nTest\n"; + + let result = split_by_lines_impl(data).unwrap(); + + // Should have 3 chunks + assert_eq!(result.chunks.len(), 3); + + // Verify chunk contents + assert_eq!(result.chunks[0].data, b"Hello\n"); + assert_eq!(result.chunks[1].data, b"World\r\n"); + assert_eq!(result.chunks[2].data, b"Test\n"); + } + + #[test] + fn test_line_chunking_no_trailing_newline() { + let data = b"Hello\nWorld\nTest"; + + let result = split_by_lines_impl(data).unwrap(); + + // Should have 3 chunks + assert_eq!(result.chunks.len(), 3); + + // Verify chunk contents + assert_eq!(result.chunks[0].data, b"Hello\n"); + assert_eq!(result.chunks[1].data, b"World\n"); + assert_eq!(result.chunks[2].data, b"Test"); + } + + #[test] + fn test_line_chunking_empty_lines() { + let data = b"Hello\n\nWorld\n\n\n"; + + let result = split_by_lines_impl(data).unwrap(); + + // Should have 5 chunks (including empty lines) + // "Hello\n", "\n", "World\n", "\n", "\n" + assert_eq!(result.chunks.len(), 5); + + // Verify chunk contents + assert_eq!(result.chunks[0].data, b"Hello\n"); + assert_eq!(result.chunks[1].data, b"\n"); + assert_eq!(result.chunks[2].data, b"World\n"); + assert_eq!(result.chunks[3].data, b"\n"); + assert_eq!(result.chunks[4].data, b"\n"); + } + + #[test] + fn test_line_chunking_empty_file() { + let data = b""; + + let result = split_by_lines_impl(data).unwrap(); + + // Should have 1 empty chunk + assert_eq!(result.chunks.len(), 1); + assert_eq!(result.chunks[0].data, b""); + } + + #[test] + fn test_detect_line_ending() { + // Test Unix detection + let unix_data = b"Hello\nWorld\n"; + assert_eq!(detect_line_ending(unix_data), LineEndingType::Unix); + + // Test Windows detection + let windows_data = b"Hello\r\nWorld\r\n"; + assert_eq!(detect_line_ending(windows_data), LineEndingType::Windows); + + // Test mixed detection + let mixed_data = b"Hello\nWorld\r\n"; + assert_eq!(detect_line_ending(mixed_data), LineEndingType::Mixed); + + // Test no newlines + let no_newlines = b"Hello World"; + assert_eq!(detect_line_ending(no_newlines), LineEndingType::Mixed); + } + + #[test] + fn test_custom_line_ending_unix() { + let data = b"Hello\nWorld\n"; + + let result = split_by_lines_custom::<UnixLineEnding>(data).unwrap(); + + assert_eq!(result.chunks.len(), 2); + assert_eq!(result.chunks[0].data, b"Hello\n"); + assert_eq!(result.chunks[1].data, b"World\n"); + } + + #[test] + fn test_custom_line_ending_windows() { + let data = b"Hello\r\nWorld\r\n"; + + let result = split_by_lines_custom::<WindowsLineEnding>(data).unwrap(); + + assert_eq!(result.chunks.len(), 2); + assert_eq!(result.chunks[0].data, b"Hello\r\n"); + assert_eq!(result.chunks[1].data, b"World\r\n"); + } + + #[test] + fn test_line_ending_type_split() { + let unix_data = b"Hello\nWorld\n"; + let windows_data = b"Hello\r\nWorld\r\n"; + let mixed_data = b"Hello\nWorld\r\n"; + + // Test Unix + let unix_result = LineEndingType::Unix.split_by_lines(unix_data).unwrap(); + assert_eq!(unix_result.chunks.len(), 2); + + // Test Windows + let windows_result = LineEndingType::Windows + .split_by_lines(windows_data) + .unwrap(); + assert_eq!(windows_result.chunks.len(), 2); + + // Test Mixed + let mixed_result = LineEndingType::Mixed.split_by_lines(mixed_data).unwrap(); + assert_eq!(mixed_result.chunks.len(), 2); + } + + #[test] + fn test_chunk_hash_uniqueness() { + // Test that different lines produce different hashes + let data1 = b"Hello\n"; + let data2 = b"World\n"; + + let result1 = split_by_lines_impl(data1).unwrap(); + let result2 = split_by_lines_impl(data2).unwrap(); + + assert_ne!(result1.chunks[0].hash, result2.chunks[0].hash); + } +} |
