Add file inclusion and text formatting to parser

author: 魏曹先生 <1992414357@qq.com> 2026-02-09 18:32:24 +0800
committer: 魏曹先生 <1992414357@qq.com> 2026-02-09 18:32:24 +0800
commit: 204bb6824bf3555b80ca574ca3edb8ea007c89dd (patch)
tree: 9a50b67ff64b2d1fe918e4bfa5034ac8389668e3 /parser
parent: 12d08d599a41b15e0a20113d1a521c8c3a232e79 (diff)
6 files changed, 495 insertions, 3 deletions
diff --git a/parser/Cargo.toml b/parser/Cargo.toml
index 07e462f..2d7cb0c 100644
--- a/parser/Cargo.toml
+++ b/parser/Cargo.toml
@@ -5,6 +5,7 @@ edition = "2024"
 
 [dependencies]
 colored = "3.0"
+strip-ansi-escapes = "0.2.1"
 unicode-width = "0.2"
 regex = "1.12"
 sha2 = "0.10"
diff --git a/parser/src/error.rs b/parser/src/error.rs
index ca68442..b9ac888 100644
--- a/parser/src/error.rs
+++ b/parser/src/error.rs
@@ -1,4 +1,4 @@
-use std::{i64, process::exit};
+use std::{i64, path::PathBuf, process::exit};
 
 use colored::Colorize;
 use unicode_width::UnicodeWidthStr;
@@ -14,6 +14,8 @@ pub enum Exit {
         begin: i64,
         end: i64,
     },
+    DuplicateMarker(String),
+    CycleDependency(PathBuf),
 }
 
 impl From<std::io::Error> for Exit {
@@ -35,6 +37,14 @@ pub fn handle_exit(e: Exit) {
         } => {
             print_syntax_error(content, reason, line, begin, end);
         }
+        Exit::DuplicateMarker(marker) => {
+            eprintln!("Duplicate marker `{}` found!", marker);
+            exit(1)
+        }
+        Exit::CycleDependency(dialog) => {
+            eprintln!("Dialog `{}` depends on itself!", dialog.display());
+            exit(1)
+        }
     }
 }
 
diff --git a/parser/src/lib.rs b/parser/src/lib.rs
index a462697..d7caac3 100644
--- a/parser/src/lib.rs
+++ b/parser/src/lib.rs
@@ -2,3 +2,4 @@ pub mod error;
 pub mod macros;
 pub mod parse;
 pub mod syntax_checker;
+pub mod utils;
diff --git a/parser/src/parse.rs b/parser/src/parse.rs
index 434cca3..79f5719 100644
--- a/parser/src/parse.rs
+++ b/parser/src/parse.rs
@@ -1,25 +1,110 @@
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 
 use regex::Regex;
 use sha2::{Digest, Sha256};
 
-use crate::{error::Exit, syntax_checker::check_markdown_syntax};
+use crate::{error::Exit, syntax_checker::check_markdown_syntax, utils::path_fmt::format_path};
 
 pub fn parse(input: PathBuf, ir_output: PathBuf) -> Result<(), Exit> {
     let result = std::fs::read_to_string(&input)?;
 
     check_markdown_syntax(&result)?;
 
+    let result = unwrap_includes(result, input)?;
+
+    check_duplicate_marker(&result)?;
+
     let result = clean_markdown(result)?;
     let result = fix_mark_jump(result)?;
     let result = replace_marker_name(result)?;
     let result = convert_to_step_sentence_structure(result)?;
     let result = strip_invalid_jump(result)?;
+    let result = convert_image_to_code(result)?;
+    let result = apply_code_lines(result)?;
+    let result = split_sentence_and_encode(result)?;
 
     std::fs::write(&ir_output, result)?;
     Ok(())
 }
 
+/// Expand text includes of [[Dialog.md]]
+pub fn unwrap_includes(input: String, self_path: PathBuf) -> Result<String, Exit> {
+    let mut stack = Vec::<PathBuf>::new();
+    expand_recursive(input, &self_path, &mut stack)
+}
+
+fn expand_recursive(
+    content: String,
+    current_path: &Path,
+    stack: &mut Vec<PathBuf>,
+) -> Result<String, Exit> {
+    let mut output = String::new();
+    let mut in_code_block = false;
+
+    let current_norm = format_path(current_path)?;
+
+    if stack.contains(&current_norm) {
+        return Err(Exit::CycleDependency(current_norm));
+    }
+
+    stack.push(current_norm.clone());
+
+    for line in content.lines() {
+        if line.trim().starts_with("```") {
+            in_code_block = !in_code_block;
+            output.push_str(line);
+            output.push('\n');
+            continue;
+        }
+
+        if in_code_block {
+            output.push_str(line);
+            output.push('\n');
+            continue;
+        }
+
+        if let Some(include_path) = extract_include(line) {
+            let include_abs = format_path(&current_path.parent().unwrap().join(include_path))?;
+            let include_content =
+                std::fs::read_to_string(&include_abs).map_err(|e| Exit::IoError(e))?;
+
+            let expanded = expand_recursive(include_content, &include_abs, stack)?;
+            output.push_str(&expanded);
+        } else {
+            output.push_str(line);
+            output.push('\n');
+        }
+    }
+
+    stack.pop();
+
+    Ok(output)
+}
+
+fn extract_include(line: &str) -> Option<&str> {
+    line.trim()
+        .strip_prefix("[[")
+        .and_then(|s| s.strip_suffix("]]"))
+}
+
+/// Check for duplicate markers
+pub fn check_duplicate_marker(input: &String) -> Result<(), Exit> {
+    let mut seen = std::collections::HashSet::new();
+    let heading_re = Regex::new(r"^(#{1,5})\s+(.+)$").unwrap();
+
+    for line in input.lines() {
+        if let Some(caps) = heading_re.captures(line) {
+            let heading_text = caps[2].trim().to_string();
+            if seen.contains(&heading_text) {
+                return Err(Exit::DuplicateMarker(heading_text));
+            }
+            seen.insert(heading_text);
+        }
+    }
+
+    Ok(())
+}
+
 /// Clean Markdown
 /// 1. Remove blockquotes
 /// 2. Remove empty lines
@@ -916,3 +1001,274 @@ pub fn strip_invalid_jump(input: String) -> Result<String, Exit> {
 
     Ok(result_lines.join("\n"))
 }
+
+/// Convert image lines to code lines
+pub fn convert_image_to_code(input: String) -> Result<String, Exit> {
+    let mut result = String::new();
+    let lines: Vec<&str> = input.lines().collect();
+    let image_re = Regex::new(r"^!\[[^\]]*\]\(([^)]+)\)$").unwrap();
+
+    for line in lines {
+        if let Some(caps) = image_re.captures(line) {
+            let image_path = caps.get(1).unwrap().as_str();
+            result.push_str(&format!("`image \"{}\"`\n", image_path));
+        } else {
+            result.push_str(line);
+            result.push('\n');
+        }
+    }
+
+    // Remove trailing newline if present
+    if result.ends_with('\n') {
+        result.pop();
+    }
+
+    Ok(result)
+}
+
+/// Apply code lines to sentences
+pub fn apply_code_lines(input: String) -> Result<String, Exit> {
+    let mut out = String::new();
+    let lines: Vec<&str> = input.lines().collect();
+
+    let mut i = 0;
+    while i < lines.len() {
+        let line = lines[i];
+
+        if !line.trim_start().starts_with('`') {
+            out.push_str(line);
+            out.push('\n');
+            i += 1;
+            continue;
+        }
+
+        let mut code_buf = String::new();
+        while i < lines.len() && {
+            let line: &str = lines[i];
+            line.trim_start().starts_with('`')
+        } {
+            code_buf.push_str(lines[i].trim());
+            i += 1;
+        }
+
+        if i >= lines.len()
+            || !{
+                let line: &str = lines[i];
+                line.trim_start().starts_with('[')
+            }
+        {
+            continue;
+        }
+
+        if i + 1 < lines.len() && {
+            let line: &str = lines[i + 1];
+            line.trim_start().starts_with('[')
+        } {
+            continue;
+        }
+
+        let merged = helper_merge_code_into_sentence(&code_buf, lines[i]);
+        out.push_str(&merged);
+        out.push('\n');
+        i += 1;
+    }
+
+    Ok(out)
+}
+
+fn helper_merge_code_into_sentence(code: &str, sentence: &str) -> String {
+    if let Some(start) = sentence.find(":[") {
+        if let Some(_) = sentence[start + 2..].find(']') {
+            let content_start = start + 2;
+
+            let mut result = String::new();
+            result.push_str(&sentence[..content_start]);
+            result.push_str(code);
+            result.push_str(&sentence[content_start..]);
+            return result;
+        }
+    }
+
+    sentence.to_string()
+}
+
+/// Split sentences into embeddable tokens and perform Unicode encoding
+pub fn split_sentence_and_encode(input: String) -> Result<String, Exit> {
+    let mut result = String::new();
+    let lines: Vec<&str> = input.lines().collect();
+
+    for line in lines {
+        if line.starts_with('[') && line.contains("]:[") && line.contains("]->[") {
+            if let Some(start) = line.find("]:[") {
+                if let Some(end) = line.find("]->[") {
+                    let content = &line[start + 3..end];
+                    let processed_content = helper_process_sentence_content(content);
+
+                    let suffix = &line[end + 1..];
+
+                    let char_end = start;
+                    let char_start = 1;
+                    let character = &line[char_start..char_end];
+                    let encoded_character = helper_encode_unicode(character);
+
+                    // Build the new line with encoded character and processed content
+                    let new_line =
+                        format!("[{}]:{}{}", encoded_character, processed_content, suffix);
+                    result.push_str(&format!("{}\n", new_line));
+                    continue;
+                }
+            }
+        }
+        result.push_str(&format!("{}\n", line));
+    }
+
+    if result.ends_with('\n') {
+        result.pop();
+    }
+
+    Ok(result)
+}
+
+fn helper_process_sentence_content(content: &str) -> String {
+    let mut result = String::new();
+    let mut chars = content.chars().peekable();
+    let mut current_text = String::new();
+    let mut in_code = false;
+    let mut in_bold = false;
+    let mut in_italic = false;
+    let mut code_buffer = String::new();
+    let mut backticks_count = 0;
+
+    while let Some(ch) = chars.next() {
+        match ch {
+            '`' => {
+                backticks_count += 1;
+                if backticks_count == 1 {
+                    // Start of code block
+                    if !current_text.is_empty() {
+                        let encoded_text = helper_encode_unicode(&current_text);
+                        result.push_str(&format!("[text:[{}]]", encoded_text));
+                        current_text.clear();
+                    }
+                    code_buffer.push(ch);
+                    in_code = true;
+                } else if backticks_count == 2 && in_code {
+                    // End of code block
+                    code_buffer.push(ch);
+                    let encoded_code = helper_encode_unicode(&code_buffer);
+                    result.push_str(&format!("[code:[{}]]", encoded_code));
+                    code_buffer.clear();
+                    backticks_count = 0;
+                    in_code = false;
+                } else if backticks_count == 1 && !in_code {
+                    // Single backtick in text
+                    current_text.push(ch);
+                }
+            }
+            '*' => {
+                if in_code {
+                    code_buffer.push(ch);
+                    continue;
+                }
+
+                // Check for bold
+                if chars.peek() == Some(&'*') {
+                    chars.next(); // Consume the second '*'
+
+                    if in_bold {
+                        // End bold
+                        if !current_text.is_empty() {
+                            let encoded_text = helper_encode_unicode(&current_text);
+                            result.push_str(&format!("[bold:[{}]]", encoded_text));
+                            current_text.clear();
+                        }
+                        in_bold = false;
+                    } else if in_italic {
+                        if !current_text.is_empty() {
+                            let encoded_text = helper_encode_unicode(&current_text);
+                            result.push_str(&format!("[italic:[{}]]", encoded_text));
+                            current_text.clear();
+                        }
+                        in_italic = false;
+                        // Start bold_italic
+                        in_bold = true;
+                    } else {
+                        // Start bold
+                        if !current_text.is_empty() {
+                            let encoded_text = helper_encode_unicode(&current_text);
+                            result.push_str(&format!("[text:[{}]]", encoded_text));
+                            current_text.clear();
+                        }
+                        in_bold = true;
+                    }
+                } else {
+                    if in_italic {
+                        // End italic
+                        if !current_text.is_empty() {
+                            let encoded_text = helper_encode_unicode(&current_text);
+                            result.push_str(&format!("[italic:[{}]]", encoded_text));
+                            current_text.clear();
+                        }
+                        in_italic = false;
+                    } else if in_bold {
+                        if !current_text.is_empty() {
+                            let encoded_text = helper_encode_unicode(&current_text);
+                            result.push_str(&format!("[bold:[{}]]", encoded_text));
+                            current_text.clear();
+                        }
+                        // Start bold_italic
+                        in_bold = true;
+                        in_italic = true;
+                    } else {
+                        // Start italic
+                        if !current_text.is_empty() {
+                            let encoded_text = helper_encode_unicode(&current_text);
+                            result.push_str(&format!("[text:[{}]]", encoded_text));
+                            current_text.clear();
+                        }
+                        in_italic = true;
+                    }
+                }
+            }
+            _ => {
+                if in_code {
+                    code_buffer.push(ch);
+                } else {
+                    current_text.push(ch);
+                }
+            }
+        }
+    }
+
+    // Handle any remaining text
+    if !code_buffer.is_empty() {
+        let encoded_code = helper_encode_unicode(&code_buffer);
+        result.push_str(&format!("[code:[{}]]", encoded_code));
+    }
+
+    if !current_text.is_empty() {
+        let style = match (in_bold, in_italic) {
+            (true, true) => "bold_italic",
+            (true, false) => "bold",
+            (false, true) => "italic",
+            (false, false) => "text",
+        };
+        let encoded_text = helper_encode_unicode(&current_text);
+        result.push_str(&format!("[{}:[{}]]", style, encoded_text));
+    }
+
+    result
+}
+
+fn helper_encode_unicode(s: &str) -> String {
+    let mut result = String::new();
+    for ch in s.chars() {
+        let code = ch as u32;
+        if code <= 0x7F {
+            result.push(ch);
+        } else {
+            result.push_str(&format!("\\u{:X}", code));
+        }
+    }
+    result
+}
diff --git a/parser/src/utils.rs b/parser/src/utils.rs
new file mode 100644
index 0000000..0fbb516
--- /dev/null
+++ b/parser/src/utils.rs
@@ -0,0 +1 @@
+pub mod path_fmt;
diff --git a/parser/src/utils/path_fmt.rs b/parser/src/utils/path_fmt.rs
new file mode 100644
index 0000000..8750db6
--- /dev/null
+++ b/parser/src/utils/path_fmt.rs
@@ -0,0 +1,123 @@
+use std::path::{Path, PathBuf};
+
+/// Normalize an input path string into a canonical, platform‑agnostic form.
+///
+/// This function removes ANSI escape sequences, unifies separators to `/`,
+/// collapses duplicate slashes, strips unfriendly characters (`*`, `?`, `"`, `<`, `>`, `|`),
+/// resolves simple `..` components, and preserves a trailing slash when present.
+///
+/// See examples below for the exact normalization behavior.
+///
+/// # Examples
+///
+/// ```
+/// # use string_proc::format_path::format_path_str;
+/// use std::io::Error;
+///
+/// # fn main() -> Result<(), Error> {
+/// assert_eq!(format_path_str("C:\\Users\\\\test")?, "C:/Users/test");
+/// assert_eq!(
+///     format_path_str("/path/with/*unfriendly?chars")?,
+///     "/path/with/unfriendlychars"
+/// );
+/// assert_eq!(format_path_str("\x1b[31m/path\x1b[0m")?, "/path");
+/// assert_eq!(format_path_str("/home/user/dir/")?, "/home/user/dir/");
+/// assert_eq!(
+///     format_path_str("/home/user/file.txt")?,
+///     "/home/user/file.txt"
+/// );
+/// assert_eq!(
+///     format_path_str("/home/my_user/DOCS/JVCS_TEST/Workspace/../Vault/")?,
+///     "/home/my_user/DOCS/JVCS_TEST/Vault/"
+/// );
+/// assert_eq!(format_path_str("./home/file.txt")?, "home/file.txt");
+/// assert_eq!(format_path_str("./home/path/")?, "home/path/");
+/// assert_eq!(format_path_str("./")?, "");
+/// # Ok(())
+/// # }
+/// ```
+pub fn format_path_str(path: impl Into<String>) -> Result<String, std::io::Error> {
+    let path_str = path.into();
+    let ends_with_slash = path_str.ends_with('/');
+
+    // ANSI Strip
+    let cleaned = strip_ansi_escapes::strip(&path_str);
+    let path_without_ansi = String::from_utf8(cleaned)
+        .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
+
+    let path_with_forward_slash = path_without_ansi.replace('\\', "/");
+    let mut result = String::new();
+    let mut prev_char = '\0';
+
+    for c in path_with_forward_slash.chars() {
+        if c == '/' && prev_char == '/' {
+            continue;
+        }
+        result.push(c);
+        prev_char = c;
+    }
+
+    let unfriendly_chars = ['*', '?', '"', '<', '>', '|'];
+    result = result
+        .chars()
+        .filter(|c| !unfriendly_chars.contains(c))
+        .collect();
+
+    // Handle ".." path components
+    let path_buf = PathBuf::from(&result);
+    let normalized_path = normalize_path(&path_buf);
+    result = normalized_path.to_string_lossy().replace('\\', "/");
+
+    // Restore trailing slash if original path had one
+    if ends_with_slash && !result.ends_with('/') {
+        result.push('/');
+    }
+
+    // Special case: when result is only "./", return ""
+    if result == "./" {
+        return Ok(String::new());
+    }
+
+    Ok(result)
+}
+
+/// Normalize path by resolving ".." components without requiring file system access
+fn normalize_path(path: &Path) -> PathBuf {
+    let mut components = Vec::new();
+
+    for component in path.components() {
+        match component {
+            std::path::Component::ParentDir => {
+                if !components.is_empty() {
+                    components.pop();
+                }
+            }
+            std::path::Component::CurDir => {
+                // Skip current directory components
+            }
+            _ => {
+                components.push(component);
+            }
+        }
+    }
+
+    if components.is_empty() {
+        PathBuf::from(".")
+    } else {
+        components.iter().collect()
+    }
+}
+
+/// Format a [`PathBuf`] into its canonical string form and convert it back.
+///
+/// This is a convenience wrapper around [`format_path_str`], preserving
+/// the semantics of [`PathBuf`] while applying the same normalization rules:
+/// - normalize separators to `/`
+/// - remove duplicated separators
+/// - strip ANSI escape sequences
+/// - remove unfriendly characters (`*`, `?`, etc.)
+/// - resolve simple `..` segments
+pub fn format_path(path: impl Into<PathBuf>) -> Result<PathBuf, std::io::Error> {
+    let path_str = format_path_str(path.into().display().to_string())?;
+    Ok(PathBuf::from(path_str))
+}
author	魏曹先生 <1992414357@qq.com>	2026-02-09 18:32:24 +0800
committer	魏曹先生 <1992414357@qq.com>	2026-02-09 18:32:24 +0800
commit	204bb6824bf3555b80ca574ca3edb8ea007c89dd (patch)
tree	9a50b67ff64b2d1fe918e4bfa5034ac8389668e3 /parser
parent	12d08d599a41b15e0a20113d1a521c8c3a232e79 (diff)