summaryrefslogtreecommitdiff
path: root/parser
diff options
context:
space:
mode:
author魏曹先生 <1992414357@qq.com>2026-02-09 18:32:24 +0800
committer魏曹先生 <1992414357@qq.com>2026-02-09 18:32:24 +0800
commit204bb6824bf3555b80ca574ca3edb8ea007c89dd (patch)
tree9a50b67ff64b2d1fe918e4bfa5034ac8389668e3 /parser
parent12d08d599a41b15e0a20113d1a521c8c3a232e79 (diff)
Add file inclusion and text formatting to parser
Diffstat (limited to 'parser')
-rw-r--r--parser/Cargo.toml1
-rw-r--r--parser/src/error.rs12
-rw-r--r--parser/src/lib.rs1
-rw-r--r--parser/src/parse.rs360
-rw-r--r--parser/src/utils.rs1
-rw-r--r--parser/src/utils/path_fmt.rs123
6 files changed, 495 insertions, 3 deletions
diff --git a/parser/Cargo.toml b/parser/Cargo.toml
index 07e462f..2d7cb0c 100644
--- a/parser/Cargo.toml
+++ b/parser/Cargo.toml
@@ -5,6 +5,7 @@ edition = "2024"
[dependencies]
colored = "3.0"
+strip-ansi-escapes = "0.2.1"
unicode-width = "0.2"
regex = "1.12"
sha2 = "0.10"
diff --git a/parser/src/error.rs b/parser/src/error.rs
index ca68442..b9ac888 100644
--- a/parser/src/error.rs
+++ b/parser/src/error.rs
@@ -1,4 +1,4 @@
-use std::{i64, process::exit};
+use std::{i64, path::PathBuf, process::exit};
use colored::Colorize;
use unicode_width::UnicodeWidthStr;
@@ -14,6 +14,8 @@ pub enum Exit {
begin: i64,
end: i64,
},
+ DuplicateMarker(String),
+ CycleDependency(PathBuf),
}
impl From<std::io::Error> for Exit {
@@ -35,6 +37,14 @@ pub fn handle_exit(e: Exit) {
} => {
print_syntax_error(content, reason, line, begin, end);
}
+ Exit::DuplicateMarker(marker) => {
+ eprintln!("Duplicate marker `{}` found!", marker);
+ exit(1)
+ }
+ Exit::CycleDependency(dialog) => {
+ eprintln!("Dialog `{}` depends on itself!", dialog.display());
+ exit(1)
+ }
}
}
diff --git a/parser/src/lib.rs b/parser/src/lib.rs
index a462697..d7caac3 100644
--- a/parser/src/lib.rs
+++ b/parser/src/lib.rs
@@ -2,3 +2,4 @@ pub mod error;
pub mod macros;
pub mod parse;
pub mod syntax_checker;
+pub mod utils;
diff --git a/parser/src/parse.rs b/parser/src/parse.rs
index 434cca3..79f5719 100644
--- a/parser/src/parse.rs
+++ b/parser/src/parse.rs
@@ -1,25 +1,110 @@
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
use regex::Regex;
use sha2::{Digest, Sha256};
-use crate::{error::Exit, syntax_checker::check_markdown_syntax};
+use crate::{error::Exit, syntax_checker::check_markdown_syntax, utils::path_fmt::format_path};
pub fn parse(input: PathBuf, ir_output: PathBuf) -> Result<(), Exit> {
let result = std::fs::read_to_string(&input)?;
check_markdown_syntax(&result)?;
+ let result = unwrap_includes(result, input)?;
+
+ check_duplicate_marker(&result)?;
+
let result = clean_markdown(result)?;
let result = fix_mark_jump(result)?;
let result = replace_marker_name(result)?;
let result = convert_to_step_sentence_structure(result)?;
let result = strip_invalid_jump(result)?;
+ let result = convert_image_to_code(result)?;
+ let result = apply_code_lines(result)?;
+ let result = split_sentence_and_encode(result)?;
std::fs::write(&ir_output, result)?;
Ok(())
}
+/// Expand text includes of [[Dialog.md]]
+pub fn unwrap_includes(input: String, self_path: PathBuf) -> Result<String, Exit> {
+ let mut stack = Vec::<PathBuf>::new();
+ expand_recursive(input, &self_path, &mut stack)
+}
+
+fn expand_recursive(
+ content: String,
+ current_path: &Path,
+ stack: &mut Vec<PathBuf>,
+) -> Result<String, Exit> {
+ let mut output = String::new();
+ let mut in_code_block = false;
+
+ let current_norm = format_path(current_path)?;
+
+ if stack.contains(&current_norm) {
+ return Err(Exit::CycleDependency(current_norm));
+ }
+
+ stack.push(current_norm.clone());
+
+ for line in content.lines() {
+ if line.trim().starts_with("```") {
+ in_code_block = !in_code_block;
+ output.push_str(line);
+ output.push('\n');
+ continue;
+ }
+
+ if in_code_block {
+ output.push_str(line);
+ output.push('\n');
+ continue;
+ }
+
+ if let Some(include_path) = extract_include(line) {
+ let include_abs = format_path(&current_path.parent().unwrap().join(include_path))?;
+ let include_content =
+ std::fs::read_to_string(&include_abs).map_err(|e| Exit::IoError(e))?;
+
+ let expanded = expand_recursive(include_content, &include_abs, stack)?;
+ output.push_str(&expanded);
+ } else {
+ output.push_str(line);
+ output.push('\n');
+ }
+ }
+
+ stack.pop();
+
+ Ok(output)
+}
+
+fn extract_include(line: &str) -> Option<&str> {
+ line.trim()
+ .strip_prefix("[[")
+ .and_then(|s| s.strip_suffix("]]"))
+}
+
+/// Check for duplicate markers
+pub fn check_duplicate_marker(input: &String) -> Result<(), Exit> {
+ let mut seen = std::collections::HashSet::new();
+ let heading_re = Regex::new(r"^(#{1,5})\s+(.+)$").unwrap();
+
+ for line in input.lines() {
+ if let Some(caps) = heading_re.captures(line) {
+ let heading_text = caps[2].trim().to_string();
+ if seen.contains(&heading_text) {
+ return Err(Exit::DuplicateMarker(heading_text));
+ }
+ seen.insert(heading_text);
+ }
+ }
+
+ Ok(())
+}
+
/// Clean Markdown
/// 1. Remove blockquotes
/// 2. Remove empty lines
@@ -916,3 +1001,274 @@ pub fn strip_invalid_jump(input: String) -> Result<String, Exit> {
Ok(result_lines.join("\n"))
}
+
+/// Convert image lines to code lines
+pub fn convert_image_to_code(input: String) -> Result<String, Exit> {
+ let mut result = String::new();
+ let lines: Vec<&str> = input.lines().collect();
+ let image_re = Regex::new(r"^!\[[^\]]*\]\(([^)]+)\)$").unwrap();
+
+ for line in lines {
+ if let Some(caps) = image_re.captures(line) {
+ let image_path = caps.get(1).unwrap().as_str();
+ result.push_str(&format!("`image \"{}\"`\n", image_path));
+ } else {
+ result.push_str(line);
+ result.push('\n');
+ }
+ }
+
+ // Remove trailing newline if present
+ if result.ends_with('\n') {
+ result.pop();
+ }
+
+ Ok(result)
+}
+
+/// Apply code lines to sentences
+pub fn apply_code_lines(input: String) -> Result<String, Exit> {
+ let mut out = String::new();
+ let lines: Vec<&str> = input.lines().collect();
+
+ let mut i = 0;
+ while i < lines.len() {
+ let line = lines[i];
+
+ if !line.trim_start().starts_with('`') {
+ out.push_str(line);
+ out.push('\n');
+ i += 1;
+ continue;
+ }
+
+ let mut code_buf = String::new();
+ while i < lines.len() && {
+ let line: &str = lines[i];
+ line.trim_start().starts_with('`')
+ } {
+ code_buf.push_str(lines[i].trim());
+ i += 1;
+ }
+
+ if i >= lines.len()
+ || !{
+ let line: &str = lines[i];
+ line.trim_start().starts_with('[')
+ }
+ {
+ continue;
+ }
+
+ if i + 1 < lines.len() && {
+ let line: &str = lines[i + 1];
+ line.trim_start().starts_with('[')
+ } {
+ continue;
+ }
+
+ let merged = helper_merge_code_into_sentence(&code_buf, lines[i]);
+ out.push_str(&merged);
+ out.push('\n');
+ i += 1;
+ }
+
+ Ok(out)
+}
+
+fn helper_merge_code_into_sentence(code: &str, sentence: &str) -> String {
+ if let Some(start) = sentence.find(":[") {
+ if let Some(_) = sentence[start + 2..].find(']') {
+ let content_start = start + 2;
+
+ let mut result = String::new();
+ result.push_str(&sentence[..content_start]);
+ result.push_str(code);
+ result.push_str(&sentence[content_start..]);
+ return result;
+ }
+ }
+
+ sentence.to_string()
+}
+
+/// Split sentences into embeddable tokens and perform Unicode encoding
+pub fn split_sentence_and_encode(input: String) -> Result<String, Exit> {
+ let mut result = String::new();
+ let lines: Vec<&str> = input.lines().collect();
+
+ for line in lines {
+ if line.starts_with('[') && line.contains("]:[") && line.contains("]->[") {
+ if let Some(start) = line.find("]:[") {
+ if let Some(end) = line.find("]->[") {
+ let content = &line[start + 3..end];
+ let processed_content = helper_process_sentence_content(content);
+
+ let suffix = &line[end + 1..];
+
+ let char_end = start;
+ let char_start = 1;
+ let character = &line[char_start..char_end];
+ let encoded_character = helper_encode_unicode(character);
+
+ // Build the new line with encoded character and processed content
+ let new_line =
+ format!("[{}]:{}{}", encoded_character, processed_content, suffix);
+ result.push_str(&format!("{}\n", new_line));
+ continue;
+ }
+ }
+ }
+ result.push_str(&format!("{}\n", line));
+ }
+
+ if result.ends_with('\n') {
+ result.pop();
+ }
+
+ Ok(result)
+}
+
+fn helper_process_sentence_content(content: &str) -> String {
+ let mut result = String::new();
+ let mut chars = content.chars().peekable();
+ let mut current_text = String::new();
+ let mut in_code = false;
+ let mut in_bold = false;
+ let mut in_italic = false;
+ let mut code_buffer = String::new();
+ let mut backticks_count = 0;
+
+ while let Some(ch) = chars.next() {
+ match ch {
+ '`' => {
+ backticks_count += 1;
+ if backticks_count == 1 {
+ // Start of code block
+ if !current_text.is_empty() {
+ let encoded_text = helper_encode_unicode(&current_text);
+ result.push_str(&format!("[text:[{}]]", encoded_text));
+ current_text.clear();
+ }
+ code_buffer.push(ch);
+ in_code = true;
+ } else if backticks_count == 2 && in_code {
+ // End of code block
+ code_buffer.push(ch);
+ let encoded_code = helper_encode_unicode(&code_buffer);
+ result.push_str(&format!("[code:[{}]]", encoded_code));
+ code_buffer.clear();
+ backticks_count = 0;
+ in_code = false;
+ } else if backticks_count == 1 && !in_code {
+ // Single backtick in text
+ current_text.push(ch);
+ }
+ }
+ '*' => {
+ if in_code {
+ code_buffer.push(ch);
+ continue;
+ }
+
+ // Check for bold
+ if chars.peek() == Some(&'*') {
+ chars.next(); // Consume the second '*'
+
+ if in_bold {
+ // End bold
+ if !current_text.is_empty() {
+ let encoded_text = helper_encode_unicode(&current_text);
+ result.push_str(&format!("[bold:[{}]]", encoded_text));
+ current_text.clear();
+ }
+ in_bold = false;
+ } else if in_italic {
+ if !current_text.is_empty() {
+ let encoded_text = helper_encode_unicode(&current_text);
+ result.push_str(&format!("[italic:[{}]]", encoded_text));
+ current_text.clear();
+ }
+ in_italic = false;
+ // Start bold_italic
+ in_bold = true;
+ } else {
+ // Start bold
+ if !current_text.is_empty() {
+ let encoded_text = helper_encode_unicode(&current_text);
+ result.push_str(&format!("[text:[{}]]", encoded_text));
+ current_text.clear();
+ }
+ in_bold = true;
+ }
+ } else {
+ if in_italic {
+ // End italic
+ if !current_text.is_empty() {
+ let encoded_text = helper_encode_unicode(&current_text);
+ result.push_str(&format!("[italic:[{}]]", encoded_text));
+ current_text.clear();
+ }
+ in_italic = false;
+ } else if in_bold {
+ if !current_text.is_empty() {
+ let encoded_text = helper_encode_unicode(&current_text);
+ result.push_str(&format!("[bold:[{}]]", encoded_text));
+ current_text.clear();
+ }
+ // Start bold_italic
+ in_bold = true;
+ in_italic = true;
+ } else {
+ // Start italic
+ if !current_text.is_empty() {
+ let encoded_text = helper_encode_unicode(&current_text);
+ result.push_str(&format!("[text:[{}]]", encoded_text));
+ current_text.clear();
+ }
+ in_italic = true;
+ }
+ }
+ }
+ _ => {
+ if in_code {
+ code_buffer.push(ch);
+ } else {
+ current_text.push(ch);
+ }
+ }
+ }
+ }
+
+ // Handle any remaining text
+ if !code_buffer.is_empty() {
+ let encoded_code = helper_encode_unicode(&code_buffer);
+ result.push_str(&format!("[code:[{}]]", encoded_code));
+ }
+
+ if !current_text.is_empty() {
+ let style = match (in_bold, in_italic) {
+ (true, true) => "bold_italic",
+ (true, false) => "bold",
+ (false, true) => "italic",
+ (false, false) => "text",
+ };
+ let encoded_text = helper_encode_unicode(&current_text);
+ result.push_str(&format!("[{}:[{}]]", style, encoded_text));
+ }
+
+ result
+}
+
+fn helper_encode_unicode(s: &str) -> String {
+ let mut result = String::new();
+ for ch in s.chars() {
+ let code = ch as u32;
+ if code <= 0x7F {
+ result.push(ch);
+ } else {
+ result.push_str(&format!("\\u{:X}", code));
+ }
+ }
+ result
+}
diff --git a/parser/src/utils.rs b/parser/src/utils.rs
new file mode 100644
index 0000000..0fbb516
--- /dev/null
+++ b/parser/src/utils.rs
@@ -0,0 +1 @@
+pub mod path_fmt;
diff --git a/parser/src/utils/path_fmt.rs b/parser/src/utils/path_fmt.rs
new file mode 100644
index 0000000..8750db6
--- /dev/null
+++ b/parser/src/utils/path_fmt.rs
@@ -0,0 +1,123 @@
+use std::path::{Path, PathBuf};
+
+/// Normalize an input path string into a canonical, platform‑agnostic form.
+///
+/// This function removes ANSI escape sequences, unifies separators to `/`,
+/// collapses duplicate slashes, strips unfriendly characters (`*`, `?`, `"`, `<`, `>`, `|`),
+/// resolves simple `..` components, and preserves a trailing slash when present.
+///
+/// See examples below for the exact normalization behavior.
+///
+/// # Examples
+///
+/// ```
+/// # use string_proc::format_path::format_path_str;
+/// use std::io::Error;
+///
+/// # fn main() -> Result<(), Error> {
+/// assert_eq!(format_path_str("C:\\Users\\\\test")?, "C:/Users/test");
+/// assert_eq!(
+/// format_path_str("/path/with/*unfriendly?chars")?,
+/// "/path/with/unfriendlychars"
+/// );
+/// assert_eq!(format_path_str("\x1b[31m/path\x1b[0m")?, "/path");
+/// assert_eq!(format_path_str("/home/user/dir/")?, "/home/user/dir/");
+/// assert_eq!(
+/// format_path_str("/home/user/file.txt")?,
+/// "/home/user/file.txt"
+/// );
+/// assert_eq!(
+/// format_path_str("/home/my_user/DOCS/JVCS_TEST/Workspace/../Vault/")?,
+/// "/home/my_user/DOCS/JVCS_TEST/Vault/"
+/// );
+/// assert_eq!(format_path_str("./home/file.txt")?, "home/file.txt");
+/// assert_eq!(format_path_str("./home/path/")?, "home/path/");
+/// assert_eq!(format_path_str("./")?, "");
+/// # Ok(())
+/// # }
+/// ```
+pub fn format_path_str(path: impl Into<String>) -> Result<String, std::io::Error> {
+ let path_str = path.into();
+ let ends_with_slash = path_str.ends_with('/');
+
+ // ANSI Strip
+ let cleaned = strip_ansi_escapes::strip(&path_str);
+ let path_without_ansi = String::from_utf8(cleaned)
+ .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
+
+ let path_with_forward_slash = path_without_ansi.replace('\\', "/");
+ let mut result = String::new();
+ let mut prev_char = '\0';
+
+ for c in path_with_forward_slash.chars() {
+ if c == '/' && prev_char == '/' {
+ continue;
+ }
+ result.push(c);
+ prev_char = c;
+ }
+
+ let unfriendly_chars = ['*', '?', '"', '<', '>', '|'];
+ result = result
+ .chars()
+ .filter(|c| !unfriendly_chars.contains(c))
+ .collect();
+
+ // Handle ".." path components
+ let path_buf = PathBuf::from(&result);
+ let normalized_path = normalize_path(&path_buf);
+ result = normalized_path.to_string_lossy().replace('\\', "/");
+
+ // Restore trailing slash if original path had one
+ if ends_with_slash && !result.ends_with('/') {
+ result.push('/');
+ }
+
+ // Special case: when result is only "./", return ""
+ if result == "./" {
+ return Ok(String::new());
+ }
+
+ Ok(result)
+}
+
+/// Normalize path by resolving ".." components without requiring file system access
+fn normalize_path(path: &Path) -> PathBuf {
+ let mut components = Vec::new();
+
+ for component in path.components() {
+ match component {
+ std::path::Component::ParentDir => {
+ if !components.is_empty() {
+ components.pop();
+ }
+ }
+ std::path::Component::CurDir => {
+ // Skip current directory components
+ }
+ _ => {
+ components.push(component);
+ }
+ }
+ }
+
+ if components.is_empty() {
+ PathBuf::from(".")
+ } else {
+ components.iter().collect()
+ }
+}
+
+/// Format a [`PathBuf`] into its canonical string form and convert it back.
+///
+/// This is a convenience wrapper around [`format_path_str`], preserving
+/// the semantics of [`PathBuf`] while applying the same normalization rules:
+/// - normalize separators to `/`
+/// - remove duplicated separators
+/// - strip ANSI escape sequences
+/// - remove unfriendly characters (`*`, `?`, etc.)
+/// - resolve simple `..` segments
+pub fn format_path(path: impl Into<PathBuf>) -> Result<PathBuf, std::io::Error> {
+ let path_str = format_path_str(path.into().display().to_string())?;
+ Ok(PathBuf::from(path_str))
+}