summaryrefslogtreecommitdiff
path: root/_工具
diff options
context:
space:
mode:
Diffstat (limited to '_工具')
-rwxr-xr-x_工具/字数统计.py170
1 files changed, 170 insertions, 0 deletions
diff --git a/_工具/字数统计.py b/_工具/字数统计.py
new file mode 100755
index 0000000..81c782f
--- /dev/null
+++ b/_工具/字数统计.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+import os
+import re
+
+
+def is_excluded(name, is_dir):
+ """判断是否应该被排除
+ 排除规则:
+ - 点开头的隐藏文件/目录
+ - 以下划线开头的文件/目录
+ """
+ if name.startswith("."):
+ return True
+ if name.startswith("_"):
+ return True
+ return False
+
+
+def count_net_text(file_path):
+ """
+ 统计文件的净文本字符数。
+ 规则:
+ 1. 去除行首的 Markdown 标记(##、-、*、数字序号、>)
+ 2. 去除所有 ASCII 标点符号、中文标点、空白字符
+ 3. 只保留中文字符、英文字母、数字
+ """
+ if not os.path.isfile(file_path):
+ return 0
+
+ try:
+ with open(file_path, "r", encoding="utf-8") as f:
+ content = f.read()
+ except Exception:
+ return 0
+
+ # 去除行首 Markdown 标记
+ lines = content.split("\n")
+ cleaned_lines = []
+ for line in lines:
+ line = re.sub(r"^#+\s*", "", line)
+ line = re.sub(r"^-\s*", "", line)
+ line = re.sub(r"^\*\s*", "", line)
+ line = re.sub(r"^\d+\.\s*", "", line)
+ line = re.sub(r"^>\s*", "", line)
+ cleaned_lines.append(line)
+
+ text = "".join(cleaned_lines)
+
+ # 只保留:中文字符(含扩展区)、英文字母、数字
+ net_chars = re.findall(r"[\u4e00-\u9fff\uf900-\ufaffA-Za-z0-9]", text)
+
+ return len(net_chars)
+
+
+def calc_dir_total(dir_path):
+ """递归计算目录及其子目录的总净文本数"""
+ total = 0
+ try:
+ entries = sorted(os.listdir(dir_path))
+ except PermissionError:
+ return 0
+
+ for entry in entries:
+ entry_path = os.path.join(dir_path, entry)
+ is_dir = os.path.isdir(entry_path)
+ if is_excluded(entry, is_dir):
+ continue
+ if os.path.isfile(entry_path):
+ total += count_net_text(entry_path)
+ elif is_dir:
+ total += calc_dir_total(entry_path)
+
+ return total
+
+
+def print_tree(dir_path, prefix=""):
+ """递归打印树状图"""
+ try:
+ entries = sorted(os.listdir(dir_path))
+ except PermissionError:
+ return
+
+ filtered = []
+ for entry in entries:
+ entry_path = os.path.join(dir_path, entry)
+ is_dir = os.path.isdir(entry_path)
+ if is_excluded(entry, is_dir):
+ continue
+ filtered.append(entry)
+
+ total_items = len(filtered)
+
+ for idx, entry in enumerate(filtered):
+ is_last = idx == total_items - 1
+ entry_path = os.path.join(dir_path, entry)
+
+ connector = "└── " if is_last else "├── "
+ child_prefix = prefix + (" " if is_last else "│ ")
+
+ if os.path.isfile(entry_path):
+ cnt = count_net_text(entry_path)
+ print(f"{prefix}{connector}{entry}(净文本:{cnt}字)")
+ elif os.path.isdir(entry_path):
+ sub_total = calc_dir_total(entry_path)
+ print(f"{prefix}{connector}{entry} (净文本:{sub_total}字)")
+ print_tree(entry_path, child_prefix)
+
+
+def main():
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+ # 收集根目录项
+ root_entries = sorted(os.listdir(base_dir))
+ root_items = []
+ for entry in root_entries:
+ entry_path = os.path.join(base_dir, entry)
+ is_dir = os.path.isdir(entry_path)
+ if is_excluded(entry, is_dir):
+ continue
+ root_items.append(entry)
+
+ # 计算根目录总计
+ grand_total = 0
+ for entry in root_items:
+ entry_path = os.path.join(base_dir, entry)
+ if os.path.isfile(entry_path):
+ grand_total += count_net_text(entry_path)
+ elif os.path.isdir(entry_path):
+ grand_total += calc_dir_total(entry_path)
+
+ # 统计目录和文件数(递归统计全部)
+ total_dirs = 0
+ total_files = 0
+ for entry in root_items:
+ entry_path = os.path.join(base_dir, entry)
+ if os.path.isdir(entry_path):
+ total_dirs += 1
+ for current_root, dirs, files in os.walk(entry_path):
+ dirs[:] = [d for d in dirs if not is_excluded(d, True)]
+ files = [f for f in files if not is_excluded(f, False)]
+ total_dirs += len(dirs)
+ total_files += len(files)
+ elif os.path.isfile(entry_path):
+ total_files += 1
+
+ # 输出
+ print(".")
+ print(f"(净文本总计:{grand_total}字)")
+
+ total_root = len(root_items)
+ for idx, entry in enumerate(root_items):
+ is_last = idx == total_root - 1
+ entry_path = os.path.join(base_dir, entry)
+
+ connector = "└── " if is_last else "├── "
+ child_prefix = " " if is_last else "│ "
+
+ if os.path.isfile(entry_path):
+ cnt = count_net_text(entry_path)
+ print(f"{connector}{entry}(净文本:{cnt}字)")
+ elif os.path.isdir(entry_path):
+ sub_total = calc_dir_total(entry_path)
+ print(f"{connector}{entry} (净文本:{sub_total}字)")
+ print_tree(entry_path, child_prefix)
+
+ print(f"\n{total_dirs} directories, {total_files} files")
+
+
+if __name__ == "__main__":
+ main()