diff options
Diffstat (limited to '_工具/字数统计.py')
| -rwxr-xr-x | _工具/字数统计.py | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/_工具/字数统计.py b/_工具/字数统计.py new file mode 100755 index 0000000..81c782f --- /dev/null +++ b/_工具/字数统计.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +import os +import re + + +def is_excluded(name, is_dir): + """判断是否应该被排除 + 排除规则: + - 点开头的隐藏文件/目录 + - 以下划线开头的文件/目录 + """ + if name.startswith("."): + return True + if name.startswith("_"): + return True + return False + + +def count_net_text(file_path): + """ + 统计文件的净文本字符数。 + 规则: + 1. 去除行首的 Markdown 标记(##、-、*、数字序号、>) + 2. 去除所有 ASCII 标点符号、中文标点、空白字符 + 3. 只保留中文字符、英文字母、数字 + """ + if not os.path.isfile(file_path): + return 0 + + try: + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + except Exception: + return 0 + + # 去除行首 Markdown 标记 + lines = content.split("\n") + cleaned_lines = [] + for line in lines: + line = re.sub(r"^#+\s*", "", line) + line = re.sub(r"^-\s*", "", line) + line = re.sub(r"^\*\s*", "", line) + line = re.sub(r"^\d+\.\s*", "", line) + line = re.sub(r"^>\s*", "", line) + cleaned_lines.append(line) + + text = "".join(cleaned_lines) + + # 只保留:中文字符(含扩展区)、英文字母、数字 + net_chars = re.findall(r"[\u4e00-\u9fff\uf900-\ufaffA-Za-z0-9]", text) + + return len(net_chars) + + +def calc_dir_total(dir_path): + """递归计算目录及其子目录的总净文本数""" + total = 0 + try: + entries = sorted(os.listdir(dir_path)) + except PermissionError: + return 0 + + for entry in entries: + entry_path = os.path.join(dir_path, entry) + is_dir = os.path.isdir(entry_path) + if is_excluded(entry, is_dir): + continue + if os.path.isfile(entry_path): + total += count_net_text(entry_path) + elif is_dir: + total += calc_dir_total(entry_path) + + return total + + +def print_tree(dir_path, prefix=""): + """递归打印树状图""" + try: + entries = sorted(os.listdir(dir_path)) + except PermissionError: + return + + filtered = [] + for entry in entries: + entry_path = os.path.join(dir_path, entry) + is_dir = os.path.isdir(entry_path) + if is_excluded(entry, is_dir): + continue + filtered.append(entry) + + total_items = len(filtered) + + for idx, entry in enumerate(filtered): + is_last = idx == total_items - 1 + entry_path = os.path.join(dir_path, entry) + + connector = "└── " if is_last else "├── " + child_prefix = prefix + (" " if is_last else "│ ") + + if os.path.isfile(entry_path): + cnt = count_net_text(entry_path) + print(f"{prefix}{connector}{entry}(净文本:{cnt}字)") + elif os.path.isdir(entry_path): + sub_total = calc_dir_total(entry_path) + print(f"{prefix}{connector}{entry} (净文本:{sub_total}字)") + print_tree(entry_path, child_prefix) + + +def main(): + base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + # 收集根目录项 + root_entries = sorted(os.listdir(base_dir)) + root_items = [] + for entry in root_entries: + entry_path = os.path.join(base_dir, entry) + is_dir = os.path.isdir(entry_path) + if is_excluded(entry, is_dir): + continue + root_items.append(entry) + + # 计算根目录总计 + grand_total = 0 + for entry in root_items: + entry_path = os.path.join(base_dir, entry) + if os.path.isfile(entry_path): + grand_total += count_net_text(entry_path) + elif os.path.isdir(entry_path): + grand_total += calc_dir_total(entry_path) + + # 统计目录和文件数(递归统计全部) + total_dirs = 0 + total_files = 0 + for entry in root_items: + entry_path = os.path.join(base_dir, entry) + if os.path.isdir(entry_path): + total_dirs += 1 + for current_root, dirs, files in os.walk(entry_path): + dirs[:] = [d for d in dirs if not is_excluded(d, True)] + files = [f for f in files if not is_excluded(f, False)] + total_dirs += len(dirs) + total_files += len(files) + elif os.path.isfile(entry_path): + total_files += 1 + + # 输出 + print(".") + print(f"(净文本总计:{grand_total}字)") + + total_root = len(root_items) + for idx, entry in enumerate(root_items): + is_last = idx == total_root - 1 + entry_path = os.path.join(base_dir, entry) + + connector = "└── " if is_last else "├── " + child_prefix = " " if is_last else "│ " + + if os.path.isfile(entry_path): + cnt = count_net_text(entry_path) + print(f"{connector}{entry}(净文本:{cnt}字)") + elif os.path.isdir(entry_path): + sub_total = calc_dir_total(entry_path) + print(f"{connector}{entry} (净文本:{sub_total}字)") + print_tree(entry_path, child_prefix) + + print(f"\n{total_dirs} directories, {total_files} files") + + +if __name__ == "__main__": + main() |
