#!/usr/bin/env python3 import os import re def is_excluded(name, is_dir): """判断是否应该被排除 排除规则: - 点开头的隐藏文件/目录 - 以下划线开头的文件/目录 """ if name.startswith("."): return True if name.startswith("_"): return True return False def count_net_text(file_path): """ 统计文件的净文本字符数。 规则: 1. 去除行首的 Markdown 标记(##、-、*、数字序号、>) 2. 去除所有 ASCII 标点符号、中文标点、空白字符 3. 只保留中文字符、英文字母、数字 """ if not os.path.isfile(file_path): return 0 try: with open(file_path, "r", encoding="utf-8") as f: content = f.read() except Exception: return 0 # 去除行首 Markdown 标记 lines = content.split("\n") cleaned_lines = [] for line in lines: line = re.sub(r"^#+\s*", "", line) line = re.sub(r"^-\s*", "", line) line = re.sub(r"^\*\s*", "", line) line = re.sub(r"^\d+\.\s*", "", line) line = re.sub(r"^>\s*", "", line) cleaned_lines.append(line) text = "".join(cleaned_lines) # 只保留:中文字符(含扩展区)、英文字母、数字 net_chars = re.findall(r"[\u4e00-\u9fff\uf900-\ufaffA-Za-z0-9]", text) return len(net_chars) def calc_dir_total(dir_path): """递归计算目录及其子目录的总净文本数""" total = 0 try: entries = sorted(os.listdir(dir_path)) except PermissionError: return 0 for entry in entries: entry_path = os.path.join(dir_path, entry) is_dir = os.path.isdir(entry_path) if is_excluded(entry, is_dir): continue if os.path.isfile(entry_path): total += count_net_text(entry_path) elif is_dir: total += calc_dir_total(entry_path) return total def print_tree(dir_path, prefix=""): """递归打印树状图""" try: entries = sorted(os.listdir(dir_path)) except PermissionError: return filtered = [] for entry in entries: entry_path = os.path.join(dir_path, entry) is_dir = os.path.isdir(entry_path) if is_excluded(entry, is_dir): continue filtered.append(entry) total_items = len(filtered) for idx, entry in enumerate(filtered): is_last = idx == total_items - 1 entry_path = os.path.join(dir_path, entry) connector = "└── " if is_last else "├── " child_prefix = prefix + (" " if is_last else "│ ") if os.path.isfile(entry_path): cnt = count_net_text(entry_path) print(f"{prefix}{connector}{entry}(净文本:{cnt}字)") elif os.path.isdir(entry_path): sub_total = calc_dir_total(entry_path) print(f"{prefix}{connector}{entry} (净文本:{sub_total}字)") print_tree(entry_path, child_prefix) def main(): base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # 收集根目录项 root_entries = sorted(os.listdir(base_dir)) root_items = [] for entry in root_entries: entry_path = os.path.join(base_dir, entry) is_dir = os.path.isdir(entry_path) if is_excluded(entry, is_dir): continue root_items.append(entry) # 计算根目录总计 grand_total = 0 for entry in root_items: entry_path = os.path.join(base_dir, entry) if os.path.isfile(entry_path): grand_total += count_net_text(entry_path) elif os.path.isdir(entry_path): grand_total += calc_dir_total(entry_path) # 统计目录和文件数(递归统计全部) total_dirs = 0 total_files = 0 for entry in root_items: entry_path = os.path.join(base_dir, entry) if os.path.isdir(entry_path): total_dirs += 1 for current_root, dirs, files in os.walk(entry_path): dirs[:] = [d for d in dirs if not is_excluded(d, True)] files = [f for f in files if not is_excluded(f, False)] total_dirs += len(dirs) total_files += len(files) elif os.path.isfile(entry_path): total_files += 1 # 输出 print(".") print(f"(净文本总计:{grand_total}字)") total_root = len(root_items) for idx, entry in enumerate(root_items): is_last = idx == total_root - 1 entry_path = os.path.join(base_dir, entry) connector = "└── " if is_last else "├── " child_prefix = " " if is_last else "│ " if os.path.isfile(entry_path): cnt = count_net_text(entry_path) print(f"{connector}{entry}(净文本:{cnt}字)") elif os.path.isdir(entry_path): sub_total = calc_dir_total(entry_path) print(f"{connector}{entry} (净文本:{sub_total}字)") print_tree(entry_path, child_prefix) print(f"\n{total_dirs} directories, {total_files} files") if __name__ == "__main__": main()