summaryrefslogtreecommitdiff
path: root/_工具/字数统计.py
blob: 81c782f77609b2d51d8d7186d81def018c5a897c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
import os
import re


def is_excluded(name, is_dir):
    """判断是否应该被排除
    排除规则:
    - 点开头的隐藏文件/目录
    - 以下划线开头的文件/目录
    """
    if name.startswith("."):
        return True
    if name.startswith("_"):
        return True
    return False


def count_net_text(file_path):
    """
    统计文件的净文本字符数。
    规则:
    1. 去除行首的 Markdown 标记(##、-、*、数字序号、>)
    2. 去除所有 ASCII 标点符号、中文标点、空白字符
    3. 只保留中文字符、英文字母、数字
    """
    if not os.path.isfile(file_path):
        return 0

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
    except Exception:
        return 0

    # 去除行首 Markdown 标记
    lines = content.split("\n")
    cleaned_lines = []
    for line in lines:
        line = re.sub(r"^#+\s*", "", line)
        line = re.sub(r"^-\s*", "", line)
        line = re.sub(r"^\*\s*", "", line)
        line = re.sub(r"^\d+\.\s*", "", line)
        line = re.sub(r"^>\s*", "", line)
        cleaned_lines.append(line)

    text = "".join(cleaned_lines)

    # 只保留:中文字符(含扩展区)、英文字母、数字
    net_chars = re.findall(r"[\u4e00-\u9fff\uf900-\ufaffA-Za-z0-9]", text)

    return len(net_chars)


def calc_dir_total(dir_path):
    """递归计算目录及其子目录的总净文本数"""
    total = 0
    try:
        entries = sorted(os.listdir(dir_path))
    except PermissionError:
        return 0

    for entry in entries:
        entry_path = os.path.join(dir_path, entry)
        is_dir = os.path.isdir(entry_path)
        if is_excluded(entry, is_dir):
            continue
        if os.path.isfile(entry_path):
            total += count_net_text(entry_path)
        elif is_dir:
            total += calc_dir_total(entry_path)

    return total


def print_tree(dir_path, prefix=""):
    """递归打印树状图"""
    try:
        entries = sorted(os.listdir(dir_path))
    except PermissionError:
        return

    filtered = []
    for entry in entries:
        entry_path = os.path.join(dir_path, entry)
        is_dir = os.path.isdir(entry_path)
        if is_excluded(entry, is_dir):
            continue
        filtered.append(entry)

    total_items = len(filtered)

    for idx, entry in enumerate(filtered):
        is_last = idx == total_items - 1
        entry_path = os.path.join(dir_path, entry)

        connector = "└── " if is_last else "├── "
        child_prefix = prefix + ("    " if is_last else "│   ")

        if os.path.isfile(entry_path):
            cnt = count_net_text(entry_path)
            print(f"{prefix}{connector}{entry}(净文本:{cnt}字)")
        elif os.path.isdir(entry_path):
            sub_total = calc_dir_total(entry_path)
            print(f"{prefix}{connector}{entry} (净文本:{sub_total}字)")
            print_tree(entry_path, child_prefix)


def main():
    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

    # 收集根目录项
    root_entries = sorted(os.listdir(base_dir))
    root_items = []
    for entry in root_entries:
        entry_path = os.path.join(base_dir, entry)
        is_dir = os.path.isdir(entry_path)
        if is_excluded(entry, is_dir):
            continue
        root_items.append(entry)

    # 计算根目录总计
    grand_total = 0
    for entry in root_items:
        entry_path = os.path.join(base_dir, entry)
        if os.path.isfile(entry_path):
            grand_total += count_net_text(entry_path)
        elif os.path.isdir(entry_path):
            grand_total += calc_dir_total(entry_path)

    # 统计目录和文件数(递归统计全部)
    total_dirs = 0
    total_files = 0
    for entry in root_items:
        entry_path = os.path.join(base_dir, entry)
        if os.path.isdir(entry_path):
            total_dirs += 1
            for current_root, dirs, files in os.walk(entry_path):
                dirs[:] = [d for d in dirs if not is_excluded(d, True)]
                files = [f for f in files if not is_excluded(f, False)]
                total_dirs += len(dirs)
                total_files += len(files)
        elif os.path.isfile(entry_path):
            total_files += 1

    # 输出
    print(".")
    print(f"(净文本总计:{grand_total}字)")

    total_root = len(root_items)
    for idx, entry in enumerate(root_items):
        is_last = idx == total_root - 1
        entry_path = os.path.join(base_dir, entry)

        connector = "└── " if is_last else "├── "
        child_prefix = "    " if is_last else "│   "

        if os.path.isfile(entry_path):
            cnt = count_net_text(entry_path)
            print(f"{connector}{entry}(净文本:{cnt}字)")
        elif os.path.isdir(entry_path):
            sub_total = calc_dir_total(entry_path)
            print(f"{connector}{entry} (净文本:{sub_total}字)")
            print_tree(entry_path, child_prefix)

    print(f"\n{total_dirs} directories, {total_files} files")


if __name__ == "__main__":
    main()