xianyan/scripts/check_translation_coverage.py

#!/usr/bin/env python3
# ============================================================
# 闲言APP — 多语言翻译覆盖率检测脚本
# 创建时间: 2026-06-01
# 更新时间: 2026-06-01
# 作用: CI/CD中自动检测14个语言文件的翻译覆盖率
# 上次更新: 初始创建
# ============================================================

import os
import sys
import re
import io
import json
import argparse
from pathlib import Path
from collections import OrderedDict
from datetime import datetime

if sys.stdout.encoding != "utf-8":
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
if sys.stderr.encoding != "utf-8":
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")

PROJECT_ROOT = Path(__file__).resolve().parent.parent
LANG_DIR = PROJECT_ROOT / "lib" / "l10n" / "languages"
BASE_LANG_FILE = "zh_cn.dart"

LANGUAGE_MAP = OrderedDict([
    ("zh_cn.dart", "zh_CN"),
    ("zh_tw.dart", "zh_TW"),
    ("en.dart", "en"),
    ("ja.dart", "ja"),
    ("ko.dart", "ko"),
    ("de.dart", "de"),
    ("fr.dart", "fr"),
    ("es.dart", "es"),
    ("it.dart", "it"),
    ("pt.dart", "pt"),
    ("ru.dart", "ru"),
    ("ar.dart", "ar"),
    ("bn.dart", "bn"),
    ("hi.dart", "hi"),
])


def parse_dart_lang_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    sections = OrderedDict()
    field_paths = {}
    section_stack = []
    current_top = None

    eq_pos = content.find("= T(")
    if eq_pos == -1:
        return sections, field_paths, content

    i = eq_pos + 4
    depth = 1

    while i < len(content) and depth > 0:
        while i < len(content) and content[i] in " \t\n\r,":
            i += 1
        if i >= len(content):
            break

        m = re.match(r"(\w+):\s*\w+\s*\(", content[i:])
        if m:
            name = m.group(1)
            if depth == 1:
                current_top = name
                section_stack = [name]
                sections[current_top] = OrderedDict()
            else:
                section_stack.append(name)
            i += m.end()
            depth += 1
            continue

        m = re.match(r"(\w+):\s*'((?:[^'\\]|\\.)*)'", content[i:])
        if m and current_top is not None:
            fname = m.group(1)
            fval = m.group(2)
            sections[current_top][fname] = fval
            field_paths[(current_top, fname)] = list(section_stack)
            i += m.end()
            continue

        if content[i] == ")":
            depth -= 1
            if depth == 1:
                current_top = None
                section_stack = []
            elif depth > 1 and len(section_stack) > 1:
                section_stack.pop()
            i += 1
            continue

        if content[i] == "(":
            depth += 1
            i += 1
            continue

        i += 1

    return sections, field_paths, content


def find_matching_paren(content, open_pos):
    depth = 1
    i = open_pos + 1
    in_string = False
    escape_next = False

    while i < len(content) and depth > 0:
        c = content[i]
        if escape_next:
            escape_next = False
            i += 1
            continue
        if c == "\\":
            escape_next = True
            i += 1
            continue
        if in_string:
            if c == "'":
                in_string = False
            i += 1
            continue
        if c == "'":
            in_string = True
            i += 1
            continue
        if c == "(":
            depth += 1
        elif c == ")":
            depth -= 1
            if depth == 0:
                return i
        i += 1

    return -1


def find_constructor_ranges(content):
    ranges = {}
    section_stack = []
    current_top = None

    eq_pos = content.find("= T(")
    if eq_pos == -1:
        return ranges

    i = eq_pos + 4
    depth = 1

    while i < len(content) and depth > 0:
        while i < len(content) and content[i] in " \t\n\r,":
            i += 1
        if i >= len(content):
            break

        m = re.match(r"(\w+):\s*\w+\s*\(", content[i:])
        if m:
            name = m.group(1)
            if depth == 1:
                current_top = name
                section_stack = [name]
            else:
                section_stack.append(name)
            path = ".".join(section_stack)

            line_start = content.rfind("\n", 0, i) + 1
            indent_match = re.match(r"(\s+)", content[line_start:i])
            indent = (indent_match.group(1) if indent_match else "    ") + "  "

            open_pos = i + m.end() - 1
            close_pos = find_matching_paren(content, open_pos)

            if close_pos != -1:
                ranges[path] = {
                    "open_pos": open_pos,
                    "close_pos": close_pos,
                    "indent": indent,
                }

            i += m.end()
            depth += 1
            continue

        m = re.match(r"(\w+):\s*'((?:[^'\\]|\\.)*)'", content[i:])
        if m:
            i += m.end()
            continue

        if content[i] == ")":
            depth -= 1
            if depth == 1:
                current_top = None
                section_stack = []
            elif depth > 1 and len(section_stack) > 1:
                section_stack.pop()
            i += 1
            continue

        if content[i] == "(":
            depth += 1
            i += 1
            continue

        i += 1

    return ranges


def check_coverage(base_sections, target_sections):
    missing = []
    empty = []
    extra = []

    for section, fields in base_sections.items():
        target_fields = target_sections.get(section, OrderedDict())
        for field, value in fields.items():
            if field not in target_fields:
                missing.append((section, field))
            elif not target_fields[field] and value:
                empty.append((section, field))

    for section, fields in target_sections.items():
        if section not in base_sections:
            for field in fields:
                extra.append((section, field))
            continue
        for field in fields:
            if field not in base_sections.get(section, {}):
                extra.append((section, field))

    return missing, empty, extra


def count_total_keys(sections):
    return sum(len(fields) for fields in sections.values())


def generate_text_report(results, threshold):
    lines = []
    lines.append("=" * 72)
    lines.append("  闲言APP 多语言翻译覆盖率报告")
    lines.append(f"  生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    lines.append("=" * 72)
    lines.append("")

    header = f"{'语言':<8} {'总键数':>6} {'已翻译':>6} {'缺失':>6} {'空值':>6} {'覆盖率':>8}"
    lines.append(header)
    lines.append("-" * len(header))

    below_threshold = []
    sorted_results = sorted(results, key=lambda x: x["coverage"], reverse=True)

    for r in sorted_results:
        mark = " ✅" if r["coverage"] >= threshold else " ❌"
        line = (
            f"{r['lang']:<8} {r['total']:>6} {r['translated']:>6} "
            f"{r['missing_count']:>6} {r['empty_count']:>6} {r['coverage']:>7.1f}%{mark}"
        )
        lines.append(line)
        if r["coverage"] < threshold and r["lang"] != "zh_CN":
            below_threshold.append(r["lang"])

    lines.append("")

    if below_threshold:
        lines.append(f"⚠️  以下语言覆盖率低于阈值 {threshold}%: {', '.join(below_threshold)}")
    else:
        lines.append(f"✅ 所有语言覆盖率均达到阈值 {threshold}%")

    lines.append("")
    lines.append("=" * 72)
    lines.append("  覆盖率排名")
    lines.append("=" * 72)

    for idx, r in enumerate(sorted_results, 1):
        medal = "🥇" if idx == 1 else "🥈" if idx == 2 else "🥉" if idx == 3 else "  "
        lines.append(f"  {medal} #{idx:<2} {r['lang']:<8} {r['coverage']:.1f}%")

    lines.append("")

    has_details = any(r["missing"] or r["empty"] for r in results)
    if has_details:
        lines.append("=" * 72)
        lines.append("  缺失/空值键详情")
        lines.append("=" * 72)

        for r in sorted_results:
            if r["missing"] or r["empty"]:
                lines.append(f"")
                lines.append(f"── {r['lang']} ({r['file']}) ──")
                if r["missing"]:
                    lines.append(f"  缺失键 ({len(r['missing'])}):")
                    for section, field in r["missing"]:
                        lines.append(f"    - {section}.{field}")
                if r["empty"]:
                    lines.append(f"  空值键 ({len(r['empty'])}):")
                    for section, field in r["empty"]:
                        lines.append(f"    - {section}.{field}")

    lines.append("")
    return "\n".join(lines), below_threshold


def generate_json_report(results, threshold):
    below_threshold = [
        r["lang"] for r in results if r["coverage"] < threshold and r["lang"] != "zh_CN"
    ]
    report = {
        "generated_at": datetime.now().isoformat(),
        "threshold": threshold,
        "below_threshold": below_threshold,
        "ranking": [
            {"rank": idx, "lang": r["lang"], "coverage": round(r["coverage"], 1)}
            for idx, r in enumerate(sorted(results, key=lambda x: x["coverage"], reverse=True), 1)
        ],
        "languages": {},
    }
    for r in sorted(results, key=lambda x: x["coverage"], reverse=True):
        report["languages"][r["lang"]] = {
            "total": r["total"],
            "translated": r["translated"],
            "missing_count": r["missing_count"],
            "empty_count": r["empty_count"],
            "extra_count": r["extra_count"],
            "coverage": round(r["coverage"], 1),
            "missing_keys": [f"{s}.{f}" for s, f in r["missing"]],
            "empty_keys": [f"{s}.{f}" for s, f in r["empty"]],
        }
    return json.dumps(report, ensure_ascii=False, indent=2), below_threshold


def fix_missing_keys(target_filepath, missing_keys, base_sections, base_field_paths, target_content):
    constructor_ranges = find_constructor_ranges(target_content)

    keys_by_path = OrderedDict()
    for section, field in missing_keys:
        path_stack = base_field_paths.get((section, field), [section])
        path = ".".join(path_stack)
        if path not in keys_by_path:
            keys_by_path[path] = []
        keys_by_path[path].append((field, base_sections[section][field]))

    insertions = []
    fixed_count = 0
    skipped_count = 0

    for path, fields in keys_by_path.items():
        if path not in constructor_ranges:
            skipped_count += len(fields)
            continue
        range_info = constructor_ranges[path]
        close_pos = range_info["close_pos"]
        indent = range_info["indent"]

        insert_text = ""
        for field, value in fields:
            insert_text += f"{indent}// TODO: translate\n"
            insert_text += f"{indent}{field}: '{value}',\n"
            fixed_count += 1

        insertions.append((close_pos, insert_text))

    insertions.sort(key=lambda x: x[0], reverse=True)
    for pos, text in insertions:
        target_content = target_content[:pos] + text + target_content[pos:]

    with open(target_filepath, "w", encoding="utf-8") as f:
        f.write(target_content)

    return fixed_count, skipped_count


def main():
    parser = argparse.ArgumentParser(description="闲言APP 多语言翻译覆盖率检测")
    parser.add_argument(
        "--threshold", type=float, default=80,
        help="覆盖率阈值(百分比)，低于此值返回非零退出码 (默认: 80)",
    )
    parser.add_argument(
        "--json", action="store_true", dest="json_output",
        help="输出JSON格式报告",
    )
    parser.add_argument(
        "--fix", action="store_true",
        help="自动填充缺失的键(用基准语言值+TODO标记)",
    )
    args = parser.parse_args()

    if not LANG_DIR.exists():
        print(f"❌ 语言文件目录不存在: {LANG_DIR}", file=sys.stderr)
        sys.exit(1)

    base_filepath = LANG_DIR / BASE_LANG_FILE
    if not base_filepath.exists():
        print(f"❌ 基准语言文件不存在: {base_filepath}", file=sys.stderr)
        sys.exit(1)

    base_sections, base_field_paths, _ = parse_dart_lang_file(base_filepath)
    base_total = count_total_keys(base_sections)

    if base_total == 0:
        print("❌ 基准语言文件解析失败，未找到任何翻译键", file=sys.stderr)
        sys.exit(1)

    print(f"📊 基准语言(zh_CN)共 {base_total} 个翻译键，共 {len(base_sections)} 个模块\n")

    results = []

    for filename, lang_id in LANGUAGE_MAP.items():
        filepath = LANG_DIR / filename
        if not filepath.exists():
            print(f"⚠️  语言文件不存在: {filename}", file=sys.stderr)
            continue

        target_sections, _, target_content = parse_dart_lang_file(filepath)
        missing, empty, extra = check_coverage(base_sections, target_sections)

        translated = base_total - len(missing) - len(empty)
        coverage = (translated / base_total * 100) if base_total > 0 else 0

        results.append({
            "lang": lang_id,
            "file": filename,
            "total": base_total,
            "translated": translated,
            "missing_count": len(missing),
            "empty_count": len(empty),
            "extra_count": len(extra),
            "coverage": coverage,
            "missing": missing,
            "empty": empty,
            "extra": extra,
        })

        if args.fix and lang_id != "zh_CN" and missing:
            fixed, skipped = fix_missing_keys(
                filepath, missing, base_sections, base_field_paths, target_content,
            )
            if fixed > 0:
                print(f"  🔧 {lang_id}: 已填充 {fixed} 个缺失键")
            if skipped > 0:
                print(f"  ⚠️  {lang_id}: {skipped} 个缺失键无法自动填充(目标构造函数不存在)")

    if args.json_output:
        report, below = generate_json_report(results, args.threshold)
    else:
        report, below = generate_text_report(results, args.threshold)

    print(report)

    if below:
        sys.exit(1)
    else:
        sys.exit(0)


if __name__ == "__main__":
    main()