Files
kitchen/scripts/test_pdf_garbled_chars.dart
Developer 4ec348b28e feat: 更新鸿蒙应用配置与功能优化
- 添加鸿蒙分层图标配置和生成脚本
- 修复数据导出JSON解析问题
- 优化关于页面和团队信息展示
- 更新应用版本至1.4.1
- 清理代码警告和冗余文件
- 添加字体和二维码测试脚本
- 完善鸿蒙适配文档和指南
2026-04-25 09:52:06 +08:00

344 lines
12 KiB
Dart
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// 2026-04-25 | test_pdf_garbled_chars.dart | PDF乱码字符诊断脚本
// 2026-04-25 | 创建: 验证 _cleanPdfText 对各种Unicode字符的过滤效果
void main() {
print('╔══════════════════════════════════════════════════════════════╗');
print('║ PDF 乱码字符诊断工具 v1.0 ║');
print('║ 测试 _cleanPdfText 过滤效果 ║');
print('╚══════════════════════════════════════════════════════════════╝\n');
// ========== 测试用例 ==========
final testCases = <_TestCase>[
// 基础测试 - 正常文本应该保留
_TestCase(name: '✅ 正常中文', input: '这道菜很好吃,营养丰富', expected: '保留'),
_TestCase(name: '✅ 中英文混合', input: 'Hello世界美味佳肴123', expected: '保留'),
_TestCase(name: '✅ 纯英文', input: 'Delicious food recipe', expected: '保留'),
// 乱码测试 - 应该被过滤
_TestCase(name: '❌ 菱形方块 (U+25FF)', input: '▯▯▯▯▯▯▯▯▯▯▯▯', expected: '过滤'),
_TestCase(name: '❌ 交叉形状 (U+2716)', input: '✖✖✖✖✖✖✖✖✖', expected: '过滤'),
_TestCase(
name: '❌ 私用区字符 (U+E000-U+F8FF)',
input: '\uE000\uE001\uE002\uE003\uE004\uE005',
expected: '过滤',
),
_TestCase(
name: '❌ 变体选择器 (U+FE00-U+FE0F)',
input: 'A\uFE00B\uFE01C\uFE02',
expected: '过滤或部分保留',
),
_TestCase(
name: '❌ 控制字符 (U+00-U+1F)',
input: '\x00\x01\x02\x03\x04\x05',
expected: '过滤',
),
_TestCase(name: '❌ Unicode替换字符 (U+FFFD)', input: '<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>', expected: '过滤'),
// 边界情况
_TestCase(name: '⚠️ 混合内容 (正常+乱码)', input: '很好吃▯▯▯营养▯▯丰富', expected: '部分保留'),
_TestCase(
name: '⚠️ 高比例乱码 (>40%)',
input: '好吃▯▯▯▯▯▯▯▯▯▯▯▯▯▯▯▯▯▯▯▯▯▯▯',
expected: '过滤(>40%阈值)',
),
_TestCase(
name: '⚠️ 低比例乱码 (<40%)',
input: '这道菜真的很好吃,营养丰富味道鲜美▯▯',
expected: '保留(<40%阈值)',
),
// 特殊Unicode区块
_TestCase(
name: '🔣 制表符/边框 (U+2500-U+257F)',
input: '┌┐└┘├┤┬┴┼─│',
expected: '过滤',
),
_TestCase(
name: '🔣 方块元素 (U+25A0-U+25FF)',
input: '■□▢▣▤▥▦▧▨▩',
expected: '过滤',
),
_TestCase(
name: '🔣 几何形状 (U+25A0-U+25FF)',
input: '▲▼◆◇○●◐◑◒◓',
expected: '过滤',
),
_TestCase(
name: '🔣 箭头符号 (U+2190-U+21FF)',
input: '→←↑↓↔⇒⇐⇑⇓',
expected: '过滤',
),
_TestCase(
name: '🔣 数学运算符 (U+2200-U+22FF)',
input: '±×÷≈≠≤≥∞√',
expected: '过滤',
),
_TestCase(
name: '🔣 装饰符号 (U+2700-U+27BF)',
input: '✓✔✗✘★☆♠♣♥♦',
expected: '过滤',
),
_TestCase(
name: '🔣 Dingbats (U+2700-U+27BF)',
input: '❤❥❦❧❝❞❟❰❱',
expected: '过滤',
),
// 实际场景模拟
_TestCase(
name: '🎯 场景1: displayIntro含PUA',
input: '美味家常菜\uE000\uE001\uE002\uE003',
expected: '保留"美味家常菜"',
),
_TestCase(
name: '🎯 场景2: 全是乱码',
input: '\uE000\uE001\uE002\uE003\uE004\uE005\uE006\uE007\uE008\uE009',
expected: '空字符串',
),
_TestCase(
name: '🎯 场景3: 含控制字符',
input: '好吃的菜\x01\x02\x03\x04\x05',
expected: '保留"好吃的菜"',
),
_TestCase(
name: '🎯 场景4: CJK扩展区汉字',
input: '\u3400\u3401\u3402\u4E00\u4E01', // CJK扩展A + 统一汉字
expected: '保留CJK扩展区',
),
];
// ========== 执行测试 ==========
int passed = 0;
int failed = 0;
for (var i = 0; i < testCases.length; i++) {
final tc = testCases[i];
print('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
print("测试 ${i + 1}/${testCases.length}: ${tc.name}");
print('输入: "${tc.input}"');
// 打印每个字符的详细信息
print('字符分析:');
for (var j = 0; j < tc.input.length; j++) {
final cp = tc.input.codeUnitAt(j);
final char = tc.input[j];
final cpHex = 'U+${cp.toRadixString(16).toUpperCase().padLeft(4, '0')}';
String category;
if (_isCjk(cp)) {
category = 'CJK汉字';
} else if (_isAsciiLetter(cp)) {
category = 'ASCII字母';
} else if (_isDigit(cp)) {
category = '数字';
} else if (_isSpace(cp)) {
category = '空白';
} else if (_isPunctuation(cp)) {
category = '标点';
} else if (_shouldFilterChar(cp, char)) {
category = '❌ 将被过滤';
} else {
category = '⚠️ 未分类';
}
print(' [$j] "$char" $cpHex$category');
}
// 执行清理
final result = _cleanPdfText(tc.input);
print('输出: "${result ?? "null"}"');
print('长度: ${result?.length ?? 0}');
// 判断是否通过
bool testPassed = false;
if (tc.expected.contains('保留') && result != null && result.isNotEmpty) {
testPassed = true;
} else if (tc.expected.contains('过滤') &&
(result == null || result.isEmpty)) {
testPassed = true;
} else if (tc.expected.contains('空字符串') &&
(result == null || result.isEmpty)) {
testPassed = true;
}
if (testPassed) {
print('结果: ✅ 通过 (预期: ${tc.expected})');
passed++;
} else {
print('结果: ❌ 失败 (预期: ${tc.expected})');
failed++;
}
print('');
}
// ========== 汇总 ==========
print('╔══════════════════════════════════════════════════════════════╗');
print('║ 测试汇总 ║');
print('╠══════════════════════════════════════════════════════════════╣');
print(
'║ 总测试数: ${testCases.length.toString().padLeft(3)}',
);
print(
'║ ✅ 通过: ${passed.toString().padLeft(3)}',
);
print(
'║ ❌ 失败: ${failed.toString().padLeft(3)}',
);
print(
'║ 通过率: ${(passed / testCases.length * 100).toStringAsFixed(1).padLeft(5)}% ║',
);
print('╚══════════════════════════════════════════════════════════════╝');
if (failed > 0) {
print('\n⚠️ 有 $failed 个测试失败,请检查过滤逻辑!');
} else {
print('\n🎉 所有测试通过_cleanPdfText 工作正常。');
}
}
// ========== 核心方法 (从 recipe_export_button.dart 复制) ==========
String _cleanPdfText(String text) {
if (text.isEmpty) return '';
var cleaned = StringBuffer();
for (var i = 0; i < text.length; i++) {
final codeUnit = text.codeUnitAt(i);
final char = text[i];
if (_shouldFilterChar(codeUnit, char)) continue;
cleaned.write(char);
}
var result = cleaned.toString().trim();
if (result.isEmpty) return '';
if (_isGarbledText(result)) return '';
return result;
}
bool _shouldFilterChar(int codeUnit, String char) {
if (codeUnit < 0x20 &&
codeUnit != 0x09 &&
codeUnit != 0x0A &&
codeUnit != 0x0D) {
return true;
}
if (codeUnit == 0x7F) return true;
if (codeUnit >= 0x80 && codeUnit <= 0x9F) return true;
if ((codeUnit & 0xFFFE) == 0xFFFE || (codeUnit & 0xFFFE) == 0xFFFF)
return true;
if (codeUnit == 0xFFFD) return true;
if (codeUnit >= 0xFDD0 && codeUnit <= 0xFDEF) return true;
if (codeUnit >= 0xE000 && codeUnit <= 0xF8FF) return true;
if (codeUnit >= 0xFFF0 && codeUnit <= 0xFFFB) return true;
if (codeUnit >= 0xFE00 && codeUnit <= 0xFE0F) return true;
if (_isSpecialSymbol(char)) return true;
return false;
}
bool _isSpecialSymbol(String char) {
const rawSymbols =
'▯□■◯○●◇◆▪▫◻◼◽◾▱░▒▓█▄▌▐▀▸▂▁▃▅▆▇▉▊▋▎▏▕▖▗▘▙▚▛▜▝▞▟╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿┌┐└┘├┤┬┴┼─│┈┉┊┋━┃┅┆┇┍┎┏┐┑▒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋☐☑☒✓✔✗✘→←↑↓↔⇒⇐⇑⇓⇔⇕⇖⇗⇘⇙♠♣♥♦★☆▲▼◐◑◒◓◔◕◖◗❤❥❦❧❝❞❟❰❱❲❳❴❵❶❷❸❹❺❻❼❽❾❿➔➘➙➚➛➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶➷➸➹➺➻➼➽➾➿⟦⟧⟨⟩⟪⟫⟬⟭⟮⟯⬅⬆⬇⬈⬉⬊⬋⬌⬍⬎⬏⬐⬑⬒⬓⬘⬙⬚⬛⬜⬝⬞⬟⬠⬡⭢⭣⭤⭥⭦⭧⭨⭩⭪⭫⭬⭭⭮⭯⭐⭕⭘⭙⭚⭛⭜⭝⭞⭟⭠⭡⭢⭣⭤⭥';
return rawSymbols.contains(char);
}
bool _isGarbledText(String text) {
if (text.length < 2) return false;
int specialCount = 0;
for (int i = 0; i < text.length; i++) {
final cp = text.codeUnitAt(i);
final isCjk =
(cp >= 0x4E00 && cp <= 0x9FFF) ||
(cp >= 0x3400 && cp <= 0x4DBF) ||
(cp >= 0x20000 && cp <= 0x2A6DF) ||
(cp >= 0x2A700 && cp <= 0x2B73F) ||
(cp >= 0x2B740 && cp <= 0x2B81F) ||
(cp >= 0x2B820 && cp <= 0x2CEAF) ||
(cp >= 0xF900 && cp <= 0xFAFF) ||
(cp >= 0x2F800 && cp <= 0x2FA1F);
final isAsciiLetter =
(cp >= 0x41 && cp <= 0x5A) || (cp >= 0x61 && cp <= 0x7A);
final isDigit = cp >= 0x30 && cp <= 0x39;
final isSpace = cp == 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D;
final isPunctuation =
(cp >= 0x2000 && cp <= 0x206F) ||
(cp >= 0x3000 && cp <= 0x303F) ||
(cp >= 0xFF00 && cp <= 0xFFEF) ||
cp == 0x2E ||
cp == 0x2C ||
cp == 0x3B ||
cp == 0x3A ||
cp == 0x21 ||
cp == 0x3F ||
cp == 0x28 ||
cp == 0x29 ||
cp == 0x5B ||
cp == 0x5D ||
cp == 0x7B ||
cp == 0x7D ||
cp == 0x201C ||
cp == 0x201D ||
cp == 0x2018 ||
cp == 0x2019;
if (!isCjk && !isAsciiLetter && !isDigit && !isSpace && !isPunctuation) {
specialCount++;
}
}
final ratio = specialCount / text.length;
return ratio > 0.4;
}
// ========== 辅助判断方法 ==========
bool _isCjk(int cp) {
return (cp >= 0x4E00 && cp <= 0x9FFF) ||
(cp >= 0x3400 && cp <= 0x4DBF) ||
(cp >= 0x20000 && cp <= 0x2A6DF) ||
(cp >= 0x2A700 && cp <= 0x2B73F) ||
(cp >= 0x2B740 && cp <= 0x2B81F) ||
(cp >= 0x2B820 && cp <= 0x2CEAF) ||
(cp >= 0xF900 && cp <= 0xFAFF) ||
(cp >= 0x2F800 && cp <= 0x2FA1F);
}
bool _isAsciiLetter(int cp) {
return (cp >= 0x41 && cp <= 0x5A) || (cp >= 0x61 && cp <= 0x7A);
}
bool _isDigit(int cp) {
return cp >= 0x30 && cp <= 0x39;
}
bool _isSpace(int cp) {
return cp == 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D;
}
bool _isPunctuation(int cp) {
return (cp >= 0x2000 && cp <= 0x206F) ||
(cp >= 0x3000 && cp <= 0x303F) ||
(cp >= 0xFF00 && cp <= 0xFFEF) ||
cp == 0x2E ||
cp == 0x2C ||
cp == 0x3B ||
cp == 0x3A ||
cp == 0x21 ||
cp == 0x3F ||
cp == 0x28 ||
cp == 0x29 ||
cp == 0x5B ||
cp == 0x5D ||
cp == 0x7B ||
cp == 0x7D ||
cp == 0x201C ||
cp == 0x201D ||
cp == 0x2018 ||
cp == 0x2019;
}
// ========== 测试用例数据类 ==========
class _TestCase {
final String name;
final String input;
final String expected;
_TestCase({required this.name, required this.input, required this.expected});
}