refactor: 优化网络请求和错误处理 fix: 修复颜色引用和UI细节问题 docs: 更新API文档和设计规范 chore: 清理无用文件和脚本 perf: 优化图片导出和压缩逻辑 build: 更新依赖和构建配置 style: 调整代码格式和注释 test: 添加接口验证脚本 ci: 更新CI配置和脚本
812 lines
28 KiB
PHP
812 lines
28 KiB
PHP
<?php
|
||
|
||
namespace app\api\controller;
|
||
|
||
use app\common\controller\Api;
|
||
use think\Db;
|
||
use think\Cache;
|
||
|
||
/**
|
||
* @name 查重接口
|
||
* @author AI Coder
|
||
* @date 2026-04-28
|
||
* @desc 提供文本查重功能,支持精确/模糊/相似度查重
|
||
* @update v1.0 初始版本,支持21种数据源查重
|
||
*/
|
||
class Check extends Api
|
||
{
|
||
protected $noNeedLogin = ['*'];
|
||
protected $noNeedRight = ['*'];
|
||
|
||
private static $checkMap = [
|
||
'poetry' => [
|
||
'table' => 'poetry',
|
||
'name' => '古诗词',
|
||
'icon' => '📜',
|
||
'fields' => ['name', 'content', 'author'],
|
||
'title_field' => 'name',
|
||
'content_field' => 'content',
|
||
'extra_fields' => ['author', 'dynasty', 'tag'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'story' => [
|
||
'table' => 'story',
|
||
'name' => '故事大全',
|
||
'icon' => '📖',
|
||
'fields' => ['title', 'content'],
|
||
'title_field' => 'title',
|
||
'content_field' => 'content',
|
||
'extra_fields' => [],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'composition' => [
|
||
'table' => 'composition',
|
||
'name' => '作文大全',
|
||
'icon' => '✍️',
|
||
'fields' => ['title', 'content'],
|
||
'title_field' => 'title',
|
||
'content_field' => 'content',
|
||
'extra_fields' => [],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'lyric' => [
|
||
'table' => 'lyric',
|
||
'name' => '歌词大全',
|
||
'icon' => '🎵',
|
||
'fields' => ['title', 'content', 'singer'],
|
||
'title_field' => 'title',
|
||
'content_field' => 'content',
|
||
'extra_fields' => ['singer'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'wisdom' => [
|
||
'table' => 'wisdom',
|
||
'name' => '名人名言',
|
||
'icon' => '💡',
|
||
'fields' => ['content', 'name'],
|
||
'title_field' => 'name',
|
||
'content_field' => 'content',
|
||
'extra_fields' => [],
|
||
'status_field' => 'status',
|
||
'status_value' => 0,
|
||
],
|
||
'saying' => [
|
||
'table' => 'saying',
|
||
'name' => '谚语',
|
||
'icon' => '🗣️',
|
||
'fields' => ['saying', 'content'],
|
||
'title_field' => 'saying',
|
||
'content_field' => 'content',
|
||
'extra_fields' => [],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'why' => [
|
||
'table' => 'why',
|
||
'name' => '十万个为什么',
|
||
'icon' => '❓',
|
||
'fields' => ['title', 'content'],
|
||
'title_field' => 'title',
|
||
'content_field' => 'content',
|
||
'extra_fields' => [],
|
||
'status_field' => 'status',
|
||
'status_value' => 0,
|
||
],
|
||
'cs' => [
|
||
'table' => 'cs',
|
||
'name' => '生活常识',
|
||
'icon' => '🏠',
|
||
'fields' => ['title', 'content'],
|
||
'title_field' => 'title',
|
||
'content_field' => 'content',
|
||
'extra_fields' => [],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'hitokoto' => [
|
||
'table' => 'hitokoto',
|
||
'name' => '一言句子',
|
||
'icon' => '💬',
|
||
'fields' => ['hitokoto'],
|
||
'title_field' => '',
|
||
'content_field' => 'hitokoto',
|
||
'extra_fields' => ['type_name', 'from_source', 'from_who'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'chengyu' => [
|
||
'table' => 'cy',
|
||
'name' => '成语大全',
|
||
'icon' => '🔤',
|
||
'fields' => ['cy', 'cyjs'],
|
||
'title_field' => 'cy',
|
||
'content_field' => 'cyjs',
|
||
'extra_fields' => ['cypy', 'cycc'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'cidian' => [
|
||
'table' => 'zc',
|
||
'name' => '词典',
|
||
'icon' => '📚',
|
||
'fields' => ['zc', 'zcjs'],
|
||
'title_field' => 'zc',
|
||
'content_field' => 'zcjs',
|
||
'extra_fields' => ['zcpy'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'drug' => [
|
||
'table' => 'drug',
|
||
'name' => '药品查询',
|
||
'icon' => '💊',
|
||
'fields' => ['name', 'syz'],
|
||
'title_field' => 'name',
|
||
'content_field' => 'syz',
|
||
'extra_fields' => ['goods_name', 'gg', 'cf'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'herbal' => [
|
||
'table' => 'herbal',
|
||
'name' => '中药材',
|
||
'icon' => '🌿',
|
||
'fields' => ['name', 'effect'],
|
||
'title_field' => 'name',
|
||
'content_field' => 'effect',
|
||
'extra_fields' => ['name_alias', 'spell'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'prescription' => [
|
||
'table' => 'prescription',
|
||
'name' => '民间偏方',
|
||
'icon' => '🧪',
|
||
'fields' => ['title', 'content'],
|
||
'title_field' => 'title',
|
||
'content_field' => 'content',
|
||
'extra_fields' => [],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'tisana' => [
|
||
'table' => 'tisana',
|
||
'name' => '药茶大全',
|
||
'icon' => '🍵',
|
||
'fields' => ['name', 'effect', 'recipe'],
|
||
'title_field' => 'name',
|
||
'content_field' => 'effect',
|
||
'extra_fields' => ['recipe', 'source'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'food' => [
|
||
'table' => 'food',
|
||
'name' => '食物相克',
|
||
'icon' => '🍽️',
|
||
'fields' => ['sw', 'yh'],
|
||
'title_field' => 'sw',
|
||
'content_field' => 'yh',
|
||
'extra_fields' => [],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'couplet' => [
|
||
'table' => 'couplet',
|
||
'name' => '对联大全',
|
||
'icon' => '🧧',
|
||
'fields' => ['hp', 'sl', 'xl'],
|
||
'title_field' => 'hp',
|
||
'content_field' => 'sl',
|
||
'extra_fields' => ['xl', 'yy'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'brainteaser' => [
|
||
'table' => 'brainteaser',
|
||
'name' => '脑筋急转弯',
|
||
'icon' => '🧠',
|
||
'fields' => ['topic', 'answer'],
|
||
'title_field' => 'topic',
|
||
'content_field' => 'answer',
|
||
'extra_fields' => [],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'riddle' => [
|
||
'table' => 'riddle',
|
||
'name' => '谜语大全',
|
||
'icon' => '🔮',
|
||
'fields' => ['riddle', 'miidii'],
|
||
'title_field' => 'riddle',
|
||
'content_field' => 'miidii',
|
||
'extra_fields' => [],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'xiehouyu' => [
|
||
'table' => 'efs',
|
||
'name' => '歇后语',
|
||
'icon' => '😏',
|
||
'fields' => ['facet', 'undertone'],
|
||
'title_field' => 'facet',
|
||
'content_field' => 'undertone',
|
||
'extra_fields' => [],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
'jiufang' => [
|
||
'table' => 'jiufang',
|
||
'name' => '酒方大全',
|
||
'icon' => '🍷',
|
||
'fields' => ['name', 'ingredients', 'usage'],
|
||
'title_field' => 'name',
|
||
'content_field' => 'usage',
|
||
'extra_fields' => ['ingredients', 'source', 'method', 'categories'],
|
||
'status_field' => 'switch',
|
||
'status_value' => 1,
|
||
],
|
||
];
|
||
|
||
/**
|
||
* @name 查重数据源列表
|
||
* @desc 返回所有可查重的数据源及其字段信息
|
||
*/
|
||
public function sources()
|
||
{
|
||
$sources = [];
|
||
foreach (self::$checkMap as $key => $config) {
|
||
$count = 0;
|
||
try {
|
||
$query = Db::name($config['table']);
|
||
if ($config['status_field'] && $config['status_value']) {
|
||
$query->where($config['status_field'], $config['status_value']);
|
||
}
|
||
$count = $query->count();
|
||
} catch (\Exception $e) {}
|
||
|
||
$sources[] = [
|
||
'key' => $key,
|
||
'name' => $config['name'],
|
||
'icon' => $config['icon'],
|
||
'table' => 'tool_' . $config['table'],
|
||
'total' => $count,
|
||
'check_fields' => $config['fields'],
|
||
'title_field' => $config['title_field'],
|
||
'content_field' => $config['content_field'],
|
||
'extra_fields' => $config['extra_fields'],
|
||
];
|
||
}
|
||
|
||
$this->success('成功', [
|
||
'total_sources' => count($sources),
|
||
'sources' => $sources,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* @name 精确查重
|
||
* @desc 完全匹配标题或内容字段
|
||
*/
|
||
public function exact()
|
||
{
|
||
$rawPost = $this->request->post(false);
|
||
$text = isset($rawPost['text']) ? trim($rawPost['text']) : '';
|
||
$type = isset($rawPost['type']) ? trim($rawPost['type']) : 'all';
|
||
$field = isset($rawPost['field']) ? trim($rawPost['field']) : 'auto';
|
||
$limit = min(50, max(1, isset($rawPost['limit']) ? intval($rawPost['limit']) : 20));
|
||
|
||
if (empty($text)) {
|
||
$this->error('请输入查重文本');
|
||
}
|
||
if (mb_strlen($text) > 5000) {
|
||
$this->error('文本长度不能超过5000字符');
|
||
}
|
||
|
||
$types = ($type === 'all') ? array_keys(self::$checkMap) : [$type];
|
||
$results = [];
|
||
$total_matched = 0;
|
||
|
||
foreach ($types as $t) {
|
||
if (!isset(self::$checkMap[$t])) continue;
|
||
|
||
$config = self::$checkMap[$t];
|
||
$matched = $this->_exactCheck($config, $text, $field, $limit);
|
||
|
||
if (!empty($matched)) {
|
||
$total_matched += count($matched);
|
||
$results[] = [
|
||
'type' => $t,
|
||
'name' => $config['name'],
|
||
'icon' => $config['icon'],
|
||
'count' => count($matched),
|
||
'matches' => $matched,
|
||
];
|
||
}
|
||
|
||
if ($total_matched >= $limit) break;
|
||
}
|
||
|
||
$this->success('查重完成', [
|
||
'text' => mb_substr($text, 0, 100),
|
||
'text_length' => mb_strlen($text),
|
||
'mode' => 'exact',
|
||
'types_checked' => count($types),
|
||
'total_matched' => $total_matched,
|
||
'results' => $results,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* @name 模糊查重
|
||
* @desc LIKE关键词匹配
|
||
*/
|
||
public function fuzzy()
|
||
{
|
||
$rawPost = $this->request->post(false);
|
||
$text = isset($rawPost['text']) ? trim($rawPost['text']) : '';
|
||
$type = isset($rawPost['type']) ? trim($rawPost['type']) : 'all';
|
||
$field = isset($rawPost['field']) ? trim($rawPost['field']) : 'auto';
|
||
$limit = min(50, max(1, isset($rawPost['limit']) ? intval($rawPost['limit']) : 20));
|
||
$min_len = isset($rawPost['min_length']) ? intval($rawPost['min_length']) : 4;
|
||
|
||
if (empty($text)) {
|
||
$this->error('请输入查重文本');
|
||
}
|
||
if (mb_strlen($text) < $min_len) {
|
||
$this->error("查重文本至少需要{$min_len}个字符");
|
||
}
|
||
if (mb_strlen($text) > 5000) {
|
||
$this->error('文本长度不能超过5000字符');
|
||
}
|
||
|
||
$keywords = $this->_extractKeywords($text, $min_len);
|
||
if (empty($keywords)) {
|
||
$this->error('未能提取有效关键词,请增加文本长度');
|
||
}
|
||
|
||
$types = ($type === 'all') ? array_keys(self::$checkMap) : [$type];
|
||
$results = [];
|
||
$total_matched = 0;
|
||
|
||
foreach ($types as $t) {
|
||
if (!isset(self::$checkMap[$t])) continue;
|
||
|
||
$config = self::$checkMap[$t];
|
||
$matched = $this->_fuzzyCheck($config, $keywords, $field, $limit);
|
||
|
||
if (!empty($matched)) {
|
||
$total_matched += count($matched);
|
||
$results[] = [
|
||
'type' => $t,
|
||
'name' => $config['name'],
|
||
'icon' => $config['icon'],
|
||
'count' => count($matched),
|
||
'matches' => $matched,
|
||
];
|
||
}
|
||
|
||
if ($total_matched >= $limit) break;
|
||
}
|
||
|
||
$this->success('查重完成', [
|
||
'text' => mb_substr($text, 0, 100),
|
||
'text_length' => mb_strlen($text),
|
||
'mode' => 'fuzzy',
|
||
'keywords' => $keywords,
|
||
'types_checked' => count($types),
|
||
'total_matched' => $total_matched,
|
||
'results' => $results,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* @name 相似度查重
|
||
* @desc 基于n-gram文本相似度算法
|
||
*/
|
||
public function similar()
|
||
{
|
||
$rawPost = $this->request->post(false);
|
||
$text = isset($rawPost['text']) ? trim($rawPost['text']) : '';
|
||
$type = isset($rawPost['type']) ? trim($rawPost['type']) : 'all';
|
||
$threshold = isset($rawPost['threshold']) ? floatval($rawPost['threshold']) : 0.3;
|
||
$limit = min(50, max(1, isset($rawPost['limit']) ? intval($rawPost['limit']) : 20));
|
||
|
||
if (empty($text)) {
|
||
$this->error('请输入查重文本');
|
||
}
|
||
if (mb_strlen($text) > 5000) {
|
||
$this->error('文本长度不能超过5000字符');
|
||
}
|
||
$threshold = max(0.1, min(1.0, $threshold));
|
||
|
||
$textNgrams = $this->_ngram($text, 2);
|
||
|
||
$types = ($type === 'all') ? array_keys(self::$checkMap) : [$type];
|
||
$results = [];
|
||
$total_matched = 0;
|
||
|
||
foreach ($types as $t) {
|
||
if (!isset(self::$checkMap[$t])) continue;
|
||
|
||
$config = self::$checkMap[$t];
|
||
$matched = $this->_similarCheck($config, $text, $textNgrams, $threshold, $limit);
|
||
|
||
if (!empty($matched)) {
|
||
$total_matched += count($matched);
|
||
$results[] = [
|
||
'type' => $t,
|
||
'name' => $config['name'],
|
||
'icon' => $config['icon'],
|
||
'count' => count($matched),
|
||
'matches' => $matched,
|
||
];
|
||
}
|
||
|
||
if ($total_matched >= $limit) break;
|
||
}
|
||
|
||
usort($results, function ($a, $b) {
|
||
$maxA = !empty($a['matches']) ? max(array_column($a['matches'], 'similarity')) : 0;
|
||
$maxB = !empty($b['matches']) ? max(array_column($b['matches'], 'similarity')) : 0;
|
||
return $maxB <=> $maxA;
|
||
});
|
||
|
||
$this->success('查重完成', [
|
||
'text' => mb_substr($text, 0, 100),
|
||
'text_length' => mb_strlen($text),
|
||
'mode' => 'similar',
|
||
'threshold' => $threshold,
|
||
'types_checked' => count($types),
|
||
'total_matched' => $total_matched,
|
||
'results' => $results,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* @name 综合查重报告
|
||
* @desc 一次请求返回精确+模糊+相似度综合结果
|
||
*/
|
||
public function report()
|
||
{
|
||
$rawPost = $this->request->post(false);
|
||
$text = isset($rawPost['text']) ? trim($rawPost['text']) : '';
|
||
$type = isset($rawPost['type']) ? trim($rawPost['type']) : 'all';
|
||
$limit = min(20, max(1, isset($rawPost['limit']) ? intval($rawPost['limit']) : 10));
|
||
|
||
if (empty($text)) {
|
||
$this->error('请输入查重文本');
|
||
}
|
||
if (mb_strlen($text) > 5000) {
|
||
$this->error('文本长度不能超过5000字符');
|
||
}
|
||
|
||
$startTime = microtime(true);
|
||
|
||
$exactResults = [];
|
||
$fuzzyResults = [];
|
||
$similarResults = [];
|
||
|
||
$types = ($type === 'all') ? array_keys(self::$checkMap) : [$type];
|
||
if (!isset(self::$checkMap[$type]) && $type !== 'all') {
|
||
$this->error('不支持的查重类型: ' . $type);
|
||
}
|
||
|
||
foreach ($types as $t) {
|
||
if (!isset(self::$checkMap[$t])) continue;
|
||
$config = self::$checkMap[$t];
|
||
|
||
$exactMatch = $this->_exactCheck($config, $text, 'auto', $limit);
|
||
if (!empty($exactMatch)) {
|
||
$exactResults[] = ['type' => $t, 'name' => $config['name'], 'icon' => $config['icon'], 'count' => count($exactMatch), 'matches' => $exactMatch];
|
||
}
|
||
|
||
$keywords = $this->_extractKeywords($text, 4);
|
||
$fuzzyMatch = $this->_fuzzyCheck($config, $keywords, 'auto', $limit);
|
||
if (!empty($fuzzyMatch)) {
|
||
$fuzzyResults[] = ['type' => $t, 'name' => $config['name'], 'icon' => $config['icon'], 'count' => count($fuzzyMatch), 'matches' => $fuzzyMatch];
|
||
}
|
||
|
||
$textNgrams = $this->_ngram($text, 2);
|
||
$similarMatch = $this->_similarCheck($config, $text, $textNgrams, 0.3, $limit);
|
||
if (!empty($similarMatch)) {
|
||
$similarResults[] = ['type' => $t, 'name' => $config['name'], 'icon' => $config['icon'], 'count' => count($similarMatch), 'matches' => $similarMatch];
|
||
}
|
||
}
|
||
|
||
$exactTotal = array_sum(array_column($exactResults, 'count'));
|
||
$fuzzyTotal = array_sum(array_column($fuzzyResults, 'count'));
|
||
$similarTotal = array_sum(array_column($similarResults, 'count'));
|
||
|
||
$maxSimilarity = 0;
|
||
$maxSimilaritySource = '';
|
||
foreach ($similarResults as $sr) {
|
||
foreach ($sr['matches'] as $m) {
|
||
if ($m['similarity'] > $maxSimilarity) {
|
||
$maxSimilarity = $m['similarity'];
|
||
$maxSimilaritySource = $sr['name'];
|
||
}
|
||
}
|
||
}
|
||
|
||
$elapsed = round(microtime(true) - $startTime, 3);
|
||
|
||
$riskLevel = 'low';
|
||
$riskScore = 0;
|
||
if ($exactTotal > 0) {
|
||
$riskLevel = 'high';
|
||
$riskScore = min(100, 60 + $exactTotal * 10);
|
||
} elseif ($similarTotal > 0 && $maxSimilarity >= 0.6) {
|
||
$riskLevel = 'high';
|
||
$riskScore = min(100, 40 + intval($maxSimilarity * 50));
|
||
} elseif ($fuzzyTotal > 0 || $similarTotal > 0) {
|
||
$riskLevel = 'medium';
|
||
$riskScore = min(100, 20 + $fuzzyTotal * 5 + intval($maxSimilarity * 30));
|
||
}
|
||
|
||
$this->success('查重报告生成完成', [
|
||
'text' => mb_substr($text, 0, 100),
|
||
'text_length' => mb_strlen($text),
|
||
'types_checked' => count($types),
|
||
'elapsed_time' => $elapsed . 's',
|
||
'risk_level' => $riskLevel,
|
||
'risk_score' => $riskScore,
|
||
'max_similarity' => round($maxSimilarity * 100, 1) . '%',
|
||
'max_similarity_source' => $maxSimilaritySource,
|
||
'summary' => [
|
||
'exact_matches' => $exactTotal,
|
||
'fuzzy_matches' => $fuzzyTotal,
|
||
'similar_matches' => $similarTotal,
|
||
],
|
||
'exact_results' => $exactResults,
|
||
'fuzzy_results' => $fuzzyResults,
|
||
'similar_results' => $similarResults,
|
||
]);
|
||
}
|
||
|
||
/**
|
||
* @name 精确查重内部方法
|
||
*/
|
||
private function _exactCheck($config, $text, $field, $limit)
|
||
{
|
||
try {
|
||
$query = Db::name($config['table']);
|
||
if ($config['status_field'] && isset($config['status_value'])) {
|
||
$query->where($config['status_field'], $config['status_value']);
|
||
}
|
||
|
||
$searchFields = ($field === 'auto') ? $config['fields'] : [$field];
|
||
$query->where(function ($q) use ($searchFields, $text) {
|
||
foreach ($searchFields as $f) {
|
||
$q->whereOr($f, '=', $text);
|
||
}
|
||
});
|
||
|
||
$selectFields = ['id'];
|
||
if ($config['title_field']) $selectFields[] = $config['title_field'];
|
||
if ($config['content_field']) $selectFields[] = $config['content_field'];
|
||
foreach ($config['extra_fields'] as $ef) {
|
||
if (!in_array($ef, $selectFields)) $selectFields[] = $ef;
|
||
}
|
||
|
||
$rows = $query->field($selectFields)->limit($limit)->select();
|
||
return $this->_formatMatches($config, $rows, 1.0);
|
||
} catch (\Exception $e) {
|
||
return [];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @name 模糊查重内部方法
|
||
*/
|
||
private function _fuzzyCheck($config, $keywords, $field, $limit)
|
||
{
|
||
try {
|
||
$query = Db::name($config['table']);
|
||
if ($config['status_field'] && isset($config['status_value'])) {
|
||
$query->where($config['status_field'], $config['status_value']);
|
||
}
|
||
|
||
$searchFields = ($field === 'auto') ? $config['fields'] : [$field];
|
||
$kw = '%' . implode('%', $keywords) . '%';
|
||
|
||
$query->where(function ($q) use ($searchFields, $kw) {
|
||
foreach ($searchFields as $f) {
|
||
$q->whereOr($f, 'like', $kw);
|
||
}
|
||
});
|
||
|
||
$selectFields = ['id'];
|
||
if ($config['title_field']) $selectFields[] = $config['title_field'];
|
||
if ($config['content_field']) $selectFields[] = $config['content_field'];
|
||
foreach ($config['extra_fields'] as $ef) {
|
||
if (!in_array($ef, $selectFields)) $selectFields[] = $ef;
|
||
}
|
||
|
||
$rows = $query->field($selectFields)->limit($limit)->select();
|
||
return $this->_formatMatches($config, $rows, null);
|
||
} catch (\Exception $e) {
|
||
return [];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @name 相似度查重内部方法
|
||
*/
|
||
private function _similarCheck($config, $text, $textNgrams, $threshold, $limit)
|
||
{
|
||
try {
|
||
$keywords = $this->_extractKeywords($text, 2);
|
||
if (empty($keywords)) return [];
|
||
|
||
$query = Db::name($config['table']);
|
||
if ($config['status_field'] && isset($config['status_value'])) {
|
||
$query->where($config['status_field'], $config['status_value']);
|
||
}
|
||
|
||
$primaryField = $config['content_field'] ?: $config['title_field'];
|
||
if (count($keywords) >= 2) {
|
||
$kw = '%' . $keywords[0] . '%' . $keywords[1] . '%';
|
||
$query->where($primaryField, 'like', $kw);
|
||
} else {
|
||
$kw = '%' . $keywords[0] . '%';
|
||
$query->where($primaryField, 'like', $kw);
|
||
}
|
||
|
||
$selectFields = ['id'];
|
||
if ($config['title_field']) $selectFields[] = $config['title_field'];
|
||
if ($config['content_field']) $selectFields[] = $config['content_field'];
|
||
foreach ($config['extra_fields'] as $ef) {
|
||
if (!in_array($ef, $selectFields)) $selectFields[] = $ef;
|
||
}
|
||
|
||
$rows = $query->field($selectFields)->limit(100)->select();
|
||
|
||
$matches = [];
|
||
foreach ($rows as $row) {
|
||
$rowText = '';
|
||
if ($config['title_field'] && isset($row[$config['title_field']])) {
|
||
$rowText .= $row[$config['title_field']] . ' ';
|
||
}
|
||
if ($config['content_field'] && isset($row[$config['content_field']])) {
|
||
$rowText .= $row[$config['content_field']];
|
||
}
|
||
$rowText = strip_tags($rowText);
|
||
$rowNgrams = $this->_ngram($rowText, 2);
|
||
$similarity = $this->_cosineSimilarity($textNgrams, $rowNgrams);
|
||
|
||
if ($similarity >= $threshold) {
|
||
$match = $this->_formatSingleMatch($config, $row);
|
||
$match['similarity'] = round($similarity, 4);
|
||
$match['similarity_percent'] = round($similarity * 100, 1) . '%';
|
||
$matches[] = $match;
|
||
}
|
||
}
|
||
|
||
usort($matches, function ($a, $b) {
|
||
return $b['similarity'] <=> $a['similarity'];
|
||
});
|
||
|
||
return array_slice($matches, 0, $limit);
|
||
} catch (\Exception $e) {
|
||
return [];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @name 格式化匹配结果
|
||
*/
|
||
private function _formatMatches($config, $rows, $similarity = null)
|
||
{
|
||
$matches = [];
|
||
foreach ($rows as $row) {
|
||
$match = $this->_formatSingleMatch($config, $row);
|
||
if ($similarity !== null) {
|
||
$match['similarity'] = $similarity;
|
||
$match['similarity_percent'] = round($similarity * 100, 1) . '%';
|
||
}
|
||
$matches[] = $match;
|
||
}
|
||
return $matches;
|
||
}
|
||
|
||
/**
|
||
* @name 格式化单条匹配
|
||
*/
|
||
private function _formatSingleMatch($config, $row)
|
||
{
|
||
$match = ['id' => $row['id']];
|
||
|
||
if ($config['title_field'] && isset($row[$config['title_field']])) {
|
||
$match['title'] = mb_substr(strip_tags($row[$config['title_field']]), 0, 200);
|
||
}
|
||
if ($config['content_field'] && isset($row[$config['content_field']])) {
|
||
$match['content'] = mb_substr(strip_tags($row[$config['content_field']]), 0, 300);
|
||
}
|
||
|
||
foreach ($config['extra_fields'] as $ef) {
|
||
if (isset($row[$ef]) && !empty($row[$ef])) {
|
||
$match[$ef] = mb_substr(strip_tags($row[$ef]), 0, 100);
|
||
}
|
||
}
|
||
|
||
return $match;
|
||
}
|
||
|
||
/**
|
||
* @name 提取关键词
|
||
*/
|
||
private function _extractKeywords($text, $minLen = 2)
|
||
{
|
||
$text = strip_tags($text);
|
||
$text = preg_replace('/[^\x{4e00}-\x{9fa5}a-zA-Z0-9]/u', ' ', $text);
|
||
$text = preg_replace('/\s+/', ' ', trim($text));
|
||
|
||
$words = explode(' ', $text);
|
||
$keywords = [];
|
||
foreach ($words as $word) {
|
||
$word = trim($word);
|
||
if (mb_strlen($word) >= $minLen) {
|
||
$keywords[] = $word;
|
||
}
|
||
}
|
||
|
||
if (empty($keywords) && mb_strlen($text) >= $minLen) {
|
||
$len = mb_strlen($text);
|
||
$chunkSize = min(8, $len);
|
||
for ($i = 0; $i < $len - $chunkSize + 1; $i += max(1, intval($chunkSize / 2))) {
|
||
$keywords[] = mb_substr($text, $i, $chunkSize);
|
||
}
|
||
}
|
||
|
||
return array_unique(array_slice($keywords, 0, 10));
|
||
}
|
||
|
||
/**
|
||
* @name 生成n-gram
|
||
*/
|
||
private function _ngram($text, $n = 2)
|
||
{
|
||
$text = strip_tags($text);
|
||
$text = preg_replace('/\s+/', '', $text);
|
||
$len = mb_strlen($text);
|
||
$ngrams = [];
|
||
for ($i = 0; $i < $len - $n + 1; $i++) {
|
||
$gram = mb_substr($text, $i, $n);
|
||
$ngrams[$gram] = isset($ngrams[$gram]) ? $ngrams[$gram] + 1 : 1;
|
||
}
|
||
return $ngrams;
|
||
}
|
||
|
||
/**
|
||
* @name 余弦相似度计算
|
||
*/
|
||
private function _cosineSimilarity($ngrams1, $ngrams2)
|
||
{
|
||
if (empty($ngrams1) || empty($ngrams2)) return 0;
|
||
|
||
$dotProduct = 0;
|
||
$norm1 = 0;
|
||
$norm2 = 0;
|
||
|
||
foreach ($ngrams1 as $gram => $count) {
|
||
$norm1 += $count * $count;
|
||
if (isset($ngrams2[$gram])) {
|
||
$dotProduct += $count * $ngrams2[$gram];
|
||
}
|
||
}
|
||
|
||
foreach ($ngrams2 as $gram => $count) {
|
||
$norm2 += $count * $count;
|
||
}
|
||
|
||
if ($norm1 == 0 || $norm2 == 0) return 0;
|
||
|
||
return $dotProduct / (sqrt($norm1) * sqrt($norm2));
|
||
}
|
||
}
|