优化资源搜索结果的去重逻辑

This commit is contained in:
x1ao4 2025-08-26 17:20:34 +08:00
parent 54bcd0906a
commit 101abb0247
2 changed files with 100 additions and 17 deletions

View File

@ -981,26 +981,91 @@ def get_task_suggestions():
except Exception as e:
logging.warning(f"PanSou 搜索失败: {str(e)}")
# 去重按shareurl优先其次taskname
# 去重并统一时间字段为 publish_date
# 规则:
# 1) shareurl 相同视为同一资源
# 2) 当 taskname 与 publish_date 同时完全一致时,也视为同一资源(即使 shareurl 不同)
dedup = []
seen = set()
seen_shareurls = set()
seen_title_date = set()
seen_fingerprints = set()
# 规范化工具
def normalize_shareurl(url: str) -> str:
try:
if not url:
return ""
u = url.strip()
# 仅取夸克分享ID: pan.quark.cn/s/<id>[?...]
# 同时支持直接传入ID的情况
match = re.search(r"/s/([^\?/#\s]+)", u)
if match:
return match.group(1)
# 如果没有域名路径,尝试去掉查询参数
return u.split('?')[0]
except Exception:
return url or ""
def normalize_title(title: str) -> str:
try:
if not title:
return ""
import unicodedata
t = unicodedata.normalize('NFKC', title)
t = t.replace('\u3000', ' ').replace('\t', ' ')
t = re.sub(r"\s+", " ", t).strip()
return t
except Exception:
return title or ""
def normalize_date(date_str: str) -> str:
try:
if not date_str:
return ""
import unicodedata
ds = unicodedata.normalize('NFKC', date_str).strip()
return ds
except Exception:
return (date_str or "").strip()
for item in merged:
if not isinstance(item, dict):
continue
key = item.get("shareurl") or item.get("taskname")
if not key:
# 统一时间字段:优先使用已存在的 publish_date否则使用 datetime并写回 publish_date
try:
if not item.get("publish_date") and item.get("datetime"):
item["publish_date"] = item.get("datetime")
except Exception:
pass
shareurl = normalize_shareurl(item.get("shareurl") or "")
title = normalize_title(item.get("taskname") or "")
pubdate = normalize_date(item.get("publish_date") or "")
source = (item.get("source") or "").strip()
# 条件1按 shareurl 去重
if shareurl and shareurl in seen_shareurls:
continue
if key in seen:
# 条件2标题 + 发布时间 同时一致则判定为同一资源
title_date_key = f"{title}||{pubdate}" if title and pubdate else None
if title_date_key and title_date_key in seen_title_date:
continue
seen.add(key)
# 条件3完整指纹键shareurl+title+date+source去重兜底完全相同的重复项
fingerprint = f"{shareurl}|{title}|{pubdate}|{source}"
if fingerprint in seen_fingerprints:
continue
# 记录已见键并保留该条
if shareurl:
seen_shareurls.add(shareurl)
if title_date_key:
seen_title_date.add(title_date_key)
seen_fingerprints.add(fingerprint)
dedup.append(item)
# 全局时间排序:所有来源的结果混合排序,按时间倒序(最新的在前)
if dedup:
def parse_datetime_for_sort(item):
"""解析时间字段,返回可比较的时间戳"""
# 兼容两个字段名publish_date 和 datetime
datetime_str = item.get("publish_date") or item.get("datetime")
"""解析时间字段,返回可比较的时间戳(统一以 publish_date 为准)"""
datetime_str = item.get("publish_date")
if not datetime_str:
return 0 # 没有时间的排在最后
try:

View File

@ -1017,16 +1017,16 @@
<i class="spinner-border spinner-border-sm" role="status" aria-hidden="true"></i>
正在验证链接有效性...{{ smart_param.validateProgress.current }}/{{ smart_param.validateProgress.total }}<span v-if="smart_param.validateProgress.valid > 0">已找到 {{ smart_param.validateProgress.valid }} 个有效链接</span>
</span>
<span v-else>正在搜索...</span>
<span v-else>正在搜索资源...</span>
</div>
<div class="dropdown-item text-muted" v-else style="font-size:14px; padding-left: 8px; text-align: left;">
{{ smart_param.taskSuggestions.message ? smart_param.taskSuggestions.message : smart_param.taskSuggestions.data && smart_param.taskSuggestions.data.length ? `以下资源由 ${(smart_param.taskSuggestions.source || '').replace(/,\s*/g, '、')} 搜索提供(仅显示有效链接,共 ${(smart_param.taskSuggestions.data || []).length} 个),如有侵权请联系资源发布方` : "未搜索到有效资源" }}
</div>
<div v-for="suggestion in smart_param.taskSuggestions.data || []" :key="suggestion.taskname" class="dropdown-item cursor-pointer" @click.prevent="selectSuggestion(index, suggestion)" style="font-size: 14px;" :title="suggestion.content">
<div v-for="suggestion in smart_param.taskSuggestions.data || []" :key="suggestion.taskname" class="dropdown-item cursor-pointer" @click.prevent="selectSuggestion(index, suggestion)" style="font-size: 14px;" :title="getSuggestionHoverTitle(suggestion)">
<span v-html="suggestion.verify ? '✅': ''"></span> {{ suggestion.taskname }}
<small class="text-muted">
<a :href="suggestion.shareurl" target="_blank" @click.stop> · {{ suggestion.shareurl.replace(/^https?:\/\/pan\.quark\.cn\/s\//, '') }}</a>
<template v-if="suggestion.source"><span class="source-badge" :class="suggestion.source.toLowerCase()" :data-publish-date="(suggestion.publish_date || suggestion.datetime) ? ' · ' + (suggestion.publish_date || suggestion.datetime) : ''">{{ suggestion.source }}</span></template>
<template v-if="suggestion.source"><span class="source-badge" :class="suggestion.source.toLowerCase()" :data-publish-date="suggestion.publish_date ? ' · ' + suggestion.publish_date : ''">{{ suggestion.source }}</span></template>
</small>
</div>
</div>
@ -1945,16 +1945,16 @@
<i class="spinner-border spinner-border-sm" role="status" aria-hidden="true"></i>
正在验证链接有效性...{{ smart_param.validateProgress.current }}/{{ smart_param.validateProgress.total }}<span v-if="smart_param.validateProgress.valid > 0">已找到 {{ smart_param.validateProgress.valid }} 个有效链接</span>
</span>
<span v-else>正在搜索...</span>
<span v-else>正在搜索资源...</span>
</div>
<div class="dropdown-item text-muted" v-else style="font-size:14px; padding-left: 8px; text-align: left;">
{{ smart_param.taskSuggestions.message ? smart_param.taskSuggestions.message : smart_param.taskSuggestions.data && smart_param.taskSuggestions.data.length ? `以下资源由 ${(smart_param.taskSuggestions.source || '').replace(/,\s*/g, '、')} 搜索提供(仅显示有效链接,共 ${(smart_param.taskSuggestions.data || []).length} 个),如有侵权请联系资源发布方` : "未搜索到有效资源" }}
</div>
<div v-for="suggestion in smart_param.taskSuggestions.data || []" :key="suggestion.taskname" class="dropdown-item cursor-pointer" @click.prevent="selectSuggestion(-1, suggestion)" style="font-size: 14px;" :title="suggestion.content">
<div v-for="suggestion in smart_param.taskSuggestions.data || []" :key="suggestion.taskname" class="dropdown-item cursor-pointer" @click.prevent="selectSuggestion(-1, suggestion)" style="font-size: 14px;" :title="getSuggestionHoverTitle(suggestion)">
<span v-html="suggestion.verify ? '✅': ''"></span> {{ suggestion.taskname }}
<small class="text-muted">
<a :href="suggestion.shareurl" target="_blank" @click.stop> · {{ suggestion.shareurl.replace(/^https?:\/\/pan\.quark\.cn\/s\//, '') }}</a>
<template v-if="suggestion.source"><span class="source-badge" :class="suggestion.source.toLowerCase()" :data-publish-date="(suggestion.publish_date || suggestion.datetime) ? ' · ' + (suggestion.publish_date || suggestion.datetime) : ''">{{ suggestion.source }}</span></template>
<template v-if="suggestion.source"><span class="source-badge" :class="suggestion.source.toLowerCase()" :data-publish-date="suggestion.publish_date ? ' · ' + suggestion.publish_date : ''">{{ suggestion.source }}</span></template>
</small>
</div>
</div>
@ -2758,6 +2758,24 @@
document.removeEventListener('click', this.handleOutsideClick);
},
methods: {
// 仅当有有效信息时返回悬停提示否则返回null以不显示
getSuggestionHoverTitle(suggestion) {
if (!suggestion) return null;
let content = (suggestion.content || '').trim();
if (!content) return null;
// 统一标点为英文冒号,统一逗号
const normalized = content
.replace(//g, ':')
.replace(//g, ',')
.replace(/\s+/g, ' ')
.trim();
// 仅在明确的占位文本时隐藏:
// 1) 全文就是“大小:-”
if (/^大小\s*:\s*-$/i.test(normalized)) return null;
// 2) 完全匹配“类别:xx, 文件类型:yy, 大小:-”这类占位
if (/^类别\s*:[^,]*,\s*文件类型\s*:[^,]*,\s*大小\s*:\s*-$/i.test(normalized)) return null;
return content;
},
// 获取插件展示名称支持别名仅用于WebUI显示
getPluginDisplayName(pluginName) {
return this.pluginDisplayAliases[pluginName] || pluginName;
@ -4312,7 +4330,7 @@
// 解析时间用于排序(降序:最新在前)
const getItemTs = (item) => {
const raw = item.publish_date || item.datetime || '';
const raw = item.publish_date || '';
const ts = Date.parse(raw);
return isNaN(ts) ? 0 : ts;
};
@ -4445,7 +4463,7 @@
// 结束前做一次排序,确保最终顺序正确
const getItemTs = (item) => {
const raw = item.publish_date || item.datetime || '';
const raw = item.publish_date || '';
const ts = Date.parse(raw);
return isNaN(ts) ? 0 : ts;
};