mirror of
https://github.com/Cp0204/quark-auto-save.git
synced 2026-01-12 15:20:44 +08:00
优化资源搜索结果的去重逻辑
This commit is contained in:
parent
54bcd0906a
commit
101abb0247
83
app/run.py
83
app/run.py
@ -981,26 +981,91 @@ def get_task_suggestions():
|
||||
except Exception as e:
|
||||
logging.warning(f"PanSou 搜索失败: {str(e)}")
|
||||
|
||||
# 去重(按shareurl优先,其次taskname)
|
||||
# 去重并统一时间字段为 publish_date
|
||||
# 规则:
|
||||
# 1) shareurl 相同视为同一资源
|
||||
# 2) 当 taskname 与 publish_date 同时完全一致时,也视为同一资源(即使 shareurl 不同)
|
||||
dedup = []
|
||||
seen = set()
|
||||
seen_shareurls = set()
|
||||
seen_title_date = set()
|
||||
seen_fingerprints = set()
|
||||
# 规范化工具
|
||||
def normalize_shareurl(url: str) -> str:
|
||||
try:
|
||||
if not url:
|
||||
return ""
|
||||
u = url.strip()
|
||||
# 仅取夸克分享ID: pan.quark.cn/s/<id>[?...]
|
||||
# 同时支持直接传入ID的情况
|
||||
match = re.search(r"/s/([^\?/#\s]+)", u)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# 如果没有域名路径,尝试去掉查询参数
|
||||
return u.split('?')[0]
|
||||
except Exception:
|
||||
return url or ""
|
||||
def normalize_title(title: str) -> str:
|
||||
try:
|
||||
if not title:
|
||||
return ""
|
||||
import unicodedata
|
||||
t = unicodedata.normalize('NFKC', title)
|
||||
t = t.replace('\u3000', ' ').replace('\t', ' ')
|
||||
t = re.sub(r"\s+", " ", t).strip()
|
||||
return t
|
||||
except Exception:
|
||||
return title or ""
|
||||
def normalize_date(date_str: str) -> str:
|
||||
try:
|
||||
if not date_str:
|
||||
return ""
|
||||
import unicodedata
|
||||
ds = unicodedata.normalize('NFKC', date_str).strip()
|
||||
return ds
|
||||
except Exception:
|
||||
return (date_str or "").strip()
|
||||
for item in merged:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
key = item.get("shareurl") or item.get("taskname")
|
||||
if not key:
|
||||
# 统一时间字段:优先使用已存在的 publish_date,否则使用 datetime,并写回 publish_date
|
||||
try:
|
||||
if not item.get("publish_date") and item.get("datetime"):
|
||||
item["publish_date"] = item.get("datetime")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
shareurl = normalize_shareurl(item.get("shareurl") or "")
|
||||
title = normalize_title(item.get("taskname") or "")
|
||||
pubdate = normalize_date(item.get("publish_date") or "")
|
||||
source = (item.get("source") or "").strip()
|
||||
|
||||
# 条件1:按 shareurl 去重
|
||||
if shareurl and shareurl in seen_shareurls:
|
||||
continue
|
||||
if key in seen:
|
||||
|
||||
# 条件2:标题 + 发布时间 同时一致则判定为同一资源
|
||||
title_date_key = f"{title}||{pubdate}" if title and pubdate else None
|
||||
if title_date_key and title_date_key in seen_title_date:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
# 条件3:完整指纹键(shareurl+title+date+source)去重,兜底完全相同的重复项
|
||||
fingerprint = f"{shareurl}|{title}|{pubdate}|{source}"
|
||||
if fingerprint in seen_fingerprints:
|
||||
continue
|
||||
|
||||
# 记录已见键并保留该条
|
||||
if shareurl:
|
||||
seen_shareurls.add(shareurl)
|
||||
if title_date_key:
|
||||
seen_title_date.add(title_date_key)
|
||||
seen_fingerprints.add(fingerprint)
|
||||
dedup.append(item)
|
||||
|
||||
# 全局时间排序:所有来源的结果混合排序,按时间倒序(最新的在前)
|
||||
if dedup:
|
||||
def parse_datetime_for_sort(item):
|
||||
"""解析时间字段,返回可比较的时间戳"""
|
||||
# 兼容两个字段名:publish_date 和 datetime
|
||||
datetime_str = item.get("publish_date") or item.get("datetime")
|
||||
"""解析时间字段,返回可比较的时间戳(统一以 publish_date 为准)"""
|
||||
datetime_str = item.get("publish_date")
|
||||
if not datetime_str:
|
||||
return 0 # 没有时间的排在最后
|
||||
try:
|
||||
|
||||
@ -1017,16 +1017,16 @@
|
||||
<i class="spinner-border spinner-border-sm" role="status" aria-hidden="true"></i>
|
||||
正在验证链接有效性...({{ smart_param.validateProgress.current }}/{{ smart_param.validateProgress.total }})<span v-if="smart_param.validateProgress.valid > 0">已找到 {{ smart_param.validateProgress.valid }} 个有效链接</span>
|
||||
</span>
|
||||
<span v-else>正在搜索中...</span>
|
||||
<span v-else>正在搜索资源...</span>
|
||||
</div>
|
||||
<div class="dropdown-item text-muted" v-else style="font-size:14px; padding-left: 8px; text-align: left;">
|
||||
{{ smart_param.taskSuggestions.message ? smart_param.taskSuggestions.message : smart_param.taskSuggestions.data && smart_param.taskSuggestions.data.length ? `以下资源由 ${(smart_param.taskSuggestions.source || '').replace(/,\s*/g, '、')} 搜索提供(仅显示有效链接,共 ${(smart_param.taskSuggestions.data || []).length} 个),如有侵权请联系资源发布方` : "未搜索到有效资源" }}
|
||||
</div>
|
||||
<div v-for="suggestion in smart_param.taskSuggestions.data || []" :key="suggestion.taskname" class="dropdown-item cursor-pointer" @click.prevent="selectSuggestion(index, suggestion)" style="font-size: 14px;" :title="suggestion.content">
|
||||
<div v-for="suggestion in smart_param.taskSuggestions.data || []" :key="suggestion.taskname" class="dropdown-item cursor-pointer" @click.prevent="selectSuggestion(index, suggestion)" style="font-size: 14px;" :title="getSuggestionHoverTitle(suggestion)">
|
||||
<span v-html="suggestion.verify ? '✅': ''"></span> {{ suggestion.taskname }}
|
||||
<small class="text-muted">
|
||||
<a :href="suggestion.shareurl" target="_blank" @click.stop> · {{ suggestion.shareurl.replace(/^https?:\/\/pan\.quark\.cn\/s\//, '') }}</a>
|
||||
<template v-if="suggestion.source"><span class="source-badge" :class="suggestion.source.toLowerCase()" :data-publish-date="(suggestion.publish_date || suggestion.datetime) ? ' · ' + (suggestion.publish_date || suggestion.datetime) : ''">{{ suggestion.source }}</span></template>
|
||||
<template v-if="suggestion.source"><span class="source-badge" :class="suggestion.source.toLowerCase()" :data-publish-date="suggestion.publish_date ? ' · ' + suggestion.publish_date : ''">{{ suggestion.source }}</span></template>
|
||||
</small>
|
||||
</div>
|
||||
</div>
|
||||
@ -1945,16 +1945,16 @@
|
||||
<i class="spinner-border spinner-border-sm" role="status" aria-hidden="true"></i>
|
||||
正在验证链接有效性...({{ smart_param.validateProgress.current }}/{{ smart_param.validateProgress.total }})<span v-if="smart_param.validateProgress.valid > 0">已找到 {{ smart_param.validateProgress.valid }} 个有效链接</span>
|
||||
</span>
|
||||
<span v-else>正在搜索中...</span>
|
||||
<span v-else>正在搜索资源...</span>
|
||||
</div>
|
||||
<div class="dropdown-item text-muted" v-else style="font-size:14px; padding-left: 8px; text-align: left;">
|
||||
{{ smart_param.taskSuggestions.message ? smart_param.taskSuggestions.message : smart_param.taskSuggestions.data && smart_param.taskSuggestions.data.length ? `以下资源由 ${(smart_param.taskSuggestions.source || '').replace(/,\s*/g, '、')} 搜索提供(仅显示有效链接,共 ${(smart_param.taskSuggestions.data || []).length} 个),如有侵权请联系资源发布方` : "未搜索到有效资源" }}
|
||||
</div>
|
||||
<div v-for="suggestion in smart_param.taskSuggestions.data || []" :key="suggestion.taskname" class="dropdown-item cursor-pointer" @click.prevent="selectSuggestion(-1, suggestion)" style="font-size: 14px;" :title="suggestion.content">
|
||||
<div v-for="suggestion in smart_param.taskSuggestions.data || []" :key="suggestion.taskname" class="dropdown-item cursor-pointer" @click.prevent="selectSuggestion(-1, suggestion)" style="font-size: 14px;" :title="getSuggestionHoverTitle(suggestion)">
|
||||
<span v-html="suggestion.verify ? '✅': ''"></span> {{ suggestion.taskname }}
|
||||
<small class="text-muted">
|
||||
<a :href="suggestion.shareurl" target="_blank" @click.stop> · {{ suggestion.shareurl.replace(/^https?:\/\/pan\.quark\.cn\/s\//, '') }}</a>
|
||||
<template v-if="suggestion.source"><span class="source-badge" :class="suggestion.source.toLowerCase()" :data-publish-date="(suggestion.publish_date || suggestion.datetime) ? ' · ' + (suggestion.publish_date || suggestion.datetime) : ''">{{ suggestion.source }}</span></template>
|
||||
<template v-if="suggestion.source"><span class="source-badge" :class="suggestion.source.toLowerCase()" :data-publish-date="suggestion.publish_date ? ' · ' + suggestion.publish_date : ''">{{ suggestion.source }}</span></template>
|
||||
</small>
|
||||
</div>
|
||||
</div>
|
||||
@ -2758,6 +2758,24 @@
|
||||
document.removeEventListener('click', this.handleOutsideClick);
|
||||
},
|
||||
methods: {
|
||||
// 仅当有有效信息时返回悬停提示,否则返回null以不显示
|
||||
getSuggestionHoverTitle(suggestion) {
|
||||
if (!suggestion) return null;
|
||||
let content = (suggestion.content || '').trim();
|
||||
if (!content) return null;
|
||||
// 统一标点为英文冒号,统一逗号
|
||||
const normalized = content
|
||||
.replace(/:/g, ':')
|
||||
.replace(/,/g, ',')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
// 仅在明确的占位文本时隐藏:
|
||||
// 1) 全文就是“大小:-”
|
||||
if (/^大小\s*:\s*-$/i.test(normalized)) return null;
|
||||
// 2) 完全匹配“类别:xx, 文件类型:yy, 大小:-”这类占位
|
||||
if (/^类别\s*:[^,]*,\s*文件类型\s*:[^,]*,\s*大小\s*:\s*-$/i.test(normalized)) return null;
|
||||
return content;
|
||||
},
|
||||
// 获取插件展示名称(支持别名,仅用于WebUI显示)
|
||||
getPluginDisplayName(pluginName) {
|
||||
return this.pluginDisplayAliases[pluginName] || pluginName;
|
||||
@ -4312,7 +4330,7 @@
|
||||
|
||||
// 解析时间用于排序(降序:最新在前)
|
||||
const getItemTs = (item) => {
|
||||
const raw = item.publish_date || item.datetime || '';
|
||||
const raw = item.publish_date || '';
|
||||
const ts = Date.parse(raw);
|
||||
return isNaN(ts) ? 0 : ts;
|
||||
};
|
||||
@ -4445,7 +4463,7 @@
|
||||
|
||||
// 结束前做一次排序,确保最终顺序正确
|
||||
const getItemTs = (item) => {
|
||||
const raw = item.publish_date || item.datetime || '';
|
||||
const raw = item.publish_date || '';
|
||||
const ts = Date.parse(raw);
|
||||
return isNaN(ts) ? 0 : ts;
|
||||
};
|
||||
|
||||
Loading…
Reference in New Issue
Block a user