mirror of
https://github.com/Cp0204/quark-auto-save.git
synced 2026-01-14 00:10:43 +08:00
优化资源搜索结果的去重逻辑,改为去重时保留最新记录
- PanSou:按 shareurl 归并,保留 publish_date 最新记录
- CloudSaver:按 shareurl 归并,保留 datetime 最新记录
- 聚合层(/task_suggestions):
- 首轮:按 shareurl 归并取最新;仅在无链接时用指纹兜底
- 二次:按“标准化标题+时间戳”归并(兼容多种时间格式)
- 同秒平手:优先 CloudSaver,其次内容更丰富
- 最终仍按 publish_date 倒序展示
This commit is contained in:
parent
8304d8e8fd
commit
3ccaeeae15
109
app/run.py
109
app/run.py
@ -987,12 +987,12 @@ def get_task_suggestions():
|
||||
|
||||
# 去重并统一时间字段为 publish_date
|
||||
# 规则:
|
||||
# 1) shareurl 相同视为同一资源
|
||||
# 2) 当 taskname 与 publish_date 同时完全一致时,也视为同一资源(即使 shareurl 不同)
|
||||
dedup = []
|
||||
seen_shareurls = set()
|
||||
seen_title_date = set()
|
||||
seen_fingerprints = set()
|
||||
# 1) 首轮仅按 shareurl 归并:同一链接保留发布时间最新的一条(展示以该条为准)
|
||||
# 2) 兜底(极少):无链接时按完整指纹(shareurl|title|date|source)归并
|
||||
# 3) 二次归并:对所有候选结果再按 标题+发布时间 做一次归并(无论 shareurl 是否相同),取最新
|
||||
# 注意:当发生归并冲突时,始终保留发布时间最新的记录
|
||||
dedup_map = {} # 按 shareurl 归并
|
||||
fingerprint_map = {} # 兜底:完整指纹归并(仅当缺失链接时)
|
||||
# 规范化工具
|
||||
def normalize_shareurl(url: str) -> str:
|
||||
try:
|
||||
@ -1028,6 +1028,29 @@ def get_task_suggestions():
|
||||
return ds
|
||||
except Exception:
|
||||
return (date_str or "").strip()
|
||||
# 解析时间供比较
|
||||
def to_ts(datetime_str):
|
||||
if not datetime_str:
|
||||
return 0
|
||||
try:
|
||||
s = str(datetime_str).strip()
|
||||
from datetime import datetime
|
||||
try:
|
||||
return datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timestamp()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
return datetime.strptime(s, "%Y-%m-%d").timestamp()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
s2 = s.replace('Z', '+00:00')
|
||||
return datetime.fromisoformat(s2).timestamp()
|
||||
except Exception:
|
||||
return 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
for item in merged:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
@ -1043,27 +1066,63 @@ def get_task_suggestions():
|
||||
pubdate = normalize_date(item.get("publish_date") or "")
|
||||
source = (item.get("source") or "").strip()
|
||||
|
||||
# 条件1:按 shareurl 去重
|
||||
if shareurl and shareurl in seen_shareurls:
|
||||
continue
|
||||
timestamp = to_ts(pubdate)
|
||||
|
||||
# 条件2:标题 + 发布时间 同时一致则判定为同一资源
|
||||
title_date_key = f"{title}||{pubdate}" if title and pubdate else None
|
||||
if title_date_key and title_date_key in seen_title_date:
|
||||
continue
|
||||
|
||||
# 条件3:完整指纹键(shareurl+title+date+source)去重,兜底完全相同的重复项
|
||||
fingerprint = f"{shareurl}|{title}|{pubdate}|{source}"
|
||||
if fingerprint in seen_fingerprints:
|
||||
continue
|
||||
|
||||
# 记录已见键并保留该条
|
||||
# 条件1:按 shareurl 归并,取最新
|
||||
if shareurl:
|
||||
seen_shareurls.add(shareurl)
|
||||
if title_date_key:
|
||||
seen_title_date.add(title_date_key)
|
||||
seen_fingerprints.add(fingerprint)
|
||||
dedup.append(item)
|
||||
existed = dedup_map.get(shareurl)
|
||||
if not existed or to_ts(existed.get("publish_date")) < timestamp:
|
||||
dedup_map[shareurl] = item
|
||||
else:
|
||||
# 条件2(兜底):完整指纹归并(极少发生),依然取最新
|
||||
fingerprint = f"{shareurl}|{title}|{pubdate}|{source}"
|
||||
existed = fingerprint_map.get(fingerprint)
|
||||
if not existed or to_ts(existed.get("publish_date")) < timestamp:
|
||||
fingerprint_map[fingerprint] = item
|
||||
|
||||
# 第一轮:汇总归并后的候选结果
|
||||
candidates = list(dedup_map.values()) + list(fingerprint_map.values())
|
||||
|
||||
# 第二轮:无论 shareurl 是否相同,再按 标题+发布时间 归并一次(使用时间戳作为键,兼容不同时间格式),保留最新
|
||||
final_map = {}
|
||||
for item in candidates:
|
||||
try:
|
||||
t = normalize_title(item.get("taskname") or "")
|
||||
d = normalize_date(item.get("publish_date") or "")
|
||||
s = normalize_shareurl(item.get("shareurl") or "")
|
||||
src = (item.get("source") or "").strip()
|
||||
# 优先采用 标题+时间 作为归并键
|
||||
ts_val = to_ts(d)
|
||||
if t and ts_val:
|
||||
key = f"TD::{t}||{int(ts_val)}"
|
||||
elif s:
|
||||
key = f"URL::{s}"
|
||||
else:
|
||||
key = f"FP::{s}|{t}|{d}|{src}"
|
||||
existed = final_map.get(key)
|
||||
current_ts = to_ts(item.get("publish_date"))
|
||||
if not existed:
|
||||
final_map[key] = item
|
||||
else:
|
||||
existed_ts = to_ts(existed.get("publish_date"))
|
||||
if current_ts > existed_ts:
|
||||
final_map[key] = item
|
||||
elif current_ts == existed_ts:
|
||||
# 时间完全相同,使用确定性优先级打破平手
|
||||
source_priority = {"CloudSaver": 2, "PanSou": 1}
|
||||
existed_pri = source_priority.get((existed.get("source") or "").strip(), 0)
|
||||
current_pri = source_priority.get(src, 0)
|
||||
if current_pri > existed_pri:
|
||||
final_map[key] = item
|
||||
elif current_pri == existed_pri:
|
||||
# 进一步比较信息丰富度(content 长度)
|
||||
if len(str(item.get("content") or "")) > len(str(existed.get("content") or "")):
|
||||
final_map[key] = item
|
||||
except Exception:
|
||||
# 出现异常则跳过该项
|
||||
continue
|
||||
|
||||
dedup = list(final_map.values())
|
||||
|
||||
# 仅在排序时对多种格式进行解析(优先解析 YYYY-MM-DD HH:mm:ss,其次 ISO)
|
||||
if dedup:
|
||||
|
||||
@ -142,24 +142,63 @@ class CloudSaver:
|
||||
# 获取发布时间 - 采用与原始实现一致的方式
|
||||
pubdate_iso = item.get("pubDate", "") # 原始时间字符串(可能为 ISO 或已是北京时间)
|
||||
pubdate = pubdate_iso # 不做时区转换,保留来源原始时间
|
||||
# 链接去重
|
||||
if link.get("link") not in link_array:
|
||||
link_array.append(link.get("link"))
|
||||
clean_results.append(
|
||||
{
|
||||
"shareurl": link.get("link"),
|
||||
"taskname": title,
|
||||
"content": content,
|
||||
"datetime": pubdate, # 显示用时间
|
||||
"tags": item.get("tags", []),
|
||||
"channel": item.get("channelId", ""),
|
||||
"source": "CloudSaver"
|
||||
}
|
||||
)
|
||||
# 收集结果(不在此处去重,统一在末尾按最新归并)
|
||||
clean_results.append(
|
||||
{
|
||||
"shareurl": link.get("link"),
|
||||
"taskname": title,
|
||||
"content": content,
|
||||
"datetime": pubdate, # 显示用时间
|
||||
"tags": item.get("tags", []),
|
||||
"channel": item.get("channelId", ""),
|
||||
"source": "CloudSaver"
|
||||
}
|
||||
)
|
||||
|
||||
# 去重:按 shareurl 归并,保留发布时间最新的记录
|
||||
def to_ts(date_str: str) -> float:
|
||||
if not date_str:
|
||||
return 0
|
||||
try:
|
||||
s = str(date_str).strip()
|
||||
from datetime import datetime
|
||||
try:
|
||||
return datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timestamp()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
return datetime.strptime(s, "%Y-%m-%d").timestamp()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
s2 = s.replace('Z', '+00:00')
|
||||
return datetime.fromisoformat(s2).timestamp()
|
||||
except Exception:
|
||||
return 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
by_url = {}
|
||||
for item in clean_results:
|
||||
try:
|
||||
url = item.get("shareurl", "")
|
||||
if not url:
|
||||
continue
|
||||
existed = by_url.get(url)
|
||||
if not existed:
|
||||
by_url[url] = item
|
||||
else:
|
||||
# 比较 datetime(CloudSaver清洗阶段时间字段名为 datetime)
|
||||
if to_ts(item.get("datetime")) > to_ts(existed.get("datetime")):
|
||||
by_url[url] = item
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
unique_results = list(by_url.values())
|
||||
|
||||
# 注意:排序逻辑已移至全局,这里不再进行内部排序
|
||||
# 返回原始顺序的结果,由全局排序函数统一处理
|
||||
return clean_results
|
||||
# 返回归并后的结果,由全局排序函数统一处理
|
||||
return unique_results
|
||||
|
||||
# 测试示例
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -156,13 +156,44 @@ class PanSou:
|
||||
if not cleaned:
|
||||
return {"success": False, "message": "PanSou搜索无夸克网盘结果"}
|
||||
|
||||
# 去重:按 shareurl 去重
|
||||
seen_urls = set()
|
||||
unique_results = []
|
||||
for item in cleaned:
|
||||
url = item.get("shareurl", "")
|
||||
if url and url not in seen_urls:
|
||||
seen_urls.add(url)
|
||||
unique_results.append(item)
|
||||
# 去重:按 shareurl 归并,保留发布时间最新的记录
|
||||
def to_ts(date_str: str) -> float:
|
||||
if not date_str:
|
||||
return 0
|
||||
try:
|
||||
s = str(date_str).strip()
|
||||
from datetime import datetime
|
||||
try:
|
||||
return datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timestamp()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
return datetime.strptime(s, "%Y-%m-%d").timestamp()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
s2 = s.replace('Z', '+00:00')
|
||||
return datetime.fromisoformat(s2).timestamp()
|
||||
except Exception:
|
||||
return 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
by_url = {}
|
||||
for item in cleaned:
|
||||
try:
|
||||
url = item.get("shareurl", "")
|
||||
if not url:
|
||||
continue
|
||||
existed = by_url.get(url)
|
||||
if not existed:
|
||||
by_url[url] = item
|
||||
else:
|
||||
# 比较 publish_date(若不存在则视为0)
|
||||
if to_ts(item.get("publish_date")) > to_ts(existed.get("publish_date")):
|
||||
by_url[url] = item
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
unique_results = list(by_url.values())
|
||||
return {"success": True, "data": unique_results}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user