fix(ai): remove individual enrichment bottleneck and implement massive 429 prevention

This commit is contained in:
Nubenetes Bot
2026-05-16 23:01:47 +02:00
parent 01eb50a6f8
commit a1e95bb210

View File

@@ -161,9 +161,6 @@ class IntelligentLinkCleaner:
original_idx = attempt if best_strat_idx is None else (best_strat_idx if attempt == 0 else (attempt if attempt < best_strat_idx else attempt))
self.learning_data["domains"][domain]["best_strategy_idx"] = original_idx
# Enrichment: If link is ALIVE but lacks description in V1, generate one
await self._enrich_description_if_needed(url)
if "link_cache" not in self.learning_data: self.learning_data["link_cache"] = {}
self.learning_data["link_cache"][url] = {"status": "ALIVE", "last_checked": now}
@@ -193,60 +190,6 @@ class IntelligentLinkCleaner:
except: pass
return url, True, None, "Conservative Keep"
async def _enrich_description_if_needed(self, url: str):
"""
Policy Update: We NO LONGER enrich existing V1 descriptions in the repo
to respect manual curation. However, we ensure the INVENTORY has a
description for the V2 Elite portal.
"""
norm_url = normalize_url(url)
if norm_url not in self.inventory: self.inventory[norm_url] = {}
force_full = os.getenv("FORCE_FULL_CHECK", "false").lower() == "true"
# If inventory already has a description, we are done UNLESS forcing full
if self.inventory[norm_url].get("ai_summary") and not force_full:
# Still fetch GH metadata if missing
if "github.com" in url and ("stars" not in self.inventory[norm_url] or force_full):
gh_meta = await self._fetch_github_metadata(url)
if gh_meta:
self.inventory[norm_url]["stars"] = min(5, gh_meta.get("stars", 0) // 100) # Simple rating
self.inventory[norm_url]["gh_stars"] = gh_meta.get("stars", 0)
self.inventory[norm_url]["last_commit"] = gh_meta.get("pushed_at", "")
return
async with self.ai_semaphore:
log_event(f" [✨] INVENTORY: Generating summary for {url} (V2 Only)")
try:
# 1. Fetch GitHub metadata if applicable
if "github.com" in url:
gh_meta = await self._fetch_github_metadata(url)
if gh_meta:
self.inventory[norm_url]["stars"] = min(5, gh_meta.get("stars", 0) // 100)
self.inventory[norm_url]["gh_stars"] = gh_meta.get("stars", 0)
self.inventory[norm_url]["last_commit"] = gh_meta.get("pushed_at", "")
# 2. AI Content Analysis
from src.agentic_curator import _deep_fetch_content
from src.gemini_utils import call_gemini_with_retry
web_content = await _deep_fetch_content(url)
if not web_content: return
prompt = (
f"Analyze this resource: {url}\n"
f"Technical Content Snippet: {web_content[:1500]}\n"
"Provide: 1) A professional 1-sentence English description. 2) The precise original PUBLICATION DATE (YYYY-MM-DD or YYYY if possible).\n"
'Format: JSON: {"desc": "...", "pub_date": "..."}'
)
ai_data = await call_gemini_with_retry(prompt, prefer_flash=True)
if ai_data:
res_desc = ai_data.get("desc", "").strip()
self.inventory[norm_url]["ai_summary"] = res_desc
self.inventory[norm_url]["pub_date"] = str(ai_data.get("pub_date", "N/A"))
self.stats["enriched_descriptions"] += 1
log_event(f" [OK] Cached for V2: {res_desc[:50]}...")
except Exception as e:
log_event(f" [!] Enrichment error: {e}")
async def _check_url_logic(self, url: str, strategy: Dict) -> Tuple[bool, str, Optional[str]]:
# RESILIENT LOGIC: Mimic user behavior and handle blocks gracefully
headers = {