mirror of
https://github.com/nubenetes/awesome-kubernetes.git
synced 2026-05-22 00:53:37 +00:00
fix(ai): remove individual enrichment bottleneck and implement massive 429 prevention
This commit is contained in:
@@ -161,9 +161,6 @@ class IntelligentLinkCleaner:
|
||||
original_idx = attempt if best_strat_idx is None else (best_strat_idx if attempt == 0 else (attempt if attempt < best_strat_idx else attempt))
|
||||
self.learning_data["domains"][domain]["best_strategy_idx"] = original_idx
|
||||
|
||||
# Enrichment: If link is ALIVE but lacks description in V1, generate one
|
||||
await self._enrich_description_if_needed(url)
|
||||
|
||||
if "link_cache" not in self.learning_data: self.learning_data["link_cache"] = {}
|
||||
self.learning_data["link_cache"][url] = {"status": "ALIVE", "last_checked": now}
|
||||
|
||||
@@ -193,60 +190,6 @@ class IntelligentLinkCleaner:
|
||||
except: pass
|
||||
return url, True, None, "Conservative Keep"
|
||||
|
||||
async def _enrich_description_if_needed(self, url: str):
|
||||
"""
|
||||
Policy Update: We NO LONGER enrich existing V1 descriptions in the repo
|
||||
to respect manual curation. However, we ensure the INVENTORY has a
|
||||
description for the V2 Elite portal.
|
||||
"""
|
||||
norm_url = normalize_url(url)
|
||||
if norm_url not in self.inventory: self.inventory[norm_url] = {}
|
||||
|
||||
force_full = os.getenv("FORCE_FULL_CHECK", "false").lower() == "true"
|
||||
|
||||
# If inventory already has a description, we are done UNLESS forcing full
|
||||
if self.inventory[norm_url].get("ai_summary") and not force_full:
|
||||
# Still fetch GH metadata if missing
|
||||
if "github.com" in url and ("stars" not in self.inventory[norm_url] or force_full):
|
||||
gh_meta = await self._fetch_github_metadata(url)
|
||||
if gh_meta:
|
||||
self.inventory[norm_url]["stars"] = min(5, gh_meta.get("stars", 0) // 100) # Simple rating
|
||||
self.inventory[norm_url]["gh_stars"] = gh_meta.get("stars", 0)
|
||||
self.inventory[norm_url]["last_commit"] = gh_meta.get("pushed_at", "")
|
||||
return
|
||||
|
||||
async with self.ai_semaphore:
|
||||
log_event(f" [✨] INVENTORY: Generating summary for {url} (V2 Only)")
|
||||
try:
|
||||
# 1. Fetch GitHub metadata if applicable
|
||||
if "github.com" in url:
|
||||
gh_meta = await self._fetch_github_metadata(url)
|
||||
if gh_meta:
|
||||
self.inventory[norm_url]["stars"] = min(5, gh_meta.get("stars", 0) // 100)
|
||||
self.inventory[norm_url]["gh_stars"] = gh_meta.get("stars", 0)
|
||||
self.inventory[norm_url]["last_commit"] = gh_meta.get("pushed_at", "")
|
||||
|
||||
# 2. AI Content Analysis
|
||||
from src.agentic_curator import _deep_fetch_content
|
||||
from src.gemini_utils import call_gemini_with_retry
|
||||
web_content = await _deep_fetch_content(url)
|
||||
if not web_content: return
|
||||
|
||||
prompt = (
|
||||
f"Analyze this resource: {url}\n"
|
||||
f"Technical Content Snippet: {web_content[:1500]}\n"
|
||||
"Provide: 1) A professional 1-sentence English description. 2) The precise original PUBLICATION DATE (YYYY-MM-DD or YYYY if possible).\n"
|
||||
'Format: JSON: {"desc": "...", "pub_date": "..."}'
|
||||
)
|
||||
ai_data = await call_gemini_with_retry(prompt, prefer_flash=True)
|
||||
if ai_data:
|
||||
res_desc = ai_data.get("desc", "").strip()
|
||||
self.inventory[norm_url]["ai_summary"] = res_desc
|
||||
self.inventory[norm_url]["pub_date"] = str(ai_data.get("pub_date", "N/A"))
|
||||
self.stats["enriched_descriptions"] += 1
|
||||
log_event(f" [OK] Cached for V2: {res_desc[:50]}...")
|
||||
except Exception as e:
|
||||
log_event(f" [!] Enrichment error: {e}")
|
||||
async def _check_url_logic(self, url: str, strategy: Dict) -> Tuple[bool, str, Optional[str]]:
|
||||
# RESILIENT LOGIC: Mimic user behavior and handle blocks gracefully
|
||||
headers = {
|
||||
|
||||
Reference in New Issue
Block a user