diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py index 1b2d7d00..a0339235 100644 --- a/src/intelligent_health_checker.py +++ b/src/intelligent_health_checker.py @@ -161,9 +161,6 @@ class IntelligentLinkCleaner: original_idx = attempt if best_strat_idx is None else (best_strat_idx if attempt == 0 else (attempt if attempt < best_strat_idx else attempt)) self.learning_data["domains"][domain]["best_strategy_idx"] = original_idx - # Enrichment: If link is ALIVE but lacks description in V1, generate one - await self._enrich_description_if_needed(url) - if "link_cache" not in self.learning_data: self.learning_data["link_cache"] = {} self.learning_data["link_cache"][url] = {"status": "ALIVE", "last_checked": now} @@ -193,60 +190,6 @@ class IntelligentLinkCleaner: except: pass return url, True, None, "Conservative Keep" - async def _enrich_description_if_needed(self, url: str): - """ - Policy Update: We NO LONGER enrich existing V1 descriptions in the repo - to respect manual curation. However, we ensure the INVENTORY has a - description for the V2 Elite portal. - """ - norm_url = normalize_url(url) - if norm_url not in self.inventory: self.inventory[norm_url] = {} - - force_full = os.getenv("FORCE_FULL_CHECK", "false").lower() == "true" - - # If inventory already has a description, we are done UNLESS forcing full - if self.inventory[norm_url].get("ai_summary") and not force_full: - # Still fetch GH metadata if missing - if "github.com" in url and ("stars" not in self.inventory[norm_url] or force_full): - gh_meta = await self._fetch_github_metadata(url) - if gh_meta: - self.inventory[norm_url]["stars"] = min(5, gh_meta.get("stars", 0) // 100) # Simple rating - self.inventory[norm_url]["gh_stars"] = gh_meta.get("stars", 0) - self.inventory[norm_url]["last_commit"] = gh_meta.get("pushed_at", "") - return - - async with self.ai_semaphore: - log_event(f" [✨] INVENTORY: Generating summary for {url} (V2 Only)") - try: - # 1. Fetch GitHub metadata if applicable - if "github.com" in url: - gh_meta = await self._fetch_github_metadata(url) - if gh_meta: - self.inventory[norm_url]["stars"] = min(5, gh_meta.get("stars", 0) // 100) - self.inventory[norm_url]["gh_stars"] = gh_meta.get("stars", 0) - self.inventory[norm_url]["last_commit"] = gh_meta.get("pushed_at", "") - - # 2. AI Content Analysis - from src.agentic_curator import _deep_fetch_content - from src.gemini_utils import call_gemini_with_retry - web_content = await _deep_fetch_content(url) - if not web_content: return - - prompt = ( - f"Analyze this resource: {url}\n" - f"Technical Content Snippet: {web_content[:1500]}\n" - "Provide: 1) A professional 1-sentence English description. 2) The precise original PUBLICATION DATE (YYYY-MM-DD or YYYY if possible).\n" - 'Format: JSON: {"desc": "...", "pub_date": "..."}' - ) - ai_data = await call_gemini_with_retry(prompt, prefer_flash=True) - if ai_data: - res_desc = ai_data.get("desc", "").strip() - self.inventory[norm_url]["ai_summary"] = res_desc - self.inventory[norm_url]["pub_date"] = str(ai_data.get("pub_date", "N/A")) - self.stats["enriched_descriptions"] += 1 - log_event(f" [OK] Cached for V2: {res_desc[:50]}...") - except Exception as e: - log_event(f" [!] Enrichment error: {e}") async def _check_url_logic(self, url: str, strategy: Dict) -> Tuple[bool, str, Optional[str]]: # RESILIENT LOGIC: Mimic user behavior and handle blocks gracefully headers = {