feat(ai): make Content-URL Rescue Protocol universal for all links (Mandate 31)

2026-05-22 00:53:37 +00:00 · 2026-05-17 18:55:56 +02:00
parent 3b031794ad
commit 9a48ac7659
3 changed files with 42 additions and 38 deletions
--- a/GEMINI.md
+++ b/GEMINI.md
@@ -96,8 +96,10 @@ This file contains the accumulated instructions and long-term vision for the aut
    - **Lowercase Anchors**: All Markdown anchors MUST use strictly lowercase slugs without special characters.

 31. **Content-URL Precision Standard**: To prevent misinformation and maintain high-density technical value:
-    - **No Generic Redirects**: If a technical deep-link redirects to a generic landing page (e.g., home page, "About" section, or index), it MUST be removed entirely from the archive.
-    - **Title Mismatch**: If the new URL resulting from redirection or consolidation no longer contains the specific technical content described in the link title or descriptive text, the resource MUST NOT be kept. Precision is prioritized over link presence.
+    - **Generic Redirect Detection**: If a technical deep-link redirects to a generic landing page (e.g., home page, "About" section, or index), it is flagged as a precision failure.
+    - **Deep Link Rescue (Universal)**: For ALL technical resources, the bot MUST NOT delete the link immediately upon a generic redirect or 404. Instead, it SHOULD attempt to "rescue" the link by identifying the specific content's new path on the destination domain using the resource's descriptive title.
+    - **Authoritative Preservation**: If a specific technical equivalent is found (e.g., during a site migration like Nginx to F5), the URL MUST be updated to the new specific path to maintain content coherence with the descriptive title.
+    - **Title Mismatch**: If no specific destination is found and the new URL only provides generic content, the resource MUST be removed. Precision is prioritized over link presence.

 ## 🛠️ Structural Evolution & Navigation
 ...
--- a/README.md
+++ b/README.md
@@ -449,7 +449,7 @@ graph TD
 ```

 ### 7.6. Strategic Benefits
- **Content-URL Precision Standard (Mandate 31)**: AI agents automatically detect **Generic Redirects** (e.g., a deep technical link redirecting to a home page or generic "About" section). If the destination URL loses the specific technical context described in the resource title, the link is automatically removed to prevent misinformation.
+- **Content-URL Precision Standard (Mandate 31)**: AI agents automatically detect **Generic Redirects** (e.g., deep technical links redirecting to home pages). For ALL resources, the system triggers a **Universal Rescue Protocol**, using Gemini to find the specific content's new location on the destination domain. Only if no technical equivalent is found is the link removed, ensuring technical coherence and zero misinformation across site migrations (e.g., Nginx to F5).
 - **Universal Title and TOC Standards (Mandate 30)**: All technical titles and indices are programmatically sanitized to remove emojis and ampersands, ensuring 100% robust internal Markdown links and cross-platform rendering stability.
 - **Platinum Lifecycle Management**: The system implements advanced data engineering fields including **SHA256 Content Fingerprinting** (to detect silent content drift), **Health Reliability Scoring** (0-100 EMA), and **Source Provenance Tracking**.
 - **Deep Semantic Deduplication**: The V2 engine identifies multiple URLs belonging to the same technical project (e.g., website, repository, documentation) and consolidates them into a single **Authoritative Super-Entry** with `aliases`, ensuring a clean V2 portal while preserving full link history in V1.
--- a/src/intelligent_health_checker.py
+++ b/src/intelligent_health_checker.py
@@ -5,6 +5,7 @@ import re
 import httpx
 import random
 import yaml
+import hashlib
 from datetime import datetime
 from typing import Dict, List, Set, Tuple, Optional, Any
 from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ
@@ -12,7 +13,7 @@ from src.gitops_manager import RepositoryController
 from src.markdown_ast import MarkdownSanitizer
 from src.agentic_curator import AgenticCurator
 from src.logger import log_event
-from src.gemini_utils import normalize_url
+from src.gemini_utils import call_gemini_with_retry, normalize_url

 # Configuración de Excepciones
 CORE_FILES = ["docs/index.md", "README.md"]
@@ -86,6 +87,14 @@ class IntelligentLinkCleaner:
        nu = normalize_url(url); entry = self.inventory.get(nu, {})
        alive, reason, final = await self._check_url_logic(url)
        
+        # --- MANDATE 31: RESCUE PROTOCOL (Universal) ---
+        if (not alive or reason == "generic_redirect_loss"):
+            log_event(f"  [🔍] RESCUE ATTEMPT: '{entry.get('title', url)}' is missing. Searching new location...")
+            new_location = await self._try_rescue_link(url, entry.get("title", ""))
+            if new_location:
+                log_event(f"  [✨] RESCUED: Found at {new_location}")
+                alive, reason, final = True, "resurrected", new_location
+
        # 1. Update Health Score
        score = entry.get("health_score", 100)
        score = (score * 0.8) + (100 if alive else 0) * 0.2
@@ -95,8 +104,7 @@ class IntelligentLinkCleaner:
        # 2. Semantic Drift Detection (SHA256)
        if alive:
            from src.agentic_curator import _deep_fetch_content
-            import hashlib
-            text, _ = await _deep_fetch_content(url)
+            text, _ = await _deep_fetch_content(url if not final else final)
            new_hash = hashlib.sha256(text.encode()).hexdigest() if text else "N/A"
            old_hash = entry.get("content_hash", "N/A")
            
@@ -114,6 +122,28 @@ class IntelligentLinkCleaner:
        
        self.inventory[nu] = entry

+    async def _try_rescue_link(self, old_url: str, title: str) -> Optional[str]:
+        """
+        Uses Gemini to identify the new home of a technical resource.
+        Universal application for all links (Mandate 31).
+        """
+        if not title: return None
+        prompt = (
+            f"You act as a Technical Librarian. The resource '{title}' was at '{old_url}'.\n"
+            "The site has migrated or restructured. Identify the NEW specific URL for this technical content.\n"
+            "Search for the direct equivalent, not a home page.\n"
+            "Return ONLY the raw URL. If not found, return 'NONE'."
+        )
+        try:
+            async with self.ai_semaphore:
+                new_url = await call_gemini_with_retry(prompt, response_format="text", prefer_flash=False)
+                if new_url and new_url.startswith("http") and normalize_url(new_url) != normalize_url(old_url):
+                    async with httpx.AsyncClient(timeout=10, follow_redirects=True) as client:
+                        resp = await client.get(new_url)
+                        if resp.status_code < 400: return new_url
+        except: pass
+        return None
+
    async def _check_url_logic(self, url: str) -> Tuple[bool, str, Optional[str]]:
        headers = {"User-Agent": "Mozilla/5.0", "Accept-Language": "en-US,en;q=0.5"}
        parked_indicators = ["buy this domain", "parked free", "domain is for sale"]
@@ -128,31 +158,24 @@ class IntelligentLinkCleaner:
                    
                    # Mandate 31: Content-URL Precision (Generic Redirect Detection)
                    if final_url != url:
-                        # Clean both for comparison
                        u_path = url.split("://")[-1].rstrip("/")
                        f_path = final_url.split("://")[-1].rstrip("/")
-                        
-                        generic_segments = ["/about", "/home", "/index", "/whats-new", "/es/", "/en/"]
-                        # If a deep link (multiple slashes) redirects to a very shallow path
+                        generic_segments = ["/about", "/home", "/index", "/whats-new", "/es/", "/en/", "/products/"]
                        is_deep_orig = u_path.count("/") >= 3
                        is_shallow_final = f_path.count("/") <= 2 or any(f_path.endswith(s) for s in generic_segments)
                        
                        if is_deep_orig and is_shallow_final:
-                            log_event(f"  [!] PRECISION LOSS: {url} -> {final_url} (Generic redirect). Removing.")
                            return False, "generic_redirect_loss", None

                    return True, "OK", final_url if final_url != url else None
                
-                # Definitive Failures
                if resp.status_code in [404, 410]:
-                    # AUTO-HEAL GitHub Branches (master -> main)
                    if "github.com" in url and "/master/" in url:
                        heal = url.replace("/master/", "/main/")
                        try:
                            if (await client.get(heal)).status_code < 400: return True, "healed", heal
                        except: pass
                    
-                    # Mandate 8: Repository Consolidation
                    if "github.com" in url:
                        match = re.search(r'(https?://github\.com/[^/]+/[^/]+)', url)
                        if match:
@@ -162,7 +185,6 @@ class IntelligentLinkCleaner:
                                    if (await client.get(root_url)).status_code < 400:
                                        return True, "consolidated_to_root", root_url
                                except: pass
-
                    return False, "404", None
                return True, f"Soft Block {resp.status_code}", None
        except: return True, "Connection Error", None
@@ -170,54 +192,34 @@ class IntelligentLinkCleaner:
    async def apply_changes(self):
        log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True)
        file_updates = {}
-        
-        # 1. Prepare file updates for dead/canonical links
        for url, (fallback, reason) in self.dead_links.items():
            nu = normalize_url(url)
-            # Use v1_locations from inventory if available, fallback to registry
            paths = self.inventory.get(nu, {}).get("v1_locations", [])
            if not paths:
                paths = [occ["file"] for occ in self.link_registry.get(nu, [])]
-            
            for path in set(paths):
                if not os.path.exists(path): continue
-                if path not in file_updates: 
-                    file_updates[path] = open(path, "r").readlines()
-                
-                # Perform surgical replacement line-by-line
+                if path not in file_updates: file_updates[path] = open(path, "r").readlines()
                for i, line in enumerate(file_updates[path]):
                    if url in line:
                        if fallback and fallback.startswith("CANONICAL:"):
                            new_url = fallback.replace("CANONICAL:", "")
-                            log_event(f"  [FIX] Redirect: {url} -> {new_url} in {path}")
                            file_updates[path][i] = line.replace(url, new_url)
                        else:
-                            log_event(f"  [DEL] Dead Link: {url} in {path}")
-                            file_updates[path][i] = None # Mark line for removal
+                            file_updates[path][i] = None 

-        # 2. Final Payload Construction
        final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()}
-        
-        # 3. Database Maintenance (GC & Persistence)
        await self.prune_orphaned_metadata()
        self._save_inventory()
        final_payload[INVENTORY_PATH] = yaml.dump(self.inventory, sort_keys=False, allow_unicode=True)

-        # 4. Safety Audit & Non-Blocking PR
        from src.safety_guard import SafetyGuard
        guard = SafetyGuard()
        safety_report = guard.generate_audit_report()
        
        if final_payload:
-            metrics = {
-                "total_extracted": len(self.link_registry),
-                "full_report": self.action_log,
-                "deleted_dead": len([v for v in self.dead_links.values() if v[0] is None]),
-                "fixed_redirects": len([v for v in self.dead_links.values() if v[0] and "CANONICAL" in v[0]])
-            }
+            metrics = {"total_extracted": len(self.link_registry), "full_report": self.action_log}
            self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=safety_report)
-        else:
-            log_event("  [INFO] No files required cleaning in this cycle.")

    async def prune_orphaned_metadata(self):
        valid_map = {}