feat(ai): make Content-URL Rescue Protocol universal for all links (Mandate 31)

This commit is contained in:
Nubenetes Bot
2026-05-17 18:55:56 +02:00
parent 3b031794ad
commit 9a48ac7659
3 changed files with 42 additions and 38 deletions

View File

@@ -96,8 +96,10 @@ This file contains the accumulated instructions and long-term vision for the aut
- **Lowercase Anchors**: All Markdown anchors MUST use strictly lowercase slugs without special characters.
31. **Content-URL Precision Standard**: To prevent misinformation and maintain high-density technical value:
- **No Generic Redirects**: If a technical deep-link redirects to a generic landing page (e.g., home page, "About" section, or index), it MUST be removed entirely from the archive.
- **Title Mismatch**: If the new URL resulting from redirection or consolidation no longer contains the specific technical content described in the link title or descriptive text, the resource MUST NOT be kept. Precision is prioritized over link presence.
- **Generic Redirect Detection**: If a technical deep-link redirects to a generic landing page (e.g., home page, "About" section, or index), it is flagged as a precision failure.
- **Deep Link Rescue (Universal)**: For ALL technical resources, the bot MUST NOT delete the link immediately upon a generic redirect or 404. Instead, it SHOULD attempt to "rescue" the link by identifying the specific content's new path on the destination domain using the resource's descriptive title.
- **Authoritative Preservation**: If a specific technical equivalent is found (e.g., during a site migration like Nginx to F5), the URL MUST be updated to the new specific path to maintain content coherence with the descriptive title.
- **Title Mismatch**: If no specific destination is found and the new URL only provides generic content, the resource MUST be removed. Precision is prioritized over link presence.
## 🛠️ Structural Evolution & Navigation
...

View File

@@ -449,7 +449,7 @@ graph TD
```
### 7.6. Strategic Benefits
- **Content-URL Precision Standard (Mandate 31)**: AI agents automatically detect **Generic Redirects** (e.g., a deep technical link redirecting to a home page or generic "About" section). If the destination URL loses the specific technical context described in the resource title, the link is automatically removed to prevent misinformation.
- **Content-URL Precision Standard (Mandate 31)**: AI agents automatically detect **Generic Redirects** (e.g., deep technical links redirecting to home pages). For ALL resources, the system triggers a **Universal Rescue Protocol**, using Gemini to find the specific content's new location on the destination domain. Only if no technical equivalent is found is the link removed, ensuring technical coherence and zero misinformation across site migrations (e.g., Nginx to F5).
- **Universal Title and TOC Standards (Mandate 30)**: All technical titles and indices are programmatically sanitized to remove emojis and ampersands, ensuring 100% robust internal Markdown links and cross-platform rendering stability.
- **Platinum Lifecycle Management**: The system implements advanced data engineering fields including **SHA256 Content Fingerprinting** (to detect silent content drift), **Health Reliability Scoring** (0-100 EMA), and **Source Provenance Tracking**.
- **Deep Semantic Deduplication**: The V2 engine identifies multiple URLs belonging to the same technical project (e.g., website, repository, documentation) and consolidates them into a single **Authoritative Super-Entry** with `aliases`, ensuring a clean V2 portal while preserving full link history in V1.

View File

@@ -5,6 +5,7 @@ import re
import httpx
import random
import yaml
import hashlib
from datetime import datetime
from typing import Dict, List, Set, Tuple, Optional, Any
from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ
@@ -12,7 +13,7 @@ from src.gitops_manager import RepositoryController
from src.markdown_ast import MarkdownSanitizer
from src.agentic_curator import AgenticCurator
from src.logger import log_event
from src.gemini_utils import normalize_url
from src.gemini_utils import call_gemini_with_retry, normalize_url
# Configuración de Excepciones
CORE_FILES = ["docs/index.md", "README.md"]
@@ -86,6 +87,14 @@ class IntelligentLinkCleaner:
nu = normalize_url(url); entry = self.inventory.get(nu, {})
alive, reason, final = await self._check_url_logic(url)
# --- MANDATE 31: RESCUE PROTOCOL (Universal) ---
if (not alive or reason == "generic_redirect_loss"):
log_event(f" [🔍] RESCUE ATTEMPT: '{entry.get('title', url)}' is missing. Searching new location...")
new_location = await self._try_rescue_link(url, entry.get("title", ""))
if new_location:
log_event(f" [✨] RESCUED: Found at {new_location}")
alive, reason, final = True, "resurrected", new_location
# 1. Update Health Score
score = entry.get("health_score", 100)
score = (score * 0.8) + (100 if alive else 0) * 0.2
@@ -95,8 +104,7 @@ class IntelligentLinkCleaner:
# 2. Semantic Drift Detection (SHA256)
if alive:
from src.agentic_curator import _deep_fetch_content
import hashlib
text, _ = await _deep_fetch_content(url)
text, _ = await _deep_fetch_content(url if not final else final)
new_hash = hashlib.sha256(text.encode()).hexdigest() if text else "N/A"
old_hash = entry.get("content_hash", "N/A")
@@ -114,6 +122,28 @@ class IntelligentLinkCleaner:
self.inventory[nu] = entry
async def _try_rescue_link(self, old_url: str, title: str) -> Optional[str]:
"""
Uses Gemini to identify the new home of a technical resource.
Universal application for all links (Mandate 31).
"""
if not title: return None
prompt = (
f"You act as a Technical Librarian. The resource '{title}' was at '{old_url}'.\n"
"The site has migrated or restructured. Identify the NEW specific URL for this technical content.\n"
"Search for the direct equivalent, not a home page.\n"
"Return ONLY the raw URL. If not found, return 'NONE'."
)
try:
async with self.ai_semaphore:
new_url = await call_gemini_with_retry(prompt, response_format="text", prefer_flash=False)
if new_url and new_url.startswith("http") and normalize_url(new_url) != normalize_url(old_url):
async with httpx.AsyncClient(timeout=10, follow_redirects=True) as client:
resp = await client.get(new_url)
if resp.status_code < 400: return new_url
except: pass
return None
async def _check_url_logic(self, url: str) -> Tuple[bool, str, Optional[str]]:
headers = {"User-Agent": "Mozilla/5.0", "Accept-Language": "en-US,en;q=0.5"}
parked_indicators = ["buy this domain", "parked free", "domain is for sale"]
@@ -128,31 +158,24 @@ class IntelligentLinkCleaner:
# Mandate 31: Content-URL Precision (Generic Redirect Detection)
if final_url != url:
# Clean both for comparison
u_path = url.split("://")[-1].rstrip("/")
f_path = final_url.split("://")[-1].rstrip("/")
generic_segments = ["/about", "/home", "/index", "/whats-new", "/es/", "/en/"]
# If a deep link (multiple slashes) redirects to a very shallow path
generic_segments = ["/about", "/home", "/index", "/whats-new", "/es/", "/en/", "/products/"]
is_deep_orig = u_path.count("/") >= 3
is_shallow_final = f_path.count("/") <= 2 or any(f_path.endswith(s) for s in generic_segments)
if is_deep_orig and is_shallow_final:
log_event(f" [!] PRECISION LOSS: {url} -> {final_url} (Generic redirect). Removing.")
return False, "generic_redirect_loss", None
return True, "OK", final_url if final_url != url else None
# Definitive Failures
if resp.status_code in [404, 410]:
# AUTO-HEAL GitHub Branches (master -> main)
if "github.com" in url and "/master/" in url:
heal = url.replace("/master/", "/main/")
try:
if (await client.get(heal)).status_code < 400: return True, "healed", heal
except: pass
# Mandate 8: Repository Consolidation
if "github.com" in url:
match = re.search(r'(https?://github\.com/[^/]+/[^/]+)', url)
if match:
@@ -162,7 +185,6 @@ class IntelligentLinkCleaner:
if (await client.get(root_url)).status_code < 400:
return True, "consolidated_to_root", root_url
except: pass
return False, "404", None
return True, f"Soft Block {resp.status_code}", None
except: return True, "Connection Error", None
@@ -170,54 +192,34 @@ class IntelligentLinkCleaner:
async def apply_changes(self):
log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True)
file_updates = {}
# 1. Prepare file updates for dead/canonical links
for url, (fallback, reason) in self.dead_links.items():
nu = normalize_url(url)
# Use v1_locations from inventory if available, fallback to registry
paths = self.inventory.get(nu, {}).get("v1_locations", [])
if not paths:
paths = [occ["file"] for occ in self.link_registry.get(nu, [])]
for path in set(paths):
if not os.path.exists(path): continue
if path not in file_updates:
file_updates[path] = open(path, "r").readlines()
# Perform surgical replacement line-by-line
if path not in file_updates: file_updates[path] = open(path, "r").readlines()
for i, line in enumerate(file_updates[path]):
if url in line:
if fallback and fallback.startswith("CANONICAL:"):
new_url = fallback.replace("CANONICAL:", "")
log_event(f" [FIX] Redirect: {url} -> {new_url} in {path}")
file_updates[path][i] = line.replace(url, new_url)
else:
log_event(f" [DEL] Dead Link: {url} in {path}")
file_updates[path][i] = None # Mark line for removal
file_updates[path][i] = None
# 2. Final Payload Construction
final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()}
# 3. Database Maintenance (GC & Persistence)
await self.prune_orphaned_metadata()
self._save_inventory()
final_payload[INVENTORY_PATH] = yaml.dump(self.inventory, sort_keys=False, allow_unicode=True)
# 4. Safety Audit & Non-Blocking PR
from src.safety_guard import SafetyGuard
guard = SafetyGuard()
safety_report = guard.generate_audit_report()
if final_payload:
metrics = {
"total_extracted": len(self.link_registry),
"full_report": self.action_log,
"deleted_dead": len([v for v in self.dead_links.values() if v[0] is None]),
"fixed_redirects": len([v for v in self.dead_links.values() if v[0] and "CANONICAL" in v[0]])
}
metrics = {"total_extracted": len(self.link_registry), "full_report": self.action_log}
self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=safety_report)
else:
log_event(" [INFO] No files required cleaning in this cycle.")
async def prune_orphaned_metadata(self):
valid_map = {}