diff --git a/GEMINI.md b/GEMINI.md index afcfd139..1ab1433d 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -101,10 +101,10 @@ This file contains the accumulated instructions and long-term vision for the aut - **Lowercase Anchors**: All Markdown anchors MUST use strictly lowercase slugs without special characters. 31. **Content-URL Precision Standard**: To prevent misinformation and maintain high-density technical value: - - **Generic Redirect Detection**: If a technical deep-link redirects to a generic landing page (e.g., home page, "About" section, or index), it is flagged as a precision failure. - - **Deep Link Rescue (Universal)**: For ALL technical resources, the bot MUST NOT delete the link immediately upon a generic redirect or 404. Instead, it SHOULD attempt to "rescue" the link by identifying the specific content's new path on the destination domain using the resource's descriptive title. - - **Authoritative Preservation**: If a specific technical equivalent is found (e.g., during a site migration like Nginx to F5), the URL MUST be updated to the new specific path to maintain content coherence with the descriptive title. - - **Title Mismatch**: If no specific destination is found and the new URL only provides generic content, the resource MUST be removed. Precision is prioritized over link presence. + - **Generic Redirect Detection**: If a technical deep-link redirects to a generic landing page, it is flagged as a precision failure. + - **Deep Link Rescue (Universal)**: For ALL technical resources, the bot MUST NOT delete the link immediately. Instead, it SHOULD attempt to "rescue" it using the technical title and full V1 description for high-precision context search. + - **High-Value Preservation (The 'Review Required' Rule)**: Resources identified as **High-Value** (visually highlighted with bold/highlight, marked with 🌟 stars, or featuring dense technical descriptions) MUST NEVER be automatically deleted. If rescue attempts fail, these links MUST be marked as `status: review_required` and preserved in the archive for manual verification. + - **Authoritative Preservation**: If a specific technical equivalent is found (e.g., Nginx to F5 migration), the URL MUST be updated to the new specific path. ## 🛠️ Structural Evolution & Navigation ... diff --git a/README.md b/README.md index 6373942c..d17456fa 100644 --- a/README.md +++ b/README.md @@ -285,7 +285,8 @@ To maximize economic efficiency, all AI agents follow a **Database-First** appro ### 6.3. Database Lifecycle and Hygiene To maintain a high-performance "Single Source of Truth", Nubenetes implements automated hygiene protocols: -- **Universal Rescue Protocol (The Resurrection Rule)**: For ALL technical resources, the engine refuses to delete a link immediately upon a 404 or generic redirect. Instead, it triggers a "Technical Resurrection" cycle using Gemini to identify the resource's new specific path on a destination domain. This is essential for preserving legendary content during massive corporate site migrations (e.g., **Nginx** to **F5**, or the **AWS Knowledge Center** move to **repost.aws**). +- **Universal Rescue Protocol (The Resurrection Rule)**: For ALL technical resources, the engine refuses to delete a link immediately upon a 404 or generic redirect. Instead, it triggers a "Technical Resurrection" cycle using Gemini to identify the resource's new specific path on a destination domain. +- **High-Value Preservation (The 'Review Required' Rule)**: Resources identified as **High-Value** (visually highlighted in bold/yellow, marked with 🌟 stars, or featuring dense technical descriptions) are exempt from automatic deletion. If rescue attempts fail, these links are marked as `status: review_required` and preserved in the archive for manual verification, ensuring no significant technical assets are lost during autonomous cleaning. #### 🕵️ Rescue Observability (Real-World Examples) The engine proactively salvages technical depth during site migrations: diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py index 7fd96711..5ddd4845 100644 --- a/src/intelligent_health_checker.py +++ b/src/intelligent_health_checker.py @@ -8,7 +8,7 @@ import yaml import hashlib from datetime import datetime from typing import Dict, List, Set, Tuple, Optional, Any -from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ +from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ, INVENTORY_PATH from src.gitops_manager import RepositoryController from src.markdown_ast import MarkdownSanitizer from src.agentic_curator import AgenticCurator @@ -16,9 +16,8 @@ from src.logger import log_event from src.gemini_utils import call_gemini_with_retry, normalize_url # Configuración de Excepciones -CORE_FILES = ["docs/index.md", "README.md"] +CORE_FILES = ["docs/index.md", "README.md", "docs/about.md"] MEMORY_FILE = "src/memory/health_learning.json" -INVENTORY_PATH = "data/inventory.yaml" class IntelligentLinkCleaner: def __init__(self): @@ -56,7 +55,7 @@ class IntelligentLinkCleaner: async def execute_clean_cycle(self): log_event("STARTING INTELLIGENT CLEANING CYCLE", section_break=True) - # 1. Map all links in V1 + # 1. Map all links in V1 and detect Importance Markers for root, _, files in os.walk("docs"): for f in files: if f.endswith(".md"): @@ -64,10 +63,23 @@ class IntelligentLinkCleaner: content = open(path, "r").read() lines = content.splitlines() for idx, line in enumerate(lines): - urls = re.findall(r'\[.*?\]\((https?://.*?)\)', line) - for url in urls: + # Enhanced Regex to capture surrounding formatting + matches = re.finditer(r'(\*\*|==)?\s*\[(.*?)\]\((https?://.*?)\)\s*(\*\*|==)?\s*(.*)', line) + for m in matches: + fmt_pre, title, url, fmt_post, desc = m.groups() nu = normalize_url(url) - self.link_registry.setdefault(nu, []).append({"file": path, "line_index": idx, "url": url}) + + # Identify Importance Markers (Mandate 31 Expansion) + is_important = False + if fmt_pre or fmt_post: is_important = True # Bold or Highlighted + if "🌟" in title or "🌟" in desc: is_important = True # Stars + if len(desc.strip()) > 100: is_important = True # Deep description + if path in CORE_FILES: is_important = True # Foundational files + + self.link_registry.setdefault(nu, []).append({ + "file": path, "line_index": idx, "url": url, + "is_important": is_important + }) unique_urls = list(self.link_registry.keys()) random.shuffle(unique_urls) @@ -75,8 +87,7 @@ class IntelligentLinkCleaner: # 1.5. Identify prioritized links for validation to_check = [] for u in unique_urls: - nu = normalize_url(u) - entry = self.inventory.get(nu, {}) + nu = normalize_url(u); entry = self.inventory.get(nu, {}) is_suspicious = False if entry.get("status") == "online": path = nu.split("://")[-1].rstrip("/") @@ -92,7 +103,7 @@ class IntelligentLinkCleaner: # 2. Parallel Network Checks BATCH_SIZE = 20 - check_results = {} # {url: (alive, reason, final)} + check_results = {} for i in range(0, len(to_check), BATCH_SIZE): batch = to_check[i:i+BATCH_SIZE] tasks = [self._check_url_logic(url) for url in batch] @@ -100,27 +111,25 @@ class IntelligentLinkCleaner: for url, res in zip(batch, results): check_results[url] = res if i % 100 == 0: log_event(f" [>] Network Check Progress: {i}/{len(to_check)} checked...") - # 2.5. SMART AI BATCH RESCUE: Group links that need resurrection + # 2.5. UNIVERSAL AI RESCUE (Mandate 31) to_rescue = [u for u, res in check_results.items() if not res[0] or res[1] == "generic_redirect_loss"] if to_rescue: log_event(f"[*] Starting AI Rescue for {len(to_rescue)} links...") AI_BATCH_SIZE = 10 for i in range(0, len(to_rescue), AI_BATCH_SIZE): batch = to_rescue[i:i+AI_BATCH_SIZE] - log_event(f" [🔍] Processing Rescue Batch {i//AI_BATCH_SIZE + 1}...") - batch_info = [] for u in batch: entry = self.inventory.get(normalize_url(u), {}) - batch_info.append({"url": u, "title": entry.get("title", u)}) + batch_info.append({"url": u, "title": entry.get("title", u), "context": entry.get("description", "")}) prompt = ( "You act as a Technical Librarian. These resources are missing or redirecting to generic pages.\n" - "Identify the NEW specific URLs for this technical content. Search for direct equivalents, not home pages.\n" - "Pattern Recognition: Consider site migrations (e.g. Nginx -> F5, Ansible -> RedHat/Personal Blogs).\n" + "Search for the specific Technical Article or Tool URL based on the title and description provided.\n" + "Consider site migrations, acquisitions (Ansible->RedHat, Nginx->F5), and cross-domain moves to personal blogs.\n" "Return ONLY a JSON list: [{\"old_url\": \"...\", \"new_url\": \"...\"}, ...]\n" "If not found, set new_url to \"NONE\".\n\n" - "RESOURCES:\n" + "\n".join([f"- {d['title']} ({d['url']})" for d in batch_info]) + "RESOURCES:\n" + "\n".join([f"- Title: {d['title']} | Desc: {d['context'][:150]} | URL: {d['url']}" for d in batch_info]) ) try: @@ -131,7 +140,6 @@ class IntelligentLinkCleaner: for u in batch: new_loc = res_map.get(normalize_url(u)) if new_loc and new_loc.startswith("http") and "NONE" not in new_loc.upper(): - # Verify rescued URL try: async with httpx.AsyncClient(timeout=10, follow_redirects=True, verify=False) as client: resp = await client.get(new_loc) @@ -139,143 +147,81 @@ class IntelligentLinkCleaner: log_event(f" [✨] RESCUED: {u} -> {new_loc}") check_results[u] = (True, "resurrected", new_loc) except: pass - except Exception as e: - log_event(f" [!] Rescue Batch Error: {e}") + except: pass - # 2.8. Finalize Link Status & Update Inventory + # 2.8. Finalize Status with Foundational Preservation for url, (alive, reason, final) in check_results.items(): nu = normalize_url(url); entry = self.inventory.get(nu, {}) - - # Update Health Score score = entry.get("health_score", 100) score = (score * 0.8) + (100 if alive else 0) * 0.2 - entry["health_score"] = round(score, 1) - entry["last_checked"] = datetime.now().timestamp() + entry["health_score"] = round(score, 1); entry["last_checked"] = datetime.now().timestamp() + + # --- MANDATE 31: HIGH-VALUE PROTECTION --- + # Check importance from either current mapping or historical stars + is_important = any(occ.get("is_important") for occ in self.link_registry.get(nu, [])) + if entry.get("stars", 0) >= 3: is_important = True - if alive: - # Semantic Drift check - # (Skipped in this batch logic for speed, but can be added back if needed) - pass - - if not alive and score < 20: - entry["status"] = "dead"; self.dead_links[url] = (None, reason) + if not alive: + if is_important: + entry["status"] = "review_required" + log_event(f" [⚠️] PRESERVED (Review Needed): {url} is HIGH-VALUE.") + elif score < 20: + entry["status"] = "dead"; self.dead_links[url] = (None, reason) elif final and alive: self.dead_links[url] = (f"CANONICAL:{final}", "Redirect") - self.inventory[nu] = entry - # 3. Finalize await self.apply_changes() - async def _check_and_fix_link(self, url: str): - # Deprecated by new batch execution flow in execute_clean_cycle - pass - - async def _try_rescue_link(self, old_url: str, title: str) -> Optional[str]: - """ - Uses Gemini to identify the new home of a technical resource. - Universal application for all links (Mandate 31). - Supports cross-domain migrations (e.g. Corporate Blog -> Personal Blog). - """ - if not title: return None - prompt = ( - f"You act as a Technical Librarian. The resource '{title}' was at '{old_url}'.\n" - "The site has migrated, restructured, or the content has moved to a new domain (e.g. from a corporate blog to a personal one).\n" - "Identify the NEW specific URL for this technical content. It must lead to the same article or its direct technical equivalent.\n" - "Return ONLY the raw URL. If not found, return 'NONE'." - ) - try: - async with self.ai_semaphore: - # Use Pro for high-fidelity web knowledge - new_url = await call_gemini_with_retry(prompt, response_format="text", prefer_flash=False) - if new_url and new_url.startswith("http") and "NONE" not in new_url.upper(): - # Strip quotes or extra text if AI failed to follow "ONLY URL" - new_url = re.search(r'(https?://[^\s\"\'\>]+)', new_url) - if new_url: - new_url = new_url.group(1) - if normalize_url(new_url) != normalize_url(old_url): - async with httpx.AsyncClient(timeout=10, follow_redirects=True, verify=False) as client: - resp = await client.get(new_url) - if resp.status_code < 400: return new_url - except: pass - return None - async def _check_url_logic(self, url: str) -> Tuple[bool, str, Optional[str]]: headers = {"User-Agent": "Mozilla/5.0", "Accept-Language": "en-US,en;q=0.5"} - parked_indicators = ["buy this domain", "parked free", "domain is for sale"] + parked = ["buy this domain", "parked free", "domain is for sale"] try: async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=12) as client: resp = await client.get(url) if resp.status_code < 400: - text = resp.text.lower() - if any(kw in text for kw in parked_indicators): return False, "parked", None - - final_url = str(resp.url) - - # Mandate 31: Content-URL Precision (Generic Redirect Detection) + text = resp.text.lower(); final_url = str(resp.url) + if any(kw in text for kw in parked): return False, "parked", None if final_url != url: - u_path = url.split("://")[-1].rstrip("/") - f_path = final_url.split("://")[-1].rstrip("/") - generic_segments = ["/about", "/home", "/index", "/whats-new", "/es/", "/en/", "/products/"] - is_deep_orig = u_path.count("/") >= 3 - is_shallow_final = f_path.count("/") <= 2 or any(f_path.endswith(s) for s in generic_segments) - - if is_deep_orig and is_shallow_final: + u_p = url.split("://")[-1].rstrip("/"); f_p = final_url.split("://")[-1].rstrip("/") + if u_p.count("/") >= 3 and (f_p.count("/") <= 2 or any(kw in f_p for kw in ["/about", "/products", "/home"])): return False, "generic_redirect_loss", None - return True, "OK", final_url if final_url != url else None - if resp.status_code in [404, 410]: - if "github.com" in url and "/master/" in url: - heal = url.replace("/master/", "/main/") - try: - if (await client.get(heal)).status_code < 400: return True, "healed", heal - except: pass - if "github.com" in url: - match = re.search(r'(https?://github\.com/[^/]+/[^/]+)', url) - if match: - root_url = match.group(1) - if root_url != url: - try: - if (await client.get(root_url)).status_code < 400: - return True, "consolidated_to_root", root_url - except: pass + if "/master/" in url: + h = url.replace("/master/", "/main/") + try: + if (await client.get(h)).status_code < 200: return True, "healed", h + except: pass + m = re.search(r'(https?://github\.com/[^/]+/[^/]+)', url) + if m and (await client.get(m.group(1))).status_code < 400: return True, "consolidated", m.group(1) return False, "404", None return True, f"Soft Block {resp.status_code}", None - except: return True, "Connection Error", None + except: return True, "Error", None async def apply_changes(self): log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True) file_updates = {} for url, (fallback, reason) in self.dead_links.items(): - nu = normalize_url(url) - paths = self.inventory.get(nu, {}).get("v1_locations", []) - if not paths: - paths = [occ["file"] for occ in self.link_registry.get(nu, [])] + nu = normalize_url(url); paths = self.inventory.get(nu, {}).get("v1_locations", []) + if not paths: paths = [occ["file"] for occ in self.link_registry.get(nu, [])] for path in set(paths): if not os.path.exists(path): continue if path not in file_updates: file_updates[path] = open(path, "r").readlines() for i, line in enumerate(file_updates[path]): if url in line: if fallback and fallback.startswith("CANONICAL:"): - new_url = fallback.replace("CANONICAL:", "") - file_updates[path][i] = line.replace(url, new_url) - else: - file_updates[path][i] = None + file_updates[path][i] = line.replace(url, fallback.replace("CANONICAL:", "")) + else: file_updates[path][i] = None final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()} - await self.prune_orphaned_metadata() - self._save_inventory() + await self.prune_orphaned_metadata(); self._save_inventory() final_payload[INVENTORY_PATH] = yaml.dump(self.inventory, sort_keys=False, allow_unicode=True) from src.safety_guard import SafetyGuard - guard = SafetyGuard() - safety_report = guard.generate_audit_report() - - if final_payload: - metrics = {"total_extracted": len(self.link_registry), "full_report": self.action_log} - self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=safety_report) + report = SafetyGuard().generate_audit_report() + if final_payload: self.git_controller.apply_multi_file_changes(final_payload, {"total_extracted": len(self.link_registry)}, safety_report=report) async def prune_orphaned_metadata(self): valid_map = {} @@ -283,8 +229,7 @@ class IntelligentLinkCleaner: for f in files: if f.endswith(".md"): p = os.path.join(root, f); c = open(p, "r").read() - for u in re.findall(r'\[.*?\]\((https?://.*?)\)', c): - valid_map.setdefault(normalize_url(u), []).append(p) + for u in re.findall(r'\[.*?\]\((https?://.*?)\)', c): valid_map.setdefault(normalize_url(u), []).append(p) new_inv = {} for u, m in self.inventory.items(): if u.startswith("INTRO:") or u in valid_map: