feat(ai): implement 'High-Value Preservation' to protect highlighted and starred links from automatic deletion

This commit is contained in:
Nubenetes Bot
2026-05-17 22:55:36 +02:00
parent d5c383bfd1
commit 6cd5b6cfb7
3 changed files with 67 additions and 121 deletions

View File

@@ -101,10 +101,10 @@ This file contains the accumulated instructions and long-term vision for the aut
- **Lowercase Anchors**: All Markdown anchors MUST use strictly lowercase slugs without special characters.
31. **Content-URL Precision Standard**: To prevent misinformation and maintain high-density technical value:
- **Generic Redirect Detection**: If a technical deep-link redirects to a generic landing page (e.g., home page, "About" section, or index), it is flagged as a precision failure.
- **Deep Link Rescue (Universal)**: For ALL technical resources, the bot MUST NOT delete the link immediately upon a generic redirect or 404. Instead, it SHOULD attempt to "rescue" the link by identifying the specific content's new path on the destination domain using the resource's descriptive title.
- **Authoritative Preservation**: If a specific technical equivalent is found (e.g., during a site migration like Nginx to F5), the URL MUST be updated to the new specific path to maintain content coherence with the descriptive title.
- **Title Mismatch**: If no specific destination is found and the new URL only provides generic content, the resource MUST be removed. Precision is prioritized over link presence.
- **Generic Redirect Detection**: If a technical deep-link redirects to a generic landing page, it is flagged as a precision failure.
- **Deep Link Rescue (Universal)**: For ALL technical resources, the bot MUST NOT delete the link immediately. Instead, it SHOULD attempt to "rescue" it using the technical title and full V1 description for high-precision context search.
- **High-Value Preservation (The 'Review Required' Rule)**: Resources identified as **High-Value** (visually highlighted with bold/highlight, marked with 🌟 stars, or featuring dense technical descriptions) MUST NEVER be automatically deleted. If rescue attempts fail, these links MUST be marked as `status: review_required` and preserved in the archive for manual verification.
- **Authoritative Preservation**: If a specific technical equivalent is found (e.g., Nginx to F5 migration), the URL MUST be updated to the new specific path.
## 🛠️ Structural Evolution & Navigation
...

View File

@@ -285,7 +285,8 @@ To maximize economic efficiency, all AI agents follow a **Database-First** appro
### 6.3. Database Lifecycle and Hygiene
To maintain a high-performance "Single Source of Truth", Nubenetes implements automated hygiene protocols:
- **Universal Rescue Protocol (The Resurrection Rule)**: For ALL technical resources, the engine refuses to delete a link immediately upon a 404 or generic redirect. Instead, it triggers a "Technical Resurrection" cycle using Gemini to identify the resource's new specific path on a destination domain. This is essential for preserving legendary content during massive corporate site migrations (e.g., **Nginx** to **F5**, or the **AWS Knowledge Center** move to **repost.aws**).
- **Universal Rescue Protocol (The Resurrection Rule)**: For ALL technical resources, the engine refuses to delete a link immediately upon a 404 or generic redirect. Instead, it triggers a "Technical Resurrection" cycle using Gemini to identify the resource's new specific path on a destination domain.
- **High-Value Preservation (The 'Review Required' Rule)**: Resources identified as **High-Value** (visually highlighted in bold/yellow, marked with 🌟 stars, or featuring dense technical descriptions) are exempt from automatic deletion. If rescue attempts fail, these links are marked as `status: review_required` and preserved in the archive for manual verification, ensuring no significant technical assets are lost during autonomous cleaning.
#### 🕵️ Rescue Observability (Real-World Examples)
The engine proactively salvages technical depth during site migrations:

View File

@@ -8,7 +8,7 @@ import yaml
import hashlib
from datetime import datetime
from typing import Dict, List, Set, Tuple, Optional, Any
from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ
from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ, INVENTORY_PATH
from src.gitops_manager import RepositoryController
from src.markdown_ast import MarkdownSanitizer
from src.agentic_curator import AgenticCurator
@@ -16,9 +16,8 @@ from src.logger import log_event
from src.gemini_utils import call_gemini_with_retry, normalize_url
# Configuración de Excepciones
CORE_FILES = ["docs/index.md", "README.md"]
CORE_FILES = ["docs/index.md", "README.md", "docs/about.md"]
MEMORY_FILE = "src/memory/health_learning.json"
INVENTORY_PATH = "data/inventory.yaml"
class IntelligentLinkCleaner:
def __init__(self):
@@ -56,7 +55,7 @@ class IntelligentLinkCleaner:
async def execute_clean_cycle(self):
log_event("STARTING INTELLIGENT CLEANING CYCLE", section_break=True)
# 1. Map all links in V1
# 1. Map all links in V1 and detect Importance Markers
for root, _, files in os.walk("docs"):
for f in files:
if f.endswith(".md"):
@@ -64,10 +63,23 @@ class IntelligentLinkCleaner:
content = open(path, "r").read()
lines = content.splitlines()
for idx, line in enumerate(lines):
urls = re.findall(r'\[.*?\]\((https?://.*?)\)', line)
for url in urls:
# Enhanced Regex to capture surrounding formatting
matches = re.finditer(r'(\*\*|==)?\s*\[(.*?)\]\((https?://.*?)\)\s*(\*\*|==)?\s*(.*)', line)
for m in matches:
fmt_pre, title, url, fmt_post, desc = m.groups()
nu = normalize_url(url)
self.link_registry.setdefault(nu, []).append({"file": path, "line_index": idx, "url": url})
# Identify Importance Markers (Mandate 31 Expansion)
is_important = False
if fmt_pre or fmt_post: is_important = True # Bold or Highlighted
if "🌟" in title or "🌟" in desc: is_important = True # Stars
if len(desc.strip()) > 100: is_important = True # Deep description
if path in CORE_FILES: is_important = True # Foundational files
self.link_registry.setdefault(nu, []).append({
"file": path, "line_index": idx, "url": url,
"is_important": is_important
})
unique_urls = list(self.link_registry.keys())
random.shuffle(unique_urls)
@@ -75,8 +87,7 @@ class IntelligentLinkCleaner:
# 1.5. Identify prioritized links for validation
to_check = []
for u in unique_urls:
nu = normalize_url(u)
entry = self.inventory.get(nu, {})
nu = normalize_url(u); entry = self.inventory.get(nu, {})
is_suspicious = False
if entry.get("status") == "online":
path = nu.split("://")[-1].rstrip("/")
@@ -92,7 +103,7 @@ class IntelligentLinkCleaner:
# 2. Parallel Network Checks
BATCH_SIZE = 20
check_results = {} # {url: (alive, reason, final)}
check_results = {}
for i in range(0, len(to_check), BATCH_SIZE):
batch = to_check[i:i+BATCH_SIZE]
tasks = [self._check_url_logic(url) for url in batch]
@@ -100,27 +111,25 @@ class IntelligentLinkCleaner:
for url, res in zip(batch, results): check_results[url] = res
if i % 100 == 0: log_event(f" [>] Network Check Progress: {i}/{len(to_check)} checked...")
# 2.5. SMART AI BATCH RESCUE: Group links that need resurrection
# 2.5. UNIVERSAL AI RESCUE (Mandate 31)
to_rescue = [u for u, res in check_results.items() if not res[0] or res[1] == "generic_redirect_loss"]
if to_rescue:
log_event(f"[*] Starting AI Rescue for {len(to_rescue)} links...")
AI_BATCH_SIZE = 10
for i in range(0, len(to_rescue), AI_BATCH_SIZE):
batch = to_rescue[i:i+AI_BATCH_SIZE]
log_event(f" [🔍] Processing Rescue Batch {i//AI_BATCH_SIZE + 1}...")
batch_info = []
for u in batch:
entry = self.inventory.get(normalize_url(u), {})
batch_info.append({"url": u, "title": entry.get("title", u)})
batch_info.append({"url": u, "title": entry.get("title", u), "context": entry.get("description", "")})
prompt = (
"You act as a Technical Librarian. These resources are missing or redirecting to generic pages.\n"
"Identify the NEW specific URLs for this technical content. Search for direct equivalents, not home pages.\n"
"Pattern Recognition: Consider site migrations (e.g. Nginx -> F5, Ansible -> RedHat/Personal Blogs).\n"
"Search for the specific Technical Article or Tool URL based on the title and description provided.\n"
"Consider site migrations, acquisitions (Ansible->RedHat, Nginx->F5), and cross-domain moves to personal blogs.\n"
"Return ONLY a JSON list: [{\"old_url\": \"...\", \"new_url\": \"...\"}, ...]\n"
"If not found, set new_url to \"NONE\".\n\n"
"RESOURCES:\n" + "\n".join([f"- {d['title']} ({d['url']})" for d in batch_info])
"RESOURCES:\n" + "\n".join([f"- Title: {d['title']} | Desc: {d['context'][:150]} | URL: {d['url']}" for d in batch_info])
)
try:
@@ -131,7 +140,6 @@ class IntelligentLinkCleaner:
for u in batch:
new_loc = res_map.get(normalize_url(u))
if new_loc and new_loc.startswith("http") and "NONE" not in new_loc.upper():
# Verify rescued URL
try:
async with httpx.AsyncClient(timeout=10, follow_redirects=True, verify=False) as client:
resp = await client.get(new_loc)
@@ -139,143 +147,81 @@ class IntelligentLinkCleaner:
log_event(f" [✨] RESCUED: {u} -> {new_loc}")
check_results[u] = (True, "resurrected", new_loc)
except: pass
except Exception as e:
log_event(f" [!] Rescue Batch Error: {e}")
except: pass
# 2.8. Finalize Link Status & Update Inventory
# 2.8. Finalize Status with Foundational Preservation
for url, (alive, reason, final) in check_results.items():
nu = normalize_url(url); entry = self.inventory.get(nu, {})
# Update Health Score
score = entry.get("health_score", 100)
score = (score * 0.8) + (100 if alive else 0) * 0.2
entry["health_score"] = round(score, 1)
entry["last_checked"] = datetime.now().timestamp()
entry["health_score"] = round(score, 1); entry["last_checked"] = datetime.now().timestamp()
# --- MANDATE 31: HIGH-VALUE PROTECTION ---
# Check importance from either current mapping or historical stars
is_important = any(occ.get("is_important") for occ in self.link_registry.get(nu, []))
if entry.get("stars", 0) >= 3: is_important = True
if alive:
# Semantic Drift check
# (Skipped in this batch logic for speed, but can be added back if needed)
pass
if not alive and score < 20:
entry["status"] = "dead"; self.dead_links[url] = (None, reason)
if not alive:
if is_important:
entry["status"] = "review_required"
log_event(f" [⚠️] PRESERVED (Review Needed): {url} is HIGH-VALUE.")
elif score < 20:
entry["status"] = "dead"; self.dead_links[url] = (None, reason)
elif final and alive:
self.dead_links[url] = (f"CANONICAL:{final}", "Redirect")
self.inventory[nu] = entry
# 3. Finalize
await self.apply_changes()
async def _check_and_fix_link(self, url: str):
# Deprecated by new batch execution flow in execute_clean_cycle
pass
async def _try_rescue_link(self, old_url: str, title: str) -> Optional[str]:
"""
Uses Gemini to identify the new home of a technical resource.
Universal application for all links (Mandate 31).
Supports cross-domain migrations (e.g. Corporate Blog -> Personal Blog).
"""
if not title: return None
prompt = (
f"You act as a Technical Librarian. The resource '{title}' was at '{old_url}'.\n"
"The site has migrated, restructured, or the content has moved to a new domain (e.g. from a corporate blog to a personal one).\n"
"Identify the NEW specific URL for this technical content. It must lead to the same article or its direct technical equivalent.\n"
"Return ONLY the raw URL. If not found, return 'NONE'."
)
try:
async with self.ai_semaphore:
# Use Pro for high-fidelity web knowledge
new_url = await call_gemini_with_retry(prompt, response_format="text", prefer_flash=False)
if new_url and new_url.startswith("http") and "NONE" not in new_url.upper():
# Strip quotes or extra text if AI failed to follow "ONLY URL"
new_url = re.search(r'(https?://[^\s\"\'\>]+)', new_url)
if new_url:
new_url = new_url.group(1)
if normalize_url(new_url) != normalize_url(old_url):
async with httpx.AsyncClient(timeout=10, follow_redirects=True, verify=False) as client:
resp = await client.get(new_url)
if resp.status_code < 400: return new_url
except: pass
return None
async def _check_url_logic(self, url: str) -> Tuple[bool, str, Optional[str]]:
headers = {"User-Agent": "Mozilla/5.0", "Accept-Language": "en-US,en;q=0.5"}
parked_indicators = ["buy this domain", "parked free", "domain is for sale"]
parked = ["buy this domain", "parked free", "domain is for sale"]
try:
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=12) as client:
resp = await client.get(url)
if resp.status_code < 400:
text = resp.text.lower()
if any(kw in text for kw in parked_indicators): return False, "parked", None
final_url = str(resp.url)
# Mandate 31: Content-URL Precision (Generic Redirect Detection)
text = resp.text.lower(); final_url = str(resp.url)
if any(kw in text for kw in parked): return False, "parked", None
if final_url != url:
u_path = url.split("://")[-1].rstrip("/")
f_path = final_url.split("://")[-1].rstrip("/")
generic_segments = ["/about", "/home", "/index", "/whats-new", "/es/", "/en/", "/products/"]
is_deep_orig = u_path.count("/") >= 3
is_shallow_final = f_path.count("/") <= 2 or any(f_path.endswith(s) for s in generic_segments)
if is_deep_orig and is_shallow_final:
u_p = url.split("://")[-1].rstrip("/"); f_p = final_url.split("://")[-1].rstrip("/")
if u_p.count("/") >= 3 and (f_p.count("/") <= 2 or any(kw in f_p for kw in ["/about", "/products", "/home"])):
return False, "generic_redirect_loss", None
return True, "OK", final_url if final_url != url else None
if resp.status_code in [404, 410]:
if "github.com" in url and "/master/" in url:
heal = url.replace("/master/", "/main/")
try:
if (await client.get(heal)).status_code < 400: return True, "healed", heal
except: pass
if "github.com" in url:
match = re.search(r'(https?://github\.com/[^/]+/[^/]+)', url)
if match:
root_url = match.group(1)
if root_url != url:
try:
if (await client.get(root_url)).status_code < 400:
return True, "consolidated_to_root", root_url
except: pass
if "/master/" in url:
h = url.replace("/master/", "/main/")
try:
if (await client.get(h)).status_code < 200: return True, "healed", h
except: pass
m = re.search(r'(https?://github\.com/[^/]+/[^/]+)', url)
if m and (await client.get(m.group(1))).status_code < 400: return True, "consolidated", m.group(1)
return False, "404", None
return True, f"Soft Block {resp.status_code}", None
except: return True, "Connection Error", None
except: return True, "Error", None
async def apply_changes(self):
log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True)
file_updates = {}
for url, (fallback, reason) in self.dead_links.items():
nu = normalize_url(url)
paths = self.inventory.get(nu, {}).get("v1_locations", [])
if not paths:
paths = [occ["file"] for occ in self.link_registry.get(nu, [])]
nu = normalize_url(url); paths = self.inventory.get(nu, {}).get("v1_locations", [])
if not paths: paths = [occ["file"] for occ in self.link_registry.get(nu, [])]
for path in set(paths):
if not os.path.exists(path): continue
if path not in file_updates: file_updates[path] = open(path, "r").readlines()
for i, line in enumerate(file_updates[path]):
if url in line:
if fallback and fallback.startswith("CANONICAL:"):
new_url = fallback.replace("CANONICAL:", "")
file_updates[path][i] = line.replace(url, new_url)
else:
file_updates[path][i] = None
file_updates[path][i] = line.replace(url, fallback.replace("CANONICAL:", ""))
else: file_updates[path][i] = None
final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()}
await self.prune_orphaned_metadata()
self._save_inventory()
await self.prune_orphaned_metadata(); self._save_inventory()
final_payload[INVENTORY_PATH] = yaml.dump(self.inventory, sort_keys=False, allow_unicode=True)
from src.safety_guard import SafetyGuard
guard = SafetyGuard()
safety_report = guard.generate_audit_report()
if final_payload:
metrics = {"total_extracted": len(self.link_registry), "full_report": self.action_log}
self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=safety_report)
report = SafetyGuard().generate_audit_report()
if final_payload: self.git_controller.apply_multi_file_changes(final_payload, {"total_extracted": len(self.link_registry)}, safety_report=report)
async def prune_orphaned_metadata(self):
valid_map = {}
@@ -283,8 +229,7 @@ class IntelligentLinkCleaner:
for f in files:
if f.endswith(".md"):
p = os.path.join(root, f); c = open(p, "r").read()
for u in re.findall(r'\[.*?\]\((https?://.*?)\)', c):
valid_map.setdefault(normalize_url(u), []).append(p)
for u in re.findall(r'\[.*?\]\((https?://.*?)\)', c): valid_map.setdefault(normalize_url(u), []).append(p)
new_inv = {}
for u, m in self.inventory.items():
if u.startswith("INTRO:") or u in valid_map: