mirror of
https://github.com/nubenetes/awesome-kubernetes.git
synced 2026-05-22 00:53:37 +00:00
feat(ai): implement 'High-Value Preservation' to protect highlighted and starred links from automatic deletion
This commit is contained in:
@@ -101,10 +101,10 @@ This file contains the accumulated instructions and long-term vision for the aut
|
||||
- **Lowercase Anchors**: All Markdown anchors MUST use strictly lowercase slugs without special characters.
|
||||
|
||||
31. **Content-URL Precision Standard**: To prevent misinformation and maintain high-density technical value:
|
||||
- **Generic Redirect Detection**: If a technical deep-link redirects to a generic landing page (e.g., home page, "About" section, or index), it is flagged as a precision failure.
|
||||
- **Deep Link Rescue (Universal)**: For ALL technical resources, the bot MUST NOT delete the link immediately upon a generic redirect or 404. Instead, it SHOULD attempt to "rescue" the link by identifying the specific content's new path on the destination domain using the resource's descriptive title.
|
||||
- **Authoritative Preservation**: If a specific technical equivalent is found (e.g., during a site migration like Nginx to F5), the URL MUST be updated to the new specific path to maintain content coherence with the descriptive title.
|
||||
- **Title Mismatch**: If no specific destination is found and the new URL only provides generic content, the resource MUST be removed. Precision is prioritized over link presence.
|
||||
- **Generic Redirect Detection**: If a technical deep-link redirects to a generic landing page, it is flagged as a precision failure.
|
||||
- **Deep Link Rescue (Universal)**: For ALL technical resources, the bot MUST NOT delete the link immediately. Instead, it SHOULD attempt to "rescue" it using the technical title and full V1 description for high-precision context search.
|
||||
- **High-Value Preservation (The 'Review Required' Rule)**: Resources identified as **High-Value** (visually highlighted with bold/highlight, marked with 🌟 stars, or featuring dense technical descriptions) MUST NEVER be automatically deleted. If rescue attempts fail, these links MUST be marked as `status: review_required` and preserved in the archive for manual verification.
|
||||
- **Authoritative Preservation**: If a specific technical equivalent is found (e.g., Nginx to F5 migration), the URL MUST be updated to the new specific path.
|
||||
|
||||
## 🛠️ Structural Evolution & Navigation
|
||||
...
|
||||
|
||||
@@ -285,7 +285,8 @@ To maximize economic efficiency, all AI agents follow a **Database-First** appro
|
||||
|
||||
### 6.3. Database Lifecycle and Hygiene
|
||||
To maintain a high-performance "Single Source of Truth", Nubenetes implements automated hygiene protocols:
|
||||
- **Universal Rescue Protocol (The Resurrection Rule)**: For ALL technical resources, the engine refuses to delete a link immediately upon a 404 or generic redirect. Instead, it triggers a "Technical Resurrection" cycle using Gemini to identify the resource's new specific path on a destination domain. This is essential for preserving legendary content during massive corporate site migrations (e.g., **Nginx** to **F5**, or the **AWS Knowledge Center** move to **repost.aws**).
|
||||
- **Universal Rescue Protocol (The Resurrection Rule)**: For ALL technical resources, the engine refuses to delete a link immediately upon a 404 or generic redirect. Instead, it triggers a "Technical Resurrection" cycle using Gemini to identify the resource's new specific path on a destination domain.
|
||||
- **High-Value Preservation (The 'Review Required' Rule)**: Resources identified as **High-Value** (visually highlighted in bold/yellow, marked with 🌟 stars, or featuring dense technical descriptions) are exempt from automatic deletion. If rescue attempts fail, these links are marked as `status: review_required` and preserved in the archive for manual verification, ensuring no significant technical assets are lost during autonomous cleaning.
|
||||
|
||||
#### 🕵️ Rescue Observability (Real-World Examples)
|
||||
The engine proactively salvages technical depth during site migrations:
|
||||
|
||||
@@ -8,7 +8,7 @@ import yaml
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Set, Tuple, Optional, Any
|
||||
from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ
|
||||
from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ, INVENTORY_PATH
|
||||
from src.gitops_manager import RepositoryController
|
||||
from src.markdown_ast import MarkdownSanitizer
|
||||
from src.agentic_curator import AgenticCurator
|
||||
@@ -16,9 +16,8 @@ from src.logger import log_event
|
||||
from src.gemini_utils import call_gemini_with_retry, normalize_url
|
||||
|
||||
# Configuración de Excepciones
|
||||
CORE_FILES = ["docs/index.md", "README.md"]
|
||||
CORE_FILES = ["docs/index.md", "README.md", "docs/about.md"]
|
||||
MEMORY_FILE = "src/memory/health_learning.json"
|
||||
INVENTORY_PATH = "data/inventory.yaml"
|
||||
|
||||
class IntelligentLinkCleaner:
|
||||
def __init__(self):
|
||||
@@ -56,7 +55,7 @@ class IntelligentLinkCleaner:
|
||||
|
||||
async def execute_clean_cycle(self):
|
||||
log_event("STARTING INTELLIGENT CLEANING CYCLE", section_break=True)
|
||||
# 1. Map all links in V1
|
||||
# 1. Map all links in V1 and detect Importance Markers
|
||||
for root, _, files in os.walk("docs"):
|
||||
for f in files:
|
||||
if f.endswith(".md"):
|
||||
@@ -64,10 +63,23 @@ class IntelligentLinkCleaner:
|
||||
content = open(path, "r").read()
|
||||
lines = content.splitlines()
|
||||
for idx, line in enumerate(lines):
|
||||
urls = re.findall(r'\[.*?\]\((https?://.*?)\)', line)
|
||||
for url in urls:
|
||||
# Enhanced Regex to capture surrounding formatting
|
||||
matches = re.finditer(r'(\*\*|==)?\s*\[(.*?)\]\((https?://.*?)\)\s*(\*\*|==)?\s*(.*)', line)
|
||||
for m in matches:
|
||||
fmt_pre, title, url, fmt_post, desc = m.groups()
|
||||
nu = normalize_url(url)
|
||||
self.link_registry.setdefault(nu, []).append({"file": path, "line_index": idx, "url": url})
|
||||
|
||||
# Identify Importance Markers (Mandate 31 Expansion)
|
||||
is_important = False
|
||||
if fmt_pre or fmt_post: is_important = True # Bold or Highlighted
|
||||
if "🌟" in title or "🌟" in desc: is_important = True # Stars
|
||||
if len(desc.strip()) > 100: is_important = True # Deep description
|
||||
if path in CORE_FILES: is_important = True # Foundational files
|
||||
|
||||
self.link_registry.setdefault(nu, []).append({
|
||||
"file": path, "line_index": idx, "url": url,
|
||||
"is_important": is_important
|
||||
})
|
||||
|
||||
unique_urls = list(self.link_registry.keys())
|
||||
random.shuffle(unique_urls)
|
||||
@@ -75,8 +87,7 @@ class IntelligentLinkCleaner:
|
||||
# 1.5. Identify prioritized links for validation
|
||||
to_check = []
|
||||
for u in unique_urls:
|
||||
nu = normalize_url(u)
|
||||
entry = self.inventory.get(nu, {})
|
||||
nu = normalize_url(u); entry = self.inventory.get(nu, {})
|
||||
is_suspicious = False
|
||||
if entry.get("status") == "online":
|
||||
path = nu.split("://")[-1].rstrip("/")
|
||||
@@ -92,7 +103,7 @@ class IntelligentLinkCleaner:
|
||||
|
||||
# 2. Parallel Network Checks
|
||||
BATCH_SIZE = 20
|
||||
check_results = {} # {url: (alive, reason, final)}
|
||||
check_results = {}
|
||||
for i in range(0, len(to_check), BATCH_SIZE):
|
||||
batch = to_check[i:i+BATCH_SIZE]
|
||||
tasks = [self._check_url_logic(url) for url in batch]
|
||||
@@ -100,27 +111,25 @@ class IntelligentLinkCleaner:
|
||||
for url, res in zip(batch, results): check_results[url] = res
|
||||
if i % 100 == 0: log_event(f" [>] Network Check Progress: {i}/{len(to_check)} checked...")
|
||||
|
||||
# 2.5. SMART AI BATCH RESCUE: Group links that need resurrection
|
||||
# 2.5. UNIVERSAL AI RESCUE (Mandate 31)
|
||||
to_rescue = [u for u, res in check_results.items() if not res[0] or res[1] == "generic_redirect_loss"]
|
||||
if to_rescue:
|
||||
log_event(f"[*] Starting AI Rescue for {len(to_rescue)} links...")
|
||||
AI_BATCH_SIZE = 10
|
||||
for i in range(0, len(to_rescue), AI_BATCH_SIZE):
|
||||
batch = to_rescue[i:i+AI_BATCH_SIZE]
|
||||
log_event(f" [🔍] Processing Rescue Batch {i//AI_BATCH_SIZE + 1}...")
|
||||
|
||||
batch_info = []
|
||||
for u in batch:
|
||||
entry = self.inventory.get(normalize_url(u), {})
|
||||
batch_info.append({"url": u, "title": entry.get("title", u)})
|
||||
batch_info.append({"url": u, "title": entry.get("title", u), "context": entry.get("description", "")})
|
||||
|
||||
prompt = (
|
||||
"You act as a Technical Librarian. These resources are missing or redirecting to generic pages.\n"
|
||||
"Identify the NEW specific URLs for this technical content. Search for direct equivalents, not home pages.\n"
|
||||
"Pattern Recognition: Consider site migrations (e.g. Nginx -> F5, Ansible -> RedHat/Personal Blogs).\n"
|
||||
"Search for the specific Technical Article or Tool URL based on the title and description provided.\n"
|
||||
"Consider site migrations, acquisitions (Ansible->RedHat, Nginx->F5), and cross-domain moves to personal blogs.\n"
|
||||
"Return ONLY a JSON list: [{\"old_url\": \"...\", \"new_url\": \"...\"}, ...]\n"
|
||||
"If not found, set new_url to \"NONE\".\n\n"
|
||||
"RESOURCES:\n" + "\n".join([f"- {d['title']} ({d['url']})" for d in batch_info])
|
||||
"RESOURCES:\n" + "\n".join([f"- Title: {d['title']} | Desc: {d['context'][:150]} | URL: {d['url']}" for d in batch_info])
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -131,7 +140,6 @@ class IntelligentLinkCleaner:
|
||||
for u in batch:
|
||||
new_loc = res_map.get(normalize_url(u))
|
||||
if new_loc and new_loc.startswith("http") and "NONE" not in new_loc.upper():
|
||||
# Verify rescued URL
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10, follow_redirects=True, verify=False) as client:
|
||||
resp = await client.get(new_loc)
|
||||
@@ -139,143 +147,81 @@ class IntelligentLinkCleaner:
|
||||
log_event(f" [✨] RESCUED: {u} -> {new_loc}")
|
||||
check_results[u] = (True, "resurrected", new_loc)
|
||||
except: pass
|
||||
except Exception as e:
|
||||
log_event(f" [!] Rescue Batch Error: {e}")
|
||||
except: pass
|
||||
|
||||
# 2.8. Finalize Link Status & Update Inventory
|
||||
# 2.8. Finalize Status with Foundational Preservation
|
||||
for url, (alive, reason, final) in check_results.items():
|
||||
nu = normalize_url(url); entry = self.inventory.get(nu, {})
|
||||
|
||||
# Update Health Score
|
||||
score = entry.get("health_score", 100)
|
||||
score = (score * 0.8) + (100 if alive else 0) * 0.2
|
||||
entry["health_score"] = round(score, 1)
|
||||
entry["last_checked"] = datetime.now().timestamp()
|
||||
entry["health_score"] = round(score, 1); entry["last_checked"] = datetime.now().timestamp()
|
||||
|
||||
# --- MANDATE 31: HIGH-VALUE PROTECTION ---
|
||||
# Check importance from either current mapping or historical stars
|
||||
is_important = any(occ.get("is_important") for occ in self.link_registry.get(nu, []))
|
||||
if entry.get("stars", 0) >= 3: is_important = True
|
||||
|
||||
if alive:
|
||||
# Semantic Drift check
|
||||
# (Skipped in this batch logic for speed, but can be added back if needed)
|
||||
pass
|
||||
|
||||
if not alive and score < 20:
|
||||
entry["status"] = "dead"; self.dead_links[url] = (None, reason)
|
||||
if not alive:
|
||||
if is_important:
|
||||
entry["status"] = "review_required"
|
||||
log_event(f" [⚠️] PRESERVED (Review Needed): {url} is HIGH-VALUE.")
|
||||
elif score < 20:
|
||||
entry["status"] = "dead"; self.dead_links[url] = (None, reason)
|
||||
elif final and alive:
|
||||
self.dead_links[url] = (f"CANONICAL:{final}", "Redirect")
|
||||
|
||||
self.inventory[nu] = entry
|
||||
|
||||
# 3. Finalize
|
||||
await self.apply_changes()
|
||||
|
||||
async def _check_and_fix_link(self, url: str):
|
||||
# Deprecated by new batch execution flow in execute_clean_cycle
|
||||
pass
|
||||
|
||||
async def _try_rescue_link(self, old_url: str, title: str) -> Optional[str]:
|
||||
"""
|
||||
Uses Gemini to identify the new home of a technical resource.
|
||||
Universal application for all links (Mandate 31).
|
||||
Supports cross-domain migrations (e.g. Corporate Blog -> Personal Blog).
|
||||
"""
|
||||
if not title: return None
|
||||
prompt = (
|
||||
f"You act as a Technical Librarian. The resource '{title}' was at '{old_url}'.\n"
|
||||
"The site has migrated, restructured, or the content has moved to a new domain (e.g. from a corporate blog to a personal one).\n"
|
||||
"Identify the NEW specific URL for this technical content. It must lead to the same article or its direct technical equivalent.\n"
|
||||
"Return ONLY the raw URL. If not found, return 'NONE'."
|
||||
)
|
||||
try:
|
||||
async with self.ai_semaphore:
|
||||
# Use Pro for high-fidelity web knowledge
|
||||
new_url = await call_gemini_with_retry(prompt, response_format="text", prefer_flash=False)
|
||||
if new_url and new_url.startswith("http") and "NONE" not in new_url.upper():
|
||||
# Strip quotes or extra text if AI failed to follow "ONLY URL"
|
||||
new_url = re.search(r'(https?://[^\s\"\'\>]+)', new_url)
|
||||
if new_url:
|
||||
new_url = new_url.group(1)
|
||||
if normalize_url(new_url) != normalize_url(old_url):
|
||||
async with httpx.AsyncClient(timeout=10, follow_redirects=True, verify=False) as client:
|
||||
resp = await client.get(new_url)
|
||||
if resp.status_code < 400: return new_url
|
||||
except: pass
|
||||
return None
|
||||
|
||||
async def _check_url_logic(self, url: str) -> Tuple[bool, str, Optional[str]]:
|
||||
headers = {"User-Agent": "Mozilla/5.0", "Accept-Language": "en-US,en;q=0.5"}
|
||||
parked_indicators = ["buy this domain", "parked free", "domain is for sale"]
|
||||
parked = ["buy this domain", "parked free", "domain is for sale"]
|
||||
try:
|
||||
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=12) as client:
|
||||
resp = await client.get(url)
|
||||
if resp.status_code < 400:
|
||||
text = resp.text.lower()
|
||||
if any(kw in text for kw in parked_indicators): return False, "parked", None
|
||||
|
||||
final_url = str(resp.url)
|
||||
|
||||
# Mandate 31: Content-URL Precision (Generic Redirect Detection)
|
||||
text = resp.text.lower(); final_url = str(resp.url)
|
||||
if any(kw in text for kw in parked): return False, "parked", None
|
||||
if final_url != url:
|
||||
u_path = url.split("://")[-1].rstrip("/")
|
||||
f_path = final_url.split("://")[-1].rstrip("/")
|
||||
generic_segments = ["/about", "/home", "/index", "/whats-new", "/es/", "/en/", "/products/"]
|
||||
is_deep_orig = u_path.count("/") >= 3
|
||||
is_shallow_final = f_path.count("/") <= 2 or any(f_path.endswith(s) for s in generic_segments)
|
||||
|
||||
if is_deep_orig and is_shallow_final:
|
||||
u_p = url.split("://")[-1].rstrip("/"); f_p = final_url.split("://")[-1].rstrip("/")
|
||||
if u_p.count("/") >= 3 and (f_p.count("/") <= 2 or any(kw in f_p for kw in ["/about", "/products", "/home"])):
|
||||
return False, "generic_redirect_loss", None
|
||||
|
||||
return True, "OK", final_url if final_url != url else None
|
||||
|
||||
if resp.status_code in [404, 410]:
|
||||
if "github.com" in url and "/master/" in url:
|
||||
heal = url.replace("/master/", "/main/")
|
||||
try:
|
||||
if (await client.get(heal)).status_code < 400: return True, "healed", heal
|
||||
except: pass
|
||||
|
||||
if "github.com" in url:
|
||||
match = re.search(r'(https?://github\.com/[^/]+/[^/]+)', url)
|
||||
if match:
|
||||
root_url = match.group(1)
|
||||
if root_url != url:
|
||||
try:
|
||||
if (await client.get(root_url)).status_code < 400:
|
||||
return True, "consolidated_to_root", root_url
|
||||
except: pass
|
||||
if "/master/" in url:
|
||||
h = url.replace("/master/", "/main/")
|
||||
try:
|
||||
if (await client.get(h)).status_code < 200: return True, "healed", h
|
||||
except: pass
|
||||
m = re.search(r'(https?://github\.com/[^/]+/[^/]+)', url)
|
||||
if m and (await client.get(m.group(1))).status_code < 400: return True, "consolidated", m.group(1)
|
||||
return False, "404", None
|
||||
return True, f"Soft Block {resp.status_code}", None
|
||||
except: return True, "Connection Error", None
|
||||
except: return True, "Error", None
|
||||
|
||||
async def apply_changes(self):
|
||||
log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True)
|
||||
file_updates = {}
|
||||
for url, (fallback, reason) in self.dead_links.items():
|
||||
nu = normalize_url(url)
|
||||
paths = self.inventory.get(nu, {}).get("v1_locations", [])
|
||||
if not paths:
|
||||
paths = [occ["file"] for occ in self.link_registry.get(nu, [])]
|
||||
nu = normalize_url(url); paths = self.inventory.get(nu, {}).get("v1_locations", [])
|
||||
if not paths: paths = [occ["file"] for occ in self.link_registry.get(nu, [])]
|
||||
for path in set(paths):
|
||||
if not os.path.exists(path): continue
|
||||
if path not in file_updates: file_updates[path] = open(path, "r").readlines()
|
||||
for i, line in enumerate(file_updates[path]):
|
||||
if url in line:
|
||||
if fallback and fallback.startswith("CANONICAL:"):
|
||||
new_url = fallback.replace("CANONICAL:", "")
|
||||
file_updates[path][i] = line.replace(url, new_url)
|
||||
else:
|
||||
file_updates[path][i] = None
|
||||
file_updates[path][i] = line.replace(url, fallback.replace("CANONICAL:", ""))
|
||||
else: file_updates[path][i] = None
|
||||
|
||||
final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()}
|
||||
await self.prune_orphaned_metadata()
|
||||
self._save_inventory()
|
||||
await self.prune_orphaned_metadata(); self._save_inventory()
|
||||
final_payload[INVENTORY_PATH] = yaml.dump(self.inventory, sort_keys=False, allow_unicode=True)
|
||||
|
||||
from src.safety_guard import SafetyGuard
|
||||
guard = SafetyGuard()
|
||||
safety_report = guard.generate_audit_report()
|
||||
|
||||
if final_payload:
|
||||
metrics = {"total_extracted": len(self.link_registry), "full_report": self.action_log}
|
||||
self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=safety_report)
|
||||
report = SafetyGuard().generate_audit_report()
|
||||
if final_payload: self.git_controller.apply_multi_file_changes(final_payload, {"total_extracted": len(self.link_registry)}, safety_report=report)
|
||||
|
||||
async def prune_orphaned_metadata(self):
|
||||
valid_map = {}
|
||||
@@ -283,8 +229,7 @@ class IntelligentLinkCleaner:
|
||||
for f in files:
|
||||
if f.endswith(".md"):
|
||||
p = os.path.join(root, f); c = open(p, "r").read()
|
||||
for u in re.findall(r'\[.*?\]\((https?://.*?)\)', c):
|
||||
valid_map.setdefault(normalize_url(u), []).append(p)
|
||||
for u in re.findall(r'\[.*?\]\((https?://.*?)\)', c): valid_map.setdefault(normalize_url(u), []).append(p)
|
||||
new_inv = {}
|
||||
for u, m in self.inventory.items():
|
||||
if u.startswith("INTRO:") or u in valid_map:
|
||||
|
||||
Reference in New Issue
Block a user