feat(health): robust report generation and incremental processing stabilization

This commit is contained in:
Nubenetes Bot
2026-05-10 10:32:32 +02:00
parent 795904638c
commit 660c0b8078

View File

@@ -5,7 +5,7 @@ import re
import httpx
import random
from datetime import datetime
from typing import Dict, List, Set, Tuple, Optional
from typing import Dict, List, Set, Tuple, Optional, Any
from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ
from src.gitops_manager import RepositoryController
from src.markdown_ast import MarkdownSanitizer
@@ -56,17 +56,15 @@ class IntelligentLinkCleaner:
return None
async def _check_url_with_retries(self, url: str, max_retries=5) -> Tuple[str, bool, Optional[str], str]:
# 1. Check Cache (Incremental Processing para evitar Timeouts)
now = datetime.now().timestamp()
cache_entry = self.learning_data.get("link_cache", {}).get(url)
if cache_entry and cache_entry.get("status") == "ALIVE":
if now - cache_entry.get("last_checked", 0) < (21 * 24 * 3600): # 21 días
if now - cache_entry.get("last_checked", 0) < (21 * 24 * 3600):
self.detailed_stats["skipped_recent"] += 1
return url, True, None, "Cached (Recent)"
domain = url.split("//")[-1].split("/")[0]
domain_info = self.learning_data["domains"].get(domain, {})
strategies = [
{"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "ref": "https://www.google.com/", "desc": "Desktop/Google"},
{"type": "http", "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1", "ref": "https://t.co/", "desc": "Mobile/Twitter"},
@@ -74,45 +72,37 @@ class IntelligentLinkCleaner:
{"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0", "ref": "https://news.ycombinator.com/", "desc": "Firefox/Reddit"},
{"type": "playwright", "ua": "Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36", "ref": "https://www.google.com/", "desc": "PW Mobile/Google"}
]
best_strat_idx = domain_info.get("best_strategy_idx")
if best_strat_idx is not None and best_strat_idx < len(strategies):
best_strat = strategies.pop(best_strat_idx)
strategies.insert(0, best_strat)
best_strat = strategies.pop(best_strat_idx); strategies.insert(0, best_strat)
for attempt in range(min(max_retries, len(strategies))):
strategy = strategies[attempt]
try:
if attempt > 0: await asyncio.sleep((2 ** attempt) + random.random())
is_alive, reason = await self._check_url_logic(url, strategy)
if is_alive:
if domain not in self.learning_data["domains"]: self.learning_data["domains"][domain] = {}
original_idx = attempt if best_strat_idx is None else (best_strat_idx if attempt == 0 else (attempt if attempt < best_strat_idx else attempt))
self.learning_data["domains"][domain]["best_strategy_idx"] = original_idx
self.learning_data["link_cache"][url] = {"status": "ALIVE", "last_checked": now}
return url, True, None, f"Alive ({strategy['desc']}) - {reason}"
if reason in ["404", "soft_404", "redirect_to_home"]:
if any(git_host in url for git_host in ["github.com", "gitlab.com", "bitbucket.org"]):
parts = url.split("/")
if len(parts) > 4:
repo_root = "/".join(parts[:5])
parts = url.split("/"); repo_root = "/".join(parts[:5]) if len(parts) > 4 else None
if repo_root:
root_alive, _ = await self._check_url_logic(repo_root, strategies[0])
if root_alive: return url, False, f"REPO_ROOT:{repo_root}", f"Consolidated (Original: {reason})"
if attempt == max_retries - 1:
archived = await self._check_wayback(url)
if archived: return url, False, f"ARCHIVE:{archived}", f"Archived (Original: {reason})"
return url, False, None, reason
except: pass
return url, True, None, "Conservative Keep"
async def _check_url_logic(self, url: str, strategy: Dict) -> Tuple[bool, str]:
headers = {"User-Agent": strategy["ua"], "Referer": strategy["ref"], "Accept-Language": "en-US,en;q=0.9"}
paywall_indicators = ["sign in", "create free account", "member-only story", "página de suscripción", "inicia sesión"]
if strategy["type"] == "http":
try:
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=12) as client:
@@ -160,20 +150,18 @@ class IntelligentLinkCleaner:
except: pass
async def validate_links_tiered(self):
print(f"[*] Validando {len(self.link_registry)} URLs con procesamiento incremental...")
print(f"[*] Validando {len(self.link_registry)} URLs...")
unique_urls = list(self.link_registry.keys()); random.shuffle(unique_urls)
batch_size = 40
for i in range(0, len(unique_urls), batch_size):
batch = unique_urls[i:i+batch_size]
for i in range(0, len(unique_urls), 40):
batch = unique_urls[i:i+40]
tasks = [self._check_url_with_retries(url) for url in batch]
results = await asyncio.gather(*tasks)
for url, is_alive, fallback, reason in results:
if not is_alive: self.dead_links[url] = (fallback if fallback else "DEAD", reason)
self._save_memory()
if i % 100 == 0: print(f" - Progreso: {i}/{len(unique_urls)}")
async def apply_changes(self):
print("[*] Aplicando cambios y métricas visuales...")
print("[*] Aplicando cambios y generando métricas visuales...")
file_updates = {}
def track(file, op, url, reason, cat=None):
if file not in self.detailed_stats["by_file"]: self.detailed_stats["by_file"][file] = {"removed": 0, "modified": 0, "created": 0}
@@ -191,13 +179,13 @@ class IntelligentLinkCleaner:
with open(file_path, 'r') as f: file_updates[file_path] = f.readlines()
line_idx = occ["line_index"]
if fallback and fallback.startswith("ARCHIVE:"):
real_fallback = fallback.replace("ARCHIVE:", "")
file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_fallback)
real_f = fallback.replace("ARCHIVE:", "")
file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_f)
if "[ARCHIVED]" not in file_updates[file_path][line_idx]: file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace("](", " [ARCHIVED]]( ")
track(file_path, "modified", url, reason); self.detailed_stats["operation_types"]["archived"] += 1
elif fallback and fallback.startswith("REPO_ROOT:"):
real_fallback = fallback.replace("REPO_ROOT:", "")
file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_fallback)
real_f = fallback.replace("REPO_ROOT:", "")
file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_f)
track(file_path, "modified", url, reason); self.detailed_stats["operation_types"]["consolidated"] += 1
else:
if file_path not in CORE_FILES:
@@ -214,90 +202,57 @@ class IntelligentLinkCleaner:
with open(self.curator.mkdocs_path, 'r') as f: final_payload[self.curator.mkdocs_path] = f.read()
if final_payload: self._create_pr(final_payload)
def _create_pr(self, updates: Dict[str, str]):
timestamp = datetime.now().strftime("%Y%m%d-%H%M")
branch_name = f"bot/autonomous-health-{timestamp}"
def _create_pr(self, updates: Dict[str, str], report_content: str = None):
timestamp = datetime.now().strftime("%Y%m%d-%H%M"); branch_name = f"bot/autonomous-health-{timestamp}"
if not report_content: report_content = self._build_report_body()
self.git_controller._create_feature_branch(branch_name)
for path, content in updates.items():
try:
file_meta = self.git_controller.repository.get_contents(path)
self.git_controller.repository.update_file(path=path, message=f"fix(autonomous): engine update in {path}", content=content, sha=file_meta.sha, branch=branch_name)
except: pass
safe_report = report_content[:65000]
self.git_controller.repository.create_pull(title=f"🧹 Autonomous Engine Health Report: {datetime.now().strftime('%d %b %Y')}", body=safe_report, head=branch_name, base="master")
def _build_report_body(self) -> str:
report = "## 🧠 Nubenetes Autonomous Health & Curation Engine\n\n"
# Mermaid Pie Chart
report += "### 📊 Distribución de Operaciones\n"
report += "```mermaid\npie title Operaciones de Mantenimiento\n"
report += f" \"Eliminados\" : {self.detailed_stats['operation_types']['removals']}\n"
report += f" \"Archivados\" : {self.detailed_stats['operation_types']['archived']}\n"
report += f" \"Consolidados\" : {self.detailed_stats['operation_types']['consolidated']}\n"
report += f" \"Nuevos\" : {self.detailed_stats['operation_types']['orphans']}\n```\n\n"
report += "### 📈 Resumen de Eficiencia\n"
report += f"| Métrica | Cantidad | Detalle |\n| :--- | :---: | :--- |\n"
report += "| Métrica | Cantidad | Detalle |\n| :--- | :---: | :--- |\n"
report += f"| ⏩ Omitidos (Cache) | **{self.detailed_stats['skipped_recent']}** | Verificados hace menos de 21 días |\n"
report += f"| 💀 Eliminados | **{self.detailed_stats['operation_types']['removals']}** | 404 definitivos |\n"
report += f"| 🏛️ Archivados | **{self.detailed_stats['operation_types']['archived']}** | Vía Wayback Machine |\n"
report += f"| 🎯 Consolidados | **{self.detailed_stats['operation_types']['consolidated']}** | Raíz de Repositorio Git |\n"
report += f"| 🖇️ Nuevos | **{self.detailed_stats['operation_types']['orphans']}** | Páginas vinculadas |\n\n"
report += "### 🧮 Matriz de Mantenimiento por Documento\n"
report += "### 🧮 Matriz de Mantenimiento\n"
report += "| Documento | 🔴 Elim | 🟡 Mod | 🟢 Crea | Estado |\n| :--- | :---: | :---: | :---: | :---: |\n"
for file, s in sorted(self.detailed_stats["by_file"].items()):
status = "🧹 Limpio" if s['removed'] + s['modified'] < 3 else "🛠️ Refactor"
if s['removed'] > 5: status = "⚠️ Crítico"
report += f"| `{file}` | {s['removed']} | {s['modified']} | {s['created']} | {status} |\n"
# Action Log con Compresión Adaptativa
report += "\n### 📝 Registro de Acciones\n<details><summary>Ver detalle de cambios</summary>\n\n"
report += "| Archivo | Acción | Recurso (Acortado) | Motivo |\n| :--- | :---: | :--- | :--- |\n"
is_compressed = False
current_len = len(report)
processed_logs = 0
# Agrupar logs por archivo para poder comprimir si es necesario
from collections import defaultdict
logs_by_file = defaultdict(list)
for log in self.action_log:
logs_by_file[log["file"]].append(log)
is_compressed = False; current_len = len(report); processed_logs = 0
from collections import defaultdict; logs_by_file = defaultdict(list)
for log in self.action_log: logs_by_file[log["file"]].append(log)
for file_path, actions in sorted(logs_by_file.items()):
if current_len > 55000: # Umbral de compresión agresiva
is_compressed = True
summary = f"| `{file_path}` | 🛠️ | *Múltiples enlaces* | Se procesaron {len(actions)} cambios en este archivo. |\n"
report += summary
current_len += len(summary)
continue
if current_len > 55000:
is_compressed = True; summary = f"| `{file_path}` | 🛠️ | *Múltiples enlaces* | Se procesaron {len(actions)} cambios en este archivo. |\n"
report += summary; current_len += len(summary); continue
for log in actions:
emoji = {"removed": "", "modified": "🔄", "created": ""}.get(log["action"], "")
# Inteligencia: Mantener URL completa mientras haya espacio
if current_len > 45000:
short_url = (log["url"][:50] + "...") if len(log["url"]) > 53 else log["url"]
else:
short_url = log["url"]
short_url = (log["url"][:50] + "...") if current_len > 45000 and len(log["url"]) > 53 else log["url"]
entry = f"| `{log['file']}` | {emoji} | {short_url} | {log['reason']} |\n"
if current_len + len(entry) > 62000:
is_compressed = True
break
report += entry
current_len += len(entry)
processed_logs += 1
if is_compressed:
report += f"\n> 💡 **Nota**: El log ha sido comprimido o truncado para cumplir con los límites de GitHub ({processed_logs}/{len(self.action_log)} acciones detalladas).\n"
report += "</details>\n\n"
report += f"\n---\n*📈 Inteligencia de dominios acumulada: `{len(self.learning_data['domains'])}`*"
# Validación final de longitud
safe_report = report[:65000]
self.git_controller.repository.create_pull(title=f"🧹 Autonomous Engine Health Report: {datetime.now().strftime('%d %b %Y')}", body=safe_report, head=branch_name, base="master")
if current_len + len(entry) > 62000: is_compressed = True; break
report += entry; current_len += len(entry); processed_logs += 1
if is_compressed: report += f"\n> 💡 **Nota**: Log comprimido para límites de GitHub ({processed_logs}/{len(self.action_log)} detallados).\n"
report += "</details>\n\n---\n*📈 Inteligencia acumulada: `{len(self.learning_data['domains'])}`*"
return report
async def main():
try:
@@ -308,10 +263,6 @@ async def main():
await cleaner.curator.suggest_reorganization()
await cleaner.apply_changes()
except Exception as e:
import traceback
print(f"[CRITICAL ERROR]: {e}")
traceback.print_exc()
exit(1)
import traceback; print(f"[CRITICAL ERROR]: {e}"); traceback.print_exc(); exit(1)
if __name__ == "__main__":
asyncio.run(main())
if __name__ == "__main__": asyncio.run(main())