From 660c0b8078357eaee167550ecc06f0adb979f113 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Sun, 10 May 2026 10:32:32 +0200 Subject: [PATCH] feat(health): robust report generation and incremental processing stabilization --- src/intelligent_health_checker.py | 119 +++++++++--------------------- 1 file changed, 35 insertions(+), 84 deletions(-) diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py index 56ffd0a3..1ef540c9 100644 --- a/src/intelligent_health_checker.py +++ b/src/intelligent_health_checker.py @@ -5,7 +5,7 @@ import re import httpx import random from datetime import datetime -from typing import Dict, List, Set, Tuple, Optional +from typing import Dict, List, Set, Tuple, Optional, Any from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ from src.gitops_manager import RepositoryController from src.markdown_ast import MarkdownSanitizer @@ -56,17 +56,15 @@ class IntelligentLinkCleaner: return None async def _check_url_with_retries(self, url: str, max_retries=5) -> Tuple[str, bool, Optional[str], str]: - # 1. Check Cache (Incremental Processing para evitar Timeouts) now = datetime.now().timestamp() cache_entry = self.learning_data.get("link_cache", {}).get(url) if cache_entry and cache_entry.get("status") == "ALIVE": - if now - cache_entry.get("last_checked", 0) < (21 * 24 * 3600): # 21 días + if now - cache_entry.get("last_checked", 0) < (21 * 24 * 3600): self.detailed_stats["skipped_recent"] += 1 return url, True, None, "Cached (Recent)" domain = url.split("//")[-1].split("/")[0] domain_info = self.learning_data["domains"].get(domain, {}) - strategies = [ {"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "ref": "https://www.google.com/", "desc": "Desktop/Google"}, {"type": "http", "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1", "ref": "https://t.co/", "desc": "Mobile/Twitter"}, @@ -74,45 +72,37 @@ class IntelligentLinkCleaner: {"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0", "ref": "https://news.ycombinator.com/", "desc": "Firefox/Reddit"}, {"type": "playwright", "ua": "Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36", "ref": "https://www.google.com/", "desc": "PW Mobile/Google"} ] - best_strat_idx = domain_info.get("best_strategy_idx") if best_strat_idx is not None and best_strat_idx < len(strategies): - best_strat = strategies.pop(best_strat_idx) - strategies.insert(0, best_strat) + best_strat = strategies.pop(best_strat_idx); strategies.insert(0, best_strat) for attempt in range(min(max_retries, len(strategies))): strategy = strategies[attempt] try: if attempt > 0: await asyncio.sleep((2 ** attempt) + random.random()) is_alive, reason = await self._check_url_logic(url, strategy) - if is_alive: if domain not in self.learning_data["domains"]: self.learning_data["domains"][domain] = {} original_idx = attempt if best_strat_idx is None else (best_strat_idx if attempt == 0 else (attempt if attempt < best_strat_idx else attempt)) self.learning_data["domains"][domain]["best_strategy_idx"] = original_idx self.learning_data["link_cache"][url] = {"status": "ALIVE", "last_checked": now} return url, True, None, f"Alive ({strategy['desc']}) - {reason}" - if reason in ["404", "soft_404", "redirect_to_home"]: if any(git_host in url for git_host in ["github.com", "gitlab.com", "bitbucket.org"]): - parts = url.split("/") - if len(parts) > 4: - repo_root = "/".join(parts[:5]) + parts = url.split("/"); repo_root = "/".join(parts[:5]) if len(parts) > 4 else None + if repo_root: root_alive, _ = await self._check_url_logic(repo_root, strategies[0]) if root_alive: return url, False, f"REPO_ROOT:{repo_root}", f"Consolidated (Original: {reason})" - if attempt == max_retries - 1: archived = await self._check_wayback(url) if archived: return url, False, f"ARCHIVE:{archived}", f"Archived (Original: {reason})" return url, False, None, reason except: pass - return url, True, None, "Conservative Keep" async def _check_url_logic(self, url: str, strategy: Dict) -> Tuple[bool, str]: headers = {"User-Agent": strategy["ua"], "Referer": strategy["ref"], "Accept-Language": "en-US,en;q=0.9"} paywall_indicators = ["sign in", "create free account", "member-only story", "página de suscripción", "inicia sesión"] - if strategy["type"] == "http": try: async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=12) as client: @@ -160,20 +150,18 @@ class IntelligentLinkCleaner: except: pass async def validate_links_tiered(self): - print(f"[*] Validando {len(self.link_registry)} URLs con procesamiento incremental...") + print(f"[*] Validando {len(self.link_registry)} URLs...") unique_urls = list(self.link_registry.keys()); random.shuffle(unique_urls) - batch_size = 40 - for i in range(0, len(unique_urls), batch_size): - batch = unique_urls[i:i+batch_size] + for i in range(0, len(unique_urls), 40): + batch = unique_urls[i:i+40] tasks = [self._check_url_with_retries(url) for url in batch] results = await asyncio.gather(*tasks) for url, is_alive, fallback, reason in results: if not is_alive: self.dead_links[url] = (fallback if fallback else "DEAD", reason) self._save_memory() - if i % 100 == 0: print(f" - Progreso: {i}/{len(unique_urls)}") async def apply_changes(self): - print("[*] Aplicando cambios y métricas visuales...") + print("[*] Aplicando cambios y generando métricas visuales...") file_updates = {} def track(file, op, url, reason, cat=None): if file not in self.detailed_stats["by_file"]: self.detailed_stats["by_file"][file] = {"removed": 0, "modified": 0, "created": 0} @@ -191,13 +179,13 @@ class IntelligentLinkCleaner: with open(file_path, 'r') as f: file_updates[file_path] = f.readlines() line_idx = occ["line_index"] if fallback and fallback.startswith("ARCHIVE:"): - real_fallback = fallback.replace("ARCHIVE:", "") - file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_fallback) + real_f = fallback.replace("ARCHIVE:", "") + file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_f) if "[ARCHIVED]" not in file_updates[file_path][line_idx]: file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace("](", " [ARCHIVED]]( ") track(file_path, "modified", url, reason); self.detailed_stats["operation_types"]["archived"] += 1 elif fallback and fallback.startswith("REPO_ROOT:"): - real_fallback = fallback.replace("REPO_ROOT:", "") - file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_fallback) + real_f = fallback.replace("REPO_ROOT:", "") + file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_f) track(file_path, "modified", url, reason); self.detailed_stats["operation_types"]["consolidated"] += 1 else: if file_path not in CORE_FILES: @@ -214,90 +202,57 @@ class IntelligentLinkCleaner: with open(self.curator.mkdocs_path, 'r') as f: final_payload[self.curator.mkdocs_path] = f.read() if final_payload: self._create_pr(final_payload) - def _create_pr(self, updates: Dict[str, str]): - timestamp = datetime.now().strftime("%Y%m%d-%H%M") - branch_name = f"bot/autonomous-health-{timestamp}" + def _create_pr(self, updates: Dict[str, str], report_content: str = None): + timestamp = datetime.now().strftime("%Y%m%d-%H%M"); branch_name = f"bot/autonomous-health-{timestamp}" + if not report_content: report_content = self._build_report_body() self.git_controller._create_feature_branch(branch_name) for path, content in updates.items(): try: file_meta = self.git_controller.repository.get_contents(path) self.git_controller.repository.update_file(path=path, message=f"fix(autonomous): engine update in {path}", content=content, sha=file_meta.sha, branch=branch_name) except: pass + safe_report = report_content[:65000] + self.git_controller.repository.create_pull(title=f"🧹 Autonomous Engine Health Report: {datetime.now().strftime('%d %b %Y')}", body=safe_report, head=branch_name, base="master") + def _build_report_body(self) -> str: report = "## 🧠 Nubenetes Autonomous Health & Curation Engine\n\n" - # Mermaid Pie Chart report += "### 📊 Distribución de Operaciones\n" report += "```mermaid\npie title Operaciones de Mantenimiento\n" report += f" \"Eliminados\" : {self.detailed_stats['operation_types']['removals']}\n" report += f" \"Archivados\" : {self.detailed_stats['operation_types']['archived']}\n" report += f" \"Consolidados\" : {self.detailed_stats['operation_types']['consolidated']}\n" report += f" \"Nuevos\" : {self.detailed_stats['operation_types']['orphans']}\n```\n\n" - report += "### 📈 Resumen de Eficiencia\n" - report += f"| Métrica | Cantidad | Detalle |\n| :--- | :---: | :--- |\n" + report += "| Métrica | Cantidad | Detalle |\n| :--- | :---: | :--- |\n" report += f"| ⏩ Omitidos (Cache) | **{self.detailed_stats['skipped_recent']}** | Verificados hace menos de 21 días |\n" report += f"| 💀 Eliminados | **{self.detailed_stats['operation_types']['removals']}** | 404 definitivos |\n" report += f"| 🏛️ Archivados | **{self.detailed_stats['operation_types']['archived']}** | Vía Wayback Machine |\n" report += f"| 🎯 Consolidados | **{self.detailed_stats['operation_types']['consolidated']}** | Raíz de Repositorio Git |\n" report += f"| 🖇️ Nuevos | **{self.detailed_stats['operation_types']['orphans']}** | Páginas vinculadas |\n\n" - - report += "### 🧮 Matriz de Mantenimiento por Documento\n" + report += "### 🧮 Matriz de Mantenimiento\n" report += "| Documento | 🔴 Elim | 🟡 Mod | 🟢 Crea | Estado |\n| :--- | :---: | :---: | :---: | :---: |\n" for file, s in sorted(self.detailed_stats["by_file"].items()): status = "🧹 Limpio" if s['removed'] + s['modified'] < 3 else "🛠️ Refactor" if s['removed'] > 5: status = "⚠️ Crítico" report += f"| `{file}` | {s['removed']} | {s['modified']} | {s['created']} | {status} |\n" - # Action Log con Compresión Adaptativa report += "\n### 📝 Registro de Acciones\n
Ver detalle de cambios\n\n" report += "| Archivo | Acción | Recurso (Acortado) | Motivo |\n| :--- | :---: | :--- | :--- |\n" - - is_compressed = False - current_len = len(report) - processed_logs = 0 - - # Agrupar logs por archivo para poder comprimir si es necesario - from collections import defaultdict - logs_by_file = defaultdict(list) - for log in self.action_log: - logs_by_file[log["file"]].append(log) - + is_compressed = False; current_len = len(report); processed_logs = 0 + from collections import defaultdict; logs_by_file = defaultdict(list) + for log in self.action_log: logs_by_file[log["file"]].append(log) for file_path, actions in sorted(logs_by_file.items()): - if current_len > 55000: # Umbral de compresión agresiva - is_compressed = True - summary = f"| `{file_path}` | 🛠️ | *Múltiples enlaces* | Se procesaron {len(actions)} cambios en este archivo. |\n" - report += summary - current_len += len(summary) - continue - + if current_len > 55000: + is_compressed = True; summary = f"| `{file_path}` | 🛠️ | *Múltiples enlaces* | Se procesaron {len(actions)} cambios en este archivo. |\n" + report += summary; current_len += len(summary); continue for log in actions: emoji = {"removed": "❌", "modified": "🔄", "created": "✨"}.get(log["action"], "❓") - - # Inteligencia: Mantener URL completa mientras haya espacio - if current_len > 45000: - short_url = (log["url"][:50] + "...") if len(log["url"]) > 53 else log["url"] - else: - short_url = log["url"] - + short_url = (log["url"][:50] + "...") if current_len > 45000 and len(log["url"]) > 53 else log["url"] entry = f"| `{log['file']}` | {emoji} | {short_url} | {log['reason']} |\n" - - if current_len + len(entry) > 62000: - is_compressed = True - break - - report += entry - current_len += len(entry) - processed_logs += 1 - - if is_compressed: - report += f"\n> 💡 **Nota**: El log ha sido comprimido o truncado para cumplir con los límites de GitHub ({processed_logs}/{len(self.action_log)} acciones detalladas).\n" - - report += "
\n\n" -report += f"\n---\n*📈 Inteligencia de dominios acumulada: `{len(self.learning_data['domains'])}`*" - -# Validación final de longitud -safe_report = report[:65000] -self.git_controller.repository.create_pull(title=f"🧹 Autonomous Engine Health Report: {datetime.now().strftime('%d %b %Y')}", body=safe_report, head=branch_name, base="master") - + if current_len + len(entry) > 62000: is_compressed = True; break + report += entry; current_len += len(entry); processed_logs += 1 + if is_compressed: report += f"\n> 💡 **Nota**: Log comprimido para límites de GitHub ({processed_logs}/{len(self.action_log)} detallados).\n" + report += "\n\n---\n*📈 Inteligencia acumulada: `{len(self.learning_data['domains'])}`*" + return report async def main(): try: @@ -308,10 +263,6 @@ async def main(): await cleaner.curator.suggest_reorganization() await cleaner.apply_changes() except Exception as e: - import traceback - print(f"[CRITICAL ERROR]: {e}") - traceback.print_exc() - exit(1) + import traceback; print(f"[CRITICAL ERROR]: {e}"); traceback.print_exc(); exit(1) -if __name__ == "__main__": - asyncio.run(main()) +if __name__ == "__main__": asyncio.run(main())