From 660c0b8078357eaee167550ecc06f0adb979f113 Mon Sep 17 00:00:00 2001
From: Nubenetes Bot <bot@nubenetes.com>
Date: Sun, 10 May 2026 10:32:32 +0200
Subject: [PATCH] feat(health): robust report generation and incremental
 processing stabilization

---
 src/intelligent_health_checker.py | 119 +++++++++---------------------
 1 file changed, 35 insertions(+), 84 deletions(-)

diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py
index 56ffd0a3..1ef540c9 100644
--- a/src/intelligent_health_checker.py
+++ b/src/intelligent_health_checker.py
@@ -5,7 +5,7 @@ import re
 import httpx
 import random
 from datetime import datetime
-from typing import Dict, List, Set, Tuple, Optional
+from typing import Dict, List, Set, Tuple, Optional, Any
 from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ
 from src.gitops_manager import RepositoryController
 from src.markdown_ast import MarkdownSanitizer
@@ -56,17 +56,15 @@ class IntelligentLinkCleaner:
         return None
 
     async def _check_url_with_retries(self, url: str, max_retries=5) -> Tuple[str, bool, Optional[str], str]:
-        # 1. Check Cache (Incremental Processing para evitar Timeouts)
         now = datetime.now().timestamp()
         cache_entry = self.learning_data.get("link_cache", {}).get(url)
         if cache_entry and cache_entry.get("status") == "ALIVE":
-            if now - cache_entry.get("last_checked", 0) < (21 * 24 * 3600): # 21 días
+            if now - cache_entry.get("last_checked", 0) < (21 * 24 * 3600):
                 self.detailed_stats["skipped_recent"] += 1
                 return url, True, None, "Cached (Recent)"
 
         domain = url.split("//")[-1].split("/")[0]
         domain_info = self.learning_data["domains"].get(domain, {})
-        
         strategies = [
             {"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "ref": "https://www.google.com/", "desc": "Desktop/Google"},
             {"type": "http", "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1", "ref": "https://t.co/", "desc": "Mobile/Twitter"},
@@ -74,45 +72,37 @@ class IntelligentLinkCleaner:
             {"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0", "ref": "https://news.ycombinator.com/", "desc": "Firefox/Reddit"},
             {"type": "playwright", "ua": "Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36", "ref": "https://www.google.com/", "desc": "PW Mobile/Google"}
         ]
-
         best_strat_idx = domain_info.get("best_strategy_idx")
         if best_strat_idx is not None and best_strat_idx < len(strategies):
-            best_strat = strategies.pop(best_strat_idx)
-            strategies.insert(0, best_strat)
+            best_strat = strategies.pop(best_strat_idx); strategies.insert(0, best_strat)
 
         for attempt in range(min(max_retries, len(strategies))):
             strategy = strategies[attempt]
             try:
                 if attempt > 0: await asyncio.sleep((2 ** attempt) + random.random())
                 is_alive, reason = await self._check_url_logic(url, strategy)
-                
                 if is_alive:
                     if domain not in self.learning_data["domains"]: self.learning_data["domains"][domain] = {}
                     original_idx = attempt if best_strat_idx is None else (best_strat_idx if attempt == 0 else (attempt if attempt < best_strat_idx else attempt))
                     self.learning_data["domains"][domain]["best_strategy_idx"] = original_idx
                     self.learning_data["link_cache"][url] = {"status": "ALIVE", "last_checked": now}
                     return url, True, None, f"Alive ({strategy['desc']}) - {reason}"
-                
                 if reason in ["404", "soft_404", "redirect_to_home"]:
                     if any(git_host in url for git_host in ["github.com", "gitlab.com", "bitbucket.org"]):
-                        parts = url.split("/")
-                        if len(parts) > 4:
-                            repo_root = "/".join(parts[:5])
+                        parts = url.split("/"); repo_root = "/".join(parts[:5]) if len(parts) > 4 else None
+                        if repo_root:
                             root_alive, _ = await self._check_url_logic(repo_root, strategies[0])
                             if root_alive: return url, False, f"REPO_ROOT:{repo_root}", f"Consolidated (Original: {reason})"
-                    
                     if attempt == max_retries - 1:
                         archived = await self._check_wayback(url)
                         if archived: return url, False, f"ARCHIVE:{archived}", f"Archived (Original: {reason})"
                         return url, False, None, reason
             except: pass
-            
         return url, True, None, "Conservative Keep"
 
     async def _check_url_logic(self, url: str, strategy: Dict) -> Tuple[bool, str]:
         headers = {"User-Agent": strategy["ua"], "Referer": strategy["ref"], "Accept-Language": "en-US,en;q=0.9"}
         paywall_indicators = ["sign in", "create free account", "member-only story", "página de suscripción", "inicia sesión"]
-
         if strategy["type"] == "http":
             try:
                 async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=12) as client:
@@ -160,20 +150,18 @@ class IntelligentLinkCleaner:
             except: pass
 
     async def validate_links_tiered(self):
-        print(f"[*] Validando {len(self.link_registry)} URLs con procesamiento incremental...")
+        print(f"[*] Validando {len(self.link_registry)} URLs...")
         unique_urls = list(self.link_registry.keys()); random.shuffle(unique_urls)
-        batch_size = 40
-        for i in range(0, len(unique_urls), batch_size):
-            batch = unique_urls[i:i+batch_size]
+        for i in range(0, len(unique_urls), 40):
+            batch = unique_urls[i:i+40]
             tasks = [self._check_url_with_retries(url) for url in batch]
             results = await asyncio.gather(*tasks)
             for url, is_alive, fallback, reason in results:
                 if not is_alive: self.dead_links[url] = (fallback if fallback else "DEAD", reason)
             self._save_memory()
-            if i % 100 == 0: print(f"    - Progreso: {i}/{len(unique_urls)}")
 
     async def apply_changes(self):
-        print("[*] Aplicando cambios y métricas visuales...")
+        print("[*] Aplicando cambios y generando métricas visuales...")
         file_updates = {}
         def track(file, op, url, reason, cat=None):
             if file not in self.detailed_stats["by_file"]: self.detailed_stats["by_file"][file] = {"removed": 0, "modified": 0, "created": 0}
@@ -191,13 +179,13 @@ class IntelligentLinkCleaner:
                     with open(file_path, 'r') as f: file_updates[file_path] = f.readlines()
                 line_idx = occ["line_index"]
                 if fallback and fallback.startswith("ARCHIVE:"):
-                    real_fallback = fallback.replace("ARCHIVE:", "")
-                    file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_fallback)
+                    real_f = fallback.replace("ARCHIVE:", "")
+                    file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_f)
                     if "[ARCHIVED]" not in file_updates[file_path][line_idx]: file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace("](", " [ARCHIVED]]( ")
                     track(file_path, "modified", url, reason); self.detailed_stats["operation_types"]["archived"] += 1
                 elif fallback and fallback.startswith("REPO_ROOT:"):
-                    real_fallback = fallback.replace("REPO_ROOT:", "")
-                    file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_fallback)
+                    real_f = fallback.replace("REPO_ROOT:", "")
+                    file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_f)
                     track(file_path, "modified", url, reason); self.detailed_stats["operation_types"]["consolidated"] += 1
                 else:
                     if file_path not in CORE_FILES:
@@ -214,90 +202,57 @@ class IntelligentLinkCleaner:
             with open(self.curator.mkdocs_path, 'r') as f: final_payload[self.curator.mkdocs_path] = f.read()
         if final_payload: self._create_pr(final_payload)
 
-    def _create_pr(self, updates: Dict[str, str]):
-        timestamp = datetime.now().strftime("%Y%m%d-%H%M")
-        branch_name = f"bot/autonomous-health-{timestamp}"
+    def _create_pr(self, updates: Dict[str, str], report_content: str = None):
+        timestamp = datetime.now().strftime("%Y%m%d-%H%M"); branch_name = f"bot/autonomous-health-{timestamp}"
+        if not report_content: report_content = self._build_report_body()
         self.git_controller._create_feature_branch(branch_name)
         for path, content in updates.items():
             try:
                 file_meta = self.git_controller.repository.get_contents(path)
                 self.git_controller.repository.update_file(path=path, message=f"fix(autonomous): engine update in {path}", content=content, sha=file_meta.sha, branch=branch_name)
             except: pass
+        safe_report = report_content[:65000]
+        self.git_controller.repository.create_pull(title=f"🧹 Autonomous Engine Health Report: {datetime.now().strftime('%d %b %Y')}", body=safe_report, head=branch_name, base="master")
 
+    def _build_report_body(self) -> str:
         report = "## 🧠 Nubenetes Autonomous Health & Curation Engine\n\n"
-        # Mermaid Pie Chart
         report += "### 📊 Distribución de Operaciones\n"
         report += "```mermaid\npie title Operaciones de Mantenimiento\n"
         report += f"    \"Eliminados\" : {self.detailed_stats['operation_types']['removals']}\n"
         report += f"    \"Archivados\" : {self.detailed_stats['operation_types']['archived']}\n"
         report += f"    \"Consolidados\" : {self.detailed_stats['operation_types']['consolidated']}\n"
         report += f"    \"Nuevos\" : {self.detailed_stats['operation_types']['orphans']}\n```\n\n"
-
         report += "### 📈 Resumen de Eficiencia\n"
-        report += f"| Métrica | Cantidad | Detalle |\n| :--- | :---: | :--- |\n"
+        report += "| Métrica | Cantidad | Detalle |\n| :--- | :---: | :--- |\n"
         report += f"| ⏩ Omitidos (Cache) | **{self.detailed_stats['skipped_recent']}** | Verificados hace menos de 21 días |\n"
         report += f"| 💀 Eliminados | **{self.detailed_stats['operation_types']['removals']}** | 404 definitivos |\n"
         report += f"| 🏛️ Archivados | **{self.detailed_stats['operation_types']['archived']}** | Vía Wayback Machine |\n"
         report += f"| 🎯 Consolidados | **{self.detailed_stats['operation_types']['consolidated']}** | Raíz de Repositorio Git |\n"
         report += f"| 🖇️ Nuevos | **{self.detailed_stats['operation_types']['orphans']}** | Páginas vinculadas |\n\n"
-
-        report += "### 🧮 Matriz de Mantenimiento por Documento\n"
+        report += "### 🧮 Matriz de Mantenimiento\n"
         report += "| Documento | 🔴 Elim | 🟡 Mod | 🟢 Crea | Estado |\n| :--- | :---: | :---: | :---: | :---: |\n"
         for file, s in sorted(self.detailed_stats["by_file"].items()):
             status = "🧹 Limpio" if s['removed'] + s['modified'] < 3 else "🛠️ Refactor"
             if s['removed'] > 5: status = "⚠️ Crítico"
             report += f"| `{file}` | {s['removed']} | {s['modified']} | {s['created']} | {status} |\n"
-        # Action Log con Compresión Adaptativa
         report += "\n### 📝 Registro de Acciones\n<details><summary>Ver detalle de cambios</summary>\n\n"
         report += "| Archivo | Acción | Recurso (Acortado) | Motivo |\n| :--- | :---: | :--- | :--- |\n"
-        
-        is_compressed = False
-        current_len = len(report)
-        processed_logs = 0
-        
-        # Agrupar logs por archivo para poder comprimir si es necesario
-        from collections import defaultdict
-        logs_by_file = defaultdict(list)
-        for log in self.action_log:
-            logs_by_file[log["file"]].append(log)
-
+        is_compressed = False; current_len = len(report); processed_logs = 0
+        from collections import defaultdict; logs_by_file = defaultdict(list)
+        for log in self.action_log: logs_by_file[log["file"]].append(log)
         for file_path, actions in sorted(logs_by_file.items()):
-            if current_len > 55000: # Umbral de compresión agresiva
-                is_compressed = True
-                summary = f"| `{file_path}` | 🛠️ | *Múltiples enlaces* | Se procesaron {len(actions)} cambios en este archivo. |\n"
-                report += summary
-                current_len += len(summary)
-                continue
-
+            if current_len > 55000:
+                is_compressed = True; summary = f"| `{file_path}` | 🛠️ | *Múltiples enlaces* | Se procesaron {len(actions)} cambios en este archivo. |\n"
+                report += summary; current_len += len(summary); continue
             for log in actions:
                 emoji = {"removed": "❌", "modified": "🔄", "created": "✨"}.get(log["action"], "❓")
-                
-                # Inteligencia: Mantener URL completa mientras haya espacio
-                if current_len > 45000:
-                    short_url = (log["url"][:50] + "...") if len(log["url"]) > 53 else log["url"]
-                else:
-                    short_url = log["url"]
-                
+                short_url = (log["url"][:50] + "...") if current_len > 45000 and len(log["url"]) > 53 else log["url"]
                 entry = f"| `{log['file']}` | {emoji} | {short_url} | {log['reason']} |\n"
-                
-                if current_len + len(entry) > 62000:
-                    is_compressed = True
-                    break
-                
-                report += entry
-                current_len += len(entry)
-                processed_logs += 1
-
-        if is_compressed:
-            report += f"\n> 💡 **Nota**: El log ha sido comprimido o truncado para cumplir con los límites de GitHub ({processed_logs}/{len(self.action_log)} acciones detalladas).\n"
-        
-        report += "</details>\n\n"
-report += f"\n---\n*📈 Inteligencia de dominios acumulada: `{len(self.learning_data['domains'])}`*"
-
-# Validación final de longitud
-safe_report = report[:65000]
-self.git_controller.repository.create_pull(title=f"🧹 Autonomous Engine Health Report: {datetime.now().strftime('%d %b %Y')}", body=safe_report, head=branch_name, base="master")
-
+                if current_len + len(entry) > 62000: is_compressed = True; break
+                report += entry; current_len += len(entry); processed_logs += 1
+        if is_compressed: report += f"\n> 💡 **Nota**: Log comprimido para límites de GitHub ({processed_logs}/{len(self.action_log)} detallados).\n"
+        report += "</details>\n\n---\n*📈 Inteligencia acumulada: `{len(self.learning_data['domains'])}`*"
+        return report
 
 async def main():
     try:
@@ -308,10 +263,6 @@ async def main():
         await cleaner.curator.suggest_reorganization()
         await cleaner.apply_changes()
     except Exception as e:
-        import traceback
-        print(f"[CRITICAL ERROR]: {e}")
-        traceback.print_exc()
-        exit(1)
+        import traceback; print(f"[CRITICAL ERROR]: {e}"); traceback.print_exc(); exit(1)
 
-if __name__ == "__main__":
-    asyncio.run(main())
+if __name__ == "__main__": asyncio.run(main())