feat(health): robust report generation and incremental processing stabilization

2026-07-28 01:21:41 +00:00 · 2026-05-10 10:32:32 +02:00
parent 795904638c
commit 660c0b8078
1 changed files with 35 additions and 84 deletions
--- a/src/intelligent_health_checker.py
+++ b/src/intelligent_health_checker.py
@@ -5,7 +5,7 @@ import re
 import httpx
 import random
 from datetime import datetime
-from typing import Dict, List, Set, Tuple, Optional
+from typing import Dict, List, Set, Tuple, Optional, Any
 from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ
 from src.gitops_manager import RepositoryController
 from src.markdown_ast import MarkdownSanitizer
@@ -56,17 +56,15 @@ class IntelligentLinkCleaner:
        return None

    async def _check_url_with_retries(self, url: str, max_retries=5) -> Tuple[str, bool, Optional[str], str]:
-        # 1. Check Cache (Incremental Processing para evitar Timeouts)
        now = datetime.now().timestamp()
        cache_entry = self.learning_data.get("link_cache", {}).get(url)
        if cache_entry and cache_entry.get("status") == "ALIVE":
-            if now - cache_entry.get("last_checked", 0) < (21 * 24 * 3600): # 21 días
+            if now - cache_entry.get("last_checked", 0) < (21 * 24 * 3600):
                self.detailed_stats["skipped_recent"] += 1
                return url, True, None, "Cached (Recent)"

        domain = url.split("//")[-1].split("/")[0]
        domain_info = self.learning_data["domains"].get(domain, {})
-        
        strategies = [
            {"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "ref": "https://www.google.com/", "desc": "Desktop/Google"},
            {"type": "http", "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1", "ref": "https://t.co/", "desc": "Mobile/Twitter"},
@@ -74,45 +72,37 @@ class IntelligentLinkCleaner:
            {"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0", "ref": "https://news.ycombinator.com/", "desc": "Firefox/Reddit"},
            {"type": "playwright", "ua": "Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36", "ref": "https://www.google.com/", "desc": "PW Mobile/Google"}
        ]
-
        best_strat_idx = domain_info.get("best_strategy_idx")
        if best_strat_idx is not None and best_strat_idx < len(strategies):
-            best_strat = strategies.pop(best_strat_idx)
-            strategies.insert(0, best_strat)
+            best_strat = strategies.pop(best_strat_idx); strategies.insert(0, best_strat)

        for attempt in range(min(max_retries, len(strategies))):
            strategy = strategies[attempt]
            try:
                if attempt > 0: await asyncio.sleep((2 ** attempt) + random.random())
                is_alive, reason = await self._check_url_logic(url, strategy)
-                
                if is_alive:
                    if domain not in self.learning_data["domains"]: self.learning_data["domains"][domain] = {}
                    original_idx = attempt if best_strat_idx is None else (best_strat_idx if attempt == 0 else (attempt if attempt < best_strat_idx else attempt))
                    self.learning_data["domains"][domain]["best_strategy_idx"] = original_idx
                    self.learning_data["link_cache"][url] = {"status": "ALIVE", "last_checked": now}
                    return url, True, None, f"Alive ({strategy['desc']}) - {reason}"
-                
                if reason in ["404", "soft_404", "redirect_to_home"]:
                    if any(git_host in url for git_host in ["github.com", "gitlab.com", "bitbucket.org"]):
-                        parts = url.split("/")
-                        if len(parts) > 4:
-                            repo_root = "/".join(parts[:5])
+                        parts = url.split("/"); repo_root = "/".join(parts[:5]) if len(parts) > 4 else None
+                        if repo_root:
                            root_alive, _ = await self._check_url_logic(repo_root, strategies[0])
                            if root_alive: return url, False, f"REPO_ROOT:{repo_root}", f"Consolidated (Original: {reason})"
-                    
                    if attempt == max_retries - 1:
                        archived = await self._check_wayback(url)
                        if archived: return url, False, f"ARCHIVE:{archived}", f"Archived (Original: {reason})"
                        return url, False, None, reason
            except: pass
-            
        return url, True, None, "Conservative Keep"

    async def _check_url_logic(self, url: str, strategy: Dict) -> Tuple[bool, str]:
        headers = {"User-Agent": strategy["ua"], "Referer": strategy["ref"], "Accept-Language": "en-US,en;q=0.9"}
        paywall_indicators = ["sign in", "create free account", "member-only story", "página de suscripción", "inicia sesión"]
-
        if strategy["type"] == "http":
            try:
                async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=12) as client:
@@ -160,20 +150,18 @@ class IntelligentLinkCleaner:
            except: pass

    async def validate_links_tiered(self):
-        print(f"[*] Validando {len(self.link_registry)} URLs con procesamiento incremental...")
+        print(f"[*] Validando {len(self.link_registry)} URLs...")
        unique_urls = list(self.link_registry.keys()); random.shuffle(unique_urls)
-        batch_size = 40
-        for i in range(0, len(unique_urls), batch_size):
-            batch = unique_urls[i:i+batch_size]
+        for i in range(0, len(unique_urls), 40):
+            batch = unique_urls[i:i+40]
            tasks = [self._check_url_with_retries(url) for url in batch]
            results = await asyncio.gather(*tasks)
            for url, is_alive, fallback, reason in results:
                if not is_alive: self.dead_links[url] = (fallback if fallback else "DEAD", reason)
            self._save_memory()
-            if i % 100 == 0: print(f"    - Progreso: {i}/{len(unique_urls)}")

    async def apply_changes(self):
-        print("[*] Aplicando cambios y métricas visuales...")
+        print("[*] Aplicando cambios y generando métricas visuales...")
        file_updates = {}
        def track(file, op, url, reason, cat=None):
            if file not in self.detailed_stats["by_file"]: self.detailed_stats["by_file"][file] = {"removed": 0, "modified": 0, "created": 0}
@@ -191,13 +179,13 @@ class IntelligentLinkCleaner:
                    with open(file_path, 'r') as f: file_updates[file_path] = f.readlines()
                line_idx = occ["line_index"]
                if fallback and fallback.startswith("ARCHIVE:"):
-                    real_fallback = fallback.replace("ARCHIVE:", "")
-                    file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_fallback)
+                    real_f = fallback.replace("ARCHIVE:", "")
+                    file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_f)
                    if "[ARCHIVED]" not in file_updates[file_path][line_idx]: file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace("](", " [ARCHIVED]]( ")
                    track(file_path, "modified", url, reason); self.detailed_stats["operation_types"]["archived"] += 1
                elif fallback and fallback.startswith("REPO_ROOT:"):
-                    real_fallback = fallback.replace("REPO_ROOT:", "")
-                    file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_fallback)
+                    real_f = fallback.replace("REPO_ROOT:", "")
+                    file_updates[file_path][line_idx] = file_updates[file_path][line_idx].replace(url, real_f)
                    track(file_path, "modified", url, reason); self.detailed_stats["operation_types"]["consolidated"] += 1
                else:
                    if file_path not in CORE_FILES:
@@ -214,90 +202,57 @@ class IntelligentLinkCleaner:
            with open(self.curator.mkdocs_path, 'r') as f: final_payload[self.curator.mkdocs_path] = f.read()
        if final_payload: self._create_pr(final_payload)

-    def _create_pr(self, updates: Dict[str, str]):
-        timestamp = datetime.now().strftime("%Y%m%d-%H%M")
-        branch_name = f"bot/autonomous-health-{timestamp}"
+    def _create_pr(self, updates: Dict[str, str], report_content: str = None):
+        timestamp = datetime.now().strftime("%Y%m%d-%H%M"); branch_name = f"bot/autonomous-health-{timestamp}"
+        if not report_content: report_content = self._build_report_body()
        self.git_controller._create_feature_branch(branch_name)
        for path, content in updates.items():
            try:
                file_meta = self.git_controller.repository.get_contents(path)
                self.git_controller.repository.update_file(path=path, message=f"fix(autonomous): engine update in {path}", content=content, sha=file_meta.sha, branch=branch_name)
            except: pass
+        safe_report = report_content[:65000]
+        self.git_controller.repository.create_pull(title=f"🧹 Autonomous Engine Health Report: {datetime.now().strftime('%d %b %Y')}", body=safe_report, head=branch_name, base="master")

+    def _build_report_body(self) -> str:
        report = "## 🧠 Nubenetes Autonomous Health & Curation Engine\n\n"
-        # Mermaid Pie Chart
        report += "### 📊 Distribución de Operaciones\n"
        report += "```mermaid\npie title Operaciones de Mantenimiento\n"
        report += f"    \"Eliminados\" : {self.detailed_stats['operation_types']['removals']}\n"
        report += f"    \"Archivados\" : {self.detailed_stats['operation_types']['archived']}\n"
        report += f"    \"Consolidados\" : {self.detailed_stats['operation_types']['consolidated']}\n"
        report += f"    \"Nuevos\" : {self.detailed_stats['operation_types']['orphans']}\n```\n\n"
-
        report += "### 📈 Resumen de Eficiencia\n"
-        report += f"| Métrica | Cantidad | Detalle |\n| :--- | :---: | :--- |\n"
+        report += "| Métrica | Cantidad | Detalle |\n| :--- | :---: | :--- |\n"
        report += f"| ⏩ Omitidos (Cache) | **{self.detailed_stats['skipped_recent']}** | Verificados hace menos de 21 días |\n"
        report += f"| 💀 Eliminados | **{self.detailed_stats['operation_types']['removals']}** | 404 definitivos |\n"
        report += f"| 🏛️ Archivados | **{self.detailed_stats['operation_types']['archived']}** | Vía Wayback Machine |\n"
        report += f"| 🎯 Consolidados | **{self.detailed_stats['operation_types']['consolidated']}** | Raíz de Repositorio Git |\n"
        report += f"| 🖇️ Nuevos | **{self.detailed_stats['operation_types']['orphans']}** | Páginas vinculadas |\n\n"
-
-        report += "### 🧮 Matriz de Mantenimiento por Documento\n"
+        report += "### 🧮 Matriz de Mantenimiento\n"
        report += "| Documento | 🔴 Elim | 🟡 Mod | 🟢 Crea | Estado |\n| :--- | :---: | :---: | :---: | :---: |\n"
        for file, s in sorted(self.detailed_stats["by_file"].items()):
            status = "🧹 Limpio" if s['removed'] + s['modified'] < 3 else "🛠️ Refactor"
            if s['removed'] > 5: status = "⚠️ Crítico"
            report += f"| `{file}` | {s['removed']} | {s['modified']} | {s['created']} | {status} |\n"
-        # Action Log con Compresión Adaptativa
        report += "\n### 📝 Registro de Acciones\n<details><summary>Ver detalle de cambios</summary>\n\n"
        report += "| Archivo | Acción | Recurso (Acortado) | Motivo |\n| :--- | :---: | :--- | :--- |\n"
-        
-        is_compressed = False
-        current_len = len(report)
-        processed_logs = 0
-        
-        # Agrupar logs por archivo para poder comprimir si es necesario
-        from collections import defaultdict
-        logs_by_file = defaultdict(list)
-        for log in self.action_log:
-            logs_by_file[log["file"]].append(log)
-
+        is_compressed = False; current_len = len(report); processed_logs = 0
+        from collections import defaultdict; logs_by_file = defaultdict(list)
+        for log in self.action_log: logs_by_file[log["file"]].append(log)
        for file_path, actions in sorted(logs_by_file.items()):
-            if current_len > 55000: # Umbral de compresión agresiva
-                is_compressed = True
-                summary = f"| `{file_path}` | 🛠️ | *Múltiples enlaces* | Se procesaron {len(actions)} cambios en este archivo. |\n"
-                report += summary
-                current_len += len(summary)
-                continue
-
+            if current_len > 55000:
+                is_compressed = True; summary = f"| `{file_path}` | 🛠️ | *Múltiples enlaces* | Se procesaron {len(actions)} cambios en este archivo. |\n"
+                report += summary; current_len += len(summary); continue
            for log in actions:
                emoji = {"removed": "❌", "modified": "🔄", "created": "✨"}.get(log["action"], "❓")
-                
-                # Inteligencia: Mantener URL completa mientras haya espacio
-                if current_len > 45000:
-                    short_url = (log["url"][:50] + "...") if len(log["url"]) > 53 else log["url"]
-                else:
-                    short_url = log["url"]
-                
+                short_url = (log["url"][:50] + "...") if current_len > 45000 and len(log["url"]) > 53 else log["url"]
                entry = f"| `{log['file']}` | {emoji} | {short_url} | {log['reason']} |\n"
-                
-                if current_len + len(entry) > 62000:
-                    is_compressed = True
-                    break
-                
-                report += entry
-                current_len += len(entry)
-                processed_logs += 1
-
-        if is_compressed:
-            report += f"\n> 💡 **Nota**: El log ha sido comprimido o truncado para cumplir con los límites de GitHub ({processed_logs}/{len(self.action_log)} acciones detalladas).\n"
-        
-        report += "</details>\n\n"
-report += f"\n---\n*📈 Inteligencia de dominios acumulada: `{len(self.learning_data['domains'])}`*"
-
-# Validación final de longitud
-safe_report = report[:65000]
-self.git_controller.repository.create_pull(title=f"🧹 Autonomous Engine Health Report: {datetime.now().strftime('%d %b %Y')}", body=safe_report, head=branch_name, base="master")
-
+                if current_len + len(entry) > 62000: is_compressed = True; break
+                report += entry; current_len += len(entry); processed_logs += 1
+        if is_compressed: report += f"\n> 💡 **Nota**: Log comprimido para límites de GitHub ({processed_logs}/{len(self.action_log)} detallados).\n"
+        report += "</details>\n\n---\n*📈 Inteligencia acumulada: `{len(self.learning_data['domains'])}`*"
+        return report

 async def main():
    try:
@@ -308,10 +263,6 @@ async def main():
        await cleaner.curator.suggest_reorganization()
        await cleaner.apply_changes()
    except Exception as e:
-        import traceback
-        print(f"[CRITICAL ERROR]: {e}")
-        traceback.print_exc()
-        exit(1)
+        import traceback; print(f"[CRITICAL ERROR]: {e}"); traceback.print_exc(); exit(1)

-if __name__ == "__main__":
-    asyncio.run(main())
+if __name__ == "__main__": asyncio.run(main())