diff --git a/.github/workflows/intelligent_link_cleaner.yml b/.github/workflows/intelligent_link_cleaner.yml new file mode 100644 index 00000000..d6496206 --- /dev/null +++ b/.github/workflows/intelligent_link_cleaner.yml @@ -0,0 +1,36 @@ +name: Nubenetes Intelligent Link Cleaner & Dedup + +on: + schedule: + - cron: '0 0 1 * *' # Mensual (el día 1 de cada mes) + workflow_dispatch: + +permissions: + contents: write + pull-requests: write + +jobs: + intelligent-clean-process: + runs-on: ubuntu-latest + steps: + - name: Sincronización del repositorio + uses: actions/checkout@v4 + + - name: Provisión del Entorno Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Instalación de dependencias y Playwright + run: | + python -m pip install --upgrade pip + pip install --no-cache-dir pydantic PyGithub aiohttp beautifulsoup4 httpx fake-useragent pytz python-dotenv playwright + playwright install chromium --with-deps + + - name: Ejecución de la Limpieza Inteligente Global + env: + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PYTHONPATH: . + run: | + python src/intelligent_health_checker.py diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py new file mode 100644 index 00000000..4c949ea8 --- /dev/null +++ b/src/intelligent_health_checker.py @@ -0,0 +1,247 @@ +import asyncio +import json +import os +import re +import httpx +from datetime import datetime +from typing import Dict, List, Set, Tuple +from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ +from src.gitops_manager import RepositoryController +from src.markdown_ast import MarkdownSanitizer + +# Configuración de Excepciones (Archivos que no se podan) +CORE_FILES = ["docs/index.md", "README.md"] + +class IntelligentLinkCleaner: + def __init__(self): + self.git_controller = RepositoryController(GH_TOKEN, TARGET_REPO) + self.sanitizer = MarkdownSanitizer() + self.link_registry: Dict[str, List[Dict]] = {} # URL -> List of {file, line_content, score} + self.dead_links: Set[str] = set() + self.stats = { + "total_links": 0, + "dead_links_removed": 0, + "duplicates_pruned": 0, + "ai_decisions": 0 + } + + async def build_global_registry(self): + print("[*] Construyendo registro global de enlaces...") + # Incluimos archivos core + categorías + all_files = CORE_FILES + [f"docs/{cat}.md" for cat in NUBENETES_CATEGORIES] + + for file_path in all_files: + try: + if not os.path.exists(file_path): + # Intentar obtener del repo si no está local (aunque debería estar) + repo_file = self.git_controller.repository.get_contents(file_path) + content = repo_file.decoded_content.decode("utf-8") + else: + with open(file_path, 'r') as f: + content = f.read() + + lines = content.splitlines() + for i, line in enumerate(lines): + match = self.sanitizer.link_pattern.search(line) + if match: + title, url = match.groups() + clean_url = url.split('#')[0].rstrip('/') + if "github.com" in clean_url and "/blob/" in clean_url: + continue # Evitar validar enlaces internos profundos de git por ahora + + score = self.sanitizer._calculate_link_score(line) + if clean_url not in self.link_registry: + self.link_registry[clean_url] = [] + + self.link_registry[clean_url].append({ + "file": file_path, + "line_index": i, + "content": line, + "score": score, + "title": title + }) + self.stats["total_links"] += 1 + except Exception as e: + print(f"[!] Error procesando {file_path}: {e}") + + async def validate_links_tiered(self): + """Validación en dos niveles: HTTP -> Playwright""" + print(f"[*] Validando {len(self.link_registry)} URLs únicas...") + + unique_urls = list(self.link_registry.keys()) + # Para evitar saturar, validamos en batches + batch_size = 50 + for i in range(0, len(unique_urls), batch_size): + batch = unique_urls[i:i+batch_size] + tasks = [self._check_url_sophisticated(url) for url in batch] + results = await asyncio.gather(*tasks) + for url, is_alive in results: + if not is_alive: + self.dead_links.add(url) + print(f" - Progreso: {min(i+batch_size, len(unique_urls))}/{len(unique_urls)}") + + async def _check_url_sophisticated(self, url: str) -> Tuple[str, bool]: + # TIER 1: HTTP Fast + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" + } + try: + async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=10) as client: + resp = await client.get(url) + if resp.status_code < 400: + return url, True + if resp.status_code not in [403, 429, 401]: + return url, False # 404, 500 etc son muertos + except Exception: + pass # Errores de conexión pasan a Tier 2 + + # TIER 2: Playwright (Solo si Tier 1 falla con sospecha de bloqueo) + try: + from playwright.async_api import async_playwright + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + # User agent real para Playwright + await page.set_extra_http_headers({"User-Agent": headers["User-Agent"]}) + response = await page.goto(url, wait_until="networkidle", timeout=20000) + is_alive = response.status < 400 if response else False + await browser.close() + return url, is_alive + except Exception as e: + # Si Playwright también falla, asumimos que puede estar muerto o es inaccesible + # pero para ser conservadores, solo marcamos como muerto si es un error claro + return url, True # Conservador: Si todo falla, no lo borramos todavía + + async def resolve_duplicates_with_ai(self): + print("[*] Resolviendo duplicados globales con Gemini...") + for url, occurrences in self.link_registry.items(): + if len(occurrences) <= 1 or url in self.dead_links: + continue + + # Si alguna ocurrencia está en CORE_FILES, esa manda pero no borra el resto necesariamente + # a menos que Gemini diga que es redundante. + + # Filtrar ocurrencias que NO están en archivos core para ver qué podemos podar + prunable = [occ for occ in occurrences if occ["file"] not in CORE_FILES] + if len(prunable) <= 1 and len(occurrences) - len(prunable) >= 1: + # Ya está en un CORE_FILE y solo en un sitio más, lo dejamos estar + continue + + if len(prunable) > 1: + # Preguntar a Gemini + decision = await self._ask_gemini_dedup(url, occurrences) + self.stats["ai_decisions"] += 1 + + # 'decision' debería decirnos qué archivos mantener + files_to_keep = decision.get("keep_in_files", []) + for occ in prunable: + if occ["file"] not in files_to_keep: + occ["should_prune"] = True + self.stats["duplicates_pruned"] += 1 + + async def _ask_gemini_dedup(self, url: str, occurrences: List[Dict]) -> Dict: + api_url = f"https://generativelanguage.googleapis.com/v1/models/gemini-1.5-flash:generateContent?key={GEMINI_API_KEY}" + contexts = "\n".join([f"- Archivo: {occ['file']}, Contexto: {occ['content']}" for occ in occurrences]) + prompt = ( + f"El enlace {url} aparece en múltiples archivos de mi repositorio de Kubernetes.\n" + f"Ocurrencias:\n{contexts}\n\n" + "Analiza si el enlace es fundamental en todos esos contextos o si es redundante y debería estar solo en el más relevante.\n" + "Responde en JSON: {\"keep_in_files\": [\"lista de archivos donde mantenerlo\"], \"reason\": \"...\"}" + ) + + try: + async with httpx.AsyncClient() as client: + resp = await client.post(api_url, json={"contents": [{"parts": [{"text": prompt}]}]}, timeout=20) + if resp.status_code == 200: + text = resp.json()['candidates'][0]['content']['parts'][0]['text'] + match = re.search(r'\{.*\}', text, re.DOTALL) + if match: + return json.loads(match.group(0)) + except: pass + # Fallback: Mantener solo el que tiene mayor score + best_file = max(occurrences, key=lambda x: x["score"])["file"] + return {"keep_in_files": [best_file]} + + async def apply_changes(self): + print("[*] Aplicando limpieza a los archivos...") + file_updates = {} + + # Agrupar podas por archivo + prunes_by_file = {} + for url, occurrences in self.link_registry.items(): + for occ in occurrences: + if url in self.dead_links or occ.get("should_prune"): + if occ["file"] not in prunes_by_file: + prunes_by_file[occ["file"]] = [] + prunes_by_file[occ["file"]].append(occ["line_index"]) + + for file_path, lines_to_remove in prunes_by_file.items(): + with open(file_path, 'r') as f: + lines = f.readlines() + + # Borrar de atrás hacia adelante para no arruinar índices + for idx in sorted(lines_to_remove, reverse=True): + if file_path not in CORE_FILES or url in self.dead_links: # Los core solo pierden links muertos + lines.pop(idx) + if url in self.dead_links: self.stats["dead_links_removed"] += 1 + + file_updates[file_path] = "".join(lines) + + if file_updates: + print(f"[+] Generando PR con {len(file_updates)} archivos modificados.") + metrics = { + "total_cleaned": self.stats["dead_links_removed"] + self.stats["duplicates_pruned"], + "dead_removed": self.stats["dead_links_removed"], + "duplicates_pruned": self.stats["duplicates_pruned"], + "ai_decisions": self.stats["ai_decisions"], + "files_impacted": list(file_updates.keys()) + } + # Custom narrative para el PR + self._create_pr(file_updates, metrics) + else: + print("[~] No se encontraron mejoras necesarias.") + + def _create_pr(self, updates: Dict[str, str], metrics: Dict): + # Usamos el git_controller para aplicar cambios + # (Modificado para este script específico) + timestamp = datetime.now().strftime("%Y%m%d-%H%M") + branch_name = f"bot/intelligent-clean-{timestamp}" + self.git_controller._create_feature_branch(branch_name) + + for path, content in updates.items(): + file_meta = self.git_controller.repository.get_contents(path) + self.git_controller.repository.update_file( + path=path, + message=f"fix(clean): limpieza inteligente de enlaces en {path}", + content=content, + sha=file_meta.sha, + branch=branch_name + ) + + body = ( + f"## 🤖 Limpieza Inteligente de Enlaces (May 2026)\n\n" + f"He completado un ciclo de revisión global utilizando **Playwright** para evasión de bloqueos y **Gemini** para deduplicación inteligente.\n\n" + f"### 📊 Resumen de Ejecución:\n" + f"- 💀 Enlaces muertos eliminados: `{metrics['dead_removed']}`\n" + f"- ✂️ Duplicados globales podados: `{metrics['duplicates_pruned']}`\n" + f"- 🧠 Decisiones asistidas por IA: `{metrics['ai_decisions']}`\n\n" + f"### 📂 Archivos Optimizados:\n" + + "\n".join([f"- `{f}`" for f in metrics['files_impacted']]) + ) + + self.git_controller.repository.create_pull( + title=f"🧹 Intelligent Link Clean & Dedup: {datetime.now().strftime('%d %b %Y')}", + body=body, + head=branch_name, + base="master" + ) + +async def main(): + cleaner = IntelligentLinkCleaner() + await cleaner.build_global_registry() + await cleaner.validate_links_tiered() + await cleaner.resolve_duplicates_with_ai() + await cleaner.apply_changes() + +if __name__ == "__main__": + asyncio.run(main())