From fdfee39bbaf02ee9cfc6629ec597c10414a80ee7 Mon Sep 17 00:00:00 2001
From: Nubenetes Bot <bot@nubenetes.com>
Date: Thu, 14 May 2026 20:51:01 +0200
Subject: [PATCH] feat: implement MVQ, descriptive style guide, and optimized
 API rotation

---
 GEMINI.md                         |   6 +-
 src/agentic_curator.py            |  66 ++++++++++++-------
 src/gemini_utils.py               | 104 ++++++++++++++++++------------
 src/intelligent_health_checker.py |  14 +++-
 src/main.py                       |  24 +++++--
 5 files changed, 138 insertions(+), 76 deletions(-)

diff --git a/GEMINI.md b/GEMINI.md
index 5778e026..c707dd83 100644
--- a/GEMINI.md
+++ b/GEMINI.md
@@ -5,8 +5,10 @@ Este archivo contiene las instrucciones acumuladas y la visión de largo plazo p
 ## 🧠 Core Mandates (Mandatos Principales)
 
 1.  **Preservación de la Información**: NUNCA elimines resúmenes, comentarios o estrellas (🌟) que acompañan a los enlaces. El bot solo debe actualizar la URL o reorganizar la posición del ítem, nunca borrar el contexto descriptivo.
-2.  **Aprendizaje Persistente**: Utiliza `src/memory/health_learning.json` para almacenar el conocimiento sobre dominios (bloqueos anti-bot, estrategias exitosas) y patrones de navegación.
-3.  **Resiliencia Total**: El workflow debe ser capaz de continuar incluso si hay errores individuales en validaciones de links o archivos. Prioriza generar un resultado (PR) aunque sea parcial.
+2. **Aprendizaje Persistente**: Utiliza `src/memory/health_learning.json` para almacenar el conocimiento sobre dominios (bloqueos anti-bot, estrategias exitosas) y patrones de navegación.
+3. **Minimum Viable Quality (MVQ)**: For GitHub/GitLab repositories, the bot MUST check the last commit date. If the repository has had NO activity (commits) in more than **4 years**, it must receive a significantly lower `impact_score` and be deprioritized, even if the content remains technically relevant. This ensures Nubenetes stays fresh and focuses on maintained projects.
+4. **Style Guide (Descriptive Summaries)**: All injected summaries MUST follow a **Descriptive** style. Avoid generic "clickbait" or action-oriented phrases (e.g., "Check this out"). Instead, provide a clear, neutral description of what the resource contains, its scope, and why it is technically significant for the Kubernetes ecosystem.
+5. **Resiliencia Total**: El workflow debe ser capaz de continuar incluso si hay errores individuales en validaciones de links o archivos. Prioriza generar un resultado (PR) aunque sea parcial.
 4.  **Consolidación de Repositorios**: Ante un fallo en un enlace profundo de GitHub/GitLab, intenta siempre validar la raíz del repositorio antes de darlo por muerto. Preferimos enlaces estables a raíces de repositorios que deep-links volátiles.
 5.  **Expansión de URLs**: Todos los enlaces acortados (t.co, bit.ly, buff.ly, etc.) DEBEN ser expandidos a su versión larga original antes de ser evaluados o inyectados. Esto garantiza la homogeneidad del inventario y mejora la precisión de la deduplicación global.
 6.  **Idioma Oficial (English Only)**: Todo el contenido inyectado (títulos, descripciones, encabezados), los logs de ejecución y las comunicaciones automatizadas (PRs) DEBEN ser exclusivamente en INGLÉS. Nubenetes es un recurso global y la consistencia lingüística es crítica.
diff --git a/src/agentic_curator.py b/src/agentic_curator.py
index 9820561d..0db123c9 100644
--- a/src/agentic_curator.py
+++ b/src/agentic_curator.py
@@ -31,6 +31,26 @@ async def _deep_fetch_content(url: str) -> str:
 
 from src.logger import log_event
 
+async def _get_github_activity(url: str) -> Optional[datetime]:
+    """Obtiene la fecha del último commit de un repo de GitHub usando la API (si hay token)."""
+    if "github.com" not in url or not GH_TOKEN: return None
+    try:
+        # Extraer user/repo
+        match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
+        if match:
+            owner, repo = match.groups()
+            repo = repo.split('#')[0].split('?')[0].rstrip('.git')
+            api_url = f"https://api.github.com/repos/{owner}/{repo}"
+            headers = {"Authorization": f"token {GH_TOKEN}"}
+            async with httpx.AsyncClient() as client:
+                resp = await client.get(api_url, headers=headers, timeout=5)
+                if resp.status_code == 200:
+                    pushed_at = resp.json().get("pushed_at")
+                    if pushed_at:
+                        return datetime.fromisoformat(pushed_at.replace('Z', '+00:00'))
+    except: pass
+    return None
+
 async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
     evaluations = {}
     if not GEMINI_API_KEYS:
@@ -52,8 +72,6 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
         
         log_event(f"--- EVALUATING {i+1}/{len(raw_assets)} ---", section_break=False)
         log_event(f"  - URL: {asset['url']}")
-        log_event(f"  - Post Date: {post_date}")
-        log_event(f"  - Post Context: \"{context[:300]}...\"")
 
         domain = asset['url'].split("//")[-1].split("/")[0]
         if domain in domain_blacklist:
@@ -61,24 +79,34 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
             evaluations[asset["url"]] = {"status": "FILTERED", "reason": "Blacklisted domain"}
             continue
 
+        # MVQ: Check GitHub activity
+        mvq_penalty = False
+        last_activity = await _get_github_activity(asset['url'])
+        if last_activity:
+            years_inactive = (datetime.now(last_activity.tzinfo) - last_activity).days / 365
+            if years_inactive > 4:
+                log_event(f"  [⚠️] MVQ Warning: Inactive for {years_inactive:.1f} years.")
+                mvq_penalty = True
+
         web_content = await _deep_fetch_content(asset['url'])
         
         prompt = (
             "You act as a Senior Curation Engineer for 'nubenetes/awesome-kubernetes'.\n"
             "Your mission is to catalog TECHNICAL content about Kubernetes and Cloud Native shared by the user.\n"
-            "GOLDEN RULE: If the link is in the feed, it's because the user considers it useful. DO NOT discard unless it is total noise (aggressive ads, 404, or non-technical content).\n\n"
+            "GOLDEN RULE: If the link is in the feed, it's because the user considers it useful. DO NOT discard unless it is total noise.\n\n"
             f"Valid categories: {', '.join(NUBENETES_CATEGORIES)}.\n\n"
             "INSTRUCTIONS:\n"
             "1. LANGUAGE: ALL outputs (title, desc, reasoning) MUST BE IN ENGLISH.\n"
-            "2. YOUTUBE: Accept technical videos or tutorials. Categorize them by topic.\n"
-            "3. SUMMARY: Create a concise summary (1 sentence). Use the 'Context' (the X post) as a priority as it explains why it was shared.\n"
-            "4. ASSIGNMENT: If it's about Model Context Protocol (MCP), assign it to 'ai-agents-mcp'.\n\n"
+            "2. STYLE: Summaries MUST BE DESCRIPTIVE (neutral, objective, explaining what/why).\n"
+            "3. MVQ: If it's a GitHub repo inactive for >4 years, penalize the impact score.\n"
+            "4. SUMMARY: Create a concise summary (1 sentence).\n"
+            f"{'IMPORTANT: This repo is old (>4 years inactive). Apply penalty.' if mvq_penalty else ''}\n\n"
             f"URL: {asset['url']}\nX Context: {context}\nExtracted Web Content: {web_content[:2000]}\n\n"
             "Evaluate TECHNICAL IMPACT (1-100):\n"
             "- >80: Exceptional resource (🌟).\n"
             "- >5: Accept (if it fits a category).\n"
             "- <5: Discard (Absolute noise).\n\n"
-            "Respond ONLY with a JSON: {\"impact_score\": int, \"categories\": [\"cat1\"], \"title\": \"...\", \"desc\": \"...\", \"reasoning\": \"Brief explanation (English)\", \"rejection_reason\": \"... (if applicable, English)\"}"
+            "Respond ONLY with a JSON: {\"impact_score\": int, \"categories\": [\"cat1\"], \"title\": \"...\", \"desc\": \"...\", \"reasoning\": \"Brief explanation (English)\"}"
         )
 
         try:
@@ -88,18 +116,11 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
             reasoning = data.get("reasoning", "No reason specified")
             
             if score < 5:
-                reason = data.get("rejection_reason", "Low technical impact")
-                evaluations[asset["url"]] = {"status": "FILTERED", "reason": reason}
-                log_event(f"  [-] REJECTED: {reason} (Score: {score})")
-                log_event(f"      AI Reason: {reasoning}")
-                
-                if score < 1 and domain not in domain_blacklist:
-                    domain_blacklist.add(domain)
-                    log_event(f"  [!] Domain {domain} added to blacklist.")
+                evaluations[asset["url"]] = {"status": "FILTERED", "reason": "Low technical impact"}
+                log_event(f"  [-] REJECTED: Low technical impact (Score: {score})")
             elif not valid_cats:
                 evaluations[asset["url"]] = {"status": "FILTERED", "reason": "No valid technical category found"}
-                log_event(f"  [-] REJECTED: No valid category found (Suggested: {data.get('categories')})")
-                log_event(f"      AI Reason: {reasoning}")
+                log_event(f"  [-] REJECTED: No valid category found")
             else:
                 evaluations[asset["url"]] = {
                     "status": "INCLUDED", "title": data["title"], "description": data["desc"],
@@ -107,17 +128,13 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
                     "reasoning": reasoning
                 }
                 log_event(f"  [+] ACCEPTED: \"{data['title']}\" (Score: {score})")
-                log_event(f"      Destination: docs/{valid_cats[0]}.md")
-                log_event(f"      Description: {data['desc']}")
-                log_event(f"      AI Reason: {reasoning}")
 
         except Exception as e:
-            log_event(f"  [!] CRITICAL ERROR EVALUATING {asset['url']}: {e}")
-            evaluations[asset["url"]] = {"status": "FILTERED", "reason": f"Evaluation Failed: {str(e)[:100]}"}
+            log_event(f"  [!] ERROR EVALUATING {asset['url']}: {e}")
+            evaluations[asset["url"]] = {"status": "FILTERED", "reason": f"Evaluation Failed"}
         
-        await asyncio.sleep(2.0) # Steady pace
+        await asyncio.sleep(1.0)
             
-    # Guardar blacklist actualizada
     try:
         os.makedirs(os.path.dirname(memory_file), exist_ok=True)
         with open(memory_file, 'w') as f:
@@ -125,6 +142,7 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
     except: pass
     return evaluations
 
+
 class AgenticCurator:
     def __init__(self):
         self.git_controller = RepositoryController(GH_TOKEN, TARGET_REPO)
diff --git a/src/gemini_utils.py b/src/gemini_utils.py
index 2ec8bc06..dd8b8523 100644
--- a/src/gemini_utils.py
+++ b/src/gemini_utils.py
@@ -32,54 +32,70 @@ class GeminiDiagnostics:
         return report
 
 async def resolve_url(url: str) -> str:
-    """Sigue las redirecciones para obtener la URL larga final y consolida repositorios si fallan."""
-    shorteners = ['t.co', 'bit.ly', 'buff.ly', 'goo.gl', 'tinyurl.com', 't.ly', 'rb.gy', 'is.gd', 'drp.li', 't.me']
+    """Sigue las redirecciones para obtener la URL larga final, consolidando repositorios y evitando bucles."""
+    shorteners = ['t.co', 'bit.ly', 'buff.ly', 'goo.gl', 'tinyurl.com', 't.ly', 'rb.gy', 'is.gd', 'drp.li', 't.me', 'lnkd.in']
     try:
         domain = url.split("//")[-1].split("/")[0].lower()
     except:
         return url
     
-    # 1. Expansión inicial
+    # 1. Expansión Multi-salto (evita intermediarios de tracking)
     final_url = url
-    if domain in shorteners or url.endswith('…'):
-        try:
-            async with httpx.AsyncClient(follow_redirects=True) as client:
-                resp = await client.head(url, timeout=5)
-                final_url = str(resp.url)
-                if final_url != url:
-                    log_event(f"  [🔗] URL Expandida: {url} -> {final_url}")
-        except:
-            pass
+    max_hops = 5
+    current_hop = 0
+    
+    async with httpx.AsyncClient(follow_redirects=True, timeout=8) as client:
+        while current_hop < max_hops:
+            try:
+                # Si no es un acortador conocido y ya tenemos una URL larga, paramos
+                current_domain = final_url.split("//")[-1].split("/")[0].lower()
+                if current_hop > 0 and current_domain not in shorteners:
+                    break
+                    
+                resp = await client.head(final_url, timeout=5)
+                new_url = str(resp.url)
+                if new_url == final_url: break
+                
+                final_url = new_url
+                current_hop += 1
+            except:
+                break
 
-    # 2. Consolidación de Repositorios (GitHub/GitLab)
+    # 2. Consolidación de Repositorios (GitHub/GitLab) con chequeo de MVQ (vía REST si es necesario)
     repo_domains = ['github.com', 'gitlab.com']
     current_domain = final_url.split("//")[-1].split("/")[0].lower()
     
     if any(d in current_domain for d in repo_domains):
-        # Intentar validar si el enlace profundo funciona
         try:
             async with httpx.AsyncClient(follow_redirects=True) as client:
                 resp = await client.head(final_url, timeout=5)
-                if resp.status_code == 200:
-                    return final_url
-                
-                # Si falla, intentar consolidar a la raíz del repo
-                # Formato esperado: https://github.com/user/repo/...
-                parts = final_url.split('/')
-                if len(parts) > 4: # https: , , domain, user, repo
-                    root_repo = "/".join(parts[:5])
-                    resp_root = await client.head(root_repo, timeout=5)
-                    if resp_root.status_code == 200:
-                        log_event(f"  [📦] Consolidación: {final_url} -> {root_repo} (Raíz validada)")
-                        return root_repo
+                if resp.status_code != 200:
+                    parts = final_url.split('/')
+                    if len(parts) > 4: 
+                        root_repo = "/".join(parts[:5])
+                        resp_root = await client.head(root_repo, timeout=5)
+                        if resp_root.status_code == 200:
+                            log_event(f"  [📦] Consolidación: {final_url} -> {root_repo}")
+                            final_url = root_repo
         except:
             pass
             
     return final_url
 
+def is_fuzzy_duplicate(url_a: str, url_b: str) -> bool:
+    """Detecta si dos URLs son iguales ignorando parámetros de tracking comunes."""
+    def clean(u):
+        u = u.split('#')[0].rstrip('/').lower()
+        # Eliminar parámetros utm_* y otros comunes
+        u = re.sub(r'(\?|&)(utm_[^&]+|s=[^&]+|t=[^&]+|ref=[^&]+)', '', u)
+        if u.endswith('?'): u = u[:-1]
+        return u
+    return clean(url_a) == clean(url_b)
+
 async def call_gemini_with_retry(prompt: str, response_format: str = "json", max_retries: int = 3):
     """
-    Llama a la API de Gemini con rotación exhaustiva y REINTENTO REAL en 429.
+    Llama a Gemini optimizando el uso de cuota (pay-per-use).
+    Rota llaves inmediatamente en 429 y usa backoff exponencial inteligente.
     """
     global CURRENT_KEY_INDEX
     if not GEMINI_API_KEYS:
@@ -87,16 +103,16 @@ async def call_gemini_with_retry(prompt: str, response_format: str = "json", max
 
     diagnostics = GeminiDiagnostics()
     
-    async with httpx.AsyncClient() as client:
-        for key_attempt in range(len(GEMINI_API_KEYS)):
-            api_key = GEMINI_API_KEYS[CURRENT_KEY_INDEX]
-            
+    # Intentamos rotar entre todas las llaves disponibles antes de fallar
+    for key_attempt in range(len(GEMINI_API_KEYS)):
+        api_key = GEMINI_API_KEYS[CURRENT_KEY_INDEX]
+        
+        async with httpx.AsyncClient() as client:
             for model in GEMINI_MODELS:
                 full_model_name = f"models/{model}"
                 api_url = f"https://generativelanguage.googleapis.com/{GEMINI_API_VERSION}/{full_model_name}:generateContent?key={api_key}"
                 
-                # Reintentos por modelo (incluyendo 429)
-                for attempt in range(max_retries + 2):
+                for attempt in range(max_retries):
                     try:
                         payload = {"contents": [{"parts": [{"text": prompt}]}]}
                         response = await client.post(api_url, json=payload, timeout=45)
@@ -117,14 +133,14 @@ async def call_gemini_with_retry(prompt: str, response_format: str = "json", max
                             break
                         
                         elif response.status_code == 429:
-                            wait_time = (10 * (attempt + 1)) + random.random() * 5
-                            log_event(f"  [!] API 429 (Límite): Reintentando {model} en {wait_time:.1f}s... (Intento {attempt+1})")
-                            await asyncio.sleep(wait_time)
-                            continue # Reintentar el MISMO modelo
+                            # 429: Rotamos llave inmediatamente para no desperdiciar tiempo
+                            log_event(f"  [!] API 429 en llave {CURRENT_KEY_INDEX+1}. Rotando...")
+                            CURRENT_KEY_INDEX = (CURRENT_KEY_INDEX + 1) % len(GEMINI_API_KEYS)
+                            # Rompemos el bucle de intentos para este modelo/llave y pasamos a la siguiente llave
+                            break 
                         
                         elif response.status_code in [500, 503, 504]:
-                            diagnostics.add_attempt(model, response.status_code, "Server Error")
-                            await asyncio.sleep(5)
+                            await asyncio.sleep(2 * (attempt + 1))
                             continue
                         
                         else:
@@ -135,7 +151,13 @@ async def call_gemini_with_retry(prompt: str, response_format: str = "json", max
                         diagnostics.add_attempt(model, 0, f"Excepción: {str(e)}")
                         break
             
+            # Si terminamos los modelos de una llave con 429, saltamos a la siguiente
+            if response.status_code == 429:
+                continue
+            
+            # Si llegamos aquí y no tuvimos éxito, probamos la siguiente llave tras un breve respiro
             CURRENT_KEY_INDEX = (CURRENT_KEY_INDEX + 1) % len(GEMINI_API_KEYS)
-            await asyncio.sleep(2)
+            await asyncio.sleep(1)
+
+    raise Exception(f"Fallo crítico Gemini tras rotación de llaves.\n{diagnostics.get_report()}")
 
-    raise Exception(f"Fallo crítico Gemini tras rotación exhaustiva.\n{diagnostics.get_report()}")
diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py
index 1ef540c9..5c093408 100644
--- a/src/intelligent_health_checker.py
+++ b/src/intelligent_health_checker.py
@@ -64,7 +64,7 @@ class IntelligentLinkCleaner:
                 return url, True, None, "Cached (Recent)"
 
         domain = url.split("//")[-1].split("/")[0]
-        domain_info = self.learning_data["domains"].get(domain, {})
+        domain_info = self.learning_data.get("domains", {}).get(domain, {})
         strategies = [
             {"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "ref": "https://www.google.com/", "desc": "Desktop/Google"},
             {"type": "http", "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1", "ref": "https://t.co/", "desc": "Mobile/Twitter"},
@@ -72,9 +72,13 @@ class IntelligentLinkCleaner:
             {"type": "http", "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0", "ref": "https://news.ycombinator.com/", "desc": "Firefox/Reddit"},
             {"type": "playwright", "ua": "Mozilla/5.0 (Linux; Android 13; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36", "ref": "https://www.google.com/", "desc": "PW Mobile/Google"}
         ]
+        
+        # PRIORIZACIÓN INTELIGENTE: Si ya sabemos qué funciona para este dominio, empezar por ahí.
         best_strat_idx = domain_info.get("best_strategy_idx")
         if best_strat_idx is not None and best_strat_idx < len(strategies):
-            best_strat = strategies.pop(best_strat_idx); strategies.insert(0, best_strat)
+            # Mover la mejor estrategia al inicio
+            best_strat = strategies.pop(best_strat_idx)
+            strategies.insert(0, best_strat)
 
         for attempt in range(min(max_retries, len(strategies))):
             strategy = strategies[attempt]
@@ -82,11 +86,17 @@ class IntelligentLinkCleaner:
                 if attempt > 0: await asyncio.sleep((2 ** attempt) + random.random())
                 is_alive, reason = await self._check_url_logic(url, strategy)
                 if is_alive:
+                    if "domains" not in self.learning_data: self.learning_data["domains"] = {}
                     if domain not in self.learning_data["domains"]: self.learning_data["domains"][domain] = {}
+                    
+                    # Guardar el índice REAL de la estrategia que funcionó
                     original_idx = attempt if best_strat_idx is None else (best_strat_idx if attempt == 0 else (attempt if attempt < best_strat_idx else attempt))
                     self.learning_data["domains"][domain]["best_strategy_idx"] = original_idx
+                    
+                    if "link_cache" not in self.learning_data: self.learning_data["link_cache"] = {}
                     self.learning_data["link_cache"][url] = {"status": "ALIVE", "last_checked": now}
                     return url, True, None, f"Alive ({strategy['desc']}) - {reason}"
+
                 if reason in ["404", "soft_404", "redirect_to_home"]:
                     if any(git_host in url for git_host in ["github.com", "gitlab.com", "bitbucket.org"]):
                         parts = url.split("/"); repo_root = "/".join(parts[:5]) if len(parts) > 4 else None
diff --git a/src/main.py b/src/main.py
index f70f0276..a63dc922 100644
--- a/src/main.py
+++ b/src/main.py
@@ -108,6 +108,7 @@ async def master_orchestrator():
     log_event(f"[*] Total after initial deduplication: {len(all_raw_assets)} unique links.")
 
     # 4. Evaluation and Registration (Robust Global Deduplication)
+    from src.gemini_utils import is_fuzzy_duplicate
     existing_urls = set()
     for root, dirs, files in os.walk("docs"):
         for file in files:
@@ -140,6 +141,21 @@ async def master_orchestrator():
             url = asset["url"]
             clean_url = url.split('#')[0].rstrip('/').lower()
             
+            # Fuzzy Deduplication
+            is_dup = False
+            for existing in existing_urls:
+                if is_fuzzy_duplicate(url, existing):
+                    is_dup = True
+                    break
+
+            if is_dup:
+                log_event(f"  [=] SKIPPED: {url[:60]}... (Already exists - Fuzzy)")
+                full_report_metrics.append({
+                    "url": url, "status": "DUPLICATE", "reason": "Already exists in repository",
+                    "category": "N/A", "post_date": asset.get('timestamp'), "source": asset.get("source_type", "Social")
+                })
+                continue
+            
             # Track max date
             try:
                 ts = asset.get('timestamp')
@@ -157,15 +173,9 @@ async def master_orchestrator():
                     max_tweet_date = asset_date
             except: pass
 
-            if clean_url in existing_urls:
-                log_event(f"  [=] SKIPPED: {url[:60]}... (Already exists)")
-                full_report_metrics.append({
-                    "url": url, "status": "DUPLICATE", "reason": "Already exists in repository",
-                    "category": "N/A", "post_date": ts, "source": asset.get("source_type", "Social")
-                })
-                continue
             assets_to_evaluate.append(asset)
 
+
         if not assets_to_evaluate:
             log_event("  [*] Entire batch consists of duplicates. Next batch.")
             continue