From 628d0ee06493fb6335bf6902492c45ade3b5c710 Mon Sep 17 00:00:00 2001
From: Nubenetes Bot <bot@nubenetes.com>
Date: Sun, 10 May 2026 23:54:26 +0200
Subject: [PATCH] feat: enforce historical curation from Oct 2024 and refine
 technical filtering

---
 .gitignore              |  1 +
 src/agentic_curator.py  | 16 ++++++++++------
 src/ingestion_twikit.py | 12 +++++++++---
 src/main.py             | 30 ++++++++++++++----------------
 4 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/.gitignore b/.gitignore
index dbe3436a..20f2963e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -353,3 +353,4 @@ src/__pycache__/
 *.json
 .env
 nubenetes_agent_env/
+.venv/
diff --git a/src/agentic_curator.py b/src/agentic_curator.py
index 633228b4..7e646a17 100644
--- a/src/agentic_curator.py
+++ b/src/agentic_curator.py
@@ -48,13 +48,17 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> List[Dict]:
         context = asset.get('context', asset.get('description', 'Sin contexto adicional'))
         
         prompt = (
-            "Actúas como Ingeniero Curador Senior de 'nubenetes/awesome-kubernetes'. "
-            f"Filtra este recurso para estas categorías: {', '.join(NUBENETES_CATEGORIES)}. "
-            "Si es sobre Model Context Protocol (MCP), asígnalo a 'ai-agents-mcp'.\n"
+            "Actúas como Ingeniero Curador Senior de 'nubenetes/awesome-kubernetes'.\n"
+            "Tu misión es identificar contenido TÉCNICO de alta calidad sobre Kubernetes y el ecosistema Cloud Native.\n"
+            f"Categorías válidas: {', '.join(NUBENETES_CATEGORIES)}.\n\n"
+            "REGLAS DE FILTRADO:\n"
+            "1. EXCLUYE: Marketing genérico, noticias de negocios sin impacto técnico, enlaces rotos, o contenido autopromocional sin valor educativo.\n"
+            "2. PRIORIZA: Tutoriales 'hands-on', nuevas herramientas open-source, guías de arquitectura, seguridad avanzada y Model Context Protocol (MCP).\n"
+            "3. ASIGNACIÓN: Si es sobre MCP, asígnalo obligatoriamente a 'ai-agents-mcp'.\n\n"
             f"URL: {asset['url']}\nContexto: {context}\nWeb: {web_content}\n\n"
-            "Evalúa el IMPACTO SOCIAL y PROFUNDIDAD (1-100):\n"
-            "- >80: Recurso excepcional, disruptivo.\n"
-            "- <20: Contenido pobre o irrelevante.\n\n"
+            "Evalúa el IMPACTO TÉCNICO y PROFUNDIDAD (1-100):\n"
+            "- >80: Recurso excepcional (🌟).\n"
+            "- <30: Descartar (No aporta valor suficiente).\n\n"
             "Responde SOLAMENTE un JSON: {\"is_exceptional\": bool, \"impact_score\": int, \"categories\": [\"cat1\"], \"title\": \"...\", \"desc\": \"...\"}"
         )
 
diff --git a/src/ingestion_twikit.py b/src/ingestion_twikit.py
index 94e12a9c..55f06fcb 100644
--- a/src/ingestion_twikit.py
+++ b/src/ingestion_twikit.py
@@ -80,8 +80,9 @@ class SocialDataExtractor:
                 
                 stop_scrolling = False
                 scroll_count = 0
-                max_scrolls = 25
+                max_scrolls = 100
                 collected_tweets = {} # URL -> tweet_data para evitar duplicados en scroll
+                target_link_count = 100
                 
                 while not stop_scrolling and scroll_count < max_scrolls:
                     articles = await page.query_selector_all('article[data-testid="tweet"]')
@@ -126,9 +127,14 @@ class SocialDataExtractor:
                                     "timestamp": tweet_dt.isoformat(),
                                     "source_type": "X.com (@nubenetes)"
                                 }
+                                if len(collected_tweets) >= target_link_count:
+                                    stop_scrolling = True
+                                    break
+                        if stop_scrolling: break
 
-                    await page.evaluate("window.scrollBy(0, 2500)")
-                    await asyncio.sleep(6)
+                    if stop_scrolling: break
+                    await page.evaluate("window.scrollBy(0, 3500)")
+                    await asyncio.sleep(4)
                     scroll_count += 1
                 
                 await browser.close()
diff --git a/src/main.py b/src/main.py
index 2d1e924f..92bdbde6 100644
--- a/src/main.py
+++ b/src/main.py
@@ -16,18 +16,9 @@ async def master_orchestrator():
     
     print("[*] INICIANDO CURADURÍA AGÉNTICA (CRONOLOGÍA Y TRANSPARENCIA)")
     
-    # 1. Determinar Horizonte Temporal (Prioridad Oct 2024 si no hay merges)
+    # 1. Horizonte Temporal Fijo (Octubre 2024) - Requisito de Curaduría Histórica
     time_horizon = datetime(2024, 10, 1, 0, 0, tzinfo=MADRID_TZ)
-    try:
-        pulls = git_controller.repository.get_pulls(state='closed', sort='updated', direction='desc')
-        for pr in pulls:
-            if pr.merged and "💎 Knowledge Update" in pr.title:
-                time_horizon = pr.merged_at.replace(tzinfo=MADRID_TZ) + timedelta(seconds=1)
-                print(f"[+] Retomando desde último merge exitoso: {pr.merged_at}")
-                break
-    except: pass
-
-    print(f"[*] Buscando posts desde: {time_horizon.date()}")
+    print(f"[*] FORZANDO CURADURÍA HISTÓRICA desde: {time_horizon.date()}")
 
     # 2. Ingesta Multi-fuente
     twitter_client = SocialDataExtractor()
@@ -42,7 +33,7 @@ async def master_orchestrator():
     
     all_raw_assets = raw_social + trending
     
-    # 3. Evaluación y Registro de Auditoría
+    # 3. Evaluación y Registro de Auditoría (Uso de archivos locales)
     existing_urls = set()
     for doc in os.listdir("docs"):
         if doc.endswith(".md"):
@@ -85,7 +76,7 @@ async def master_orchestrator():
                 "post_date": asset.get("timestamp")
             })
 
-    # 4. Inyección en Markdowns (IA Agentica)
+    # 4. Inyección en Markdowns (Uso de archivos locales)
     file_updates = {}
     stats = {"added_details": [], "categories_updated": set()}
     curator_agent = AgenticCurator()
@@ -94,17 +85,24 @@ async def master_orchestrator():
         category = asset["category"]
         file_path = f"docs/{category}.md"
         try:
+            # Leer localmente si no está en file_updates
             content = file_updates.get(file_path)
             if not content:
-                repo_file = git_controller.repository.get_contents(file_path)
-                content = repo_file.decoded_content.decode("utf-8")
+                if os.path.exists(file_path):
+                    with open(file_path, 'r') as f:
+                        content = f.read()
+                else:
+                    print(f"[!] Archivo no encontrado localmente: {file_path}")
+                    continue
             
             new_content = await curator_agent.decide_smart_injection(content, asset)
             if len(new_content) > len(content):
                 file_updates[file_path] = new_content
                 stats["added_details"].append(asset)
                 stats["categories_updated"].add(category)
-        except: continue
+        except Exception as e:
+            print(f"[!] Error inyectando en {file_path}: {e}")
+            continue
 
     # 5. GitOps con Reporte Matricial
     metrics = {