feat: enforce historical curation from Oct 2024 and refine technical filtering

2026-07-28 09:32:20 +00:00 · 2026-05-10 23:54:26 +02:00
parent 8030dc6f54
commit 628d0ee064
4 changed files with 34 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -353,3 +353,4 @@ src/__pycache__/
 *.json
 .env
 nubenetes_agent_env/
+.venv/
--- a/src/agentic_curator.py
+++ b/src/agentic_curator.py
@@ -48,13 +48,17 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> List[Dict]:
        context = asset.get('context', asset.get('description', 'Sin contexto adicional'))
        
        prompt = (
-            "Actúas como Ingeniero Curador Senior de 'nubenetes/awesome-kubernetes'. "
-            f"Filtra este recurso para estas categorías: {', '.join(NUBENETES_CATEGORIES)}. "
-            "Si es sobre Model Context Protocol (MCP), asígnalo a 'ai-agents-mcp'.\n"
+            "Actúas como Ingeniero Curador Senior de 'nubenetes/awesome-kubernetes'.\n"
+            "Tu misión es identificar contenido TÉCNICO de alta calidad sobre Kubernetes y el ecosistema Cloud Native.\n"
+            f"Categorías válidas: {', '.join(NUBENETES_CATEGORIES)}.\n\n"
+            "REGLAS DE FILTRADO:\n"
+            "1. EXCLUYE: Marketing genérico, noticias de negocios sin impacto técnico, enlaces rotos, o contenido autopromocional sin valor educativo.\n"
+            "2. PRIORIZA: Tutoriales 'hands-on', nuevas herramientas open-source, guías de arquitectura, seguridad avanzada y Model Context Protocol (MCP).\n"
+            "3. ASIGNACIÓN: Si es sobre MCP, asígnalo obligatoriamente a 'ai-agents-mcp'.\n\n"
            f"URL: {asset['url']}\nContexto: {context}\nWeb: {web_content}\n\n"
-            "Evalúa el IMPACTO SOCIAL y PROFUNDIDAD (1-100):\n"
-            "- >80: Recurso excepcional, disruptivo.\n"
-            "- <20: Contenido pobre o irrelevante.\n\n"
+            "Evalúa el IMPACTO TÉCNICO y PROFUNDIDAD (1-100):\n"
+            "- >80: Recurso excepcional (🌟).\n"
+            "- <30: Descartar (No aporta valor suficiente).\n\n"
            "Responde SOLAMENTE un JSON: {\"is_exceptional\": bool, \"impact_score\": int, \"categories\": [\"cat1\"], \"title\": \"...\", \"desc\": \"...\"}"
        )

--- a/src/ingestion_twikit.py
+++ b/src/ingestion_twikit.py
@@ -80,8 +80,9 @@ class SocialDataExtractor:
                
                stop_scrolling = False
                scroll_count = 0
-                max_scrolls = 25
+                max_scrolls = 100
                collected_tweets = {} # URL -> tweet_data para evitar duplicados en scroll
+                target_link_count = 100
                
                while not stop_scrolling and scroll_count < max_scrolls:
                    articles = await page.query_selector_all('article[data-testid="tweet"]')
@@ -126,9 +127,14 @@ class SocialDataExtractor:
                                    "timestamp": tweet_dt.isoformat(),
                                    "source_type": "X.com (@nubenetes)"
                                }
+                                if len(collected_tweets) >= target_link_count:
+                                    stop_scrolling = True
+                                    break
+                        if stop_scrolling: break

-                    await page.evaluate("window.scrollBy(0, 2500)")
-                    await asyncio.sleep(6)
+                    if stop_scrolling: break
+                    await page.evaluate("window.scrollBy(0, 3500)")
+                    await asyncio.sleep(4)
                    scroll_count += 1
                
                await browser.close()
--- a/src/main.py
+++ b/src/main.py
@@ -16,18 +16,9 @@ async def master_orchestrator():
    
    print("[*] INICIANDO CURADURÍA AGÉNTICA (CRONOLOGÍA Y TRANSPARENCIA)")
    
-    # 1. Determinar Horizonte Temporal (Prioridad Oct 2024 si no hay merges)
+    # 1. Horizonte Temporal Fijo (Octubre 2024) - Requisito de Curaduría Histórica
    time_horizon = datetime(2024, 10, 1, 0, 0, tzinfo=MADRID_TZ)
-    try:
-        pulls = git_controller.repository.get_pulls(state='closed', sort='updated', direction='desc')
-        for pr in pulls:
-            if pr.merged and "💎 Knowledge Update" in pr.title:
-                time_horizon = pr.merged_at.replace(tzinfo=MADRID_TZ) + timedelta(seconds=1)
-                print(f"[+] Retomando desde último merge exitoso: {pr.merged_at}")
-                break
-    except: pass
-
-    print(f"[*] Buscando posts desde: {time_horizon.date()}")
+    print(f"[*] FORZANDO CURADURÍA HISTÓRICA desde: {time_horizon.date()}")

    # 2. Ingesta Multi-fuente
    twitter_client = SocialDataExtractor()
@@ -42,7 +33,7 @@ async def master_orchestrator():
    
    all_raw_assets = raw_social + trending
    
-    # 3. Evaluación y Registro de Auditoría
+    # 3. Evaluación y Registro de Auditoría (Uso de archivos locales)
    existing_urls = set()
    for doc in os.listdir("docs"):
        if doc.endswith(".md"):
@@ -85,7 +76,7 @@ async def master_orchestrator():
                "post_date": asset.get("timestamp")
            })

-    # 4. Inyección en Markdowns (IA Agentica)
+    # 4. Inyección en Markdowns (Uso de archivos locales)
    file_updates = {}
    stats = {"added_details": [], "categories_updated": set()}
    curator_agent = AgenticCurator()
@@ -94,17 +85,24 @@ async def master_orchestrator():
        category = asset["category"]
        file_path = f"docs/{category}.md"
        try:
+            # Leer localmente si no está en file_updates
            content = file_updates.get(file_path)
            if not content:
-                repo_file = git_controller.repository.get_contents(file_path)
-                content = repo_file.decoded_content.decode("utf-8")
+                if os.path.exists(file_path):
+                    with open(file_path, 'r') as f:
+                        content = f.read()
+                else:
+                    print(f"[!] Archivo no encontrado localmente: {file_path}")
+                    continue
            
            new_content = await curator_agent.decide_smart_injection(content, asset)
            if len(new_content) > len(content):
                file_updates[file_path] = new_content
                stats["added_details"].append(asset)
                stats["categories_updated"].add(category)
-        except: continue
+        except Exception as e:
+            print(f"[!] Error inyectando en {file_path}: {e}")
+            continue

    # 5. GitOps con Reporte Matricial
    metrics = {