From f46541572cbbd5a13902dc5e22bebadc8ad59952 Mon Sep 17 00:00:00 2001
From: Nubenetes Bot <bot@nubenetes.com>
Date: Tue, 12 May 2026 10:25:27 +0200
Subject: [PATCH] perf: optimize extraction window and scrolling to prevent
 workflow timeouts

---
 src/ingestion_twikit.py | 12 +++++++-----
 src/main.py             | 14 ++++++++++----
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/ingestion_twikit.py b/src/ingestion_twikit.py
index 7969f835..9dc1b87e 100644
--- a/src/ingestion_twikit.py
+++ b/src/ingestion_twikit.py
@@ -90,14 +90,14 @@ class SocialDataExtractor:
                 
                 stop_scrolling = False
                 scroll_count = 0
-                max_scrolls = 300
+                max_scrolls = 100 # Reducido de 300
                 collected_tweets = {} # URL -> tweet_data para evitar duplicados en scroll
-                target_link_count = 1000
+                target_link_count = 300 # Reducido de 1000
                 
                 while not stop_scrolling and scroll_count < max_scrolls:
                     articles = await page.query_selector_all('article[data-testid="tweet"]')
                     
-                    if not articles and scroll_count > 5:
+                    if not articles and scroll_count > 3:
                         self.log_audit("Extraction", False, "No se detectan más tweets en el DOM.")
                         break
 
@@ -118,6 +118,7 @@ class SocialDataExtractor:
                         tweet_dt = datetime.fromisoformat(datetime_str.replace('Z', '+00:00')).astimezone(MADRID_TZ)
                         
                         if tweet_dt < since_date:
+                            self.log_audit("Timeline", True, f"Alcanzado horizonte temporal: {tweet_dt.date()}")
                             stop_scrolling = True
                             break
 
@@ -148,8 +149,9 @@ class SocialDataExtractor:
                         if stop_scrolling: break
 
                     if stop_scrolling: break
-                    await page.evaluate("window.scrollBy(0, 5000)")
-                    await asyncio.sleep(8)
+                    # Scroll más agresivo y menos esperas
+                    await page.evaluate("window.scrollBy(0, 8000)")
+                    await asyncio.sleep(5)
                     scroll_count += 1
                 
                 if not stop_scrolling and scroll_count >= max_scrolls:
diff --git a/src/main.py b/src/main.py
index 1314ca43..e8398d22 100644
--- a/src/main.py
+++ b/src/main.py
@@ -16,12 +16,18 @@ async def master_orchestrator():
     
     print("[*] INICIANDO CURADURÍA AGÉNTICA (CRONOLOGÍA Y TRANSPARENCIA)")
     
-    # 1. Horizonte Temporal Fijo (Octubre 2024) - Requisito de Curaduría Histórica
-    time_horizon = datetime(2024, 10, 1, 0, 0, tzinfo=MADRID_TZ)
-    print(f"[*] FORZANDO CURADURÍA HISTÓRICA desde: {time_horizon.date()}")
+    # 1. Horizonte Temporal Dinámico
+    # Por defecto, solo buscamos los últimos 30 días para evitar Timeouts de 6h.
+    # Si se requiere curaduría histórica, se puede pasar vía variable de entorno.
+    days_back = int(os.getenv("CURATION_DAYS_BACK", "30"))
+    time_horizon = datetime.now(MADRID_TZ) - timedelta(days=days_back)
+    
+    if days_back > 60:
+        print(f"[*] ALERTA: Ejecutando Curaduría Histórica Profunda ({days_back} días).")
+    print(f"[*] Horizonte temporal: {time_horizon.date()}")
 
     # 2. Ingesta Multi-fuente
-    strategy = os.getenv("EXTRACTION_STRATEGY", "scroll")
+    strategy = os.getenv("EXTRACTION_STRATEGY", "search") # Cambiamos default a 'search' por ser más rápido
     twitter_client = SocialDataExtractor()
     raw_social = await twitter_client.fetch_links_since(time_horizon, strategy=strategy)
     x_audit_trail = twitter_client.audit_trail