perf: optimize extraction window and scrolling to prevent workflow timeouts

2026-07-28 17:41:32 +00:00 · 2026-05-12 10:25:27 +02:00
parent f8cd680400
commit f46541572c
2 changed files with 17 additions and 9 deletions
--- a/src/ingestion_twikit.py
+++ b/src/ingestion_twikit.py
@@ -90,14 +90,14 @@ class SocialDataExtractor:
                
                stop_scrolling = False
                scroll_count = 0
-                max_scrolls = 300
+                max_scrolls = 100 # Reducido de 300
                collected_tweets = {} # URL -> tweet_data para evitar duplicados en scroll
-                target_link_count = 1000
+                target_link_count = 300 # Reducido de 1000
                
                while not stop_scrolling and scroll_count < max_scrolls:
                    articles = await page.query_selector_all('article[data-testid="tweet"]')
                    
-                    if not articles and scroll_count > 5:
+                    if not articles and scroll_count > 3:
                        self.log_audit("Extraction", False, "No se detectan más tweets en el DOM.")
                        break

@@ -118,6 +118,7 @@ class SocialDataExtractor:
                        tweet_dt = datetime.fromisoformat(datetime_str.replace('Z', '+00:00')).astimezone(MADRID_TZ)
                        
                        if tweet_dt < since_date:
+                            self.log_audit("Timeline", True, f"Alcanzado horizonte temporal: {tweet_dt.date()}")
                            stop_scrolling = True
                            break

@@ -148,8 +149,9 @@ class SocialDataExtractor:
                        if stop_scrolling: break

                    if stop_scrolling: break
-                    await page.evaluate("window.scrollBy(0, 5000)")
-                    await asyncio.sleep(8)
+                    # Scroll más agresivo y menos esperas
+                    await page.evaluate("window.scrollBy(0, 8000)")
+                    await asyncio.sleep(5)
                    scroll_count += 1
                
                if not stop_scrolling and scroll_count >= max_scrolls:
--- a/src/main.py
+++ b/src/main.py
@@ -16,12 +16,18 @@ async def master_orchestrator():
    
    print("[*] INICIANDO CURADURÍA AGÉNTICA (CRONOLOGÍA Y TRANSPARENCIA)")
    
-    # 1. Horizonte Temporal Fijo (Octubre 2024) - Requisito de Curaduría Histórica
-    time_horizon = datetime(2024, 10, 1, 0, 0, tzinfo=MADRID_TZ)
-    print(f"[*] FORZANDO CURADURÍA HISTÓRICA desde: {time_horizon.date()}")
+    # 1. Horizonte Temporal Dinámico
+    # Por defecto, solo buscamos los últimos 30 días para evitar Timeouts de 6h.
+    # Si se requiere curaduría histórica, se puede pasar vía variable de entorno.
+    days_back = int(os.getenv("CURATION_DAYS_BACK", "30"))
+    time_horizon = datetime.now(MADRID_TZ) - timedelta(days=days_back)
+    
+    if days_back > 60:
+        print(f"[*] ALERTA: Ejecutando Curaduría Histórica Profunda ({days_back} días).")
+    print(f"[*] Horizonte temporal: {time_horizon.date()}")

    # 2. Ingesta Multi-fuente
-    strategy = os.getenv("EXTRACTION_STRATEGY", "scroll")
+    strategy = os.getenv("EXTRACTION_STRATEGY", "search") # Cambiamos default a 'search' por ser más rápido
    twitter_client = SocialDataExtractor()
    raw_social = await twitter_client.fetch_links_since(time_horizon, strategy=strategy)
    x_audit_trail = twitter_client.audit_trail