From f46541572cbbd5a13902dc5e22bebadc8ad59952 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Tue, 12 May 2026 10:25:27 +0200 Subject: [PATCH] perf: optimize extraction window and scrolling to prevent workflow timeouts --- src/ingestion_twikit.py | 12 +++++++----- src/main.py | 14 ++++++++++---- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/ingestion_twikit.py b/src/ingestion_twikit.py index 7969f835..9dc1b87e 100644 --- a/src/ingestion_twikit.py +++ b/src/ingestion_twikit.py @@ -90,14 +90,14 @@ class SocialDataExtractor: stop_scrolling = False scroll_count = 0 - max_scrolls = 300 + max_scrolls = 100 # Reducido de 300 collected_tweets = {} # URL -> tweet_data para evitar duplicados en scroll - target_link_count = 1000 + target_link_count = 300 # Reducido de 1000 while not stop_scrolling and scroll_count < max_scrolls: articles = await page.query_selector_all('article[data-testid="tweet"]') - if not articles and scroll_count > 5: + if not articles and scroll_count > 3: self.log_audit("Extraction", False, "No se detectan más tweets en el DOM.") break @@ -118,6 +118,7 @@ class SocialDataExtractor: tweet_dt = datetime.fromisoformat(datetime_str.replace('Z', '+00:00')).astimezone(MADRID_TZ) if tweet_dt < since_date: + self.log_audit("Timeline", True, f"Alcanzado horizonte temporal: {tweet_dt.date()}") stop_scrolling = True break @@ -148,8 +149,9 @@ class SocialDataExtractor: if stop_scrolling: break if stop_scrolling: break - await page.evaluate("window.scrollBy(0, 5000)") - await asyncio.sleep(8) + # Scroll más agresivo y menos esperas + await page.evaluate("window.scrollBy(0, 8000)") + await asyncio.sleep(5) scroll_count += 1 if not stop_scrolling and scroll_count >= max_scrolls: diff --git a/src/main.py b/src/main.py index 1314ca43..e8398d22 100644 --- a/src/main.py +++ b/src/main.py @@ -16,12 +16,18 @@ async def master_orchestrator(): print("[*] INICIANDO CURADURÍA AGÉNTICA (CRONOLOGÍA Y TRANSPARENCIA)") - # 1. Horizonte Temporal Fijo (Octubre 2024) - Requisito de Curaduría Histórica - time_horizon = datetime(2024, 10, 1, 0, 0, tzinfo=MADRID_TZ) - print(f"[*] FORZANDO CURADURÍA HISTÓRICA desde: {time_horizon.date()}") + # 1. Horizonte Temporal Dinámico + # Por defecto, solo buscamos los últimos 30 días para evitar Timeouts de 6h. + # Si se requiere curaduría histórica, se puede pasar vía variable de entorno. + days_back = int(os.getenv("CURATION_DAYS_BACK", "30")) + time_horizon = datetime.now(MADRID_TZ) - timedelta(days=days_back) + + if days_back > 60: + print(f"[*] ALERTA: Ejecutando Curaduría Histórica Profunda ({days_back} días).") + print(f"[*] Horizonte temporal: {time_horizon.date()}") # 2. Ingesta Multi-fuente - strategy = os.getenv("EXTRACTION_STRATEGY", "scroll") + strategy = os.getenv("EXTRACTION_STRATEGY", "search") # Cambiamos default a 'search' por ser más rápido twitter_client = SocialDataExtractor() raw_social = await twitter_client.fetch_links_since(time_horizon, strategy=strategy) x_audit_trail = twitter_client.audit_trail