From d1bd3934c18142b4b08ca57ef90bce4fe6d4449c Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Thu, 14 May 2026 19:40:25 +0200 Subject: [PATCH] debug: add granular logs to Playwright extraction to diagnose hangs --- src/ingestion_twikit.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/ingestion_twikit.py b/src/ingestion_twikit.py index 55f2199a..57ce4150 100644 --- a/src/ingestion_twikit.py +++ b/src/ingestion_twikit.py @@ -62,10 +62,10 @@ class SocialDataExtractor: elif hasattr(playwright_stealth, 'stealth'): playwright_stealth.stealth(page) except: pass - env_cookies = os.getenv("TWITTER_COOKIES") if env_cookies: try: cookies = json.loads(env_cookies) + self.log_audit("Cookies", True, f"Cargando {len(cookies)} cookies desde secretos.") formatted = [] for c in cookies: if isinstance(c, dict) and 'name' in c and 'value' in c: @@ -73,7 +73,8 @@ class SocialDataExtractor: for k in ['sameSite', 'storeId', 'id']: c.pop(k, None) formatted.append(c) await context.add_cookies(formatted) - except: pass + except Exception as e: + self.log_audit("Cookies", False, f"Error aplicando cookies: {e}") if strategy == "search": import urllib.parse @@ -83,27 +84,36 @@ class SocialDataExtractor: encoded_query = urllib.parse.quote(search_query) target_url = f"https://x.com/search?q={encoded_query}&f=live" - self.log_audit("Advanced Search", None, f"Query: {search_query}") + self.log_audit("Advanced Search", None, f"URL: {target_url}") else: target_url = f"https://x.com/{self.target_account}" - self.log_audit("Profile Scroll", None, "Navegando al muro directo.") + self.log_audit("Profile Scroll", None, f"URL: {target_url}") - await page.goto(target_url, wait_until="domcontentloaded", timeout=90000) - await asyncio.sleep(15) + self.log_audit("Browser", None, "Navegando a la página...") + await page.goto(target_url, wait_until="load", timeout=60000) + + title = await page.title() + self.log_audit("Browser", True, f"Página cargada: '{title}'") + + await asyncio.sleep(10) stop_scrolling = False scroll_count = 0 - max_scrolls = 100 # Reducido de 300 - collected_tweets = {} # URL -> tweet_data para evitar duplicados en scroll - target_link_count = 300 # Reducido de 1000 + max_scrolls = 60 + collected_tweets = {} + target_link_count = 200 while not stop_scrolling and scroll_count < max_scrolls: + self.log_audit("Scraping", None, f"Escaneando DOM (Scroll {scroll_count+1}/{max_scrolls})...") articles = await page.query_selector_all('article[data-testid="tweet"]') - if not articles and scroll_count > 3: - self.log_audit("Extraction", False, "No se detectan más tweets en el DOM.") - break - + if not articles: + if scroll_count > 5: + self.log_audit("Extraction", False, "No se detectan tweets. ¿Posible bloqueo o fin de lista?") + break + self.log_audit("Scraping", None, "Esperando a que aparezcan los tweets...") + await asyncio.sleep(5) + for article in articles: # 1. Ignorar Pinned Posts (Solo en Profile Scroll) if strategy == "scroll":