feat: switch to Playwright stealth browser for robust X.com extraction

2026-07-27 17:13:07 +00:00 · 2026-05-10 22:04:48 +02:00
parent cfc3064f39
commit 4573300180
2 changed files with 67 additions and 67 deletions
--- a/.github/workflows/agentic_cron.yml
+++ b/.github/workflows/agentic_cron.yml
@@ -24,7 +24,8 @@ jobs:
      - name: Instalación de dependencias (LIGERO Y ROBUSTO)
        run: |
          python -m pip install --upgrade pip
-          pip install --no-cache-dir pydantic PyGithub aiohttp beautifulsoup4 httpx fake-useragent pytz python-dotenv twikit>=2.1.2
+          pip install --no-cache-dir pydantic PyGithub aiohttp beautifulsoup4 httpx fake-useragent pytz python-dotenv twikit>=2.1.2 playwright playwright-stealth
+          playwright install chromium --with-deps

      - name: Ejecución de la Canalización Agéntica Integral
        env:
--- a/src/ingestion_twikit.py
+++ b/src/ingestion_twikit.py
@@ -18,12 +18,7 @@ class SocialDataExtractor:
        self.audit_trail = []
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
-            'Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1',
-            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
-        ]
-        self.nitter_instances = [
-            "nitter.net", "nitter.cz", "nitter.it", "nitter.privacydev.net",
-            "nitter.d420.me", "nitter.perpmode.com", "nitter.esmailelbob.xyz"
+            'Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1'
        ]

    def log_audit(self, method: str, success: Optional[bool], msg: str):
@@ -35,76 +30,80 @@ class SocialDataExtractor:
    def _extract_urls_from_text(self, text: str) -> list[str]:
        return list(set(re.findall(r'https?://[^\s<>\"]+|www\.[^\s<>\"]+', text)))

-    async def _authenticate(self) -> bool:
-        env_cookies = os.getenv("TWITTER_COOKIES")
-        if env_cookies:
-            try:
-                raw_data = json.loads(env_cookies)
-                cookies_dict = {c['name']: c['value'] for c in raw_data} if isinstance(raw_data, list) else raw_data
-                with open(self.cookies_file, 'w') as f: json.dump(cookies_dict, f)
-                self.client.load_cookies(self.cookies_file)
-                self.log_audit("Auth Cookies", True, "Sesión inyectada correctamente.")
-                return True
-            except Exception as e:
-                self.log_audit("Auth Cookies", False, str(e)[:50])
-        return False
+    async def _fetch_via_playwright(self, since_date: datetime) -> list[dict]:
+        """Estrategia Definitiva: Navegador Real con Playwright."""
+        try:
+            from playwright.async_api import async_playwright
+            from playwright_stealth import stealth_async
+        except ImportError:
+            self.log_audit("Playwright", False, "Librerías no instaladas.")
+            return []
+        
+        self.log_audit("Playwright Browser", None, "Lanzando navegador real (Stealth Mode)...")
+        results = []
+        try:
+            async with async_playwright() as p:
+                browser = await p.chromium.launch(headless=True)
+                context = await browser.new_context(user_agent=self.user_agents[0])
+                page = await context.new_page()
+                await stealth_async(page)
+                
+                env_cookies = os.getenv("TWITTER_COOKIES")
+                if env_cookies:
+                    try:
+                        cookies = json.loads(env_cookies)
+                        formatted_cookies = []
+                        for c in cookies:
+                            if isinstance(c, dict) and 'name' in c and 'value' in c:
+                                # Playwright necesita dominio sin el punto inicial a veces, o con el dominio correcto
+                                c['domain'] = c.get('domain', '.x.com')
+                                # Eliminar campos incompatibles si existen
+                                for k in ['sameSite', 'storeId']: c.pop(k, None)
+                                formatted_cookies.append(c)
+                        await context.add_cookies(formatted_cookies)
+                        self.log_audit("Playwright", True, "Cookies inyectadas.")
+                    except: pass

-    async def _fetch_via_rss_bridge(self) -> list[dict]:
-        bridges = ["rssbridge.org", "rss.idoc.pub", "bridge.the-pankratz.de"]
+                await page.goto(f"https://x.com/{self.target_account}", wait_until="networkidle", timeout=60000)
+                await asyncio.sleep(10)
+                
+                # Scroll para cargar contenido
+                for i in range(3):
+                    content = await page.content()
+                    urls = self._extract_urls_from_text(content)
+                    for u in urls:
+                        if all(x not in u for x in ["x.com", "twitter.com", "t.co", "abs.twimg"]):
+                            results.append({"url": u, "context": "Playwright Scrape", "timestamp": datetime.now(MADRID_TZ).isoformat()})
+                    await page.evaluate("window.scrollBy(0, 1000)")
+                    await asyncio.sleep(3)
+                
+                await browser.close()
+                return results
+        except Exception as e:
+            self.log_audit("Playwright", False, f"Error: {str(e)[:50]}")
+        return []
+
+    async def fetch_links_since(self, since_date: datetime) -> list[dict]:
+        # 1. Intentar Playwright (Navegador Real)
+        play_links = await self._fetch_via_playwright(since_date)
+        if play_links: 
+            self.log_audit("Estrategia Playwright", True, f"Encontrados {len(play_links)} recursos.")
+            return play_links
+
+        # 2. RSS-Bridge Fallback
+        self.log_audit("RSS Fallback", None, "Intentando vía RSS-Bridge...")
+        bridges = ["rssbridge.org", "rss.idoc.pub"]
        for b in bridges:
            url = f"https://{b}/?action=display&bridge=TwitterBridge&context=By+username&user={self.target_account}&format=Mrss"
            try:
-                async with aiohttp.ClientSession(headers={"User-Agent": random.choice(self.user_agents)}) as session:
+                async with aiohttp.ClientSession() as session:
                    async with session.get(url, timeout=20) as resp:
                        if resp.status == 200:
                            urls = self._extract_urls_from_text(await resp.text())
                            valid = [u for u in urls if all(x not in u for x in ["x.com", "twitter.com", "t.co", b])]
                            if valid:
                                self.log_audit(f"RSS-Bridge ({b})", True, f"Encontrados {len(valid)} enlaces.")
-                                return [{"url": u, "context": "RSS-Bridge", "timestamp": datetime.now(MADRID_TZ).isoformat()} for u in valid]
+                                return [{"url": u, "context": "RSS", "timestamp": datetime.now(MADRID_TZ).isoformat()} for u in valid]
            except: continue
-        return []
-
-    async def fetch_links_since(self, since_date: datetime) -> list[dict]:
-        all_results = []
-        target_user_id = "1387348141150670850"
-        
-        self.log_audit("Twikit API", None, "Intentando bypass con ID directo...")
-        if await self._authenticate():
-            try:
-                tweets = await self.client.get_user_tweets(target_user_id, 'Tweets')
-                if tweets:
-                    for t in tweets:
-                        tweet_date = t.created_at_datetime.astimezone(MADRID_TZ)
-                        if tweet_date < since_date: break
-                        txt = t.full_text if hasattr(t, 'full_text') else t.text
-                        for u in self._extract_urls_from_text(txt):
-                            if "x.com" not in u and "twitter.com" not in u:
-                                all_results.append({"url": u, "context": txt, "timestamp": tweet_date.isoformat()})
-                    if all_results:
-                        self.log_audit("Twikit API", True, f"Extraídos {len(all_results)} enlaces.")
-                        return all_results
-            except Exception as e:
-                self.log_audit("Twikit API", False, "Bloqueo KEY_BYTE persistente.")
-
-        links = await self._fetch_via_rss_bridge()
-        if links: return links
-
-        self.log_audit("Wayback Deep", None, "Buscando histórico profundo...")
-        from_ts = since_date.strftime("%Y%m%d")
-        try:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(f"https://web.archive.org/cdx/search/cdx?url=twitter.com/{self.target_account}&output=json&from={from_ts}&limit=5", timeout=20) as resp:
-                    if resp.status == 200:
-                        snaps = await resp.json()
-                        if len(snaps) > 1:
-                            latest = snaps[-1][1]
-                            async with session.get(f"https://web.archive.org/web/{latest}/https://twitter.com/{self.target_account}") as s_resp:
-                                urls = self._extract_urls_from_text(await s_resp.text())
-                                valid = [u for u in urls if all(x not in u for x in ["x.com", "twitter.com", "t.co", "archive.org"])]
-                                if valid:
-                                    self.log_audit("Wayback Deep", True, f"Recuperados {len(valid)} históricos.")
-                                    return [{"url": u, "context": "Wayback", "timestamp": datetime.now(MADRID_TZ).isoformat()} for u in valid]
-        except: pass

        return []