From 4573300180d930e021b685f7e6f621fb7bc04501 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Sun, 10 May 2026 22:04:48 +0200 Subject: [PATCH] feat: switch to Playwright stealth browser for robust X.com extraction --- .github/workflows/agentic_cron.yml | 3 +- src/ingestion_twikit.py | 131 ++++++++++++++--------------- 2 files changed, 67 insertions(+), 67 deletions(-) diff --git a/.github/workflows/agentic_cron.yml b/.github/workflows/agentic_cron.yml index 70d8379b..b10a6283 100644 --- a/.github/workflows/agentic_cron.yml +++ b/.github/workflows/agentic_cron.yml @@ -24,7 +24,8 @@ jobs: - name: Instalación de dependencias (LIGERO Y ROBUSTO) run: | python -m pip install --upgrade pip - pip install --no-cache-dir pydantic PyGithub aiohttp beautifulsoup4 httpx fake-useragent pytz python-dotenv twikit>=2.1.2 + pip install --no-cache-dir pydantic PyGithub aiohttp beautifulsoup4 httpx fake-useragent pytz python-dotenv twikit>=2.1.2 playwright playwright-stealth + playwright install chromium --with-deps - name: Ejecución de la Canalización Agéntica Integral env: diff --git a/src/ingestion_twikit.py b/src/ingestion_twikit.py index 976641fc..1948ede7 100644 --- a/src/ingestion_twikit.py +++ b/src/ingestion_twikit.py @@ -18,12 +18,7 @@ class SocialDataExtractor: self.audit_trail = [] self.user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', - 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' - ] - self.nitter_instances = [ - "nitter.net", "nitter.cz", "nitter.it", "nitter.privacydev.net", - "nitter.d420.me", "nitter.perpmode.com", "nitter.esmailelbob.xyz" + 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1' ] def log_audit(self, method: str, success: Optional[bool], msg: str): @@ -35,76 +30,80 @@ class SocialDataExtractor: def _extract_urls_from_text(self, text: str) -> list[str]: return list(set(re.findall(r'https?://[^\s<>\"]+|www\.[^\s<>\"]+', text))) - async def _authenticate(self) -> bool: - env_cookies = os.getenv("TWITTER_COOKIES") - if env_cookies: - try: - raw_data = json.loads(env_cookies) - cookies_dict = {c['name']: c['value'] for c in raw_data} if isinstance(raw_data, list) else raw_data - with open(self.cookies_file, 'w') as f: json.dump(cookies_dict, f) - self.client.load_cookies(self.cookies_file) - self.log_audit("Auth Cookies", True, "Sesión inyectada correctamente.") - return True - except Exception as e: - self.log_audit("Auth Cookies", False, str(e)[:50]) - return False + async def _fetch_via_playwright(self, since_date: datetime) -> list[dict]: + """Estrategia Definitiva: Navegador Real con Playwright.""" + try: + from playwright.async_api import async_playwright + from playwright_stealth import stealth_async + except ImportError: + self.log_audit("Playwright", False, "Librerías no instaladas.") + return [] + + self.log_audit("Playwright Browser", None, "Lanzando navegador real (Stealth Mode)...") + results = [] + try: + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context(user_agent=self.user_agents[0]) + page = await context.new_page() + await stealth_async(page) + + env_cookies = os.getenv("TWITTER_COOKIES") + if env_cookies: + try: + cookies = json.loads(env_cookies) + formatted_cookies = [] + for c in cookies: + if isinstance(c, dict) and 'name' in c and 'value' in c: + # Playwright necesita dominio sin el punto inicial a veces, o con el dominio correcto + c['domain'] = c.get('domain', '.x.com') + # Eliminar campos incompatibles si existen + for k in ['sameSite', 'storeId']: c.pop(k, None) + formatted_cookies.append(c) + await context.add_cookies(formatted_cookies) + self.log_audit("Playwright", True, "Cookies inyectadas.") + except: pass - async def _fetch_via_rss_bridge(self) -> list[dict]: - bridges = ["rssbridge.org", "rss.idoc.pub", "bridge.the-pankratz.de"] + await page.goto(f"https://x.com/{self.target_account}", wait_until="networkidle", timeout=60000) + await asyncio.sleep(10) + + # Scroll para cargar contenido + for i in range(3): + content = await page.content() + urls = self._extract_urls_from_text(content) + for u in urls: + if all(x not in u for x in ["x.com", "twitter.com", "t.co", "abs.twimg"]): + results.append({"url": u, "context": "Playwright Scrape", "timestamp": datetime.now(MADRID_TZ).isoformat()}) + await page.evaluate("window.scrollBy(0, 1000)") + await asyncio.sleep(3) + + await browser.close() + return results + except Exception as e: + self.log_audit("Playwright", False, f"Error: {str(e)[:50]}") + return [] + + async def fetch_links_since(self, since_date: datetime) -> list[dict]: + # 1. Intentar Playwright (Navegador Real) + play_links = await self._fetch_via_playwright(since_date) + if play_links: + self.log_audit("Estrategia Playwright", True, f"Encontrados {len(play_links)} recursos.") + return play_links + + # 2. RSS-Bridge Fallback + self.log_audit("RSS Fallback", None, "Intentando vía RSS-Bridge...") + bridges = ["rssbridge.org", "rss.idoc.pub"] for b in bridges: url = f"https://{b}/?action=display&bridge=TwitterBridge&context=By+username&user={self.target_account}&format=Mrss" try: - async with aiohttp.ClientSession(headers={"User-Agent": random.choice(self.user_agents)}) as session: + async with aiohttp.ClientSession() as session: async with session.get(url, timeout=20) as resp: if resp.status == 200: urls = self._extract_urls_from_text(await resp.text()) valid = [u for u in urls if all(x not in u for x in ["x.com", "twitter.com", "t.co", b])] if valid: self.log_audit(f"RSS-Bridge ({b})", True, f"Encontrados {len(valid)} enlaces.") - return [{"url": u, "context": "RSS-Bridge", "timestamp": datetime.now(MADRID_TZ).isoformat()} for u in valid] + return [{"url": u, "context": "RSS", "timestamp": datetime.now(MADRID_TZ).isoformat()} for u in valid] except: continue - return [] - - async def fetch_links_since(self, since_date: datetime) -> list[dict]: - all_results = [] - target_user_id = "1387348141150670850" - - self.log_audit("Twikit API", None, "Intentando bypass con ID directo...") - if await self._authenticate(): - try: - tweets = await self.client.get_user_tweets(target_user_id, 'Tweets') - if tweets: - for t in tweets: - tweet_date = t.created_at_datetime.astimezone(MADRID_TZ) - if tweet_date < since_date: break - txt = t.full_text if hasattr(t, 'full_text') else t.text - for u in self._extract_urls_from_text(txt): - if "x.com" not in u and "twitter.com" not in u: - all_results.append({"url": u, "context": txt, "timestamp": tweet_date.isoformat()}) - if all_results: - self.log_audit("Twikit API", True, f"Extraídos {len(all_results)} enlaces.") - return all_results - except Exception as e: - self.log_audit("Twikit API", False, "Bloqueo KEY_BYTE persistente.") - - links = await self._fetch_via_rss_bridge() - if links: return links - - self.log_audit("Wayback Deep", None, "Buscando histórico profundo...") - from_ts = since_date.strftime("%Y%m%d") - try: - async with aiohttp.ClientSession() as session: - async with session.get(f"https://web.archive.org/cdx/search/cdx?url=twitter.com/{self.target_account}&output=json&from={from_ts}&limit=5", timeout=20) as resp: - if resp.status == 200: - snaps = await resp.json() - if len(snaps) > 1: - latest = snaps[-1][1] - async with session.get(f"https://web.archive.org/web/{latest}/https://twitter.com/{self.target_account}") as s_resp: - urls = self._extract_urls_from_text(await s_resp.text()) - valid = [u for u in urls if all(x not in u for x in ["x.com", "twitter.com", "t.co", "archive.org"])] - if valid: - self.log_audit("Wayback Deep", True, f"Recuperados {len(valid)} históricos.") - return [{"url": u, "context": "Wayback", "timestamp": datetime.now(MADRID_TZ).isoformat()} for u in valid] - except: pass return []