mirror of
https://github.com/nubenetes/awesome-kubernetes.git
synced 2026-05-23 09:33:33 +00:00
feat: switch to Playwright stealth browser for robust X.com extraction
This commit is contained in:
3
.github/workflows/agentic_cron.yml
vendored
3
.github/workflows/agentic_cron.yml
vendored
@@ -24,7 +24,8 @@ jobs:
|
||||
- name: Instalación de dependencias (LIGERO Y ROBUSTO)
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --no-cache-dir pydantic PyGithub aiohttp beautifulsoup4 httpx fake-useragent pytz python-dotenv twikit>=2.1.2
|
||||
pip install --no-cache-dir pydantic PyGithub aiohttp beautifulsoup4 httpx fake-useragent pytz python-dotenv twikit>=2.1.2 playwright playwright-stealth
|
||||
playwright install chromium --with-deps
|
||||
|
||||
- name: Ejecución de la Canalización Agéntica Integral
|
||||
env:
|
||||
|
||||
@@ -18,12 +18,7 @@ class SocialDataExtractor:
|
||||
self.audit_trail = []
|
||||
self.user_agents = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
|
||||
]
|
||||
self.nitter_instances = [
|
||||
"nitter.net", "nitter.cz", "nitter.it", "nitter.privacydev.net",
|
||||
"nitter.d420.me", "nitter.perpmode.com", "nitter.esmailelbob.xyz"
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1'
|
||||
]
|
||||
|
||||
def log_audit(self, method: str, success: Optional[bool], msg: str):
|
||||
@@ -35,76 +30,80 @@ class SocialDataExtractor:
|
||||
def _extract_urls_from_text(self, text: str) -> list[str]:
|
||||
return list(set(re.findall(r'https?://[^\s<>\"]+|www\.[^\s<>\"]+', text)))
|
||||
|
||||
async def _authenticate(self) -> bool:
|
||||
env_cookies = os.getenv("TWITTER_COOKIES")
|
||||
if env_cookies:
|
||||
try:
|
||||
raw_data = json.loads(env_cookies)
|
||||
cookies_dict = {c['name']: c['value'] for c in raw_data} if isinstance(raw_data, list) else raw_data
|
||||
with open(self.cookies_file, 'w') as f: json.dump(cookies_dict, f)
|
||||
self.client.load_cookies(self.cookies_file)
|
||||
self.log_audit("Auth Cookies", True, "Sesión inyectada correctamente.")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.log_audit("Auth Cookies", False, str(e)[:50])
|
||||
return False
|
||||
async def _fetch_via_playwright(self, since_date: datetime) -> list[dict]:
|
||||
"""Estrategia Definitiva: Navegador Real con Playwright."""
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
from playwright_stealth import stealth_async
|
||||
except ImportError:
|
||||
self.log_audit("Playwright", False, "Librerías no instaladas.")
|
||||
return []
|
||||
|
||||
self.log_audit("Playwright Browser", None, "Lanzando navegador real (Stealth Mode)...")
|
||||
results = []
|
||||
try:
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(user_agent=self.user_agents[0])
|
||||
page = await context.new_page()
|
||||
await stealth_async(page)
|
||||
|
||||
env_cookies = os.getenv("TWITTER_COOKIES")
|
||||
if env_cookies:
|
||||
try:
|
||||
cookies = json.loads(env_cookies)
|
||||
formatted_cookies = []
|
||||
for c in cookies:
|
||||
if isinstance(c, dict) and 'name' in c and 'value' in c:
|
||||
# Playwright necesita dominio sin el punto inicial a veces, o con el dominio correcto
|
||||
c['domain'] = c.get('domain', '.x.com')
|
||||
# Eliminar campos incompatibles si existen
|
||||
for k in ['sameSite', 'storeId']: c.pop(k, None)
|
||||
formatted_cookies.append(c)
|
||||
await context.add_cookies(formatted_cookies)
|
||||
self.log_audit("Playwright", True, "Cookies inyectadas.")
|
||||
except: pass
|
||||
|
||||
async def _fetch_via_rss_bridge(self) -> list[dict]:
|
||||
bridges = ["rssbridge.org", "rss.idoc.pub", "bridge.the-pankratz.de"]
|
||||
await page.goto(f"https://x.com/{self.target_account}", wait_until="networkidle", timeout=60000)
|
||||
await asyncio.sleep(10)
|
||||
|
||||
# Scroll para cargar contenido
|
||||
for i in range(3):
|
||||
content = await page.content()
|
||||
urls = self._extract_urls_from_text(content)
|
||||
for u in urls:
|
||||
if all(x not in u for x in ["x.com", "twitter.com", "t.co", "abs.twimg"]):
|
||||
results.append({"url": u, "context": "Playwright Scrape", "timestamp": datetime.now(MADRID_TZ).isoformat()})
|
||||
await page.evaluate("window.scrollBy(0, 1000)")
|
||||
await asyncio.sleep(3)
|
||||
|
||||
await browser.close()
|
||||
return results
|
||||
except Exception as e:
|
||||
self.log_audit("Playwright", False, f"Error: {str(e)[:50]}")
|
||||
return []
|
||||
|
||||
async def fetch_links_since(self, since_date: datetime) -> list[dict]:
|
||||
# 1. Intentar Playwright (Navegador Real)
|
||||
play_links = await self._fetch_via_playwright(since_date)
|
||||
if play_links:
|
||||
self.log_audit("Estrategia Playwright", True, f"Encontrados {len(play_links)} recursos.")
|
||||
return play_links
|
||||
|
||||
# 2. RSS-Bridge Fallback
|
||||
self.log_audit("RSS Fallback", None, "Intentando vía RSS-Bridge...")
|
||||
bridges = ["rssbridge.org", "rss.idoc.pub"]
|
||||
for b in bridges:
|
||||
url = f"https://{b}/?action=display&bridge=TwitterBridge&context=By+username&user={self.target_account}&format=Mrss"
|
||||
try:
|
||||
async with aiohttp.ClientSession(headers={"User-Agent": random.choice(self.user_agents)}) as session:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, timeout=20) as resp:
|
||||
if resp.status == 200:
|
||||
urls = self._extract_urls_from_text(await resp.text())
|
||||
valid = [u for u in urls if all(x not in u for x in ["x.com", "twitter.com", "t.co", b])]
|
||||
if valid:
|
||||
self.log_audit(f"RSS-Bridge ({b})", True, f"Encontrados {len(valid)} enlaces.")
|
||||
return [{"url": u, "context": "RSS-Bridge", "timestamp": datetime.now(MADRID_TZ).isoformat()} for u in valid]
|
||||
return [{"url": u, "context": "RSS", "timestamp": datetime.now(MADRID_TZ).isoformat()} for u in valid]
|
||||
except: continue
|
||||
return []
|
||||
|
||||
async def fetch_links_since(self, since_date: datetime) -> list[dict]:
|
||||
all_results = []
|
||||
target_user_id = "1387348141150670850"
|
||||
|
||||
self.log_audit("Twikit API", None, "Intentando bypass con ID directo...")
|
||||
if await self._authenticate():
|
||||
try:
|
||||
tweets = await self.client.get_user_tweets(target_user_id, 'Tweets')
|
||||
if tweets:
|
||||
for t in tweets:
|
||||
tweet_date = t.created_at_datetime.astimezone(MADRID_TZ)
|
||||
if tweet_date < since_date: break
|
||||
txt = t.full_text if hasattr(t, 'full_text') else t.text
|
||||
for u in self._extract_urls_from_text(txt):
|
||||
if "x.com" not in u and "twitter.com" not in u:
|
||||
all_results.append({"url": u, "context": txt, "timestamp": tweet_date.isoformat()})
|
||||
if all_results:
|
||||
self.log_audit("Twikit API", True, f"Extraídos {len(all_results)} enlaces.")
|
||||
return all_results
|
||||
except Exception as e:
|
||||
self.log_audit("Twikit API", False, "Bloqueo KEY_BYTE persistente.")
|
||||
|
||||
links = await self._fetch_via_rss_bridge()
|
||||
if links: return links
|
||||
|
||||
self.log_audit("Wayback Deep", None, "Buscando histórico profundo...")
|
||||
from_ts = since_date.strftime("%Y%m%d")
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(f"https://web.archive.org/cdx/search/cdx?url=twitter.com/{self.target_account}&output=json&from={from_ts}&limit=5", timeout=20) as resp:
|
||||
if resp.status == 200:
|
||||
snaps = await resp.json()
|
||||
if len(snaps) > 1:
|
||||
latest = snaps[-1][1]
|
||||
async with session.get(f"https://web.archive.org/web/{latest}/https://twitter.com/{self.target_account}") as s_resp:
|
||||
urls = self._extract_urls_from_text(await s_resp.text())
|
||||
valid = [u for u in urls if all(x not in u for x in ["x.com", "twitter.com", "t.co", "archive.org"])]
|
||||
if valid:
|
||||
self.log_audit("Wayback Deep", True, f"Recuperados {len(valid)} históricos.")
|
||||
return [{"url": u, "context": "Wayback", "timestamp": datetime.now(MADRID_TZ).isoformat()} for u in valid]
|
||||
except: pass
|
||||
|
||||
return []
|
||||
|
||||
Reference in New Issue
Block a user