feat: implement advanced search strategy for robust historical extraction

This commit is contained in:
Nubenetes Bot
2026-05-11 00:33:21 +02:00
parent 7459d9a07c
commit 21a892ca40

View File

@@ -75,7 +75,13 @@ class SocialDataExtractor:
await context.add_cookies(formatted)
except: pass
await page.goto(f"https://x.com/{self.target_account}", wait_until="domcontentloaded", timeout=90000)
import urllib.parse
search_query = f"from:{self.target_account} since:{since_date.date().isoformat()}"
encoded_query = urllib.parse.quote(search_query)
search_url = f"https://x.com/search?q={encoded_query}&f=live"
self.log_audit("Advanced Search", None, f"Query: {search_query}")
await page.goto(search_url, wait_until="domcontentloaded", timeout=90000)
await asyncio.sleep(15)
stop_scrolling = False
@@ -87,6 +93,10 @@ class SocialDataExtractor:
while not stop_scrolling and scroll_count < max_scrolls:
articles = await page.query_selector_all('article[data-testid="tweet"]')
if not articles and scroll_count > 5:
self.log_audit("Extraction", False, "No se detectan más tweets en la búsqueda.")
break
for article in articles:
# 1. Ignorar Pinned Posts (Post Fijo)
social_context = await article.query_selector('[data-testid="socialContext"]')
@@ -138,7 +148,7 @@ class SocialDataExtractor:
scroll_count += 1
if not stop_scrolling and scroll_count >= max_scrolls:
self.log_audit("Scrolling", False, f"Alcanzado límite de scrolls ({max_scrolls}) sin llegar a la fecha objetivo.")
self.log_audit("Scrolling", False, f"Alcanzado límite de scrolls ({max_scrolls}) en búsqueda avanzada.")
await browser.close()