From c58046502d83ba6c37735efe36bfe1ea77bfd5de Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Mon, 11 May 2026 08:39:34 +0200 Subject: [PATCH] feat: add interactive extraction strategy selection to GitHub Workflow --- .github/workflows/agentic_cron.yml | 13 +++++++++ src/ingestion_twikit.py | 45 +++++++++++++++++------------- src/main.py | 3 +- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/.github/workflows/agentic_cron.yml b/.github/workflows/agentic_cron.yml index b10a6283..b2a607f0 100644 --- a/.github/workflows/agentic_cron.yml +++ b/.github/workflows/agentic_cron.yml @@ -4,6 +4,18 @@ on: schedule: - cron: '0 5 * * 0' workflow_dispatch: + inputs: + extraction_strategy: + description: 'Estrategia de Extracción' + required: true + default: 'scroll' + type: choice + options: + - scroll + - search + # Explicación para el usuario: + # scroll: MÁS EXHAUSTIVO. Simula navegación humana. Captura TODO, pero puede ser limitado por X en fechas muy antiguas. + # search: MÁS FIABLE PARA 2024. Usa búsqueda avanzada. Llega siempre a la fecha, pero el algoritmo de X puede filtrar posts. permissions: contents: write @@ -35,6 +47,7 @@ jobs: TWITTER_COOKIES: ${{ secrets.TWITTER_COOKIES }} GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + EXTRACTION_STRATEGY: ${{ github.event.inputs.extraction_strategy || 'scroll' }} PYTHONPATH: . run: | python src/main.py diff --git a/src/ingestion_twikit.py b/src/ingestion_twikit.py index 22f1b8a2..7969f835 100644 --- a/src/ingestion_twikit.py +++ b/src/ingestion_twikit.py @@ -40,7 +40,7 @@ class SocialDataExtractor: valid_urls.append(u) return list(set(valid_urls)) - async def _fetch_via_playwright(self, since_date: datetime) -> list[dict]: + async def _fetch_via_playwright(self, since_date: datetime, strategy: str = "scroll") -> list[dict]: try: from playwright.async_api import async_playwright import playwright_stealth @@ -48,7 +48,7 @@ class SocialDataExtractor: self.log_audit("Playwright", False, "Librerías no disponibles.") return [] - self.log_audit("Playwright Browser", None, f"Cronología: Desde {since_date.date()} hasta hoy.") + self.log_audit(f"Playwright ({strategy})", None, f"Cronología: Desde {since_date.date()} hasta hoy.") results = [] try: @@ -75,13 +75,17 @@ class SocialDataExtractor: await context.add_cookies(formatted) except: pass - import urllib.parse - search_query = f"from:{self.target_account} since:{since_date.date().isoformat()}" - encoded_query = urllib.parse.quote(search_query) - search_url = f"https://x.com/search?q={encoded_query}&f=live" - - self.log_audit("Advanced Search", None, f"Query: {search_query}") - await page.goto(search_url, wait_until="domcontentloaded", timeout=90000) + if strategy == "search": + import urllib.parse + search_query = f"from:{self.target_account} since:{since_date.date().isoformat()}" + encoded_query = urllib.parse.quote(search_query) + target_url = f"https://x.com/search?q={encoded_query}&f=live" + self.log_audit("Advanced Search", None, f"Query: {search_query}") + else: + target_url = f"https://x.com/{self.target_account}" + self.log_audit("Profile Scroll", None, "Navegando al muro directo.") + + await page.goto(target_url, wait_until="domcontentloaded", timeout=90000) await asyncio.sleep(15) stop_scrolling = False @@ -94,16 +98,17 @@ class SocialDataExtractor: articles = await page.query_selector_all('article[data-testid="tweet"]') if not articles and scroll_count > 5: - self.log_audit("Extraction", False, "No se detectan más tweets en la búsqueda.") + self.log_audit("Extraction", False, "No se detectan más tweets en el DOM.") break for article in articles: - # 1. Ignorar Pinned Posts (Post Fijo) - social_context = await article.query_selector('[data-testid="socialContext"]') - if social_context: - sc_text = await social_context.inner_text() - if "Fijado" in sc_text or "Pinned" in sc_text: - continue + # 1. Ignorar Pinned Posts (Solo en Profile Scroll) + if strategy == "scroll": + social_context = await article.query_selector('[data-testid="socialContext"]') + if social_context: + sc_text = await social_context.inner_text() + if "Fijado" in sc_text or "Pinned" in sc_text: + continue # 2. Extraer Fecha time_el = await article.query_selector('time') @@ -135,7 +140,7 @@ class SocialDataExtractor: collected_tweets[u] = { "url": u, "context": tweet_text[:200], "timestamp": tweet_dt.isoformat(), - "source_type": "X.com (@nubenetes)" + "source_type": f"X.com ({strategy})" } if len(collected_tweets) >= target_link_count: stop_scrolling = True @@ -148,7 +153,7 @@ class SocialDataExtractor: scroll_count += 1 if not stop_scrolling and scroll_count >= max_scrolls: - self.log_audit("Scrolling", False, f"Alcanzado límite de scrolls ({max_scrolls}) en búsqueda avanzada.") + self.log_audit("Scrolling", False, f"Alcanzado límite de scrolls ({max_scrolls}) usando {strategy}.") await browser.close() @@ -161,8 +166,8 @@ class SocialDataExtractor: self.log_audit("Playwright", False, str(e)[:60]) return [] - async def fetch_links_since(self, since_date: datetime) -> list[dict]: - play_links = await self._fetch_via_playwright(since_date) + async def fetch_links_since(self, since_date: datetime, strategy: str = "scroll") -> list[dict]: + play_links = await self._fetch_via_playwright(since_date, strategy=strategy) if play_links: self.log_audit("Estrategia Playwright", True, f"Recuperados {len(play_links)} bookmarks ordenados cronológicamente.") return play_links diff --git a/src/main.py b/src/main.py index 92bdbde6..4e016c80 100644 --- a/src/main.py +++ b/src/main.py @@ -21,8 +21,9 @@ async def master_orchestrator(): print(f"[*] FORZANDO CURADURÍA HISTÓRICA desde: {time_horizon.date()}") # 2. Ingesta Multi-fuente + strategy = os.getenv("EXTRACTION_STRATEGY", "scroll") twitter_client = SocialDataExtractor() - raw_social = await twitter_client.fetch_links_since(time_horizon) + raw_social = await twitter_client.fetch_links_since(time_horizon, strategy=strategy) x_audit_trail = twitter_client.audit_trail print("[*] Buscando novedades en GitHub Trending...")