feat: add interactive extraction strategy selection to GitHub Workflow

2026-07-28 09:32:20 +00:00 · 2026-05-11 08:39:34 +02:00
parent 5deb0f7f28
commit c58046502d
3 changed files with 40 additions and 21 deletions
--- a/.github/workflows/agentic_cron.yml
+++ b/.github/workflows/agentic_cron.yml
@@ -4,6 +4,18 @@ on:
  schedule:
    - cron: '0 5 * * 0'
  workflow_dispatch:
+    inputs:
+      extraction_strategy:
+        description: 'Estrategia de Extracción'
+        required: true
+        default: 'scroll'
+        type: choice
+        options:
+          - scroll
+          - search
+        # Explicación para el usuario:
+        # scroll: MÁS EXHAUSTIVO. Simula navegación humana. Captura TODO, pero puede ser limitado por X en fechas muy antiguas.
+        # search: MÁS FIABLE PARA 2024. Usa búsqueda avanzada. Llega siempre a la fecha, pero el algoritmo de X puede filtrar posts.

 permissions:
  contents: write
@@ -35,6 +47,7 @@ jobs:
          TWITTER_COOKIES: ${{ secrets.TWITTER_COOKIES }}
          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          EXTRACTION_STRATEGY: ${{ github.event.inputs.extraction_strategy || 'scroll' }}
          PYTHONPATH: .
        run: |
          python src/main.py
--- a/src/ingestion_twikit.py
+++ b/src/ingestion_twikit.py
@@ -40,7 +40,7 @@ class SocialDataExtractor:
                valid_urls.append(u)
        return list(set(valid_urls))

-    async def _fetch_via_playwright(self, since_date: datetime) -> list[dict]:
+    async def _fetch_via_playwright(self, since_date: datetime, strategy: str = "scroll") -> list[dict]:
        try:
            from playwright.async_api import async_playwright
            import playwright_stealth
@@ -48,7 +48,7 @@ class SocialDataExtractor:
            self.log_audit("Playwright", False, "Librerías no disponibles.")
            return []
        
-        self.log_audit("Playwright Browser", None, f"Cronología: Desde {since_date.date()} hasta hoy.")
+        self.log_audit(f"Playwright ({strategy})", None, f"Cronología: Desde {since_date.date()} hasta hoy.")
        results = []
        
        try:
@@ -75,13 +75,17 @@ class SocialDataExtractor:
                        await context.add_cookies(formatted)
                    except: pass

-                import urllib.parse
-                search_query = f"from:{self.target_account} since:{since_date.date().isoformat()}"
-                encoded_query = urllib.parse.quote(search_query)
-                search_url = f"https://x.com/search?q={encoded_query}&f=live"
-                
-                self.log_audit("Advanced Search", None, f"Query: {search_query}")
-                await page.goto(search_url, wait_until="domcontentloaded", timeout=90000)
+                if strategy == "search":
+                    import urllib.parse
+                    search_query = f"from:{self.target_account} since:{since_date.date().isoformat()}"
+                    encoded_query = urllib.parse.quote(search_query)
+                    target_url = f"https://x.com/search?q={encoded_query}&f=live"
+                    self.log_audit("Advanced Search", None, f"Query: {search_query}")
+                else:
+                    target_url = f"https://x.com/{self.target_account}"
+                    self.log_audit("Profile Scroll", None, "Navegando al muro directo.")
+
+                await page.goto(target_url, wait_until="domcontentloaded", timeout=90000)
                await asyncio.sleep(15)
                
                stop_scrolling = False
@@ -94,16 +98,17 @@ class SocialDataExtractor:
                    articles = await page.query_selector_all('article[data-testid="tweet"]')
                    
                    if not articles and scroll_count > 5:
-                        self.log_audit("Extraction", False, "No se detectan más tweets en la búsqueda.")
+                        self.log_audit("Extraction", False, "No se detectan más tweets en el DOM.")
                        break

                    for article in articles:
-                        # 1. Ignorar Pinned Posts (Post Fijo)
-                        social_context = await article.query_selector('[data-testid="socialContext"]')
-                        if social_context:
-                            sc_text = await social_context.inner_text()
-                            if "Fijado" in sc_text or "Pinned" in sc_text:
-                                continue
+                        # 1. Ignorar Pinned Posts (Solo en Profile Scroll)
+                        if strategy == "scroll":
+                            social_context = await article.query_selector('[data-testid="socialContext"]')
+                            if social_context:
+                                sc_text = await social_context.inner_text()
+                                if "Fijado" in sc_text or "Pinned" in sc_text:
+                                    continue

                        # 2. Extraer Fecha
                        time_el = await article.query_selector('time')
@@ -135,7 +140,7 @@ class SocialDataExtractor:
                                collected_tweets[u] = {
                                    "url": u, "context": tweet_text[:200], 
                                    "timestamp": tweet_dt.isoformat(),
-                                    "source_type": "X.com (@nubenetes)"
+                                    "source_type": f"X.com ({strategy})"
                                }
                                if len(collected_tweets) >= target_link_count:
                                    stop_scrolling = True
@@ -148,7 +153,7 @@ class SocialDataExtractor:
                    scroll_count += 1
                
                if not stop_scrolling and scroll_count >= max_scrolls:
-                    self.log_audit("Scrolling", False, f"Alcanzado límite de scrolls ({max_scrolls}) en búsqueda avanzada.")
+                    self.log_audit("Scrolling", False, f"Alcanzado límite de scrolls ({max_scrolls}) usando {strategy}.")
                
                await browser.close()
                
@@ -161,8 +166,8 @@ class SocialDataExtractor:
            self.log_audit("Playwright", False, str(e)[:60])
        return []

-    async def fetch_links_since(self, since_date: datetime) -> list[dict]:
-        play_links = await self._fetch_via_playwright(since_date)
+    async def fetch_links_since(self, since_date: datetime, strategy: str = "scroll") -> list[dict]:
+        play_links = await self._fetch_via_playwright(since_date, strategy=strategy)
        if play_links: 
            self.log_audit("Estrategia Playwright", True, f"Recuperados {len(play_links)} bookmarks ordenados cronológicamente.")
            return play_links
--- a/src/main.py
+++ b/src/main.py
@@ -21,8 +21,9 @@ async def master_orchestrator():
    print(f"[*] FORZANDO CURADURÍA HISTÓRICA desde: {time_horizon.date()}")

    # 2. Ingesta Multi-fuente
+    strategy = os.getenv("EXTRACTION_STRATEGY", "scroll")
    twitter_client = SocialDataExtractor()
-    raw_social = await twitter_client.fetch_links_since(time_horizon)
+    raw_social = await twitter_client.fetch_links_since(time_horizon, strategy=strategy)
    x_audit_trail = twitter_client.audit_trail
    
    print("[*] Buscando novedades en GitHub Trending...")