mirror of
https://github.com/nubenetes/awesome-kubernetes.git
synced 2026-05-22 00:53:37 +00:00
perf: optimize extraction window and scrolling to prevent workflow timeouts
This commit is contained in:
@@ -90,14 +90,14 @@ class SocialDataExtractor:
|
||||
|
||||
stop_scrolling = False
|
||||
scroll_count = 0
|
||||
max_scrolls = 300
|
||||
max_scrolls = 100 # Reducido de 300
|
||||
collected_tweets = {} # URL -> tweet_data para evitar duplicados en scroll
|
||||
target_link_count = 1000
|
||||
target_link_count = 300 # Reducido de 1000
|
||||
|
||||
while not stop_scrolling and scroll_count < max_scrolls:
|
||||
articles = await page.query_selector_all('article[data-testid="tweet"]')
|
||||
|
||||
if not articles and scroll_count > 5:
|
||||
if not articles and scroll_count > 3:
|
||||
self.log_audit("Extraction", False, "No se detectan más tweets en el DOM.")
|
||||
break
|
||||
|
||||
@@ -118,6 +118,7 @@ class SocialDataExtractor:
|
||||
tweet_dt = datetime.fromisoformat(datetime_str.replace('Z', '+00:00')).astimezone(MADRID_TZ)
|
||||
|
||||
if tweet_dt < since_date:
|
||||
self.log_audit("Timeline", True, f"Alcanzado horizonte temporal: {tweet_dt.date()}")
|
||||
stop_scrolling = True
|
||||
break
|
||||
|
||||
@@ -148,8 +149,9 @@ class SocialDataExtractor:
|
||||
if stop_scrolling: break
|
||||
|
||||
if stop_scrolling: break
|
||||
await page.evaluate("window.scrollBy(0, 5000)")
|
||||
await asyncio.sleep(8)
|
||||
# Scroll más agresivo y menos esperas
|
||||
await page.evaluate("window.scrollBy(0, 8000)")
|
||||
await asyncio.sleep(5)
|
||||
scroll_count += 1
|
||||
|
||||
if not stop_scrolling and scroll_count >= max_scrolls:
|
||||
|
||||
14
src/main.py
14
src/main.py
@@ -16,12 +16,18 @@ async def master_orchestrator():
|
||||
|
||||
print("[*] INICIANDO CURADURÍA AGÉNTICA (CRONOLOGÍA Y TRANSPARENCIA)")
|
||||
|
||||
# 1. Horizonte Temporal Fijo (Octubre 2024) - Requisito de Curaduría Histórica
|
||||
time_horizon = datetime(2024, 10, 1, 0, 0, tzinfo=MADRID_TZ)
|
||||
print(f"[*] FORZANDO CURADURÍA HISTÓRICA desde: {time_horizon.date()}")
|
||||
# 1. Horizonte Temporal Dinámico
|
||||
# Por defecto, solo buscamos los últimos 30 días para evitar Timeouts de 6h.
|
||||
# Si se requiere curaduría histórica, se puede pasar vía variable de entorno.
|
||||
days_back = int(os.getenv("CURATION_DAYS_BACK", "30"))
|
||||
time_horizon = datetime.now(MADRID_TZ) - timedelta(days=days_back)
|
||||
|
||||
if days_back > 60:
|
||||
print(f"[*] ALERTA: Ejecutando Curaduría Histórica Profunda ({days_back} días).")
|
||||
print(f"[*] Horizonte temporal: {time_horizon.date()}")
|
||||
|
||||
# 2. Ingesta Multi-fuente
|
||||
strategy = os.getenv("EXTRACTION_STRATEGY", "scroll")
|
||||
strategy = os.getenv("EXTRACTION_STRATEGY", "search") # Cambiamos default a 'search' por ser más rápido
|
||||
twitter_client = SocialDataExtractor()
|
||||
raw_social = await twitter_client.fetch_links_since(time_horizon, strategy=strategy)
|
||||
x_audit_trail = twitter_client.audit_trail
|
||||
|
||||
Reference in New Issue
Block a user