fix: ensure PR creation with technical commit and deepen extraction scroll

This commit is contained in:
Nubenetes Bot
2026-05-10 22:22:11 +02:00
parent e50cd3afd4
commit 591f006a53
2 changed files with 17 additions and 5 deletions

View File

@@ -16,6 +16,14 @@ class RepositoryController:
branch_name = f"bot/knowledge-update-{timestamp_slug}"
self._create_feature_branch(branch_name)
# Si no hay cambios en Markdowns, añadimos un cambio técnico para permitir abrir la PR con el reporte
if not updates:
updates["src/memory/last_audit_run.json"] = json.dumps({
"timestamp": metrics.get("end_date"),
"total_extracted": metrics.get("total_extracted"),
"status": "Audit Only (No new links injected)"
}, indent=2)
for file_path, content in updates.items():
try:
commit_signature = f"chore: update {file_path} [{timestamp_slug}]"
@@ -38,15 +46,19 @@ class RepositoryController:
full_report = metrics.get('full_report', [])
# 1. Tabla Matricial de Auditoría
# Limitamos la tabla si es muy larga para evitar errores de API de GitHub
matrix_table = "### 📋 Matriz de Auditoría de Enlaces (Full Extraction)\n"
matrix_table += "| Estado | Motivo | Categoría | URL |\n| :--- | :--- | :--- | :--- |\n"
counts = {"INCLUDED": 0, "DUPLICATE": 0, "FILTERED": 0}
for item in full_report:
for item in full_report[:200]: # Mostrar solo los primeros 200 para no romper el límite de caracteres del PR
status_emoji = {"INCLUDED": "", "DUPLICATE": "👯", "FILTERED": "🛡️"}.get(item['status'], "")
matrix_table += f"| {status_emoji} {item['status']} | {item['reason']} | `{item['category']}` | {item['url']} |\n"
counts[item['status']] = counts.get(item['status'], 0) + 1
if len(full_report) > 200:
matrix_table += f"\n> 💡 *... y {len(full_report) - 200} enlaces más procesados.*"
# 2. Diagrama Mermaid
mermaid_pie = "### 📊 Métricas de Decisión\n```mermaid\npie title Distribución de Decisión Agéntica\n"
mermaid_pie += f" \"Aceptados (Inyectados)\" : {counts['INCLUDED']}\n"

View File

@@ -71,14 +71,14 @@ class SocialDataExtractor:
await page.goto(f"https://x.com/{self.target_account}", wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(8)
for _ in range(4): # Scroll moderado
for _ in range(10): # Scroll profundo para histórico
html = await page.content()
urls = self._extract_urls_from_text(html)
for u in urls:
if all(x not in u for x in ["x.com", "twitter.com", "t.co", "abs.twimg", "archive.org"]):
if all(x not in u for x in ["x.com", "twitter.com", "t.co", "abs.twimg", "archive.org", "pbs.twimg"]):
results.append({"url": u, "context": "Playwright Browser", "timestamp": datetime.now(MADRID_TZ).isoformat()})
await page.evaluate("window.scrollBy(0, 1200)")
await asyncio.sleep(4)
await page.evaluate("window.scrollBy(0, 2000)")
await asyncio.sleep(5)
await browser.close()
return results