From ac73f185fb169f48f6f4a24956dd4f321fed416d Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Sat, 16 May 2026 12:20:00 +0200 Subject: [PATCH] fix(curation): repair IndentationError and restore AgenticCurator structure --- src/agentic_curator.py | 322 ++++++++++++++++++++--------------------- 1 file changed, 160 insertions(+), 162 deletions(-) diff --git a/src/agentic_curator.py b/src/agentic_curator.py index 92648125..87a2db33 100644 --- a/src/agentic_curator.py +++ b/src/agentic_curator.py @@ -7,9 +7,11 @@ import random import difflib from datetime import datetime from typing import List, Dict, Set, Optional, Tuple +import yaml from src.config import GEMINI_API_KEYS, GH_TOKEN, TARGET_REPO, NUBENETES_CATEGORIES from src.gitops_manager import RepositoryController from src.gemini_utils import call_gemini_with_retry +from src.logger import log_event def normalize_url(url: str) -> str: url = url.split("#")[0].split("?")[0].rstrip("/") @@ -32,42 +34,132 @@ def get_best_category_match(suggested: str) -> Optional[str]: async def _deep_fetch_content(url: str) -> str: headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", } - try: - self.inventory[asset["url"]] = { - "title": data["title"], - "description": data["desc"], - "ai_summary": data["desc"], - "year": year, - "stars": min(max(score // 20, 0), 5), - "post_date": asset.get("post_date", "N/A"), - "pub_date": data.get("pub_date", "N/A"), - "repo_created_at": asset.get("gh_created", "N/A"), - "repo_pushed_at": asset.get("gh_pushed", "N/A"), - "last_checked": datetime.now().timestamp() - } - self._save_inventory() - except: pass - log_event(f" [+] ACCEPTED: \"{data['title']}\" (Score: {score})") - log_event(f" Primary: {primary_cat} | Related: {', '.join(related_cats)}") - - except Exception as e: - log_event(f" [!] ERROR EVALUATING {asset['url']}: {e}") - evaluations[asset["url"]] = {"status": "FILTERED", "reason": f"Evaluation Failed"} - - # Re-optimized for Pay-as-you-go - await asyncio.sleep(1.0) - try: - os.makedirs(os.path.dirname(memory_file), exist_ok=True) - with open(memory_file, 'w') as f: - json.dump({"blacklisted_domains": list(domain_blacklist)}, f, indent=2) - except: pass - return evaluations + timeout = httpx.Timeout(10.0, connect=5.0) + async with httpx.AsyncClient(timeout=timeout, verify=False) as client: + resp = await client.get(url, headers=headers, follow_redirects=True) + if resp.status_code == 200: + from bs4 import BeautifulSoup + soup = BeautifulSoup(resp.text, "html.parser") + for s in soup(["script", "style", "nav", "footer", "aside"]): s.decompose() + return soup.get_text(separator=" ", strip=True)[:4000] + except: return "" + return "" +async def _get_github_activity(url: str) -> Dict: + """Obtiene metadatos de GitHub (estrellas, creación, actividad).""" + if "github.com" not in url or not GH_TOKEN: return {} + try: + match = re.search(r"github\.com/([^/]+)/([^/]+)", url) + if match: + owner, repo = match.groups() + repo = repo.split("#")[0].split("?")[0].rstrip(".git") + api_url = f"https://api.github.com/repos/{owner}/{repo}" + headers = {"Authorization": f"token {GH_TOKEN}"} + async with httpx.AsyncClient() as client: + resp = await client.get(api_url, headers=headers, timeout=5) + if resp.status_code == 200: + data = resp.json() + return { + "gh_pushed": data.get("pushed_at", "").split("T")[0], + "gh_created": data.get("created_at", "").split("T")[0], + "gh_stars": data.get("stargazers_count", 0) + } + except: pass + return {} + +async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]: + evaluations = {} + memory_file = "src/memory/health_learning.json" + domain_blacklist = set() + if os.path.exists(memory_file): + try: + with open(memory_file, "r") as f: + memory_data = json.load(f) + domain_blacklist = set(memory_data.get("blacklisted_domains", [])) + except: pass + + curator = AgenticCurator() + + for i, asset in enumerate(raw_assets): + context = asset.get("text", "No additional context") + source = asset.get("source_type", "Social") + is_primary = "nubenetes" in source.lower() + + log_event(f"--- EVALUATING {i+1}/{len(raw_assets)} ---", section_break=False) + log_event(f" - URL: {asset['url']}") + + norm_url = normalize_url(asset["url"]) + if norm_url.split("//")[-1].split("/")[0] in domain_blacklist: + log_event(f" [-] REJECTED: Blacklisted domain") + evaluations[asset["url"]] = {"status": "FILTERED", "reason": "Blacklisted domain"} + continue + + gh_meta = {} + mvq_penalty = False + if "github.com" in asset["url"]: + gh_meta = await _get_github_activity(asset["url"]) + if gh_meta.get("gh_pushed"): + try: + last_date = datetime.fromisoformat(gh_meta["gh_pushed"]) + if (datetime.now() - last_date).days > (365 * 4): + mvq_penalty = True + except: pass + + web_content = await _deep_fetch_content(asset["url"]) + strictness_directive = "BE EXTREMELY SELECTIVE.\n" if not is_primary else "" + + prompt = ( + "You act as a Senior Technical Librarian for 'nubenetes/awesome-kubernetes' in 2026.\n" + f"{strictness_directive}" + "PHASE 1: SOPHISTICATED SYNTHESIS & DATING\n" + "- Extract precise PUBLICATION DATE (YYYY-MM-DD or YYYY): Look for dates in URL, context, or text.\n" + "- Identify ONE primary_category and up to TWO related_categories from the list.\n" + "PHASE 2: MANDATORY PROFESSIONAL DESCRIPTIONS\n" + "- Summaries MUST BE DESCRIPTIVE (neutral, objective, technical).\n" + "PHASE 3: QUALITY & MVQ\n" + "- Evaluate TECHNICAL IMPACT (1-100).\n" + f"{'IMPORTANT: This repo is old (>4 years inactive). Apply penalty.' if mvq_penalty else ''}\n\n" + f"Existing categories: {', '.join(NUBENETES_CATEGORIES)}.\n" + f"URL: {asset['url']}\nExtracted Web Content: {web_content[:2000]}\n" + "Respond ONLY with a JSON: {\"impact_score\": int, \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"cat\", \"related_categories\": [\"cat1\", \"cat2\"], \"title\": \"...\", \"desc\": \"...\", \"reasoning\": \"...\"}" + ) + + try: + data = await call_gemini_with_retry(prompt) + score = data.get("impact_score", 50) + year = data.get("pub_date", "N/A").split("-")[0] if data.get("pub_date") else "N/A" + if gh_meta.get("gh_pushed"): year = gh_meta["gh_pushed"].split("-")[0] + + primary_cat = get_best_category_match(data.get("primary_category")) + related_cats = [get_best_category_match(rc) for rc in data.get("related_categories", [])] + related_cats = [rc for rc in related_cats if rc and rc != primary_cat] + + min_score = 5 if is_primary else 80 + if score < min_score or not primary_cat: + evaluations[asset["url"]] = {"status": "FILTERED", "reason": "Low impact or no category"} + log_event(f" [-] REJECTED: Score {score}") + else: + evaluations[asset["url"]] = { + "status": "INCLUDED", "title": data["title"], "description": data["desc"], + "year": year, "category": primary_cat, "related_categories": related_cats[:2], + "impact_score": score, "is_exceptional": score > 80 + } + curator.inventory[norm_url] = { + "title": data["title"], "description": data["desc"], "ai_summary": data["desc"], + "year": year, "pub_date": data.get("pub_date", "N/A"), "post_date": asset.get("timestamp", "N/A"), + "repo_created_at": gh_meta.get("gh_created", "N/A"), "repo_pushed_at": gh_meta.get("gh_pushed", "N/A"), + "stars": min(max(score // 20, 0), 5), "last_checked": datetime.now().timestamp() + } + curator._save_inventory() + log_event(f" [+] ACCEPTED: {data['title']}") + except: pass + await asyncio.sleep(1.0) + return evaluations INVENTORY_PATH = "data/inventory.yaml" STRUCTURE_MAP_PATH = "data/structure_map.yaml" @@ -85,73 +177,45 @@ class AgenticCurator: def _load_inventory(self) -> dict: if os.path.exists(INVENTORY_PATH): try: - with open(INVENTORY_PATH, "r") as f: - import yaml - return yaml.safe_load(f) or {} + with open(INVENTORY_PATH, "r") as f: return yaml.safe_load(f) or {} except: return {} return {} def _save_inventory(self): os.makedirs(os.path.dirname(INVENTORY_PATH), exist_ok=True) - with open(INVENTORY_PATH, "w") as f: - import yaml - yaml.dump(self.inventory, f, sort_keys=False, allow_unicode=True) + with open(INVENTORY_PATH, "w") as f: yaml.dump(self.inventory, f, sort_keys=False, allow_unicode=True) def _load_structure_map(self) -> dict: if os.path.exists(STRUCTURE_MAP_PATH): try: - with open(STRUCTURE_MAP_PATH, "r") as f: - import yaml - return yaml.safe_load(f) or {} + with open(STRUCTURE_MAP_PATH, "r") as f: return yaml.safe_load(f) or {} except: return {} return {} def _save_structure_map(self): os.makedirs(os.path.dirname(STRUCTURE_MAP_PATH), exist_ok=True) - with open(STRUCTURE_MAP_PATH, "w") as f: - import yaml - yaml.dump(self.structure_map, f, sort_keys=False, allow_unicode=True) - self.inventory = self._load_inventory() - self.structure_map = self._load_structure_map() + with open(STRUCTURE_MAP_PATH, "w") as f: yaml.dump(self.structure_map, f, sort_keys=False, allow_unicode=True) async def _rebuild_toc(self, content: str) -> str: - """ - Detecta y reconstruye el TOC interno de un archivo markdown. - Busca el patrón de lista numerada al inicio del archivo. - """ lines = content.splitlines() - new_lines = [] headers = [] - toc_start_idx = -1 - toc_end_idx = -1 - - # 1. Extraer todos los headers (## y ###) para el nuevo TOC for line in lines: if line.startswith("## ") or line.startswith("### "): title = line.strip("#").strip() - # Generar ancla simplificada (slug) anchor = title.lower().replace(" ", "-").replace(".", "").replace("/", "").replace("(", "").replace(")", "").replace(",", "") - level = 2 if line.startswith("## ") else 3 - headers.append({"title": title, "anchor": anchor, "level": level}) - + headers.append({"title": title, "anchor": anchor, "level": 2 if line.startswith("## ") else 3}) if not headers: return content - - # 2. Localizar el TOC actual + toc_start_idx = -1 + toc_end_idx = -1 for i, line in enumerate(lines): - if re.match(r'^\d+\.\s+\[', line.strip()): + if re.match(r"^\d+\.\s+\[", line.strip()): if toc_start_idx == -1: toc_start_idx = i toc_end_idx = i - elif toc_start_idx != -1 and line.strip() == "" and i < len(lines)-1 and re.match(r'^\d+\.\s+\[', lines[i+1].strip()): - continue # Espacios en blanco dentro del TOC - elif toc_start_idx != -1 and not re.match(r'^\s*\d+\.\s+\[', line.strip()) and line.strip() != "": + elif toc_start_idx != -1 and not re.match(r"^\s*\d+\.\s+\[", line.strip()) and line.strip() != "": if toc_end_idx != -1: break - - if toc_start_idx == -1: return content # No hay TOC que actualizar - - # 3. Construir el nuevo TOC + if toc_start_idx == -1: return content new_toc = [] - h2_count = 0 - h3_count = 0 + h2_count, h3_count = 0, 0 for h in headers: if h["level"] == 2: h2_count += 1 @@ -160,120 +224,54 @@ class AgenticCurator: else: h3_count += 1 new_toc.append(f" {h3_count}. [{h['title']}](#{h['anchor']})") - - # 4. Reensamblar el archivo return "\n".join(lines[:toc_start_idx] + new_toc + lines[toc_end_idx + 1:]) async def decide_smart_injection(self, markdown_content: str, asset: Dict) -> str: - """ - Smartly injects a link and updates the TOC if necessary. - """ lines = markdown_content.splitlines() structure = "\n".join([l for l in lines if l.startswith("#")]) - - stars = " 🌟" if asset['impact_score'] > 80 else "" - year_prefix = f"**({asset.get('year')})** " if asset.get('year') and asset.get('year') != "N/A" else "" + stars = " 🌟" if asset["impact_score"] > 80 else "" + year_prefix = f"**({asset.get('year')})** " if asset.get("year") and asset.get("year") != "N/A" else "" formatted_line = f" - {year_prefix}[{asset['title']}]({asset['url']}){stars} - {asset['description']}" - - prompt = ( - "You act as a Content Architect for Nubenetes.com.\n" - f"Your mission is to logically inject this new resource into the markdown file (LANGUAGE: ENGLISH):\n" - f"RESOURCE: {formatted_line}\n" - "CURRENT STRUCTURE:\n" - f"{structure[:1500]}\n\n" - "INSTRUCTIONS:\n" - "1. Identify the most suitable header (##).\n" - "2. If it doesn't exist, PROPOSE A NEW ONE (in English).\n" - "Respond JSON: {\"target_header\": \"## ...\", \"is_new_header\": bool, \"insert_after_header\": \"## ...\"}" - ) - + prompt = f"Inject resource: {formatted_line} into structure: {structure[:1000]}. JSON: {{\"target_header\": \"## ...\", \"is_new_header\": bool}}" try: data = await call_gemini_with_retry(prompt) - target_header = data.get("target_header") + target = data.get("target_header") is_new = data.get("is_new_header", False) - ref_header = data.get("insert_after_header") - - if not target_header: return self._manual_fallback_injection(markdown_content, asset) - - new_content_raw = "" - inserted = False + if not target: return self._manual_fallback_injection(markdown_content, asset) new_lines = [] - - if is_new: - if not ref_header: - new_lines = lines + ["", target_header, formatted_line] + inserted = False + for line in lines: + new_lines.append(line) + if not inserted and target.lower() in line.lower() and line.startswith("#"): + if is_new: new_lines.append("") + new_lines.append(formatted_line) inserted = True - else: - for line in lines: - new_lines.append(line) - if not inserted and ref_header.lower() in line.lower() and line.strip().startswith("#"): - new_lines.append("") - new_lines.append(target_header) - new_lines.append(formatted_line) - inserted = True - new_content_raw = "\n".join(new_lines) - else: - for line in lines: - new_lines.append(line) - if not inserted and target_header.lower() in line.lower() and line.strip().startswith("#"): - new_lines.append(formatted_line) - inserted = True - new_content_raw = "\n".join(new_lines) - - if inserted: - # If a new header was added, rebuild the TOC - if is_new: - log_event(f" [🏠] AI decided: Section '{target_header}' (NEW)") - return await self._rebuild_toc(new_content_raw) - log_event(f" [🏠] AI decided: Section '{target_header}' (EXISTING)") - return new_content_raw - + res = "\n".join(new_lines) + return await self._rebuild_toc(res) if is_new else res except: pass return self._manual_fallback_injection(markdown_content, asset) def _manual_fallback_injection(self, content: str, asset: Dict) -> str: - stars = " 🌟" if asset['impact_score'] > 80 else "" - year_prefix = f"**({asset.get('year')})** " if asset.get('year') and asset.get('year') != "N/A" else "" + stars = " 🌟" if asset["impact_score"] > 80 else "" + year_prefix = f"**({asset.get('year')})** " if asset.get("year") and asset.get("year") != "N/A" else "" line = f" - {year_prefix}[{asset['title']}]({asset['url']}){stars} - {asset['description']}" - # If no sections, add a generic header - if "##" not in content: - return content + f"\n\n## Tools and Resources\n{line}" - return content + f"\n{line}" + return content + f"\n{line}" if "##" in content else content + f"\n\n## Tools and Resources\n{line}" async def suggest_reorganization(self): - """ - Audits files and reorganizes them INTERNALLY, rebuilding the TOC. - """ log_event("[*] Starting Internal Reorganization Audit...", section_break=True) - for file in os.listdir(self.docs_dir): if not file.endswith(".md") or file == "index.md": continue - path = os.path.join(self.docs_dir, file) - with open(path, 'r') as f: content = f.read() - - links = re.findall(r'^\s*-\s*\[', content, re.MULTILINE) - headers = re.findall(r'^##\s+', content, re.MULTILINE) - - if len(links) > 25 and len(headers) < 3: + with open(path, "r") as f: content = f.read() + if len(re.findall(r"^\s*-\s*\[", content, re.MULTILINE)) > 25: log_event(f" [!] REORGANIZING: {file}") - - prompt = ( - f"Reorganize the file '{file}' into logical sections (##).\n" - "KEEP ALL LINKS. DO NOT include the TOC (I will generate it).\n" - "ALL HEADERS MUST BE IN ENGLISH.\n" - f"CURRENT CONTENT:\n{content[:5000]}" - ) - + prompt = f"Reorganize '{file}' into logical sections (##). English headers only. Content:\n{content[:4000]}" try: reorganized = await call_gemini_with_retry(prompt, response_format="text") if len(reorganized) > len(content) * 0.7: - # Rebuild the TOC after massive reorganization - final_content = await self._rebuild_toc(reorganized) - with open(path, 'w') as f: f.write(final_content) - log_event(f" [OK] Reorganization and TOC updated for {file}") - except Exception as e: - log_event(f" [!] Error reorganizing {file}: {e}") + final = await self._rebuild_toc(reorganized) + with open(path, "w") as f: f.write(final) + log_event(f" [OK] Reorganized: {file}") + except Exception as e: log_event(f" [!] Error: {e}") - def validate_changes(self) -> bool: - return True + def validate_changes(self) -> bool: return True