fix(curation): repair IndentationError and restore AgenticCurator structure

2026-07-28 09:32:20 +00:00 · 2026-05-16 12:20:00 +02:00
parent 21bb5d1cd7
commit ac73f185fb
1 changed files with 160 additions and 162 deletions
--- a/src/agentic_curator.py
+++ b/src/agentic_curator.py
@@ -7,9 +7,11 @@ import random
 import difflib
 from datetime import datetime
 from typing import List, Dict, Set, Optional, Tuple
+import yaml
 from src.config import GEMINI_API_KEYS, GH_TOKEN, TARGET_REPO, NUBENETES_CATEGORIES
 from src.gitops_manager import RepositoryController
 from src.gemini_utils import call_gemini_with_retry
+from src.logger import log_event

 def normalize_url(url: str) -> str:
    url = url.split("#")[0].split("?")[0].rstrip("/")
@@ -32,42 +34,132 @@ def get_best_category_match(suggested: str) -> Optional[str]:

 async def _deep_fetch_content(url: str) -> str:
    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
    }
-                    try:
-                    self.inventory[asset["url"]] = {
-                        "title": data["title"],
-                        "description": data["desc"], 
-                        "ai_summary": data["desc"],
-                        "year": year,
-                        "stars": min(max(score // 20, 0), 5),
-                        "post_date": asset.get("post_date", "N/A"),
-                        "pub_date": data.get("pub_date", "N/A"),
-                        "repo_created_at": asset.get("gh_created", "N/A"),
-                        "repo_pushed_at": asset.get("gh_pushed", "N/A"),
-                        "last_checked": datetime.now().timestamp()
-                    }
-                    self._save_inventory()
-                except: pass
-                log_event(f"  [+] ACCEPTED: \"{data['title']}\" (Score: {score})")
-                log_event(f"      Primary: {primary_cat} | Related: {', '.join(related_cats)}")
-
-        except Exception as e:
-            log_event(f"  [!] ERROR EVALUATING {asset['url']}: {e}")
-            evaluations[asset["url"]] = {"status": "FILTERED", "reason": f"Evaluation Failed"}
-        
-        # Re-optimized for Pay-as-you-go
-        await asyncio.sleep(1.0)
-            
    try:
-        os.makedirs(os.path.dirname(memory_file), exist_ok=True)
-        with open(memory_file, 'w') as f:
-            json.dump({"blacklisted_domains": list(domain_blacklist)}, f, indent=2)
-    except: pass
-    return evaluations
+        timeout = httpx.Timeout(10.0, connect=5.0)
+        async with httpx.AsyncClient(timeout=timeout, verify=False) as client:
+            resp = await client.get(url, headers=headers, follow_redirects=True)
+            if resp.status_code == 200:
+                from bs4 import BeautifulSoup
+                soup = BeautifulSoup(resp.text, "html.parser")
+                for s in soup(["script", "style", "nav", "footer", "aside"]): s.decompose()
+                return soup.get_text(separator=" ", strip=True)[:4000]
+    except: return ""
+    return ""

+async def _get_github_activity(url: str) -> Dict:
+    """Obtiene metadatos de GitHub (estrellas, creación, actividad)."""
+    if "github.com" not in url or not GH_TOKEN: return {}
+    try:
+        match = re.search(r"github\.com/([^/]+)/([^/]+)", url)
+        if match:
+            owner, repo = match.groups()
+            repo = repo.split("#")[0].split("?")[0].rstrip(".git")
+            api_url = f"https://api.github.com/repos/{owner}/{repo}"
+            headers = {"Authorization": f"token {GH_TOKEN}"}
+            async with httpx.AsyncClient() as client:
+                resp = await client.get(api_url, headers=headers, timeout=5)
+                if resp.status_code == 200:
+                    data = resp.json()
+                    return {
+                        "gh_pushed": data.get("pushed_at", "").split("T")[0],
+                        "gh_created": data.get("created_at", "").split("T")[0],
+                        "gh_stars": data.get("stargazers_count", 0)
+                    }
+    except: pass
+    return {}
+
+async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
+    evaluations = {}
+    memory_file = "src/memory/health_learning.json"
+    domain_blacklist = set()
+    if os.path.exists(memory_file):
+        try:
+            with open(memory_file, "r") as f:
+                memory_data = json.load(f)
+                domain_blacklist = set(memory_data.get("blacklisted_domains", []))
+        except: pass
+
+    curator = AgenticCurator()
+
+    for i, asset in enumerate(raw_assets):
+        context = asset.get("text", "No additional context")
+        source = asset.get("source_type", "Social")
+        is_primary = "nubenetes" in source.lower()
+        
+        log_event(f"--- EVALUATING {i+1}/{len(raw_assets)} ---", section_break=False)
+        log_event(f"  - URL: {asset['url']}")
+
+        norm_url = normalize_url(asset["url"])
+        if norm_url.split("//")[-1].split("/")[0] in domain_blacklist:
+            log_event(f"  [-] REJECTED: Blacklisted domain")
+            evaluations[asset["url"]] = {"status": "FILTERED", "reason": "Blacklisted domain"}
+            continue
+
+        gh_meta = {}
+        mvq_penalty = False
+        if "github.com" in asset["url"]:
+            gh_meta = await _get_github_activity(asset["url"])
+            if gh_meta.get("gh_pushed"):
+                try:
+                    last_date = datetime.fromisoformat(gh_meta["gh_pushed"])
+                    if (datetime.now() - last_date).days > (365 * 4):
+                        mvq_penalty = True
+                except: pass
+
+        web_content = await _deep_fetch_content(asset["url"])
+        strictness_directive = "BE EXTREMELY SELECTIVE.\n" if not is_primary else ""
+
+        prompt = (
+            "You act as a Senior Technical Librarian for 'nubenetes/awesome-kubernetes' in 2026.\n"
+            f"{strictness_directive}"
+            "PHASE 1: SOPHISTICATED SYNTHESIS & DATING\n"
+            "- Extract precise PUBLICATION DATE (YYYY-MM-DD or YYYY): Look for dates in URL, context, or text.\n"
+            "- Identify ONE primary_category and up to TWO related_categories from the list.\n"
+            "PHASE 2: MANDATORY PROFESSIONAL DESCRIPTIONS\n"
+            "- Summaries MUST BE DESCRIPTIVE (neutral, objective, technical).\n"
+            "PHASE 3: QUALITY & MVQ\n"
+            "- Evaluate TECHNICAL IMPACT (1-100).\n"
+            f"{'IMPORTANT: This repo is old (>4 years inactive). Apply penalty.' if mvq_penalty else ''}\n\n"
+            f"Existing categories: {', '.join(NUBENETES_CATEGORIES)}.\n"
+            f"URL: {asset['url']}\nExtracted Web Content: {web_content[:2000]}\n"
+            "Respond ONLY with a JSON: {\"impact_score\": int, \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"cat\", \"related_categories\": [\"cat1\", \"cat2\"], \"title\": \"...\", \"desc\": \"...\", \"reasoning\": \"...\"}"
+        )
+
+        try:
+            data = await call_gemini_with_retry(prompt)
+            score = data.get("impact_score", 50)
+            year = data.get("pub_date", "N/A").split("-")[0] if data.get("pub_date") else "N/A"
+            if gh_meta.get("gh_pushed"): year = gh_meta["gh_pushed"].split("-")[0]
+
+            primary_cat = get_best_category_match(data.get("primary_category"))
+            related_cats = [get_best_category_match(rc) for rc in data.get("related_categories", [])]
+            related_cats = [rc for rc in related_cats if rc and rc != primary_cat]
+
+            min_score = 5 if is_primary else 80 
+            if score < min_score or not primary_cat:
+                evaluations[asset["url"]] = {"status": "FILTERED", "reason": "Low impact or no category"}
+                log_event(f"  [-] REJECTED: Score {score}")
+            else:
+                evaluations[asset["url"]] = {
+                    "status": "INCLUDED", "title": data["title"], "description": data["desc"],
+                    "year": year, "category": primary_cat, "related_categories": related_cats[:2],
+                    "impact_score": score, "is_exceptional": score > 80
+                }
+                curator.inventory[norm_url] = {
+                    "title": data["title"], "description": data["desc"], "ai_summary": data["desc"],
+                    "year": year, "pub_date": data.get("pub_date", "N/A"), "post_date": asset.get("timestamp", "N/A"),
+                    "repo_created_at": gh_meta.get("gh_created", "N/A"), "repo_pushed_at": gh_meta.get("gh_pushed", "N/A"),
+                    "stars": min(max(score // 20, 0), 5), "last_checked": datetime.now().timestamp()
+                }
+                curator._save_inventory()
+                log_event(f"  [+] ACCEPTED: {data['title']}")
+        except: pass
+        await asyncio.sleep(1.0)
+    return evaluations

 INVENTORY_PATH = "data/inventory.yaml"
 STRUCTURE_MAP_PATH = "data/structure_map.yaml"
@@ -85,73 +177,45 @@ class AgenticCurator:
    def _load_inventory(self) -> dict:
        if os.path.exists(INVENTORY_PATH):
            try:
-                with open(INVENTORY_PATH, "r") as f:
-                    import yaml
-                    return yaml.safe_load(f) or {}
+                with open(INVENTORY_PATH, "r") as f: return yaml.safe_load(f) or {}
            except: return {}
        return {}

    def _save_inventory(self):
        os.makedirs(os.path.dirname(INVENTORY_PATH), exist_ok=True)
-        with open(INVENTORY_PATH, "w") as f:
-            import yaml
-            yaml.dump(self.inventory, f, sort_keys=False, allow_unicode=True)
+        with open(INVENTORY_PATH, "w") as f: yaml.dump(self.inventory, f, sort_keys=False, allow_unicode=True)

    def _load_structure_map(self) -> dict:
        if os.path.exists(STRUCTURE_MAP_PATH):
            try:
-                with open(STRUCTURE_MAP_PATH, "r") as f:
-                    import yaml
-                    return yaml.safe_load(f) or {}
+                with open(STRUCTURE_MAP_PATH, "r") as f: return yaml.safe_load(f) or {}
            except: return {}
        return {}

    def _save_structure_map(self):
        os.makedirs(os.path.dirname(STRUCTURE_MAP_PATH), exist_ok=True)
-        with open(STRUCTURE_MAP_PATH, "w") as f:
-            import yaml
-            yaml.dump(self.structure_map, f, sort_keys=False, allow_unicode=True)
-        self.inventory = self._load_inventory()
-        self.structure_map = self._load_structure_map()
+        with open(STRUCTURE_MAP_PATH, "w") as f: yaml.dump(self.structure_map, f, sort_keys=False, allow_unicode=True)

    async def _rebuild_toc(self, content: str) -> str:
-        """
-        Detecta y reconstruye el TOC interno de un archivo markdown.
-        Busca el patrón de lista numerada al inicio del archivo.
-        """
        lines = content.splitlines()
-        new_lines = []
        headers = []
-        toc_start_idx = -1
-        toc_end_idx = -1
-        
-        # 1. Extraer todos los headers (## y ###) para el nuevo TOC
        for line in lines:
            if line.startswith("## ") or line.startswith("### "):
                title = line.strip("#").strip()
-                # Generar ancla simplificada (slug)
                anchor = title.lower().replace(" ", "-").replace(".", "").replace("/", "").replace("(", "").replace(")", "").replace(",", "")
-                level = 2 if line.startswith("## ") else 3
-                headers.append({"title": title, "anchor": anchor, "level": level})
-
+                headers.append({"title": title, "anchor": anchor, "level": 2 if line.startswith("## ") else 3})
        if not headers: return content
-
-        # 2. Localizar el TOC actual
+        toc_start_idx = -1
+        toc_end_idx = -1
        for i, line in enumerate(lines):
-            if re.match(r'^\d+\.\s+\[', line.strip()):
+            if re.match(r"^\d+\.\s+\[", line.strip()):
                if toc_start_idx == -1: toc_start_idx = i
                toc_end_idx = i
-            elif toc_start_idx != -1 and line.strip() == "" and i < len(lines)-1 and re.match(r'^\d+\.\s+\[', lines[i+1].strip()):
-                continue # Espacios en blanco dentro del TOC
-            elif toc_start_idx != -1 and not re.match(r'^\s*\d+\.\s+\[', line.strip()) and line.strip() != "":
+            elif toc_start_idx != -1 and not re.match(r"^\s*\d+\.\s+\[", line.strip()) and line.strip() != "":
                if toc_end_idx != -1: break
-
-        if toc_start_idx == -1: return content # No hay TOC que actualizar
-
-        # 3. Construir el nuevo TOC
+        if toc_start_idx == -1: return content
        new_toc = []
-        h2_count = 0
-        h3_count = 0
+        h2_count, h3_count = 0, 0
        for h in headers:
            if h["level"] == 2:
                h2_count += 1
@@ -160,120 +224,54 @@ class AgenticCurator:
            else:
                h3_count += 1
                new_toc.append(f"    {h3_count}. [{h['title']}](#{h['anchor']})")
-
-        # 4. Reensamblar el archivo
        return "\n".join(lines[:toc_start_idx] + new_toc + lines[toc_end_idx + 1:])

    async def decide_smart_injection(self, markdown_content: str, asset: Dict) -> str:
-        """
-        Smartly injects a link and updates the TOC if necessary.
-        """
        lines = markdown_content.splitlines()
        structure = "\n".join([l for l in lines if l.startswith("#")])
-        
-        stars = " 🌟" if asset['impact_score'] > 80 else ""
-        year_prefix = f"**({asset.get('year')})** " if asset.get('year') and asset.get('year') != "N/A" else ""
+        stars = " 🌟" if asset["impact_score"] > 80 else ""
+        year_prefix = f"**({asset.get('year')})** " if asset.get("year") and asset.get("year") != "N/A" else ""
        formatted_line = f"  - {year_prefix}[{asset['title']}]({asset['url']}){stars} - {asset['description']}"
-
-        prompt = (
-            "You act as a Content Architect for Nubenetes.com.\n"
-            f"Your mission is to logically inject this new resource into the markdown file (LANGUAGE: ENGLISH):\n"
-            f"RESOURCE: {formatted_line}\n"
-            "CURRENT STRUCTURE:\n"
-            f"{structure[:1500]}\n\n"
-            "INSTRUCTIONS:\n"
-            "1. Identify the most suitable header (##).\n"
-            "2. If it doesn't exist, PROPOSE A NEW ONE (in English).\n"
-            "Respond JSON: {\"target_header\": \"## ...\", \"is_new_header\": bool, \"insert_after_header\": \"## ...\"}"
-        )
-
+        prompt = f"Inject resource: {formatted_line} into structure: {structure[:1000]}. JSON: {{\"target_header\": \"## ...\", \"is_new_header\": bool}}"
        try:
            data = await call_gemini_with_retry(prompt)
-            target_header = data.get("target_header")
+            target = data.get("target_header")
            is_new = data.get("is_new_header", False)
-            ref_header = data.get("insert_after_header")
-            
-            if not target_header: return self._manual_fallback_injection(markdown_content, asset)
-
-            new_content_raw = ""
-            inserted = False
+            if not target: return self._manual_fallback_injection(markdown_content, asset)
            new_lines = []
-            
-            if is_new:
-                if not ref_header:
-                    new_lines = lines + ["", target_header, formatted_line]
+            inserted = False
+            for line in lines:
+                new_lines.append(line)
+                if not inserted and target.lower() in line.lower() and line.startswith("#"):
+                    if is_new: new_lines.append("")
+                    new_lines.append(formatted_line)
                    inserted = True
-                else:
-                    for line in lines:
-                        new_lines.append(line)
-                        if not inserted and ref_header.lower() in line.lower() and line.strip().startswith("#"):
-                            new_lines.append("")
-                            new_lines.append(target_header)
-                            new_lines.append(formatted_line)
-                            inserted = True
-                new_content_raw = "\n".join(new_lines)
-            else:
-                for line in lines:
-                    new_lines.append(line)
-                    if not inserted and target_header.lower() in line.lower() and line.strip().startswith("#"):
-                        new_lines.append(formatted_line)
-                        inserted = True
-                new_content_raw = "\n".join(new_lines)
-            
-            if inserted:
-                # If a new header was added, rebuild the TOC
-                if is_new:
-                    log_event(f"  [🏠] AI decided: Section '{target_header}' (NEW)")
-                    return await self._rebuild_toc(new_content_raw)
-                log_event(f"  [🏠] AI decided: Section '{target_header}' (EXISTING)")
-                return new_content_raw
-                
+            res = "\n".join(new_lines)
+            return await self._rebuild_toc(res) if is_new else res
        except: pass
        return self._manual_fallback_injection(markdown_content, asset)

    def _manual_fallback_injection(self, content: str, asset: Dict) -> str:
-        stars = " 🌟" if asset['impact_score'] > 80 else ""
-        year_prefix = f"**({asset.get('year')})** " if asset.get('year') and asset.get('year') != "N/A" else ""
+        stars = " 🌟" if asset["impact_score"] > 80 else ""
+        year_prefix = f"**({asset.get('year')})** " if asset.get("year") and asset.get("year") != "N/A" else ""
        line = f"  - {year_prefix}[{asset['title']}]({asset['url']}){stars} - {asset['description']}"
-        # If no sections, add a generic header
-        if "##" not in content:
-            return content + f"\n\n## Tools and Resources\n{line}"
-        return content + f"\n{line}"
+        return content + f"\n{line}" if "##" in content else content + f"\n\n## Tools and Resources\n{line}"

    async def suggest_reorganization(self):
-        """
-        Audits files and reorganizes them INTERNALLY, rebuilding the TOC.
-        """
        log_event("[*] Starting Internal Reorganization Audit...", section_break=True)
-        
        for file in os.listdir(self.docs_dir):
            if not file.endswith(".md") or file == "index.md": continue
-            
            path = os.path.join(self.docs_dir, file)
-            with open(path, 'r') as f: content = f.read()
-            
-            links = re.findall(r'^\s*-\s*\[', content, re.MULTILINE)
-            headers = re.findall(r'^##\s+', content, re.MULTILINE)
-            
-            if len(links) > 25 and len(headers) < 3:
+            with open(path, "r") as f: content = f.read()
+            if len(re.findall(r"^\s*-\s*\[", content, re.MULTILINE)) > 25:
                log_event(f"  [!] REORGANIZING: {file}")
-                
-                prompt = (
-                    f"Reorganize the file '{file}' into logical sections (##).\n"
-                    "KEEP ALL LINKS. DO NOT include the TOC (I will generate it).\n"
-                    "ALL HEADERS MUST BE IN ENGLISH.\n"
-                    f"CURRENT CONTENT:\n{content[:5000]}"
-                )
-                
+                prompt = f"Reorganize '{file}' into logical sections (##). English headers only. Content:\n{content[:4000]}"
                try:
                    reorganized = await call_gemini_with_retry(prompt, response_format="text")
                    if len(reorganized) > len(content) * 0.7:
-                        # Rebuild the TOC after massive reorganization
-                        final_content = await self._rebuild_toc(reorganized)
-                        with open(path, 'w') as f: f.write(final_content)
-                        log_event(f"  [OK] Reorganization and TOC updated for {file}")
-                except Exception as e:
-                    log_event(f"  [!] Error reorganizing {file}: {e}")
+                        final = await self._rebuild_toc(reorganized)
+                        with open(path, "w") as f: f.write(final)
+                        log_event(f"  [OK] Reorganized: {file}")
+                except Exception as e: log_event(f"  [!] Error: {e}")

-    def validate_changes(self) -> bool:
-        return True
+    def validate_changes(self) -> bool: return True