From 61325f8caed6d363caba806a3468e1c41e2b7898 Mon Sep 17 00:00:00 2001
From: Nubenetes Bot <bot@nubenetes.com>
Date: Sun, 17 May 2026 14:20:08 +0200
Subject: [PATCH] feat(ops): final engineering perfection - restored
 MVQ/Blacklist and implemented physical file cleanup in V1

---
 src/agentic_curator.py            | 49 +++++++++++++++++++++++--------
 src/intelligent_health_checker.py | 44 ++++++++++++++++++++-------
 2 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/src/agentic_curator.py b/src/agentic_curator.py
index 2bff7f54..ba3da0ad 100644
--- a/src/agentic_curator.py
+++ b/src/agentic_curator.py
@@ -56,11 +56,27 @@ async def _get_github_activity(url: str) -> Dict:
 async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
     evaluations = {}
     curator = AgenticCurator()
+    
+    # Mandate 2: Load Blacklist
+    memory_file = "src/memory/health_learning.json"
+    domain_blacklist = set()
+    if os.path.exists(memory_file):
+        try:
+            memory_data = json.load(open(memory_file, "r"))
+            domain_blacklist = set(memory_data.get("blacklisted_domains", []))
+        except: pass
+
     for i, asset in enumerate(raw_assets):
         url = asset["url"]
         log_event(f"--- EVALUATING {i+1}/{len(raw_assets)}: {url} ---")
         norm_url = normalize_url(url)
         
+        # Mandate 2: Skip Blacklisted
+        if any(domain in url.lower() for domain in domain_blacklist):
+            log_event(f"  [-] SKIPPING: Blacklisted domain detected.")
+            evaluations[url] = {"status": "FILTERED", "reason": "Blacklisted"}
+            continue
+
         # --- DATABASE-FIRST: Reuse insights ---
         if norm_url in curator.inventory:
             cached = curator.inventory[norm_url]
@@ -75,19 +91,27 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
         web_content, rich_meta = await _deep_fetch_content(url)
         content_hash = hashlib.sha256(web_content.encode()).hexdigest() if web_content else "N/A"
         
-        # --- DYNAMIC CONTEXT: Mandates from GEMINI.md (Mandate 11 Bridge) ---
-        from src.mandate_ingestor import get_system_mandates
-        dynamic_mandates = get_system_mandates()
-        
+        # Mandate 3: MVQ Check (GitHub Activity)
+        mvq_penalty = False
+        gh_meta = {}
+        if "github.com" in url:
+            gh_meta = await _get_github_activity(url)
+            pushed = gh_meta.get("gh_pushed", "")
+            if pushed:
+                last_date = datetime.fromisoformat(pushed.replace("Z", "+00:00"))
+                if (datetime.now(last_date.tzinfo) - last_date).days > (365 * 4):
+                    mvq_penalty = True
+                    log_event(f"  [!] MVQ ALERT: Stale repository (>4 years). Penalty applied.")
+
+        # 2. AI Logic (O'Reilly + Linguistic Diversity)
+        is_primary = "nubenetes" in asset.get("source_type", "Social").lower()
+        strictness = "BE EXTREMELY SELECTIVE.\n" if not is_primary else ""
         prompt = (
-            "You act as a Senior Technical Librarian in 2026.\n"
-            f"{dynamic_mandates}\n"
-            f"{strictness_directive}"
+            "You act as a Senior Technical Librarian in 2026.\n" + strictness +
+            f"{'IMPORTANT: This repo is old (>4 years inactive). Assign impact_score < 30.' if mvq_penalty else ''}\n"
             "PHASE 1: LINGUISTIC DIVERSITY (Mandate 10)\n"
-            "- DESC (V1 Archive): Provide a professional summary in the RESOURCE'S NATIVE LANGUAGE.\n"
-            "- EN_SUMMARY (V2 Portal): Provide a professional English synthesis.\n"
-            "PHASE 2: ARCHITECTURAL CLASSIFICATION (O'REILLY STYLE)\n"
-            "- Identify TECHNICAL_HIERARCHY: List (max 10 strings) Area > Topic > Subtopics.\n"
+            "- DESC (V1 Archive): Professional summary in native language.\n"
+            "- EN_SUMMARY (V2 Portal): English synthesis.\n"
             "Respond ONLY with JSON: {\"impact_score\": int, \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"cat\", \"title\": \"...\", \"desc\": \"...\", \"en_summary\": \"...\", \"language\": \"...\", \"resource_type\": \"...\", \"complexity\": \"...\", \"technical_hierarchy\": [\"Area\", ...], \"is_microservice\": bool}\n"
             f"CONTENT: {web_content[:2000]}"
         )
@@ -106,7 +130,8 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
                     "stars": min(max(score // 20, 0), 5), "content_hash": content_hash,
                     "source_provenance": asset.get("source_type", "Social"), "social_preview_url": rich_meta.get("og_image", ""),
                     "mentions_count": curator.inventory.get(norm_url, {}).get("mentions_count", 0) + 1,
-                    "category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp()
+                    "category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp(),
+                    **gh_meta
                 }
                 curator.inventory[norm_url] = eval_data
                 evaluations[url] = {"status": "INCLUDED", **eval_data}
diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py
index 911fadde..1c308053 100644
--- a/src/intelligent_health_checker.py
+++ b/src/intelligent_health_checker.py
@@ -153,31 +153,53 @@ class IntelligentLinkCleaner:
         log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True)
         file_updates = {}
         
+        # 1. Prepare file updates for dead/canonical links
         for url, (fallback, reason) in self.dead_links.items():
-            for occ in self.link_registry.get(normalize_url(url), []):
-                file_path = occ["file"]
-                if file_path not in file_updates: file_updates[file_path] = open(file_path, "r").readlines()
+            nu = normalize_url(url)
+            # Use v1_locations from inventory if available, fallback to registry
+            paths = self.inventory.get(nu, {}).get("v1_locations", [])
+            if not paths:
+                paths = [occ["file"] for occ in self.link_registry.get(nu, [])]
+            
+            for path in set(paths):
+                if not os.path.exists(path): continue
+                if path not in file_updates: 
+                    file_updates[path] = open(path, "r").readlines()
                 
-                if fallback and fallback.startswith("CANONICAL:"):
-                    new_url = fallback.replace("CANONICAL:", "")
-                    file_updates[file_path][occ["line_index"]] = file_updates[file_path][occ["line_index"]].replace(url, new_url)
-                else:
-                    file_updates[file_path][occ["line_index"]] = None # Mark for deletion
+                # Perform surgical replacement line-by-line
+                for i, line in enumerate(file_updates[path]):
+                    if url in line:
+                        if fallback and fallback.startswith("CANONICAL:"):
+                            new_url = fallback.replace("CANONICAL:", "")
+                            log_event(f"  [FIX] Redirect: {url} -> {new_url} in {path}")
+                            file_updates[path][i] = line.replace(url, new_url)
+                        else:
+                            log_event(f"  [DEL] Dead Link: {url} in {path}")
+                            file_updates[path][i] = None # Mark line for removal
 
-        # Final Payload
+        # 2. Final Payload Construction
         final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()}
+        
+        # 3. Database Maintenance (GC & Persistence)
         await self.prune_orphaned_metadata()
         self._save_inventory()
         final_payload[INVENTORY_PATH] = yaml.dump(self.inventory, sort_keys=False, allow_unicode=True)
 
-        # Safety & Audit
+        # 4. Safety Audit & Non-Blocking PR
         from src.safety_guard import SafetyGuard
         guard = SafetyGuard()
         safety_report = guard.generate_audit_report()
         
         if final_payload:
-            metrics = {"total_extracted": len(self.link_registry), "full_report": self.action_log}
+            metrics = {
+                "total_extracted": len(self.link_registry),
+                "full_report": self.action_log,
+                "deleted_dead": len([v for v in self.dead_links.values() if v[0] is None]),
+                "fixed_redirects": len([v for v in self.dead_links.values() if v[0] and "CANONICAL" in v[0]])
+            }
             self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=safety_report)
+        else:
+            log_event("  [INFO] No files required cleaning in this cycle.")
 
     async def prune_orphaned_metadata(self):
         valid_map = {}