From 61325f8caed6d363caba806a3468e1c41e2b7898 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Sun, 17 May 2026 14:20:08 +0200 Subject: [PATCH] feat(ops): final engineering perfection - restored MVQ/Blacklist and implemented physical file cleanup in V1 --- src/agentic_curator.py | 49 +++++++++++++++++++++++-------- src/intelligent_health_checker.py | 44 ++++++++++++++++++++------- 2 files changed, 70 insertions(+), 23 deletions(-) diff --git a/src/agentic_curator.py b/src/agentic_curator.py index 2bff7f54..ba3da0ad 100644 --- a/src/agentic_curator.py +++ b/src/agentic_curator.py @@ -56,11 +56,27 @@ async def _get_github_activity(url: str) -> Dict: async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]: evaluations = {} curator = AgenticCurator() + + # Mandate 2: Load Blacklist + memory_file = "src/memory/health_learning.json" + domain_blacklist = set() + if os.path.exists(memory_file): + try: + memory_data = json.load(open(memory_file, "r")) + domain_blacklist = set(memory_data.get("blacklisted_domains", [])) + except: pass + for i, asset in enumerate(raw_assets): url = asset["url"] log_event(f"--- EVALUATING {i+1}/{len(raw_assets)}: {url} ---") norm_url = normalize_url(url) + # Mandate 2: Skip Blacklisted + if any(domain in url.lower() for domain in domain_blacklist): + log_event(f" [-] SKIPPING: Blacklisted domain detected.") + evaluations[url] = {"status": "FILTERED", "reason": "Blacklisted"} + continue + # --- DATABASE-FIRST: Reuse insights --- if norm_url in curator.inventory: cached = curator.inventory[norm_url] @@ -75,19 +91,27 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]: web_content, rich_meta = await _deep_fetch_content(url) content_hash = hashlib.sha256(web_content.encode()).hexdigest() if web_content else "N/A" - # --- DYNAMIC CONTEXT: Mandates from GEMINI.md (Mandate 11 Bridge) --- - from src.mandate_ingestor import get_system_mandates - dynamic_mandates = get_system_mandates() - + # Mandate 3: MVQ Check (GitHub Activity) + mvq_penalty = False + gh_meta = {} + if "github.com" in url: + gh_meta = await _get_github_activity(url) + pushed = gh_meta.get("gh_pushed", "") + if pushed: + last_date = datetime.fromisoformat(pushed.replace("Z", "+00:00")) + if (datetime.now(last_date.tzinfo) - last_date).days > (365 * 4): + mvq_penalty = True + log_event(f" [!] MVQ ALERT: Stale repository (>4 years). Penalty applied.") + + # 2. AI Logic (O'Reilly + Linguistic Diversity) + is_primary = "nubenetes" in asset.get("source_type", "Social").lower() + strictness = "BE EXTREMELY SELECTIVE.\n" if not is_primary else "" prompt = ( - "You act as a Senior Technical Librarian in 2026.\n" - f"{dynamic_mandates}\n" - f"{strictness_directive}" + "You act as a Senior Technical Librarian in 2026.\n" + strictness + + f"{'IMPORTANT: This repo is old (>4 years inactive). Assign impact_score < 30.' if mvq_penalty else ''}\n" "PHASE 1: LINGUISTIC DIVERSITY (Mandate 10)\n" - "- DESC (V1 Archive): Provide a professional summary in the RESOURCE'S NATIVE LANGUAGE.\n" - "- EN_SUMMARY (V2 Portal): Provide a professional English synthesis.\n" - "PHASE 2: ARCHITECTURAL CLASSIFICATION (O'REILLY STYLE)\n" - "- Identify TECHNICAL_HIERARCHY: List (max 10 strings) Area > Topic > Subtopics.\n" + "- DESC (V1 Archive): Professional summary in native language.\n" + "- EN_SUMMARY (V2 Portal): English synthesis.\n" "Respond ONLY with JSON: {\"impact_score\": int, \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"cat\", \"title\": \"...\", \"desc\": \"...\", \"en_summary\": \"...\", \"language\": \"...\", \"resource_type\": \"...\", \"complexity\": \"...\", \"technical_hierarchy\": [\"Area\", ...], \"is_microservice\": bool}\n" f"CONTENT: {web_content[:2000]}" ) @@ -106,7 +130,8 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]: "stars": min(max(score // 20, 0), 5), "content_hash": content_hash, "source_provenance": asset.get("source_type", "Social"), "social_preview_url": rich_meta.get("og_image", ""), "mentions_count": curator.inventory.get(norm_url, {}).get("mentions_count", 0) + 1, - "category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp() + "category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp(), + **gh_meta } curator.inventory[norm_url] = eval_data evaluations[url] = {"status": "INCLUDED", **eval_data} diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py index 911fadde..1c308053 100644 --- a/src/intelligent_health_checker.py +++ b/src/intelligent_health_checker.py @@ -153,31 +153,53 @@ class IntelligentLinkCleaner: log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True) file_updates = {} + # 1. Prepare file updates for dead/canonical links for url, (fallback, reason) in self.dead_links.items(): - for occ in self.link_registry.get(normalize_url(url), []): - file_path = occ["file"] - if file_path not in file_updates: file_updates[file_path] = open(file_path, "r").readlines() + nu = normalize_url(url) + # Use v1_locations from inventory if available, fallback to registry + paths = self.inventory.get(nu, {}).get("v1_locations", []) + if not paths: + paths = [occ["file"] for occ in self.link_registry.get(nu, [])] + + for path in set(paths): + if not os.path.exists(path): continue + if path not in file_updates: + file_updates[path] = open(path, "r").readlines() - if fallback and fallback.startswith("CANONICAL:"): - new_url = fallback.replace("CANONICAL:", "") - file_updates[file_path][occ["line_index"]] = file_updates[file_path][occ["line_index"]].replace(url, new_url) - else: - file_updates[file_path][occ["line_index"]] = None # Mark for deletion + # Perform surgical replacement line-by-line + for i, line in enumerate(file_updates[path]): + if url in line: + if fallback and fallback.startswith("CANONICAL:"): + new_url = fallback.replace("CANONICAL:", "") + log_event(f" [FIX] Redirect: {url} -> {new_url} in {path}") + file_updates[path][i] = line.replace(url, new_url) + else: + log_event(f" [DEL] Dead Link: {url} in {path}") + file_updates[path][i] = None # Mark line for removal - # Final Payload + # 2. Final Payload Construction final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()} + + # 3. Database Maintenance (GC & Persistence) await self.prune_orphaned_metadata() self._save_inventory() final_payload[INVENTORY_PATH] = yaml.dump(self.inventory, sort_keys=False, allow_unicode=True) - # Safety & Audit + # 4. Safety Audit & Non-Blocking PR from src.safety_guard import SafetyGuard guard = SafetyGuard() safety_report = guard.generate_audit_report() if final_payload: - metrics = {"total_extracted": len(self.link_registry), "full_report": self.action_log} + metrics = { + "total_extracted": len(self.link_registry), + "full_report": self.action_log, + "deleted_dead": len([v for v in self.dead_links.values() if v[0] is None]), + "fixed_redirects": len([v for v in self.dead_links.values() if v[0] and "CANONICAL" in v[0]]) + } self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=safety_report) + else: + log_event(" [INFO] No files required cleaning in this cycle.") async def prune_orphaned_metadata(self): valid_map = {}