From d376f86e7edcd3d5e3e098a687c3b6a11ee201e7 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Sun, 17 May 2026 13:58:51 +0200 Subject: [PATCH] feat(ops): final mandate compliance sync - restored database-first, linguistic diversity, and repository consolidation --- src/agentic_curator.py | 30 +++++++++++++++++++++++------- src/intelligent_health_checker.py | 17 ++++++++++++++++- src/v2_optimizer.py | 26 +++++++++++++++++++++++--- 3 files changed, 62 insertions(+), 11 deletions(-) diff --git a/src/agentic_curator.py b/src/agentic_curator.py index b3ad0b3b..72c3bdcb 100644 --- a/src/agentic_curator.py +++ b/src/agentic_curator.py @@ -57,19 +57,35 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]: evaluations = {} curator = AgenticCurator() for i, asset in enumerate(raw_assets): - log_event(f"--- EVALUATING {i+1}/{len(raw_assets)}: {asset['url']} ---") - norm_url = normalize_url(asset["url"]) + url = asset["url"] + log_event(f"--- EVALUATING {i+1}/{len(raw_assets)}: {url} ---") + norm_url = normalize_url(url) + # --- DATABASE-FIRST: Reuse insights --- + if norm_url in curator.inventory: + cached = curator.inventory[norm_url] + if cached.get("title") and cached.get("hierarchy"): + log_event(f" [⚡] REUSING CACHED INSIGHTS: {cached['title']}") + from src.gemini_utils import SESSION_TRACKER + SESSION_TRACKER.track_cache_hit(est_tokens=2200) + evaluations[url] = {"status": "INCLUDED", **cached} + continue + # 1. Fetch & Fingerprint - web_content, rich_meta = await _deep_fetch_content(asset["url"]) + web_content, rich_meta = await _deep_fetch_content(url) content_hash = hashlib.sha256(web_content.encode()).hexdigest() if web_content else "N/A" - # 2. AI Logic + # 2. AI Logic (O'Reilly + Linguistic Diversity) is_primary = "nubenetes" in asset.get("source_type", "Social").lower() strictness = "BE EXTREMELY SELECTIVE.\n" if not is_primary else "" prompt = ( "You act as a Senior Technical Librarian in 2026.\n" + strictness + - "Analyze the resource and respond ONLY with JSON: {\"impact_score\": int, \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"cat\", \"related_categories\": [\"cat1\"], \"title\": \"...\", \"desc\": \"...\", \"en_summary\": \"...\", \"language\": \"...\", \"resource_type\": \"...\", \"complexity\": \"...\", \"technical_hierarchy\": [\"Area\", \"Topic\", ...], \"is_microservice\": bool}\n" + "PHASE 1: LINGUISTIC DIVERSITY (Mandate 10)\n" + + "- DESC (V1 Archive): Provide a professional summary in the RESOURCE'S NATIVE LANGUAGE.\n" + + "- EN_SUMMARY (V2 Portal): Provide a professional English synthesis.\n" + + "PHASE 2: ARCHITECTURAL CLASSIFICATION (O'REILLY STYLE)\n" + + "- Identify TECHNICAL_HIERARCHY: List (max 10 strings) Area > Topic > Subtopics.\n" + + "Respond ONLY with JSON: {\"impact_score\": int, \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"cat\", \"title\": \"...\", \"desc\": \"...\", \"en_summary\": \"...\", \"language\": \"...\", \"resource_type\": \"...\", \"complexity\": \"...\", \"technical_hierarchy\": [\"Area\", ...], \"is_microservice\": bool}\n" f"CONTENT: {web_content[:2000]}" ) @@ -90,11 +106,11 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]: "category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp() } curator.inventory[norm_url] = eval_data - evaluations[asset["url"]] = {"status": "INCLUDED", **eval_data} + evaluations[url] = {"status": "INCLUDED", **eval_data} curator._save_inventory() log_event(f" [+] ACCEPTED: {data['title']}") else: - evaluations[asset["url"]] = {"status": "FILTERED"} + evaluations[url] = {"status": "FILTERED"} except Exception as e: log_event(f" [!] AI Error: {e}") return evaluations diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py index cd38b2ea..a036d991 100644 --- a/src/intelligent_health_checker.py +++ b/src/intelligent_health_checker.py @@ -109,12 +109,27 @@ class IntelligentLinkCleaner: text = resp.text.lower() if any(kw in text for kw in parked_indicators): return False, "parked", None return True, "OK", str(resp.url) if str(resp.url) != url else None + + # Definitive Failures if resp.status_code in [404, 410]: + # AUTO-HEAL GitHub Branches (master -> main) if "github.com" in url and "/master/" in url: heal = url.replace("/master/", "/main/") try: - if (await client.get(heal)).status_code < 200: return True, "healed", heal + if (await client.get(heal)).status_code < 400: return True, "healed", heal except: pass + + # Mandate 8: Repository Consolidation + if "github.com" in url: + match = re.search(r'(https?://github\.com/[^/]+/[^/]+)', url) + if match: + root_url = match.group(1) + if root_url != url: + try: + if (await client.get(root_url)).status_code < 400: + return True, "consolidated_to_root", root_url + except: pass + return False, "404", None return True, f"Soft Block {resp.status_code}", None except: return True, "Connection Error", None diff --git a/src/v2_optimizer.py b/src/v2_optimizer.py index f5223687..15df23e6 100644 --- a/src/v2_optimizer.py +++ b/src/v2_optimizer.py @@ -298,9 +298,29 @@ class V2VisionEngine: img = f" ![Preview]({l.get('social_preview_url')})\n" if l.get('social_preview_url') else "" md += f"!!! note \"{title}\"\n{img} **[Access Resource]({l['url']})** {'🌟'*l.get('stars',4)} | Level: {l.get('complexity', 'Beginner')}\n \n {l.get('ai_summary', l.get('description', ''))}\n\n" else: - date = f"**({l.get('year', 'N/A')})** " - tags = f" ⭐ {l.get('gh_stars',0)}" - md += f" - {date}[{title}]({l['url']}){tags} {'🌟'*l.get('stars',0)}\n" + year_prefix = f"**({l.get('year', 'N/A')})** " + gh_info = f" ⭐ {l.get('gh_stars',0)}" if l.get('gh_stars') else "" + icon = " 🎥" if l.get("is_video") else "" + + lang = l.get("language", "English") + lang_tag = f" [{lang.upper()} CONTENT]" if lang.lower() != "english" else "" + + comp = l.get("complexity", "Intermediate") + comp_tag = f" [{comp.upper()} LEVEL]" if comp.lower() in ["architect", "advanced"] else "" + + res_type = l.get("resource_type", "Reference") + type_tag = f" [{res_type.upper()}]" if res_type.lower() in ["case study", "guide", "documentation"] else "" + + rich = "".join([ + f" by **{l['author']}**" if l.get("author") else "", + f" ⏱️ {l['duration']}" if l.get("duration") else "", + f" 📖 {l['reading_time']}" if l.get("reading_time") else "" + ]) + + tag = l.get("tag", "[COMMUNITY-TOOL]") + color = "success" if "STANDARD" in tag else "warning" if "EMERGING" in tag else "info" + + md += f" - {year_prefix}[{title}]({l['url']}){icon}{gh_info}{lang_tag}{comp_tag}{type_tag}{rich} {'🌟'*l.get('stars',0)} {tag}\n" if l.get('ai_summary'): md += f"\n {l['ai_summary']}\n\n" return md