From b2aa8fe54d4ca1555f9e5d7b0d1dce4f0de096cd Mon Sep 17 00:00:00 2001
From: Nubenetes Bot <bot@nubenetes.com>
Date: Sun, 17 May 2026 12:57:28 +0200
Subject: [PATCH] feat(ai): implement Automated Semantic Interlinking,
 Executive Comparison Tables, and Deep Semantic Deduplication

---
 README.md              |   2 +
 data/link_rules.yaml   |  15 +++++
 src/agentic_curator.py |  43 ++++++++++++++
 src/main.py            |   7 +++
 src/v2_optimizer.py    | 123 ++++++++++++++++++++++++++---------------
 5 files changed, 144 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index 2c864271..0572caee 100644
--- a/README.md
+++ b/README.md
@@ -438,6 +438,8 @@ graph TD
 
 ### 7.6. Strategic Benefits
 - **Technical Immutability (V1)**: AI agents are strictly forbidden from overwriting human-curated titles, manual 🌟 stars, or additional descriptive comments in the V1 archive, ensuring the bot respects and preserves manual engineering effort.
+- **Automated Semantic Interlinking (Mandate 5)**: AI agents identify technical relationships between categories and automatically inject cross-references (*"See also..."*) into the V1 archive, transforming it into an interconnected technical web.
+- **Executive Comparison Tables (V2 Premium)**: High-density categories in the V2 portal feature AI-generated technical comparison tables (Solution, Maturity, Focus, Language), providing instant decision support for architects.
 - **Structural Intelligence Persistence**: High-precision technical classification is stored as a persistent, **recursive hierarchy** (up to 10 levels deep). This allows all workflows to reuse deep structural insights, reducing AI costs by >90% and ensuring perfect consistency between V1 reorganization and V2 portal generation.
 - **Self-Healing Infrastructure**: The engine automatically detects and rescues broken links (e.g., GitHub `master` -> `main` branch migration) and identifies parked/expired domains that bypass standard health checks.
 - **Zero-to-Hero Learning Paths**: V2 resources are systematically grouped by complexity level (Fundamentals, Intermediate, Advanced, Architect), transforming the portal into a structured educational journey for Cloud Native engineering.
diff --git a/data/link_rules.yaml b/data/link_rules.yaml
index bfddce78..248cce6a 100644
--- a/data/link_rules.yaml
+++ b/data/link_rules.yaml
@@ -21,6 +21,21 @@ v1_preservation_rules:
     allow_on_404: true        # Only truncate to root if deep link is dead.
     update_metadata: true     # If link changes, update only in the BBDD.
     auto_heal_github_branches: true # Automatically try master -> main if 404.
+    v1_dedup_strategy: "Conservative" # Preserve human curation; only dedup exact URL matches.
+    v2_dedup_strategy: "Semantic"     # Consolidate multiple project URLs (e.g., site vs repo) into authoritative root.
+
+# -----------------------------------------------------------------------------
+# AI GENIALITY: SEMANTIC INTERLINKING & ANALYTICS
+# -----------------------------------------------------------------------------
+semantic_interlinking:
+  enabled: true
+  max_references: 2           # "See also" references in related categories.
+  format: "  - *See also: [%(title)s](%(url)s) in [%(category)s]*"
+
+executive_analytics:
+  comparison_tables: true     # Generate comparison tables for high-density V2 sections.
+  min_tools_for_table: 8      # Minimum 'Standard' tools to trigger a table.
+  table_fields: ["Maturity", "Language", "Stars", "Primary Use Case"]
 
 # -----------------------------------------------------------------------------
 # V2 PORTAL: THE ELITE SHOWCASE (AI-Centric / Optimized)
diff --git a/src/agentic_curator.py b/src/agentic_curator.py
index 72341e1e..952f0f35 100644
--- a/src/agentic_curator.py
+++ b/src/agentic_curator.py
@@ -333,6 +333,49 @@ class AgenticCurator:
                         log_event(f"  [OK] Reorganized: {file}")
                 except Exception as e: log_event(f"  [!] Error: {e}")
 
+    async def apply_semantic_interlinking(self, evaluations: Dict[str, Dict]):
+        """
+        Implements Automated Semantic Interlinking (Mandate 5).
+        Adds 'See also' references to related categories to improve site navigation.
+        """
+        log_event("[*] Phase 5: Executing Semantic Interlinking (Mandate 5)...", section_break=True)
+        
+        for url, eval_data in evaluations.items():
+            if eval_data.get("status") != "INCLUDED": continue
+            
+            primary_cat = eval_data.get("category")
+            related_cats = eval_data.get("related_categories", [])
+            
+            for rel_cat in related_cats:
+                if not rel_cat or rel_cat == primary_cat: continue
+                
+                rel_path = os.path.join(self.docs_dir, f"{rel_cat}.md")
+                if not os.path.exists(rel_path): continue
+                
+                with open(rel_path, "r") as f: content = f.read()
+                
+                # Check if already interlinked
+                if url in content: continue
+                
+                log_event(f"  [+] Interlinking: {eval_data['title']} -> {rel_cat}.md")
+                
+                see_also = f"\n  - *See also: [{eval_data['title']}]({url}) in [{primary_cat.replace('-', ' ').title()}]*"
+                
+                # Inject at the end of the first H2 or at the end of the file
+                match = re.search(r'^## ', content, re.MULTILINE)
+                if match:
+                    # Find the next H2 or end of section
+                    next_h2 = re.search(r'^## ', content[match.end():], re.MULTILINE)
+                    if next_h2:
+                        pos = match.end() + next_h2.start()
+                        content = content[:pos] + see_also + "\n" + content[pos:]
+                    else:
+                        content += see_also
+                else:
+                    content += f"\n\n## Related Resources\n{see_also}"
+                
+                with open(rel_path, "w") as f: f.write(content)
+
     def validate_changes(self) -> bool: return True
 
 async def _enrich_rich_metadata(url: str, soup) -> Dict:
diff --git a/src/main.py b/src/main.py
index 78dc0769..aed89275 100644
--- a/src/main.py
+++ b/src/main.py
@@ -352,6 +352,13 @@ async def master_orchestrator():
         if batch_index < len(all_raw_assets_batches) - 1:
             await asyncio.sleep(5)
 
+    # 5. Semantic Interlinking (Mandate 5)
+    if unique_new_assets:
+        try:
+            await curator_agent.apply_semantic_interlinking(evaluations)
+        except Exception as e:
+            log_event(f"  [!] Interlinking Error: {e}")
+
     # 6. Finalization, Report and PR
     pr_url = None
     if modified_files_content or full_report_metrics:
diff --git a/src/v2_optimizer.py b/src/v2_optimizer.py
index 49eb43e0..a26e7378 100644
--- a/src/v2_optimizer.py
+++ b/src/v2_optimizer.py
@@ -5,19 +5,17 @@ import asyncio
 import yaml
 import httpx
 from datetime import datetime
-from typing import List, Dict, Set, Any
+from typing import List, Dict, Set, Any, Tuple
 from src.config import GEMINI_API_KEYS, GH_TOKEN, TARGET_REPO, MADRID_TZ, INVENTORY_PATH, STRUCTURE_MAP_PATH
 from src.gemini_utils import call_gemini_with_retry, normalize_url
 from src.logger import log_event
 
 V1_DIR = "docs"
 V2_DIR = "v2-docs"
-INVENTORY_PATH = "data/inventory.yaml"
-STRUCTURE_MAP_PATH = "data/structure_map.yaml"
 
 class V2VisionEngine:
     def __init__(self):
-        # Load Special Assets & Rules
+        # Load Config & Policy
         self.special_assets_rules = self._load_special_assets()
         self.link_rules = self._load_link_rules()
         self.max_depth = self.link_rules.get("hierarchy_rules", {}).get("max_depth", 10)
@@ -38,17 +36,17 @@ class V2VisionEngine:
         }
         
         self.library_criteria = (
-            "You are a Senior Technical Content Architect in 2026. Your mission is to organize a high-density technical reference portal "
-            "with the structure and logical flow of an advanced O'Reilly technical book.\n"
+            "You are a Senior Technical Architect in 2026. Your mission is to organize a high-density technical reference portal "
+            "structured like a professional technical book (O'Reilly style).\n"
             "PHASE 1: TECHNICAL PRESERVATION & CURATION\n"
             "- KEEP >90% of technical resources (except for 'introduction.md' where only high-impact links are kept).\n"
             "PHASE 2: SOPHISTICATED HIERARCHICAL CLASSIFICATION\n"
-            "- Identify TECHNICAL_HIERARCHY: A list of strings (max depth configured) representing Area > Topic > Subtopics.\n"
+            "- Identify TECHNICAL_HIERARCHY: A list of strings (max 10) representing Area > Topic > Subtopics.\n"
             "- For 'introduction.md', set is_microservice: true if context matches.\n"
             "PHASE 3: KNOWLEDGE ASSIMILATION FLOW\n"
-            "- Order hierarchy to facilitate a structured learning journey: from foundations to advanced internals.\n"
+            "- Order hierarchy to facilitate a structured learning journey.\n"
             "PHASE 4: MANDATORY DESCRIPTIONS\n"
-            "- If 'Current Desc' is empty, generate a professional 1-2 sentence summary. Style: O'Reilly technical, neutral.\n"
+            "- If 'Current Desc' is empty, generate a professional summary. Style: O'Reilly technical.\n"
         )
         self.inventory = self._load_inventory()
         self.structure_map = self._load_structure_map()
@@ -103,13 +101,14 @@ class V2VisionEngine:
         health_inventory = await self._verify_link_health(all_v1_links)
         log_event(f"[*] Health Check Complete. {len(health_inventory)} online.")
 
-        log_event("[*] Phase 2: Evaluation & Deep Indexing...")
+        log_event("[*] Phase 2: Evaluation & Deep Indexing (Semantic Dedup)...")
         library_inventory = await self._evaluate_and_score_resources(health_inventory)
-        
+        log_event(f"[*] Inventory Refined: {len(library_inventory)} items kept after semantic consolidation.")
+
         log_event("[*] Phase 3: Recursive Hierarchy Construction...")
         v2_data = await self._rebuild_structure(library_inventory)
         
-        log_event("[*] Phase 4: Generating Premium Portal Hubs...")
+        log_event("[*] Phase 4: Generating Premium Portal Hubs (Comparison Tables)...")
         os.makedirs(V2_DIR, exist_ok=True)
         await self._write_premium_files(v2_data, mosaic_html, videos_html)
         await self._sync_enterprise_navigation(v2_data)
@@ -165,47 +164,65 @@ class V2VisionEngine:
         return None
 
     async def _evaluate_and_score_resources(self, links: List[Dict]):
-        refined, to_evaluate = [], []
+        to_evaluate = []
+        project_registry = {} # {project_id: best_item}
         force_eval = os.getenv("FORCE_EVAL", "false").lower() == "true"
-        special_files = [sa["file"] for sa in self.special_assets_rules.get("special_assets", [])]
 
         for l in links:
             item = l.copy()
             norm_url = normalize_url(l["url"])
+            
+            # Identify Project Signature
+            project_id = norm_url
+            if "github.com" in norm_url:
+                match = re.search(r'github\.com/([^/]+/[^/]+)', norm_url)
+                if match: project_id = match.group(1).lower()
+
             if not force_eval and norm_url in self.inventory and "stars" in self.inventory[norm_url]:
                 cached = self.inventory[norm_url]
                 item.update(cached)
-                if cached.get("hierarchy"): refined.append(item); continue
+                if cached.get("hierarchy"):
+                    if project_id not in project_registry or item.get("stars", 0) > project_registry[project_id].get("stars", 0):
+                        project_registry[project_id] = item
+                    continue
             to_evaluate.append(item)
 
-        if not to_evaluate: return refined
+        if to_evaluate:
+            for i in range(0, len(to_evaluate), 50):
+                batch = to_evaluate[i:i+50]
+                prompt = (f"{self.library_criteria}\nRespond ONLY JSON: {{\"results\": [{{ \"idx\": int, \"year\": \"YYYY\", \"stars\": 0-5, \"hierarchy\": [\"Area\", \"Topic\", ...], \"summary\": \"...\", \"language\": \"...\", \"type\": \"...\", \"complexity\": \"...\", \"is_microservice\": bool }}, ...]}}\n\nLINKS:\n" + 
+                          "\n".join([f"{idx}. {l['title']} ({l['url']})" for idx, l in enumerate(batch)]))
+                try:
+                    data = await call_gemini_with_retry(prompt, prefer_flash=True)
+                    for res in data.get("results", []):
+                        idx = int(res["idx"])
+                        if idx < len(batch):
+                            item = batch[idx].copy()
+                            norm_url = normalize_url(item["url"])
+                            p_id = norm_url
+                            if "github.com" in norm_url:
+                                m = re.search(r'github\.com/([^/]+/[^/]+)', norm_url)
+                                if m: p_id = m.group(1).lower()
 
-        for i in range(0, len(to_evaluate), 50):
-            batch = to_evaluate[i:i+50]
-            prompt = (f"{self.library_criteria}\nRespond ONLY JSON: {{\"results\": [{{ \"idx\": int, \"year\": \"YYYY\", \"stars\": 0-5, \"hierarchy\": [\"Area\", \"Topic\", ...], \"summary\": \"...\", \"language\": \"...\", \"type\": \"...\", \"complexity\": \"...\", \"is_microservice\": bool }}, ...]}}\n\nLINKS:\n" + 
-                      "\n".join([f"{idx}. {l['title']} ({l['url']})" for idx, l in enumerate(batch)]))
-            try:
-                data = await call_gemini_with_retry(prompt, prefer_flash=True)
-                for res in data.get("results", []):
-                    idx = int(res["idx"])
-                    if idx < len(batch):
-                        item = batch[idx].copy()
-                        norm_url = normalize_url(item["url"])
-                        eval_data = {
-                            "year": str(res.get("year", "N/A")), "stars": min(max(int(res.get("stars", 0)), 0), 5),
-                            "ai_summary": res.get("summary", ""), "language": res.get("language", "English"),
-                            "resource_type": res.get("type", "Reference"), "complexity": res.get("complexity", "Intermediate"),
-                            "hierarchy": res.get("hierarchy", ["General"]), "is_microservice": bool(res.get("is_microservice", False)),
-                            "status": "online", "tag": self._calculate_tag(item)
-                        }
-                        item.update(eval_data)
-                        self.inventory[norm_url] = eval_data
-                        self.inventory[norm_url]["title"] = item["title"]
-                        refined.append(item)
-            except: 
-                for l in batch: refined.append(l)
-            await asyncio.sleep(0.3)
-        return refined
+                            eval_data = {
+                                "year": str(res.get("year", "N/A")), "stars": min(max(int(res.get("stars", 0)), 0), 5),
+                                "ai_summary": res.get("summary", ""), "language": res.get("language", "English"),
+                                "resource_type": res.get("type", "Reference"), "complexity": res.get("complexity", "Intermediate"),
+                                "hierarchy": res.get("hierarchy", ["General"]), "is_microservice": bool(res.get("is_microservice", False)),
+                                "status": "online", "tag": self._calculate_tag(item)
+                            }
+                            item.update(eval_data)
+                            self.inventory[norm_url] = eval_data
+                            self.inventory[norm_url]["title"] = item["title"]
+                            if p_id not in project_registry or item["stars"] > project_registry[p_id].get("stars", 0):
+                                project_registry[p_id] = item
+                except: 
+                    for l in batch:
+                        # Fallback registry injection
+                        u = normalize_url(l["url"])
+                        if u not in project_registry: project_registry[u] = l
+                await asyncio.sleep(0.3)
+        return list(project_registry.values())
 
     def _calculate_tag(self, item: Dict) -> str:
         stars = item.get("gh_stars", 0)
@@ -246,6 +263,18 @@ class V2VisionEngine:
             v2_structure[dim]["summary"] = self.inventory.get(cache_key, {}).get("ai_summary", f"Strategic reference for {dim}.")
         return v2_structure
 
+    async def _generate_comparison_table(self, links: List[Dict]) -> str:
+        standard_tools = [l for l in links if l.get("stars", 0) >= 4]
+        if len(standard_tools) < 6: return ""
+        table = "\n??? abstract \"Architect's Technical Comparison Table\"\n"
+        table += "    | Solution | Maturity | Primary Focus | Language | Stars |\n"
+        table += "    | :--- | :--- | :--- | :--- | :--- |\n"
+        for l in standard_tools[:12]:
+            stars = "🌟" * l.get("stars", 0)
+            focus = l.get("topic", l.get("hierarchy", ["General"])[-1])
+            table += f"    | [{l['title'].replace('==','')}]({l['url']}) | {l.get('tag','').replace('[','').replace(']','')} | {focus} | {l.get('language','English')} | {stars} |\n"
+        return table + "\n"
+
     async def _write_premium_files(self, data: Dict[str, Dict], mosaic_html: str, videos_html: str):
         mosaic_html = mosaic_html.replace('src="images/', 'src="images/').replace('](images/', '](images/')
         trending_pool = sorted([dict(meta, url=url) for url, meta in self.inventory.items() if meta.get("stars", 0) >= 3], key=lambda x: (x.get("pub_date", "0000"), -x.get("stars", 0)), reverse=True)
@@ -266,12 +295,14 @@ class V2VisionEngine:
                 toc += f"{' ' * (depth * 4)}- [{name}](#{slug})\n" + gen_toc(subnode, depth + 1, slug)
             return toc
 
-        def render_node(node, depth, base_slug, is_intro=False):
+        async def render_node(node, depth, base_slug, is_intro=False):
             md = ""
             for name, subnode in sorted(node.items()):
                 if name == "__links__": continue
                 slug = f"{base_slug}-{name.lower().replace(' ', '-')}"
-                md += f"{'#' * min(6, depth + 2)} {name}\n\n" + render_node(subnode, depth + 1, slug, is_intro)
+                md += f"{'#' * min(6, depth + 2)} {name}\n\n"
+                if depth == 1 and "__links__" in subnode: md += await self._generate_comparison_table(subnode["__links__"])
+                md += await render_node(subnode, depth + 1, slug, is_intro)
             if "__links__" in node:
                 for l in node["__links__"]:
                     is_gold = is_intro and l.get("stars", 0) >= 4
@@ -297,14 +328,14 @@ class V2VisionEngine:
                 md += f"## {cat}\n\n"
                 if cat == "Introduction":
                     md += "!!! quote \"Vision 2026\"\n    The focus shifts to agentic autonomy and hardened security.\n\n### Ecosystem Map\n```mermaid\ngraph TD\n    A[Foundations] --> B[AI & Intelligence]\n    A --> C[Hardened Infra]\n    B --> D[Agentic Curation]\n    C --> E[Enterprise Stability]\n    D --> F[Nubenetes Portal]\n    E --> F\n```\n\n### Gateway Hub\n- 🚀 [Explore AI Dimensions](./ai-and-artificial-intelligence.md)\n- 📦 [Microservices Guide](./microservices.md)\n\n"
-                md += render_node(topics, 0, cat_slug, is_intro=(cat=="Introduction"))
+                md += await render_node(topics, 0, cat_slug, is_intro=(cat=="Introduction"))
             with open(os.path.join(V2_DIR, f"{slug}.md"), "w") as f: f.write(md)
 
     async def _sync_enterprise_navigation(self, data: Dict[str, Dict]):
         try:
             with open("v2-mkdocs.yml", "r") as f: content = f.read()
             nav = ["nav:", "  - \"The 2026 Vision\": index.md"]
-            for dim in data.keys():
+            for dim in sorted(data.keys()):
                 if data[dim]["categories"]:
                     slug = dim.lower().replace(" ", "-").replace("&", "and").replace("(", "").replace(")", "")
                     nav.append(f"  - \"{dim}\": {slug}.md")