feat(ai): implement Automated Semantic Interlinking, Executive Comparison Tables, and Deep Semantic Deduplication

2026-05-22 00:53:37 +00:00 · 2026-05-17 12:57:28 +02:00
parent bf6a817d07
commit b2aa8fe54d
5 changed files with 144 additions and 46 deletions
--- a/README.md
+++ b/README.md
@@ -438,6 +438,8 @@ graph TD

 ### 7.6. Strategic Benefits
 - **Technical Immutability (V1)**: AI agents are strictly forbidden from overwriting human-curated titles, manual 🌟 stars, or additional descriptive comments in the V1 archive, ensuring the bot respects and preserves manual engineering effort.
+- **Automated Semantic Interlinking (Mandate 5)**: AI agents identify technical relationships between categories and automatically inject cross-references (*"See also..."*) into the V1 archive, transforming it into an interconnected technical web.
+- **Executive Comparison Tables (V2 Premium)**: High-density categories in the V2 portal feature AI-generated technical comparison tables (Solution, Maturity, Focus, Language), providing instant decision support for architects.
 - **Structural Intelligence Persistence**: High-precision technical classification is stored as a persistent, **recursive hierarchy** (up to 10 levels deep). This allows all workflows to reuse deep structural insights, reducing AI costs by >90% and ensuring perfect consistency between V1 reorganization and V2 portal generation.
 - **Self-Healing Infrastructure**: The engine automatically detects and rescues broken links (e.g., GitHub `master` -> `main` branch migration) and identifies parked/expired domains that bypass standard health checks.
 - **Zero-to-Hero Learning Paths**: V2 resources are systematically grouped by complexity level (Fundamentals, Intermediate, Advanced, Architect), transforming the portal into a structured educational journey for Cloud Native engineering.
--- a/data/link_rules.yaml
+++ b/data/link_rules.yaml
@@ -21,6 +21,21 @@ v1_preservation_rules:
    allow_on_404: true        # Only truncate to root if deep link is dead.
    update_metadata: true     # If link changes, update only in the BBDD.
    auto_heal_github_branches: true # Automatically try master -> main if 404.
+    v1_dedup_strategy: "Conservative" # Preserve human curation; only dedup exact URL matches.
+    v2_dedup_strategy: "Semantic"     # Consolidate multiple project URLs (e.g., site vs repo) into authoritative root.
+
+# -----------------------------------------------------------------------------
+# AI GENIALITY: SEMANTIC INTERLINKING & ANALYTICS
+# -----------------------------------------------------------------------------
+semantic_interlinking:
+  enabled: true
+  max_references: 2           # "See also" references in related categories.
+  format: "  - *See also: [%(title)s](%(url)s) in [%(category)s]*"
+
+executive_analytics:
+  comparison_tables: true     # Generate comparison tables for high-density V2 sections.
+  min_tools_for_table: 8      # Minimum 'Standard' tools to trigger a table.
+  table_fields: ["Maturity", "Language", "Stars", "Primary Use Case"]

 # -----------------------------------------------------------------------------
 # V2 PORTAL: THE ELITE SHOWCASE (AI-Centric / Optimized)
--- a/src/agentic_curator.py
+++ b/src/agentic_curator.py
@@ -333,6 +333,49 @@ class AgenticCurator:
                        log_event(f"  [OK] Reorganized: {file}")
                except Exception as e: log_event(f"  [!] Error: {e}")

+    async def apply_semantic_interlinking(self, evaluations: Dict[str, Dict]):
+        """
+        Implements Automated Semantic Interlinking (Mandate 5).
+        Adds 'See also' references to related categories to improve site navigation.
+        """
+        log_event("[*] Phase 5: Executing Semantic Interlinking (Mandate 5)...", section_break=True)
+        
+        for url, eval_data in evaluations.items():
+            if eval_data.get("status") != "INCLUDED": continue
+            
+            primary_cat = eval_data.get("category")
+            related_cats = eval_data.get("related_categories", [])
+            
+            for rel_cat in related_cats:
+                if not rel_cat or rel_cat == primary_cat: continue
+                
+                rel_path = os.path.join(self.docs_dir, f"{rel_cat}.md")
+                if not os.path.exists(rel_path): continue
+                
+                with open(rel_path, "r") as f: content = f.read()
+                
+                # Check if already interlinked
+                if url in content: continue
+                
+                log_event(f"  [+] Interlinking: {eval_data['title']} -> {rel_cat}.md")
+                
+                see_also = f"\n  - *See also: [{eval_data['title']}]({url}) in [{primary_cat.replace('-', ' ').title()}]*"
+                
+                # Inject at the end of the first H2 or at the end of the file
+                match = re.search(r'^## ', content, re.MULTILINE)
+                if match:
+                    # Find the next H2 or end of section
+                    next_h2 = re.search(r'^## ', content[match.end():], re.MULTILINE)
+                    if next_h2:
+                        pos = match.end() + next_h2.start()
+                        content = content[:pos] + see_also + "\n" + content[pos:]
+                    else:
+                        content += see_also
+                else:
+                    content += f"\n\n## Related Resources\n{see_also}"
+                
+                with open(rel_path, "w") as f: f.write(content)
+
    def validate_changes(self) -> bool: return True

 async def _enrich_rich_metadata(url: str, soup) -> Dict:
--- a/src/main.py
+++ b/src/main.py
@@ -352,6 +352,13 @@ async def master_orchestrator():
        if batch_index < len(all_raw_assets_batches) - 1:
            await asyncio.sleep(5)

+    # 5. Semantic Interlinking (Mandate 5)
+    if unique_new_assets:
+        try:
+            await curator_agent.apply_semantic_interlinking(evaluations)
+        except Exception as e:
+            log_event(f"  [!] Interlinking Error: {e}")
+
    # 6. Finalization, Report and PR
    pr_url = None
    if modified_files_content or full_report_metrics:
--- a/src/v2_optimizer.py
+++ b/src/v2_optimizer.py
@@ -5,19 +5,17 @@ import asyncio
 import yaml
 import httpx
 from datetime import datetime
-from typing import List, Dict, Set, Any
+from typing import List, Dict, Set, Any, Tuple
 from src.config import GEMINI_API_KEYS, GH_TOKEN, TARGET_REPO, MADRID_TZ, INVENTORY_PATH, STRUCTURE_MAP_PATH
 from src.gemini_utils import call_gemini_with_retry, normalize_url
 from src.logger import log_event

 V1_DIR = "docs"
 V2_DIR = "v2-docs"
-INVENTORY_PATH = "data/inventory.yaml"
-STRUCTURE_MAP_PATH = "data/structure_map.yaml"

 class V2VisionEngine:
    def __init__(self):
-        # Load Special Assets & Rules
+        # Load Config & Policy
        self.special_assets_rules = self._load_special_assets()
        self.link_rules = self._load_link_rules()
        self.max_depth = self.link_rules.get("hierarchy_rules", {}).get("max_depth", 10)
@@ -38,17 +36,17 @@ class V2VisionEngine:
        }
        
        self.library_criteria = (
-            "You are a Senior Technical Content Architect in 2026. Your mission is to organize a high-density technical reference portal "
-            "with the structure and logical flow of an advanced O'Reilly technical book.\n"
+            "You are a Senior Technical Architect in 2026. Your mission is to organize a high-density technical reference portal "
+            "structured like a professional technical book (O'Reilly style).\n"
            "PHASE 1: TECHNICAL PRESERVATION & CURATION\n"
            "- KEEP >90% of technical resources (except for 'introduction.md' where only high-impact links are kept).\n"
            "PHASE 2: SOPHISTICATED HIERARCHICAL CLASSIFICATION\n"
-            "- Identify TECHNICAL_HIERARCHY: A list of strings (max depth configured) representing Area > Topic > Subtopics.\n"
+            "- Identify TECHNICAL_HIERARCHY: A list of strings (max 10) representing Area > Topic > Subtopics.\n"
            "- For 'introduction.md', set is_microservice: true if context matches.\n"
            "PHASE 3: KNOWLEDGE ASSIMILATION FLOW\n"
-            "- Order hierarchy to facilitate a structured learning journey: from foundations to advanced internals.\n"
+            "- Order hierarchy to facilitate a structured learning journey.\n"
            "PHASE 4: MANDATORY DESCRIPTIONS\n"
-            "- If 'Current Desc' is empty, generate a professional 1-2 sentence summary. Style: O'Reilly technical, neutral.\n"
+            "- If 'Current Desc' is empty, generate a professional summary. Style: O'Reilly technical.\n"
        )
        self.inventory = self._load_inventory()
        self.structure_map = self._load_structure_map()
@@ -103,13 +101,14 @@ class V2VisionEngine:
        health_inventory = await self._verify_link_health(all_v1_links)
        log_event(f"[*] Health Check Complete. {len(health_inventory)} online.")

-        log_event("[*] Phase 2: Evaluation & Deep Indexing...")
+        log_event("[*] Phase 2: Evaluation & Deep Indexing (Semantic Dedup)...")
        library_inventory = await self._evaluate_and_score_resources(health_inventory)
-        
+        log_event(f"[*] Inventory Refined: {len(library_inventory)} items kept after semantic consolidation.")
+
        log_event("[*] Phase 3: Recursive Hierarchy Construction...")
        v2_data = await self._rebuild_structure(library_inventory)
        
-        log_event("[*] Phase 4: Generating Premium Portal Hubs...")
+        log_event("[*] Phase 4: Generating Premium Portal Hubs (Comparison Tables)...")
        os.makedirs(V2_DIR, exist_ok=True)
        await self._write_premium_files(v2_data, mosaic_html, videos_html)
        await self._sync_enterprise_navigation(v2_data)
@@ -165,47 +164,65 @@ class V2VisionEngine:
        return None

    async def _evaluate_and_score_resources(self, links: List[Dict]):
-        refined, to_evaluate = [], []
+        to_evaluate = []
+        project_registry = {} # {project_id: best_item}
        force_eval = os.getenv("FORCE_EVAL", "false").lower() == "true"
-        special_files = [sa["file"] for sa in self.special_assets_rules.get("special_assets", [])]

        for l in links:
            item = l.copy()
            norm_url = normalize_url(l["url"])
+            
+            # Identify Project Signature
+            project_id = norm_url
+            if "github.com" in norm_url:
+                match = re.search(r'github\.com/([^/]+/[^/]+)', norm_url)
+                if match: project_id = match.group(1).lower()
+
            if not force_eval and norm_url in self.inventory and "stars" in self.inventory[norm_url]:
                cached = self.inventory[norm_url]
                item.update(cached)
-                if cached.get("hierarchy"): refined.append(item); continue
+                if cached.get("hierarchy"):
+                    if project_id not in project_registry or item.get("stars", 0) > project_registry[project_id].get("stars", 0):
+                        project_registry[project_id] = item
+                    continue
            to_evaluate.append(item)

-        if not to_evaluate: return refined
+        if to_evaluate:
+            for i in range(0, len(to_evaluate), 50):
+                batch = to_evaluate[i:i+50]
+                prompt = (f"{self.library_criteria}\nRespond ONLY JSON: {{\"results\": [{{ \"idx\": int, \"year\": \"YYYY\", \"stars\": 0-5, \"hierarchy\": [\"Area\", \"Topic\", ...], \"summary\": \"...\", \"language\": \"...\", \"type\": \"...\", \"complexity\": \"...\", \"is_microservice\": bool }}, ...]}}\n\nLINKS:\n" + 
+                          "\n".join([f"{idx}. {l['title']} ({l['url']})" for idx, l in enumerate(batch)]))
+                try:
+                    data = await call_gemini_with_retry(prompt, prefer_flash=True)
+                    for res in data.get("results", []):
+                        idx = int(res["idx"])
+                        if idx < len(batch):
+                            item = batch[idx].copy()
+                            norm_url = normalize_url(item["url"])
+                            p_id = norm_url
+                            if "github.com" in norm_url:
+                                m = re.search(r'github\.com/([^/]+/[^/]+)', norm_url)
+                                if m: p_id = m.group(1).lower()

-        for i in range(0, len(to_evaluate), 50):
-            batch = to_evaluate[i:i+50]
-            prompt = (f"{self.library_criteria}\nRespond ONLY JSON: {{\"results\": [{{ \"idx\": int, \"year\": \"YYYY\", \"stars\": 0-5, \"hierarchy\": [\"Area\", \"Topic\", ...], \"summary\": \"...\", \"language\": \"...\", \"type\": \"...\", \"complexity\": \"...\", \"is_microservice\": bool }}, ...]}}\n\nLINKS:\n" + 
-                      "\n".join([f"{idx}. {l['title']} ({l['url']})" for idx, l in enumerate(batch)]))
-            try:
-                data = await call_gemini_with_retry(prompt, prefer_flash=True)
-                for res in data.get("results", []):
-                    idx = int(res["idx"])
-                    if idx < len(batch):
-                        item = batch[idx].copy()
-                        norm_url = normalize_url(item["url"])
-                        eval_data = {
-                            "year": str(res.get("year", "N/A")), "stars": min(max(int(res.get("stars", 0)), 0), 5),
-                            "ai_summary": res.get("summary", ""), "language": res.get("language", "English"),
-                            "resource_type": res.get("type", "Reference"), "complexity": res.get("complexity", "Intermediate"),
-                            "hierarchy": res.get("hierarchy", ["General"]), "is_microservice": bool(res.get("is_microservice", False)),
-                            "status": "online", "tag": self._calculate_tag(item)
-                        }
-                        item.update(eval_data)
-                        self.inventory[norm_url] = eval_data
-                        self.inventory[norm_url]["title"] = item["title"]
-                        refined.append(item)
-            except: 
-                for l in batch: refined.append(l)
-            await asyncio.sleep(0.3)
-        return refined
+                            eval_data = {
+                                "year": str(res.get("year", "N/A")), "stars": min(max(int(res.get("stars", 0)), 0), 5),
+                                "ai_summary": res.get("summary", ""), "language": res.get("language", "English"),
+                                "resource_type": res.get("type", "Reference"), "complexity": res.get("complexity", "Intermediate"),
+                                "hierarchy": res.get("hierarchy", ["General"]), "is_microservice": bool(res.get("is_microservice", False)),
+                                "status": "online", "tag": self._calculate_tag(item)
+                            }
+                            item.update(eval_data)
+                            self.inventory[norm_url] = eval_data
+                            self.inventory[norm_url]["title"] = item["title"]
+                            if p_id not in project_registry or item["stars"] > project_registry[p_id].get("stars", 0):
+                                project_registry[p_id] = item
+                except: 
+                    for l in batch:
+                        # Fallback registry injection
+                        u = normalize_url(l["url"])
+                        if u not in project_registry: project_registry[u] = l
+                await asyncio.sleep(0.3)
+        return list(project_registry.values())

    def _calculate_tag(self, item: Dict) -> str:
        stars = item.get("gh_stars", 0)
@@ -246,6 +263,18 @@ class V2VisionEngine:
            v2_structure[dim]["summary"] = self.inventory.get(cache_key, {}).get("ai_summary", f"Strategic reference for {dim}.")
        return v2_structure

+    async def _generate_comparison_table(self, links: List[Dict]) -> str:
+        standard_tools = [l for l in links if l.get("stars", 0) >= 4]
+        if len(standard_tools) < 6: return ""
+        table = "\n??? abstract \"Architect's Technical Comparison Table\"\n"
+        table += "    | Solution | Maturity | Primary Focus | Language | Stars |\n"
+        table += "    | :--- | :--- | :--- | :--- | :--- |\n"
+        for l in standard_tools[:12]:
+            stars = "🌟" * l.get("stars", 0)
+            focus = l.get("topic", l.get("hierarchy", ["General"])[-1])
+            table += f"    | [{l['title'].replace('==','')}]({l['url']}) | {l.get('tag','').replace('[','').replace(']','')} | {focus} | {l.get('language','English')} | {stars} |\n"
+        return table + "\n"
+
    async def _write_premium_files(self, data: Dict[str, Dict], mosaic_html: str, videos_html: str):
        mosaic_html = mosaic_html.replace('src="images/', 'src="images/').replace('](images/', '](images/')
        trending_pool = sorted([dict(meta, url=url) for url, meta in self.inventory.items() if meta.get("stars", 0) >= 3], key=lambda x: (x.get("pub_date", "0000"), -x.get("stars", 0)), reverse=True)
@@ -266,12 +295,14 @@ class V2VisionEngine:
                toc += f"{' ' * (depth * 4)}- [{name}](#{slug})\n" + gen_toc(subnode, depth + 1, slug)
            return toc

-        def render_node(node, depth, base_slug, is_intro=False):
+        async def render_node(node, depth, base_slug, is_intro=False):
            md = ""
            for name, subnode in sorted(node.items()):
                if name == "__links__": continue
                slug = f"{base_slug}-{name.lower().replace(' ', '-')}"
-                md += f"{'#' * min(6, depth + 2)} {name}\n\n" + render_node(subnode, depth + 1, slug, is_intro)
+                md += f"{'#' * min(6, depth + 2)} {name}\n\n"
+                if depth == 1 and "__links__" in subnode: md += await self._generate_comparison_table(subnode["__links__"])
+                md += await render_node(subnode, depth + 1, slug, is_intro)
            if "__links__" in node:
                for l in node["__links__"]:
                    is_gold = is_intro and l.get("stars", 0) >= 4
@@ -297,14 +328,14 @@ class V2VisionEngine:
                md += f"## {cat}\n\n"
                if cat == "Introduction":
                    md += "!!! quote \"Vision 2026\"\n    The focus shifts to agentic autonomy and hardened security.\n\n### Ecosystem Map\n```mermaid\ngraph TD\n    A[Foundations] --> B[AI & Intelligence]\n    A --> C[Hardened Infra]\n    B --> D[Agentic Curation]\n    C --> E[Enterprise Stability]\n    D --> F[Nubenetes Portal]\n    E --> F\n```\n\n### Gateway Hub\n- 🚀 [Explore AI Dimensions](./ai-and-artificial-intelligence.md)\n- 📦 [Microservices Guide](./microservices.md)\n\n"
-                md += render_node(topics, 0, cat_slug, is_intro=(cat=="Introduction"))
+                md += await render_node(topics, 0, cat_slug, is_intro=(cat=="Introduction"))
            with open(os.path.join(V2_DIR, f"{slug}.md"), "w") as f: f.write(md)

    async def _sync_enterprise_navigation(self, data: Dict[str, Dict]):
        try:
            with open("v2-mkdocs.yml", "r") as f: content = f.read()
            nav = ["nav:", "  - \"The 2026 Vision\": index.md"]
-            for dim in data.keys():
+            for dim in sorted(data.keys()):
                if data[dim]["categories"]:
                    slug = dim.lower().replace(" ", "-").replace("&", "and").replace("(", "").replace(")", "")
                    nav.append(f"  - \"{dim}\": {slug}.md")