From b2aa8fe54d4ca1555f9e5d7b0d1dce4f0de096cd Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Sun, 17 May 2026 12:57:28 +0200 Subject: [PATCH] feat(ai): implement Automated Semantic Interlinking, Executive Comparison Tables, and Deep Semantic Deduplication --- README.md | 2 + data/link_rules.yaml | 15 +++++ src/agentic_curator.py | 43 ++++++++++++++ src/main.py | 7 +++ src/v2_optimizer.py | 123 ++++++++++++++++++++++++++--------------- 5 files changed, 144 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 2c864271..0572caee 100644 --- a/README.md +++ b/README.md @@ -438,6 +438,8 @@ graph TD ### 7.6. Strategic Benefits - **Technical Immutability (V1)**: AI agents are strictly forbidden from overwriting human-curated titles, manual 🌟 stars, or additional descriptive comments in the V1 archive, ensuring the bot respects and preserves manual engineering effort. +- **Automated Semantic Interlinking (Mandate 5)**: AI agents identify technical relationships between categories and automatically inject cross-references (*"See also..."*) into the V1 archive, transforming it into an interconnected technical web. +- **Executive Comparison Tables (V2 Premium)**: High-density categories in the V2 portal feature AI-generated technical comparison tables (Solution, Maturity, Focus, Language), providing instant decision support for architects. - **Structural Intelligence Persistence**: High-precision technical classification is stored as a persistent, **recursive hierarchy** (up to 10 levels deep). This allows all workflows to reuse deep structural insights, reducing AI costs by >90% and ensuring perfect consistency between V1 reorganization and V2 portal generation. - **Self-Healing Infrastructure**: The engine automatically detects and rescues broken links (e.g., GitHub `master` -> `main` branch migration) and identifies parked/expired domains that bypass standard health checks. - **Zero-to-Hero Learning Paths**: V2 resources are systematically grouped by complexity level (Fundamentals, Intermediate, Advanced, Architect), transforming the portal into a structured educational journey for Cloud Native engineering. diff --git a/data/link_rules.yaml b/data/link_rules.yaml index bfddce78..248cce6a 100644 --- a/data/link_rules.yaml +++ b/data/link_rules.yaml @@ -21,6 +21,21 @@ v1_preservation_rules: allow_on_404: true # Only truncate to root if deep link is dead. update_metadata: true # If link changes, update only in the BBDD. auto_heal_github_branches: true # Automatically try master -> main if 404. + v1_dedup_strategy: "Conservative" # Preserve human curation; only dedup exact URL matches. + v2_dedup_strategy: "Semantic" # Consolidate multiple project URLs (e.g., site vs repo) into authoritative root. + +# ----------------------------------------------------------------------------- +# AI GENIALITY: SEMANTIC INTERLINKING & ANALYTICS +# ----------------------------------------------------------------------------- +semantic_interlinking: + enabled: true + max_references: 2 # "See also" references in related categories. + format: " - *See also: [%(title)s](%(url)s) in [%(category)s]*" + +executive_analytics: + comparison_tables: true # Generate comparison tables for high-density V2 sections. + min_tools_for_table: 8 # Minimum 'Standard' tools to trigger a table. + table_fields: ["Maturity", "Language", "Stars", "Primary Use Case"] # ----------------------------------------------------------------------------- # V2 PORTAL: THE ELITE SHOWCASE (AI-Centric / Optimized) diff --git a/src/agentic_curator.py b/src/agentic_curator.py index 72341e1e..952f0f35 100644 --- a/src/agentic_curator.py +++ b/src/agentic_curator.py @@ -333,6 +333,49 @@ class AgenticCurator: log_event(f" [OK] Reorganized: {file}") except Exception as e: log_event(f" [!] Error: {e}") + async def apply_semantic_interlinking(self, evaluations: Dict[str, Dict]): + """ + Implements Automated Semantic Interlinking (Mandate 5). + Adds 'See also' references to related categories to improve site navigation. + """ + log_event("[*] Phase 5: Executing Semantic Interlinking (Mandate 5)...", section_break=True) + + for url, eval_data in evaluations.items(): + if eval_data.get("status") != "INCLUDED": continue + + primary_cat = eval_data.get("category") + related_cats = eval_data.get("related_categories", []) + + for rel_cat in related_cats: + if not rel_cat or rel_cat == primary_cat: continue + + rel_path = os.path.join(self.docs_dir, f"{rel_cat}.md") + if not os.path.exists(rel_path): continue + + with open(rel_path, "r") as f: content = f.read() + + # Check if already interlinked + if url in content: continue + + log_event(f" [+] Interlinking: {eval_data['title']} -> {rel_cat}.md") + + see_also = f"\n - *See also: [{eval_data['title']}]({url}) in [{primary_cat.replace('-', ' ').title()}]*" + + # Inject at the end of the first H2 or at the end of the file + match = re.search(r'^## ', content, re.MULTILINE) + if match: + # Find the next H2 or end of section + next_h2 = re.search(r'^## ', content[match.end():], re.MULTILINE) + if next_h2: + pos = match.end() + next_h2.start() + content = content[:pos] + see_also + "\n" + content[pos:] + else: + content += see_also + else: + content += f"\n\n## Related Resources\n{see_also}" + + with open(rel_path, "w") as f: f.write(content) + def validate_changes(self) -> bool: return True async def _enrich_rich_metadata(url: str, soup) -> Dict: diff --git a/src/main.py b/src/main.py index 78dc0769..aed89275 100644 --- a/src/main.py +++ b/src/main.py @@ -352,6 +352,13 @@ async def master_orchestrator(): if batch_index < len(all_raw_assets_batches) - 1: await asyncio.sleep(5) + # 5. Semantic Interlinking (Mandate 5) + if unique_new_assets: + try: + await curator_agent.apply_semantic_interlinking(evaluations) + except Exception as e: + log_event(f" [!] Interlinking Error: {e}") + # 6. Finalization, Report and PR pr_url = None if modified_files_content or full_report_metrics: diff --git a/src/v2_optimizer.py b/src/v2_optimizer.py index 49eb43e0..a26e7378 100644 --- a/src/v2_optimizer.py +++ b/src/v2_optimizer.py @@ -5,19 +5,17 @@ import asyncio import yaml import httpx from datetime import datetime -from typing import List, Dict, Set, Any +from typing import List, Dict, Set, Any, Tuple from src.config import GEMINI_API_KEYS, GH_TOKEN, TARGET_REPO, MADRID_TZ, INVENTORY_PATH, STRUCTURE_MAP_PATH from src.gemini_utils import call_gemini_with_retry, normalize_url from src.logger import log_event V1_DIR = "docs" V2_DIR = "v2-docs" -INVENTORY_PATH = "data/inventory.yaml" -STRUCTURE_MAP_PATH = "data/structure_map.yaml" class V2VisionEngine: def __init__(self): - # Load Special Assets & Rules + # Load Config & Policy self.special_assets_rules = self._load_special_assets() self.link_rules = self._load_link_rules() self.max_depth = self.link_rules.get("hierarchy_rules", {}).get("max_depth", 10) @@ -38,17 +36,17 @@ class V2VisionEngine: } self.library_criteria = ( - "You are a Senior Technical Content Architect in 2026. Your mission is to organize a high-density technical reference portal " - "with the structure and logical flow of an advanced O'Reilly technical book.\n" + "You are a Senior Technical Architect in 2026. Your mission is to organize a high-density technical reference portal " + "structured like a professional technical book (O'Reilly style).\n" "PHASE 1: TECHNICAL PRESERVATION & CURATION\n" "- KEEP >90% of technical resources (except for 'introduction.md' where only high-impact links are kept).\n" "PHASE 2: SOPHISTICATED HIERARCHICAL CLASSIFICATION\n" - "- Identify TECHNICAL_HIERARCHY: A list of strings (max depth configured) representing Area > Topic > Subtopics.\n" + "- Identify TECHNICAL_HIERARCHY: A list of strings (max 10) representing Area > Topic > Subtopics.\n" "- For 'introduction.md', set is_microservice: true if context matches.\n" "PHASE 3: KNOWLEDGE ASSIMILATION FLOW\n" - "- Order hierarchy to facilitate a structured learning journey: from foundations to advanced internals.\n" + "- Order hierarchy to facilitate a structured learning journey.\n" "PHASE 4: MANDATORY DESCRIPTIONS\n" - "- If 'Current Desc' is empty, generate a professional 1-2 sentence summary. Style: O'Reilly technical, neutral.\n" + "- If 'Current Desc' is empty, generate a professional summary. Style: O'Reilly technical.\n" ) self.inventory = self._load_inventory() self.structure_map = self._load_structure_map() @@ -103,13 +101,14 @@ class V2VisionEngine: health_inventory = await self._verify_link_health(all_v1_links) log_event(f"[*] Health Check Complete. {len(health_inventory)} online.") - log_event("[*] Phase 2: Evaluation & Deep Indexing...") + log_event("[*] Phase 2: Evaluation & Deep Indexing (Semantic Dedup)...") library_inventory = await self._evaluate_and_score_resources(health_inventory) - + log_event(f"[*] Inventory Refined: {len(library_inventory)} items kept after semantic consolidation.") + log_event("[*] Phase 3: Recursive Hierarchy Construction...") v2_data = await self._rebuild_structure(library_inventory) - log_event("[*] Phase 4: Generating Premium Portal Hubs...") + log_event("[*] Phase 4: Generating Premium Portal Hubs (Comparison Tables)...") os.makedirs(V2_DIR, exist_ok=True) await self._write_premium_files(v2_data, mosaic_html, videos_html) await self._sync_enterprise_navigation(v2_data) @@ -165,47 +164,65 @@ class V2VisionEngine: return None async def _evaluate_and_score_resources(self, links: List[Dict]): - refined, to_evaluate = [], [] + to_evaluate = [] + project_registry = {} # {project_id: best_item} force_eval = os.getenv("FORCE_EVAL", "false").lower() == "true" - special_files = [sa["file"] for sa in self.special_assets_rules.get("special_assets", [])] for l in links: item = l.copy() norm_url = normalize_url(l["url"]) + + # Identify Project Signature + project_id = norm_url + if "github.com" in norm_url: + match = re.search(r'github\.com/([^/]+/[^/]+)', norm_url) + if match: project_id = match.group(1).lower() + if not force_eval and norm_url in self.inventory and "stars" in self.inventory[norm_url]: cached = self.inventory[norm_url] item.update(cached) - if cached.get("hierarchy"): refined.append(item); continue + if cached.get("hierarchy"): + if project_id not in project_registry or item.get("stars", 0) > project_registry[project_id].get("stars", 0): + project_registry[project_id] = item + continue to_evaluate.append(item) - if not to_evaluate: return refined + if to_evaluate: + for i in range(0, len(to_evaluate), 50): + batch = to_evaluate[i:i+50] + prompt = (f"{self.library_criteria}\nRespond ONLY JSON: {{\"results\": [{{ \"idx\": int, \"year\": \"YYYY\", \"stars\": 0-5, \"hierarchy\": [\"Area\", \"Topic\", ...], \"summary\": \"...\", \"language\": \"...\", \"type\": \"...\", \"complexity\": \"...\", \"is_microservice\": bool }}, ...]}}\n\nLINKS:\n" + + "\n".join([f"{idx}. {l['title']} ({l['url']})" for idx, l in enumerate(batch)])) + try: + data = await call_gemini_with_retry(prompt, prefer_flash=True) + for res in data.get("results", []): + idx = int(res["idx"]) + if idx < len(batch): + item = batch[idx].copy() + norm_url = normalize_url(item["url"]) + p_id = norm_url + if "github.com" in norm_url: + m = re.search(r'github\.com/([^/]+/[^/]+)', norm_url) + if m: p_id = m.group(1).lower() - for i in range(0, len(to_evaluate), 50): - batch = to_evaluate[i:i+50] - prompt = (f"{self.library_criteria}\nRespond ONLY JSON: {{\"results\": [{{ \"idx\": int, \"year\": \"YYYY\", \"stars\": 0-5, \"hierarchy\": [\"Area\", \"Topic\", ...], \"summary\": \"...\", \"language\": \"...\", \"type\": \"...\", \"complexity\": \"...\", \"is_microservice\": bool }}, ...]}}\n\nLINKS:\n" + - "\n".join([f"{idx}. {l['title']} ({l['url']})" for idx, l in enumerate(batch)])) - try: - data = await call_gemini_with_retry(prompt, prefer_flash=True) - for res in data.get("results", []): - idx = int(res["idx"]) - if idx < len(batch): - item = batch[idx].copy() - norm_url = normalize_url(item["url"]) - eval_data = { - "year": str(res.get("year", "N/A")), "stars": min(max(int(res.get("stars", 0)), 0), 5), - "ai_summary": res.get("summary", ""), "language": res.get("language", "English"), - "resource_type": res.get("type", "Reference"), "complexity": res.get("complexity", "Intermediate"), - "hierarchy": res.get("hierarchy", ["General"]), "is_microservice": bool(res.get("is_microservice", False)), - "status": "online", "tag": self._calculate_tag(item) - } - item.update(eval_data) - self.inventory[norm_url] = eval_data - self.inventory[norm_url]["title"] = item["title"] - refined.append(item) - except: - for l in batch: refined.append(l) - await asyncio.sleep(0.3) - return refined + eval_data = { + "year": str(res.get("year", "N/A")), "stars": min(max(int(res.get("stars", 0)), 0), 5), + "ai_summary": res.get("summary", ""), "language": res.get("language", "English"), + "resource_type": res.get("type", "Reference"), "complexity": res.get("complexity", "Intermediate"), + "hierarchy": res.get("hierarchy", ["General"]), "is_microservice": bool(res.get("is_microservice", False)), + "status": "online", "tag": self._calculate_tag(item) + } + item.update(eval_data) + self.inventory[norm_url] = eval_data + self.inventory[norm_url]["title"] = item["title"] + if p_id not in project_registry or item["stars"] > project_registry[p_id].get("stars", 0): + project_registry[p_id] = item + except: + for l in batch: + # Fallback registry injection + u = normalize_url(l["url"]) + if u not in project_registry: project_registry[u] = l + await asyncio.sleep(0.3) + return list(project_registry.values()) def _calculate_tag(self, item: Dict) -> str: stars = item.get("gh_stars", 0) @@ -246,6 +263,18 @@ class V2VisionEngine: v2_structure[dim]["summary"] = self.inventory.get(cache_key, {}).get("ai_summary", f"Strategic reference for {dim}.") return v2_structure + async def _generate_comparison_table(self, links: List[Dict]) -> str: + standard_tools = [l for l in links if l.get("stars", 0) >= 4] + if len(standard_tools) < 6: return "" + table = "\n??? abstract \"Architect's Technical Comparison Table\"\n" + table += " | Solution | Maturity | Primary Focus | Language | Stars |\n" + table += " | :--- | :--- | :--- | :--- | :--- |\n" + for l in standard_tools[:12]: + stars = "🌟" * l.get("stars", 0) + focus = l.get("topic", l.get("hierarchy", ["General"])[-1]) + table += f" | [{l['title'].replace('==','')}]({l['url']}) | {l.get('tag','').replace('[','').replace(']','')} | {focus} | {l.get('language','English')} | {stars} |\n" + return table + "\n" + async def _write_premium_files(self, data: Dict[str, Dict], mosaic_html: str, videos_html: str): mosaic_html = mosaic_html.replace('src="images/', 'src="images/').replace('](images/', '](images/') trending_pool = sorted([dict(meta, url=url) for url, meta in self.inventory.items() if meta.get("stars", 0) >= 3], key=lambda x: (x.get("pub_date", "0000"), -x.get("stars", 0)), reverse=True) @@ -266,12 +295,14 @@ class V2VisionEngine: toc += f"{' ' * (depth * 4)}- [{name}](#{slug})\n" + gen_toc(subnode, depth + 1, slug) return toc - def render_node(node, depth, base_slug, is_intro=False): + async def render_node(node, depth, base_slug, is_intro=False): md = "" for name, subnode in sorted(node.items()): if name == "__links__": continue slug = f"{base_slug}-{name.lower().replace(' ', '-')}" - md += f"{'#' * min(6, depth + 2)} {name}\n\n" + render_node(subnode, depth + 1, slug, is_intro) + md += f"{'#' * min(6, depth + 2)} {name}\n\n" + if depth == 1 and "__links__" in subnode: md += await self._generate_comparison_table(subnode["__links__"]) + md += await render_node(subnode, depth + 1, slug, is_intro) if "__links__" in node: for l in node["__links__"]: is_gold = is_intro and l.get("stars", 0) >= 4 @@ -297,14 +328,14 @@ class V2VisionEngine: md += f"## {cat}\n\n" if cat == "Introduction": md += "!!! quote \"Vision 2026\"\n The focus shifts to agentic autonomy and hardened security.\n\n### Ecosystem Map\n```mermaid\ngraph TD\n A[Foundations] --> B[AI & Intelligence]\n A --> C[Hardened Infra]\n B --> D[Agentic Curation]\n C --> E[Enterprise Stability]\n D --> F[Nubenetes Portal]\n E --> F\n```\n\n### Gateway Hub\n- 🚀 [Explore AI Dimensions](./ai-and-artificial-intelligence.md)\n- 📦 [Microservices Guide](./microservices.md)\n\n" - md += render_node(topics, 0, cat_slug, is_intro=(cat=="Introduction")) + md += await render_node(topics, 0, cat_slug, is_intro=(cat=="Introduction")) with open(os.path.join(V2_DIR, f"{slug}.md"), "w") as f: f.write(md) async def _sync_enterprise_navigation(self, data: Dict[str, Dict]): try: with open("v2-mkdocs.yml", "r") as f: content = f.read() nav = ["nav:", " - \"The 2026 Vision\": index.md"] - for dim in data.keys(): + for dim in sorted(data.keys()): if data[dim]["categories"]: slug = dim.lower().replace(" ", "-").replace("&", "and").replace("(", "").replace(")", "") nav.append(f" - \"{dim}\": {slug}.md")