feat(ai): implement TOC and structural exceptions for config-heavy files (Mandate 29)

2026-05-24 01:53:45 +00:00 · 2026-05-17 16:25:01 +02:00
parent e8ab3032c7
commit 26abff05af
6 changed files with 268 additions and 40 deletions
--- a/GEMINI.md
+++ b/GEMINI.md
@@ -88,6 +88,8 @@ This file contains the accumulated instructions and long-term vision for the aut
    - **AI Dimension Naming**: Prioritize industry-standard terms (e.g., "AI and Artificial Intelligence" instead of internal jargon) for top-level navigation.
    - **Content Extraction**: High-value sub-topics may be extracted into dedicated documents (e.g., "Microservices" content extracted from `introduction.md` into `microservices.md`) to maintain high-density focus.

+29. **TOC & Structural Exceptions**: Certain files (configuration-heavy or technical tables like `mkdocs.md` or `matrix-table.md`) are exempt from TOC and deep-hierarchy requirements. These exceptions MUST be respected by all agents to avoid unnecessary structural clutter in non-navigational files as defined in [`data/link_rules.yaml`](data/link_rules.yaml).
+
 ## 🛠️ Structural Evolution & Navigation
 ...
 *   **No Link Limits**: There are NO hard limits on the number of links per page or per section (##/###). Nubenetes is built to host thousands of references.
--- a/README.md
+++ b/README.md
@@ -57,7 +57,8 @@
 12. [12. Developer Experience and VSCode Setup](#12-developer-experience-and-vscode-setup)
 14. [14. Special Assets and Learning Paths](#14-special-assets-and-learning-paths)
    *   [14.1. Special Assets Management](#141-special-assets-management)
-    *   [14.2. O'Reilly-style Knowledge Architecture](#142-oreilly-style-knowledge-architecture)
+    *   [14.2. O.Reilly-style Knowledge Architecture](#142-oreilly-style-knowledge-architecture)
+    *   [14.3. TOC and Structural Exceptions](#143-toc-and-structural-exceptions)
    *   [12.1. Extension Recommendations](#121-extension-recommendations)
    *   [12.2. Recommended settings.json](#122-recommended-settingsjson)
 13. [13. Repository Inventory and Configuration](#13-repository-inventory-and-configuration)
@@ -724,3 +725,8 @@ The V2 Portal is structured as a sophisticated technical reference guide, moving
 - **Gateway Hub Navigation**: Strategic dimensions are semantically interconnected, with a dedicated **Microservices Guide** extracted for high-density focus.
 - **Structured Assimilation**: Information is grouped into technical Areas, Topics, and Subtopics, facilitating learning from foundational theory to advanced engineering internals.
 - **Contextual Hierarchy**: Every page features an automated, clickable Table of Contents (TOC) with nested anchors for precise technical navigation.
+
+### 📑 TOC and Structural Exceptions
+Certain files are exempt from the mandatory Table of Contents (TOC) and deep-hierarchy requirements. These include configuration-heavy files (e.g., `mkdocs.md`) and large technical tables (e.g., `matrix-table.md`) where a navigational index is unnecessary or distracting.
+- **Automatic Skip**: The Agentic Curator and V2 Builder automatically bypass these files during structural reorganization cycles.
+- **Exception Registry**: Exemptions are managed via the `toc_exempt_files` list in [`data/link_rules.yaml`](data/link_rules.yaml).
--- a/current_safety_guard.py
+++ b/current_safety_guard.py
@@ -0,0 +1,206 @@
+import os
+import yaml
+import re
+from datetime import datetime
+from src.logger import log_event
+from src.gemini_utils import normalize_url
+
+INVENTORY_PATH = "data/inventory.yaml"
+V1_DIR = "docs"
+V2_DIR = "v2-docs"
+SPECIAL_ASSETS_PATH = "data/special_assets.yaml"
+CURATION_SOURCES_PATH = "data/curation_sources.yaml"
+WORKFLOW_PATH = ".github/workflows/agentic_cron.yml"
+
+class SafetyGuard:
+    def __init__(self):
+        self.errors = []
+        self.warnings = []
+        self.inventory = self._load_inventory()
+
+    def _load_inventory(self):
+        if os.path.exists(INVENTORY_PATH):
+            try:
+                with open(INVENTORY_PATH, "r") as f:
+                    return yaml.safe_load(f) or {}
+            except: return {}
+        return {}
+
+    def validate_data_integrity(self, old_inventory: dict):
+        """Mandate 1: Protección de Información."""
+        for url, old_meta in old_inventory.items():
+            new_meta = self.inventory.get(url)
+            if not new_meta: continue
+            if old_meta.get("stars", 0) > new_meta.get("stars", 0):
+                self.errors.append(f"❌ **Star Loss**: `{url}` ({old_meta['stars']} -> {new_meta['stars']})")
+            if old_meta.get("description") and not new_meta.get("description"):
+                self.errors.append(f"❌ **V1 Desc Loss**: `{url}`")
+
+    def validate_semantic_interlinking(self):
+        """Mandate 5: Verificar interconexión semántica en V1."""
+        log_event("[Safety] Auditing Semantic Interlinking...")
+        for url, meta in self.inventory.items():
+            related = meta.get("related_categories", [])
+            for rel_cat in related:
+                path = os.path.join(V1_DIR, f"{rel_cat}.md")
+                if os.path.exists(path):
+                    content = open(path, "r").read()
+                    if url not in content:
+                        self.warnings.append(f"🔗 **Interlink Missing**: `{meta['title']}` should be referenced in `{rel_cat}.md` (See also)")
+
+    def validate_special_assets_completeness(self):
+        """Mandate 27: Inclusión exhaustiva de Activos Especiales en V2."""
+        if not os.path.exists(SPECIAL_ASSETS_PATH) or not os.path.exists(V2_DIR): return
+        
+        with open(SPECIAL_ASSETS_PATH, "r") as f:
+            special = yaml.safe_load(f).get("special_assets", [])
+        
+        for sa in special:
+            if "Include 100%" in sa.get("v2_rule", ""):
+                file_name = sa["file"]
+                v1_path = os.path.join(V1_DIR, file_name)
+                if os.path.exists(v1_path):
+                    v1_links = re.findall(r'\[.*?\]\((https?://.*?)\)', open(v1_path, "r").read())
+                    # Check if these links exist in the inventory marked with this original_file
+                    for link in v1_links:
+                        nu = normalize_url(link)
+                        # We can't easily check if it's in the V2 rendered file here without complex parsing,
+                        # but we can check if it's in the inventory and not dead.
+                        if nu in self.inventory and self.inventory[nu].get("status") == "online":
+                            if not self.inventory[nu].get("v2_locations"):
+                                self.errors.append(f"💎 **Special Asset Leak**: `{link}` from `{file_name}` is missing in V2 portal")
+
+    def validate_mvq_compliance(self):
+        """Mandato 3 & 16: Verificar cumplimiento de MVQ en V2."""
+        for url, meta in self.inventory.items():
+            if "github.com" in url and meta.get("v2_locations"):
+                pushed = meta.get("gh_pushed", "")
+                if pushed:
+                    try:
+                        last_date = datetime.fromisoformat(pushed.replace("Z", "+00:00"))
+                        inactive_years = (datetime.now(last_date.tzinfo) - last_date).days / 365
+                        stars = meta.get("gh_stars", meta.get("stars", 0) * 100) # Fallback estimate
+                        if inactive_years > 4 and stars < 30:
+                            self.warnings.append(f"🏚️ **MVQ Violation**: Stale repo `{url}` (>4yrs) in V2 with low impact")
+                    except: pass
+
+    def validate_linguistic_tagging(self):
+        """Mandate 10: Verificar etiquetado de idioma en V2."""
+        if not os.path.exists(V2_DIR): return
+        for file in os.listdir(V2_DIR):
+            if file.endswith(".md"):
+                content = open(os.path.join(V2_DIR, file), "r").read()
+                # Find links that are non-English in inventory but missing tag in MD
+                for url, meta in self.inventory.items():
+                    lang = meta.get("language", "English")
+                    if lang.lower() != "english" and url in content:
+                        tag = f"[{lang.upper()} CONTENT]"
+                        if tag not in content:
+                            self.warnings.append(f"🌐 **Missing Lang Tag**: `{meta['title']}` in `{file}` needs `{tag}`")
+
+    def validate_platinum_schema(self):
+        """Mandate 22: Validar esquema Platinum en BBDD."""
+        required = ["content_hash", "health_score", "hierarchy", "v1_locations"]
+        new_count = 0
+        for url, meta in self.inventory.items():
+            if url.startswith("INTRO:"): continue
+            missing = [f for f in required if f not in meta]
+            if missing:
+                new_count += 1
+                if new_count < 10: # Don't overwhelm
+                    self.warnings.append(f"🧬 **Schema Incomplete**: `{url}` missing {missing}")
+
+    def validate_v2_architecture(self):
+        """Mandato 28: Estructura O'Reilly y V2 Flat Navigation."""
+        if not os.path.exists(V2_DIR): return
+        for file in os.listdir(V2_DIR):
+            if file.endswith(".md") and file != "index.md":
+                content = open(os.path.join(V2_DIR, file), "r").read()
+                if "## Table of Contents" not in content:
+                    self.errors.append(f"📚 **V2 TOC Missing**: `{file}`")
+                if "### " not in content:
+                    self.errors.append(f"📚 **V2 Too Flat**: `{file}` (Missing H3 subtopics)")
+                if "#### " not in content and "Introduction" not in content:
+                    # Not an error but a recommendation for O'Reilly style
+                    pass
+
+    def validate_navigation_sync(self):
+        """Mandate 11: Sincronización entre Workflow y Config."""
+        if not os.path.exists(WORKFLOW_PATH) or not os.path.exists(CURATION_SOURCES_PATH): return
+        
+        with open(CURATION_SOURCES_PATH, "r") as f:
+            sources = yaml.safe_load(f).get("sources", [])
+        topics = [s["topic"] for s in sources]
+        
+        workflow_content = open(WORKFLOW_PATH, "r").read()
+        # Look for include_XXX inputs
+        # This is a bit loose but helps
+        for topic in topics:
+            # Check if common keywords from topic are in workflow
+            keywords = re.findall(r'\w+', topic.lower())
+            found = False
+            for kw in keywords:
+                if kw in workflow_content.lower():
+                    found = True; break
+            if not found:
+                self.warnings.append(f"🔄 **Sync Warning**: Topic `{topic}` might not be represented in `{WORKFLOW_PATH}` inputs")
+
+    def validate_toc_and_anchors(self):
+        """🛠️ Structural Evolution: TOC Consistency & Lowercase Slugs."""
+        for root, _, files in os.walk(V1_DIR):
+            for file in files:
+                if file.endswith(".md"):
+                    content = open(os.path.join(root, file), "r").read()
+                    if "## Table of Contents" not in content and len(re.findall(r'^## ', content, re.M)) > 2:
+                        self.warnings.append(f"📍 **V1 TOC Missing**: `{file}` has many sections but no TOC")
+                    
+                    # Check anchors in TOC (should be lowercase)
+                    anchors = re.findall(r'\(#([^\)]+)\)', content)
+                    for a in anchors:
+                        if any(c.isupper() for c in a):
+                            self.warnings.append(f"⚓ **Upper Anchor**: `{file}` has non-lowercase anchor `#{a}`")
+
+    def generate_audit_report(self, old_inv_path=None) -> str:
+        """Generates a comprehensive Markdown report based on ALL Mandates."""
+        log_event("[Safety] Executing Full Mandate Audit (GEMINI.md compliance)...")
+        
+        if old_inv_path and os.path.exists(old_inv_path):
+            try:
+                with open(old_inv_path, "r") as f:
+                    self.validate_data_integrity(yaml.safe_load(f) or {})
+            except: pass
+        
+        self.validate_semantic_interlinking()
+        self.validate_special_assets_completeness()
+        self.validate_mvq_compliance()
+        self.validate_linguistic_tagging()
+        self.validate_platinum_schema()
+        self.validate_v2_architecture()
+        self.validate_navigation_sync()
+        self.validate_toc_and_anchors()
+        
+        status = "✅ PASS" if not self.errors else "❌ FAILED"
+        if not self.errors and self.warnings: status = "⚠️ WARNING"
+        
+        report = f"\n## 🛡️ Safety & Mandate Audit: {status}\n"
+        report += f"*Audit executed on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n"
+        
+        if not self.errors and not self.warnings:
+            report += "✨ **All project mandates from GEMINI.md and technical integrity checks passed successfully.**\n"
+        else:
+            if self.errors:
+                report += "### 🔴 Critical Failures (Mandate Violations)\n"
+                for err in self.errors: report += f"- {err}\n"
+                report += "\n"
+            
+            if self.warnings:
+                report += "### 🟡 Warnings & Recommendations\n"
+                report += "<details><summary>Click to view " + str(len(self.warnings)) + " recommendations</summary>\n\n"
+                for warn in self.warnings: report += f"- {warn}\n"
+                report += "\n> 💡 **Note**: Warnings suggest improvements to align with Nubenetes Excellence standards.\n</details>\n"
+        
+        return report
+
+if __name__ == "__main__":
+    guard = SafetyGuard()
+    print(guard.generate_audit_report())
--- a/data/link_rules.yaml
+++ b/data/link_rules.yaml
@@ -57,6 +57,13 @@ hierarchy_rules:
  naming_convention: "Technical Area > Topic > Subtopics"
  v1_reorganization: true     # Use deep hierarchy to rebuild V1 file sections.
  v2_nesting: true            # Enable H2 -> H3 -> H4 -> H5 mapping in V2.
+  toc_exempt_files:           # Files that do NOT require a Table of Contents (V1/V2).
+    - "mkdocs.md"
+    - "yesterday_mkdocs.yml"
+    - "mkdocs.yml"
+    - "v2-mkdocs.yml"
+    - "matrix-table.md"
+    - "jvm-parameters-matrix-table.md"

 # -----------------------------------------------------------------------------
 # PLATINUM METADATA: ADVANCED LIFECYCLE MANAGEMENT
--- a/src/agentic_curator.py
+++ b/src/agentic_curator.py
@@ -216,20 +216,41 @@ class AgenticCurator:
        return content + f"\n{line}" if "##" in content else content + f"\n\n## Tools and Resources\n{line}"

    async def suggest_reorganization(self):
-        log_event("[*] Starting Internal Reorganization Audit...", section_break=True)
+        log_event("[*] Starting Internal Reorganization & TOC Audit...", section_break=True)
+        # Load Special Assets & Link Rules for exceptions
        special_rules = {}
+        exempt_files = []
        if os.path.exists("data/special_assets.yaml"):
-            try:
-                with open("data/special_assets.yaml", "r") as f: special_rules = {sa["file"]: sa for sa in yaml.safe_load(f).get("special_assets", [])}
+            try: special_rules = {sa["file"]: sa for sa in yaml.safe_load(open("data/special_assets.yaml"))["special_assets"]}
            except: pass
+        if os.path.exists("data/link_rules.yaml"):
+            try: exempt_files = yaml.safe_load(open("data/link_rules.yaml"))["hierarchy_rules"].get("toc_exempt_files", [])
+            except: pass
+
        for file in os.listdir(self.docs_dir):
-            if not file.endswith(".md") or file == "index.md": continue
-            path = os.path.join(self.docs_dir, file); content = open(path, "r").read()
-            is_special = file in special_rules; link_count = len(re.findall(r"^\s*-\s*\[", content, re.MULTILINE))
-            if is_special or (link_count > 25 and len(re.findall(r"^## ", content, re.M)) < 2):
-                log_event(f"  [!] REORGANIZING: {file}")
-                instruction = "SOPHISTICATED O'REILLY HIERARCHY: Create nested sections (##) and subsections (###). Group by technical AREAS, TOPICS, and SUBTOPICS. Preserve all links." if is_special else "Group into logical sections (##)."
-                prompt = f"You act as a Technical Content Architect. Reorganize the file '{file}' based on: {instruction}\nCONTENT:\n{content[:5000]}"
+            if not file.endswith(".md") or file == "index.md" or file in exempt_files: continue
+            path = os.path.join(self.docs_dir, file)
+            with open(path, "r") as f: content = f.read()
+            
+            is_special = file in special_rules
+            link_count = len(re.findall(r"^\s*-\s*\[", content, re.MULTILINE))
+            headers = re.findall(r"^##+ ", content, re.MULTILINE)
+            
+            # --- FEATURE: Automatic TOC Injection for V1 ---
+            if len(headers) >= 3 and "Table of Contents" not in content:
+                log_event(f"  [+] INJECTING TOC: {file}")
+                content = await self._rebuild_toc(content)
+                with open(path, "w") as f: f.write(content)
+            
+            # Reorganize if special OR if flat and large
+            if is_special or (link_count > 25 and len(headers) < 2):
+                log_event(f"  [!] REORGANIZING: {file} ({'Special' if is_special else 'Standard'})")
+                instruction = (
+                    "SOPHISTICATED O'REILLY HIERARCHY: Create nested sections (##) and subsections (###). "
+                    "Group links by technical AREAS, TOPICS, and SUBTOPICS. Preserve all links."
+                    if is_special else "Group into logical sections (##)."
+                )
+                prompt = f"You act as a Technical Content Architect. Reorganize '{file}' based on: {instruction}\nCONTENT:\n{content[:5000]}"
                try:
                    reorg = await call_gemini_with_retry(prompt, response_format="text", prefer_flash=True)
                    if len(reorg) > len(content) * 0.7:
--- a/src/safety_guard.py
+++ b/src/safety_guard.py
@@ -26,6 +26,13 @@ class SafetyGuard:
            except: return {}
        return {}

+    def _load_exempt_files(self):
+        try:
+            with open("data/link_rules.yaml", "r") as f:
+                rules = yaml.safe_load(f)
+                return rules.get("hierarchy_rules", {}).get("toc_exempt_files", [])
+        except: return []
+
    def validate_data_integrity(self, old_inventory: dict):
        """Mandate 1: Protección de Información."""
        for url, old_meta in old_inventory.items():
@@ -61,11 +68,8 @@ class SafetyGuard:
                v1_path = os.path.join(V1_DIR, file_name)
                if os.path.exists(v1_path):
                    v1_links = re.findall(r'\[.*?\]\((https?://.*?)\)', open(v1_path, "r").read())
-                    # Check if these links exist in the inventory marked with this original_file
                    for link in v1_links:
                        nu = normalize_url(link)
-                        # We can't easily check if it's in the V2 rendered file here without complex parsing,
-                        # but we can check if it's in the inventory and not dead.
                        if nu in self.inventory and self.inventory[nu].get("status") == "online":
                            if not self.inventory[nu].get("v2_locations"):
                                self.errors.append(f"💎 **Special Asset Leak**: `{link}` from `{file_name}` is missing in V2 portal")
@@ -79,7 +83,7 @@ class SafetyGuard:
                    try:
                        last_date = datetime.fromisoformat(pushed.replace("Z", "+00:00"))
                        inactive_years = (datetime.now(last_date.tzinfo) - last_date).days / 365
-                        stars = meta.get("gh_stars", meta.get("stars", 0) * 100) # Fallback estimate
+                        stars = meta.get("gh_stars", meta.get("stars", 0) * 100) 
                        if inactive_years > 4 and stars < 30:
                            self.warnings.append(f"🏚️ **MVQ Violation**: Stale repo `{url}` (>4yrs) in V2 with low impact")
                    except: pass
@@ -90,7 +94,6 @@ class SafetyGuard:
        for file in os.listdir(V2_DIR):
            if file.endswith(".md"):
                content = open(os.path.join(V2_DIR, file), "r").read()
-                # Find links that are non-English in inventory but missing tag in MD
                for url, meta in self.inventory.items():
                    lang = meta.get("language", "English")
                    if lang.lower() != "english" and url in content:
@@ -107,54 +110,45 @@ class SafetyGuard:
            missing = [f for f in required if f not in meta]
            if missing:
                new_count += 1
-                if new_count < 10: # Don't overwhelm
+                if new_count < 10:
                    self.warnings.append(f"🧬 **Schema Incomplete**: `{url}` missing {missing}")

    def validate_v2_architecture(self):
        """Mandato 28: Estructura O'Reilly y V2 Flat Navigation."""
        if not os.path.exists(V2_DIR): return
+        exempt_files = self._load_exempt_files()
        for file in os.listdir(V2_DIR):
            if file.endswith(".md") and file != "index.md":
+                if file in exempt_files: continue
                content = open(os.path.join(V2_DIR, file), "r").read()
                if "## Table of Contents" not in content:
                    self.errors.append(f"📚 **V2 TOC Missing**: `{file}`")
                if "### " not in content:
                    self.errors.append(f"📚 **V2 Too Flat**: `{file}` (Missing H3 subtopics)")
-                if "#### " not in content and "Introduction" not in content:
-                    # Not an error but a recommendation for O'Reilly style
-                    pass

    def validate_navigation_sync(self):
        """Mandate 11: Sincronización entre Workflow y Config."""
        if not os.path.exists(WORKFLOW_PATH) or not os.path.exists(CURATION_SOURCES_PATH): return
-        
        with open(CURATION_SOURCES_PATH, "r") as f:
            sources = yaml.safe_load(f).get("sources", [])
        topics = [s["topic"] for s in sources]
-        
        workflow_content = open(WORKFLOW_PATH, "r").read()
-        # Look for include_XXX inputs
-        # This is a bit loose but helps
        for topic in topics:
-            # Check if common keywords from topic are in workflow
            keywords = re.findall(r'\w+', topic.lower())
-            found = False
-            for kw in keywords:
-                if kw in workflow_content.lower():
-                    found = True; break
+            found = any(kw in workflow_content.lower() for kw in keywords)
            if not found:
                self.warnings.append(f"🔄 **Sync Warning**: Topic `{topic}` might not be represented in `{WORKFLOW_PATH}` inputs")

    def validate_toc_and_anchors(self):
        """🛠️ Structural Evolution: TOC Consistency & Lowercase Slugs."""
+        exempt_files = self._load_exempt_files()
        for root, _, files in os.walk(V1_DIR):
            for file in files:
                if file.endswith(".md"):
+                    if file in exempt_files: continue
                    content = open(os.path.join(root, file), "r").read()
                    if "## Table of Contents" not in content and len(re.findall(r'^## ', content, re.M)) > 2:
                        self.warnings.append(f"📍 **V1 TOC Missing**: `{file}` has many sections but no TOC")
-                    
-                    # Check anchors in TOC (should be lowercase)
                    anchors = re.findall(r'\(#([^\)]+)\)', content)
                    for a in anchors:
                        if any(c.isupper() for c in a):
@@ -163,13 +157,11 @@ class SafetyGuard:
    def generate_audit_report(self, old_inv_path=None) -> str:
        """Generates a comprehensive Markdown report based on ALL Mandates."""
        log_event("[Safety] Executing Full Mandate Audit (GEMINI.md compliance)...")
-        
        if old_inv_path and os.path.exists(old_inv_path):
            try:
                with open(old_inv_path, "r") as f:
                    self.validate_data_integrity(yaml.safe_load(f) or {})
            except: pass
-        
        self.validate_semantic_interlinking()
        self.validate_special_assets_completeness()
        self.validate_mvq_compliance()
@@ -178,13 +170,9 @@ class SafetyGuard:
        self.validate_v2_architecture()
        self.validate_navigation_sync()
        self.validate_toc_and_anchors()
-        
        status = "✅ PASS" if not self.errors else "❌ FAILED"
        if not self.errors and self.warnings: status = "⚠️ WARNING"
-        
-        report = f"\n## 🛡️ Safety & Mandate Audit: {status}\n"
-        report += f"*Audit executed on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n"
-        
+        report = f"\n## 🛡️ Safety & Mandate Audit: {status}\n*Audit executed on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n"
        if not self.errors and not self.warnings:
            report += "✨ **All project mandates from GEMINI.md and technical integrity checks passed successfully.**\n"
        else:
@@ -192,13 +180,11 @@ class SafetyGuard:
                report += "### 🔴 Critical Failures (Mandate Violations)\n"
                for err in self.errors: report += f"- {err}\n"
                report += "\n"
-            
            if self.warnings:
                report += "### 🟡 Warnings & Recommendations\n"
                report += "<details><summary>Click to view " + str(len(self.warnings)) + " recommendations</summary>\n\n"
                for warn in self.warnings: report += f"- {warn}\n"
                report += "\n> 💡 **Note**: Warnings suggest improvements to align with Nubenetes Excellence standards.\n</details>\n"
-        
        return report

 if __name__ == "__main__":