From 26abff05af835f07d9caf5fc6c5ae410293ec4a9 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Sun, 17 May 2026 16:25:01 +0200 Subject: [PATCH] feat(ai): implement TOC and structural exceptions for config-heavy files (Mandate 29) --- GEMINI.md | 2 + README.md | 8 +- current_safety_guard.py | 206 ++++++++++++++++++++++++++++++++++++++++ data/link_rules.yaml | 7 ++ src/agentic_curator.py | 41 ++++++-- src/safety_guard.py | 44 +++------ 6 files changed, 268 insertions(+), 40 deletions(-) create mode 100644 current_safety_guard.py diff --git a/GEMINI.md b/GEMINI.md index e41bd74b..58660891 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -88,6 +88,8 @@ This file contains the accumulated instructions and long-term vision for the aut - **AI Dimension Naming**: Prioritize industry-standard terms (e.g., "AI and Artificial Intelligence" instead of internal jargon) for top-level navigation. - **Content Extraction**: High-value sub-topics may be extracted into dedicated documents (e.g., "Microservices" content extracted from `introduction.md` into `microservices.md`) to maintain high-density focus. +29. **TOC & Structural Exceptions**: Certain files (configuration-heavy or technical tables like `mkdocs.md` or `matrix-table.md`) are exempt from TOC and deep-hierarchy requirements. These exceptions MUST be respected by all agents to avoid unnecessary structural clutter in non-navigational files as defined in [`data/link_rules.yaml`](data/link_rules.yaml). + ## 🛠️ Structural Evolution & Navigation ... * **No Link Limits**: There are NO hard limits on the number of links per page or per section (##/###). Nubenetes is built to host thousands of references. diff --git a/README.md b/README.md index 674dae36..1ca43cb5 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,8 @@ 12. [12. Developer Experience and VSCode Setup](#12-developer-experience-and-vscode-setup) 14. [14. Special Assets and Learning Paths](#14-special-assets-and-learning-paths) * [14.1. Special Assets Management](#141-special-assets-management) - * [14.2. O'Reilly-style Knowledge Architecture](#142-oreilly-style-knowledge-architecture) + * [14.2. O.Reilly-style Knowledge Architecture](#142-oreilly-style-knowledge-architecture) + * [14.3. TOC and Structural Exceptions](#143-toc-and-structural-exceptions) * [12.1. Extension Recommendations](#121-extension-recommendations) * [12.2. Recommended settings.json](#122-recommended-settingsjson) 13. [13. Repository Inventory and Configuration](#13-repository-inventory-and-configuration) @@ -724,3 +725,8 @@ The V2 Portal is structured as a sophisticated technical reference guide, moving - **Gateway Hub Navigation**: Strategic dimensions are semantically interconnected, with a dedicated **Microservices Guide** extracted for high-density focus. - **Structured Assimilation**: Information is grouped into technical Areas, Topics, and Subtopics, facilitating learning from foundational theory to advanced engineering internals. - **Contextual Hierarchy**: Every page features an automated, clickable Table of Contents (TOC) with nested anchors for precise technical navigation. + +### 📑 TOC and Structural Exceptions +Certain files are exempt from the mandatory Table of Contents (TOC) and deep-hierarchy requirements. These include configuration-heavy files (e.g., `mkdocs.md`) and large technical tables (e.g., `matrix-table.md`) where a navigational index is unnecessary or distracting. +- **Automatic Skip**: The Agentic Curator and V2 Builder automatically bypass these files during structural reorganization cycles. +- **Exception Registry**: Exemptions are managed via the `toc_exempt_files` list in [`data/link_rules.yaml`](data/link_rules.yaml). diff --git a/current_safety_guard.py b/current_safety_guard.py new file mode 100644 index 00000000..be177854 --- /dev/null +++ b/current_safety_guard.py @@ -0,0 +1,206 @@ +import os +import yaml +import re +from datetime import datetime +from src.logger import log_event +from src.gemini_utils import normalize_url + +INVENTORY_PATH = "data/inventory.yaml" +V1_DIR = "docs" +V2_DIR = "v2-docs" +SPECIAL_ASSETS_PATH = "data/special_assets.yaml" +CURATION_SOURCES_PATH = "data/curation_sources.yaml" +WORKFLOW_PATH = ".github/workflows/agentic_cron.yml" + +class SafetyGuard: + def __init__(self): + self.errors = [] + self.warnings = [] + self.inventory = self._load_inventory() + + def _load_inventory(self): + if os.path.exists(INVENTORY_PATH): + try: + with open(INVENTORY_PATH, "r") as f: + return yaml.safe_load(f) or {} + except: return {} + return {} + + def validate_data_integrity(self, old_inventory: dict): + """Mandate 1: Protección de Información.""" + for url, old_meta in old_inventory.items(): + new_meta = self.inventory.get(url) + if not new_meta: continue + if old_meta.get("stars", 0) > new_meta.get("stars", 0): + self.errors.append(f"❌ **Star Loss**: `{url}` ({old_meta['stars']} -> {new_meta['stars']})") + if old_meta.get("description") and not new_meta.get("description"): + self.errors.append(f"❌ **V1 Desc Loss**: `{url}`") + + def validate_semantic_interlinking(self): + """Mandate 5: Verificar interconexión semántica en V1.""" + log_event("[Safety] Auditing Semantic Interlinking...") + for url, meta in self.inventory.items(): + related = meta.get("related_categories", []) + for rel_cat in related: + path = os.path.join(V1_DIR, f"{rel_cat}.md") + if os.path.exists(path): + content = open(path, "r").read() + if url not in content: + self.warnings.append(f"🔗 **Interlink Missing**: `{meta['title']}` should be referenced in `{rel_cat}.md` (See also)") + + def validate_special_assets_completeness(self): + """Mandate 27: Inclusión exhaustiva de Activos Especiales en V2.""" + if not os.path.exists(SPECIAL_ASSETS_PATH) or not os.path.exists(V2_DIR): return + + with open(SPECIAL_ASSETS_PATH, "r") as f: + special = yaml.safe_load(f).get("special_assets", []) + + for sa in special: + if "Include 100%" in sa.get("v2_rule", ""): + file_name = sa["file"] + v1_path = os.path.join(V1_DIR, file_name) + if os.path.exists(v1_path): + v1_links = re.findall(r'\[.*?\]\((https?://.*?)\)', open(v1_path, "r").read()) + # Check if these links exist in the inventory marked with this original_file + for link in v1_links: + nu = normalize_url(link) + # We can't easily check if it's in the V2 rendered file here without complex parsing, + # but we can check if it's in the inventory and not dead. + if nu in self.inventory and self.inventory[nu].get("status") == "online": + if not self.inventory[nu].get("v2_locations"): + self.errors.append(f"💎 **Special Asset Leak**: `{link}` from `{file_name}` is missing in V2 portal") + + def validate_mvq_compliance(self): + """Mandato 3 & 16: Verificar cumplimiento de MVQ en V2.""" + for url, meta in self.inventory.items(): + if "github.com" in url and meta.get("v2_locations"): + pushed = meta.get("gh_pushed", "") + if pushed: + try: + last_date = datetime.fromisoformat(pushed.replace("Z", "+00:00")) + inactive_years = (datetime.now(last_date.tzinfo) - last_date).days / 365 + stars = meta.get("gh_stars", meta.get("stars", 0) * 100) # Fallback estimate + if inactive_years > 4 and stars < 30: + self.warnings.append(f"🏚️ **MVQ Violation**: Stale repo `{url}` (>4yrs) in V2 with low impact") + except: pass + + def validate_linguistic_tagging(self): + """Mandate 10: Verificar etiquetado de idioma en V2.""" + if not os.path.exists(V2_DIR): return + for file in os.listdir(V2_DIR): + if file.endswith(".md"): + content = open(os.path.join(V2_DIR, file), "r").read() + # Find links that are non-English in inventory but missing tag in MD + for url, meta in self.inventory.items(): + lang = meta.get("language", "English") + if lang.lower() != "english" and url in content: + tag = f"[{lang.upper()} CONTENT]" + if tag not in content: + self.warnings.append(f"🌐 **Missing Lang Tag**: `{meta['title']}` in `{file}` needs `{tag}`") + + def validate_platinum_schema(self): + """Mandate 22: Validar esquema Platinum en BBDD.""" + required = ["content_hash", "health_score", "hierarchy", "v1_locations"] + new_count = 0 + for url, meta in self.inventory.items(): + if url.startswith("INTRO:"): continue + missing = [f for f in required if f not in meta] + if missing: + new_count += 1 + if new_count < 10: # Don't overwhelm + self.warnings.append(f"🧬 **Schema Incomplete**: `{url}` missing {missing}") + + def validate_v2_architecture(self): + """Mandato 28: Estructura O'Reilly y V2 Flat Navigation.""" + if not os.path.exists(V2_DIR): return + for file in os.listdir(V2_DIR): + if file.endswith(".md") and file != "index.md": + content = open(os.path.join(V2_DIR, file), "r").read() + if "## Table of Contents" not in content: + self.errors.append(f"📚 **V2 TOC Missing**: `{file}`") + if "### " not in content: + self.errors.append(f"📚 **V2 Too Flat**: `{file}` (Missing H3 subtopics)") + if "#### " not in content and "Introduction" not in content: + # Not an error but a recommendation for O'Reilly style + pass + + def validate_navigation_sync(self): + """Mandate 11: Sincronización entre Workflow y Config.""" + if not os.path.exists(WORKFLOW_PATH) or not os.path.exists(CURATION_SOURCES_PATH): return + + with open(CURATION_SOURCES_PATH, "r") as f: + sources = yaml.safe_load(f).get("sources", []) + topics = [s["topic"] for s in sources] + + workflow_content = open(WORKFLOW_PATH, "r").read() + # Look for include_XXX inputs + # This is a bit loose but helps + for topic in topics: + # Check if common keywords from topic are in workflow + keywords = re.findall(r'\w+', topic.lower()) + found = False + for kw in keywords: + if kw in workflow_content.lower(): + found = True; break + if not found: + self.warnings.append(f"🔄 **Sync Warning**: Topic `{topic}` might not be represented in `{WORKFLOW_PATH}` inputs") + + def validate_toc_and_anchors(self): + """🛠️ Structural Evolution: TOC Consistency & Lowercase Slugs.""" + for root, _, files in os.walk(V1_DIR): + for file in files: + if file.endswith(".md"): + content = open(os.path.join(root, file), "r").read() + if "## Table of Contents" not in content and len(re.findall(r'^## ', content, re.M)) > 2: + self.warnings.append(f"📍 **V1 TOC Missing**: `{file}` has many sections but no TOC") + + # Check anchors in TOC (should be lowercase) + anchors = re.findall(r'\(#([^\)]+)\)', content) + for a in anchors: + if any(c.isupper() for c in a): + self.warnings.append(f"⚓ **Upper Anchor**: `{file}` has non-lowercase anchor `#{a}`") + + def generate_audit_report(self, old_inv_path=None) -> str: + """Generates a comprehensive Markdown report based on ALL Mandates.""" + log_event("[Safety] Executing Full Mandate Audit (GEMINI.md compliance)...") + + if old_inv_path and os.path.exists(old_inv_path): + try: + with open(old_inv_path, "r") as f: + self.validate_data_integrity(yaml.safe_load(f) or {}) + except: pass + + self.validate_semantic_interlinking() + self.validate_special_assets_completeness() + self.validate_mvq_compliance() + self.validate_linguistic_tagging() + self.validate_platinum_schema() + self.validate_v2_architecture() + self.validate_navigation_sync() + self.validate_toc_and_anchors() + + status = "✅ PASS" if not self.errors else "❌ FAILED" + if not self.errors and self.warnings: status = "⚠️ WARNING" + + report = f"\n## 🛡️ Safety & Mandate Audit: {status}\n" + report += f"*Audit executed on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n" + + if not self.errors and not self.warnings: + report += "✨ **All project mandates from GEMINI.md and technical integrity checks passed successfully.**\n" + else: + if self.errors: + report += "### 🔴 Critical Failures (Mandate Violations)\n" + for err in self.errors: report += f"- {err}\n" + report += "\n" + + if self.warnings: + report += "### 🟡 Warnings & Recommendations\n" + report += "
Click to view " + str(len(self.warnings)) + " recommendations\n\n" + for warn in self.warnings: report += f"- {warn}\n" + report += "\n> 💡 **Note**: Warnings suggest improvements to align with Nubenetes Excellence standards.\n
\n" + + return report + +if __name__ == "__main__": + guard = SafetyGuard() + print(guard.generate_audit_report()) diff --git a/data/link_rules.yaml b/data/link_rules.yaml index 08a8aed6..4b45bf64 100644 --- a/data/link_rules.yaml +++ b/data/link_rules.yaml @@ -57,6 +57,13 @@ hierarchy_rules: naming_convention: "Technical Area > Topic > Subtopics" v1_reorganization: true # Use deep hierarchy to rebuild V1 file sections. v2_nesting: true # Enable H2 -> H3 -> H4 -> H5 mapping in V2. + toc_exempt_files: # Files that do NOT require a Table of Contents (V1/V2). + - "mkdocs.md" + - "yesterday_mkdocs.yml" + - "mkdocs.yml" + - "v2-mkdocs.yml" + - "matrix-table.md" + - "jvm-parameters-matrix-table.md" # ----------------------------------------------------------------------------- # PLATINUM METADATA: ADVANCED LIFECYCLE MANAGEMENT diff --git a/src/agentic_curator.py b/src/agentic_curator.py index ba3da0ad..0e2a45a1 100644 --- a/src/agentic_curator.py +++ b/src/agentic_curator.py @@ -216,20 +216,41 @@ class AgenticCurator: return content + f"\n{line}" if "##" in content else content + f"\n\n## Tools and Resources\n{line}" async def suggest_reorganization(self): - log_event("[*] Starting Internal Reorganization Audit...", section_break=True) + log_event("[*] Starting Internal Reorganization & TOC Audit...", section_break=True) + # Load Special Assets & Link Rules for exceptions special_rules = {} + exempt_files = [] if os.path.exists("data/special_assets.yaml"): - try: - with open("data/special_assets.yaml", "r") as f: special_rules = {sa["file"]: sa for sa in yaml.safe_load(f).get("special_assets", [])} + try: special_rules = {sa["file"]: sa for sa in yaml.safe_load(open("data/special_assets.yaml"))["special_assets"]} except: pass + if os.path.exists("data/link_rules.yaml"): + try: exempt_files = yaml.safe_load(open("data/link_rules.yaml"))["hierarchy_rules"].get("toc_exempt_files", []) + except: pass + for file in os.listdir(self.docs_dir): - if not file.endswith(".md") or file == "index.md": continue - path = os.path.join(self.docs_dir, file); content = open(path, "r").read() - is_special = file in special_rules; link_count = len(re.findall(r"^\s*-\s*\[", content, re.MULTILINE)) - if is_special or (link_count > 25 and len(re.findall(r"^## ", content, re.M)) < 2): - log_event(f" [!] REORGANIZING: {file}") - instruction = "SOPHISTICATED O'REILLY HIERARCHY: Create nested sections (##) and subsections (###). Group by technical AREAS, TOPICS, and SUBTOPICS. Preserve all links." if is_special else "Group into logical sections (##)." - prompt = f"You act as a Technical Content Architect. Reorganize the file '{file}' based on: {instruction}\nCONTENT:\n{content[:5000]}" + if not file.endswith(".md") or file == "index.md" or file in exempt_files: continue + path = os.path.join(self.docs_dir, file) + with open(path, "r") as f: content = f.read() + + is_special = file in special_rules + link_count = len(re.findall(r"^\s*-\s*\[", content, re.MULTILINE)) + headers = re.findall(r"^##+ ", content, re.MULTILINE) + + # --- FEATURE: Automatic TOC Injection for V1 --- + if len(headers) >= 3 and "Table of Contents" not in content: + log_event(f" [+] INJECTING TOC: {file}") + content = await self._rebuild_toc(content) + with open(path, "w") as f: f.write(content) + + # Reorganize if special OR if flat and large + if is_special or (link_count > 25 and len(headers) < 2): + log_event(f" [!] REORGANIZING: {file} ({'Special' if is_special else 'Standard'})") + instruction = ( + "SOPHISTICATED O'REILLY HIERARCHY: Create nested sections (##) and subsections (###). " + "Group links by technical AREAS, TOPICS, and SUBTOPICS. Preserve all links." + if is_special else "Group into logical sections (##)." + ) + prompt = f"You act as a Technical Content Architect. Reorganize '{file}' based on: {instruction}\nCONTENT:\n{content[:5000]}" try: reorg = await call_gemini_with_retry(prompt, response_format="text", prefer_flash=True) if len(reorg) > len(content) * 0.7: diff --git a/src/safety_guard.py b/src/safety_guard.py index be177854..ee316121 100644 --- a/src/safety_guard.py +++ b/src/safety_guard.py @@ -26,6 +26,13 @@ class SafetyGuard: except: return {} return {} + def _load_exempt_files(self): + try: + with open("data/link_rules.yaml", "r") as f: + rules = yaml.safe_load(f) + return rules.get("hierarchy_rules", {}).get("toc_exempt_files", []) + except: return [] + def validate_data_integrity(self, old_inventory: dict): """Mandate 1: Protección de Información.""" for url, old_meta in old_inventory.items(): @@ -61,11 +68,8 @@ class SafetyGuard: v1_path = os.path.join(V1_DIR, file_name) if os.path.exists(v1_path): v1_links = re.findall(r'\[.*?\]\((https?://.*?)\)', open(v1_path, "r").read()) - # Check if these links exist in the inventory marked with this original_file for link in v1_links: nu = normalize_url(link) - # We can't easily check if it's in the V2 rendered file here without complex parsing, - # but we can check if it's in the inventory and not dead. if nu in self.inventory and self.inventory[nu].get("status") == "online": if not self.inventory[nu].get("v2_locations"): self.errors.append(f"💎 **Special Asset Leak**: `{link}` from `{file_name}` is missing in V2 portal") @@ -79,7 +83,7 @@ class SafetyGuard: try: last_date = datetime.fromisoformat(pushed.replace("Z", "+00:00")) inactive_years = (datetime.now(last_date.tzinfo) - last_date).days / 365 - stars = meta.get("gh_stars", meta.get("stars", 0) * 100) # Fallback estimate + stars = meta.get("gh_stars", meta.get("stars", 0) * 100) if inactive_years > 4 and stars < 30: self.warnings.append(f"🏚️ **MVQ Violation**: Stale repo `{url}` (>4yrs) in V2 with low impact") except: pass @@ -90,7 +94,6 @@ class SafetyGuard: for file in os.listdir(V2_DIR): if file.endswith(".md"): content = open(os.path.join(V2_DIR, file), "r").read() - # Find links that are non-English in inventory but missing tag in MD for url, meta in self.inventory.items(): lang = meta.get("language", "English") if lang.lower() != "english" and url in content: @@ -107,54 +110,45 @@ class SafetyGuard: missing = [f for f in required if f not in meta] if missing: new_count += 1 - if new_count < 10: # Don't overwhelm + if new_count < 10: self.warnings.append(f"🧬 **Schema Incomplete**: `{url}` missing {missing}") def validate_v2_architecture(self): """Mandato 28: Estructura O'Reilly y V2 Flat Navigation.""" if not os.path.exists(V2_DIR): return + exempt_files = self._load_exempt_files() for file in os.listdir(V2_DIR): if file.endswith(".md") and file != "index.md": + if file in exempt_files: continue content = open(os.path.join(V2_DIR, file), "r").read() if "## Table of Contents" not in content: self.errors.append(f"📚 **V2 TOC Missing**: `{file}`") if "### " not in content: self.errors.append(f"📚 **V2 Too Flat**: `{file}` (Missing H3 subtopics)") - if "#### " not in content and "Introduction" not in content: - # Not an error but a recommendation for O'Reilly style - pass def validate_navigation_sync(self): """Mandate 11: Sincronización entre Workflow y Config.""" if not os.path.exists(WORKFLOW_PATH) or not os.path.exists(CURATION_SOURCES_PATH): return - with open(CURATION_SOURCES_PATH, "r") as f: sources = yaml.safe_load(f).get("sources", []) topics = [s["topic"] for s in sources] - workflow_content = open(WORKFLOW_PATH, "r").read() - # Look for include_XXX inputs - # This is a bit loose but helps for topic in topics: - # Check if common keywords from topic are in workflow keywords = re.findall(r'\w+', topic.lower()) - found = False - for kw in keywords: - if kw in workflow_content.lower(): - found = True; break + found = any(kw in workflow_content.lower() for kw in keywords) if not found: self.warnings.append(f"🔄 **Sync Warning**: Topic `{topic}` might not be represented in `{WORKFLOW_PATH}` inputs") def validate_toc_and_anchors(self): """🛠️ Structural Evolution: TOC Consistency & Lowercase Slugs.""" + exempt_files = self._load_exempt_files() for root, _, files in os.walk(V1_DIR): for file in files: if file.endswith(".md"): + if file in exempt_files: continue content = open(os.path.join(root, file), "r").read() if "## Table of Contents" not in content and len(re.findall(r'^## ', content, re.M)) > 2: self.warnings.append(f"📍 **V1 TOC Missing**: `{file}` has many sections but no TOC") - - # Check anchors in TOC (should be lowercase) anchors = re.findall(r'\(#([^\)]+)\)', content) for a in anchors: if any(c.isupper() for c in a): @@ -163,13 +157,11 @@ class SafetyGuard: def generate_audit_report(self, old_inv_path=None) -> str: """Generates a comprehensive Markdown report based on ALL Mandates.""" log_event("[Safety] Executing Full Mandate Audit (GEMINI.md compliance)...") - if old_inv_path and os.path.exists(old_inv_path): try: with open(old_inv_path, "r") as f: self.validate_data_integrity(yaml.safe_load(f) or {}) except: pass - self.validate_semantic_interlinking() self.validate_special_assets_completeness() self.validate_mvq_compliance() @@ -178,13 +170,9 @@ class SafetyGuard: self.validate_v2_architecture() self.validate_navigation_sync() self.validate_toc_and_anchors() - status = "✅ PASS" if not self.errors else "❌ FAILED" if not self.errors and self.warnings: status = "⚠️ WARNING" - - report = f"\n## 🛡️ Safety & Mandate Audit: {status}\n" - report += f"*Audit executed on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n" - + report = f"\n## 🛡️ Safety & Mandate Audit: {status}\n*Audit executed on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n" if not self.errors and not self.warnings: report += "✨ **All project mandates from GEMINI.md and technical integrity checks passed successfully.**\n" else: @@ -192,13 +180,11 @@ class SafetyGuard: report += "### 🔴 Critical Failures (Mandate Violations)\n" for err in self.errors: report += f"- {err}\n" report += "\n" - if self.warnings: report += "### 🟡 Warnings & Recommendations\n" report += "
Click to view " + str(len(self.warnings)) + " recommendations\n\n" for warn in self.warnings: report += f"- {warn}\n" report += "\n> 💡 **Note**: Warnings suggest improvements to align with Nubenetes Excellence standards.\n
\n" - return report if __name__ == "__main__":