feat(ai): enforce robust TOC standards (no emojis, no ampersands) across all layers and document as Mandate 30

2026-05-22 17:13:42 +00:00 · 2026-05-17 16:42:08 +02:00
parent d2dc1defb7
commit 537a09cc7d
4 changed files with 43 additions and 21 deletions
--- a/GEMINI.md
+++ b/GEMINI.md
@@ -90,6 +90,11 @@ This file contains the accumulated instructions and long-term vision for the aut

 29. **TOC & Structural Exceptions**: Certain files (configuration-heavy or technical tables like `mkdocs.md` or `matrix-table.md`) are exempt from TOC and deep-hierarchy requirements. These exceptions MUST be respected by all agents to avoid unnecessary structural clutter in non-navigational files as defined in [`data/link_rules.yaml`](data/link_rules.yaml).

+30. **Universal Title and TOC Standards**: To ensure robust cross-platform rendering and prevent broken internal links:
+    - **No Emojis or Special Characters**: Section titles (H2-H6) and Table of Contents (TOC) entries MUST NOT contain emojis or special symbols.
+    - **No Ampersands**: The ampersand character (`&`) MUST be replaced with "and" in all titles and TOCs.
+    - **Lowercase Anchors**: All Markdown anchors MUST use strictly lowercase slugs without special characters.
+
 ## 🛠️ Structural Evolution & Navigation
 ...
 *   **No Link Limits**: There are NO hard limits on the number of links per page or per section (##/###). Nubenetes is built to host thousands of references.
@@ -163,7 +168,7 @@ Whenever a significant curation cycle (automatic or manual) is completed:
 *   **Manual Fallback:** If a manual update is performed (emergency fixes, structural changes), the human/AI agent is responsible for manually running the metric extraction scripts and updating the `README.md` accordingly.
 *   **Algorithm-README Sync**: Whenever the AI curation logic, model tiering, or the extraction algorithm is modified (e.g., `src/gemini_utils.py` or `src/v2_optimizer.py`), the `README.md` MUST be updated to reflect these technical changes in the "Agentic Stack" and "Architectural Shift" sections.
 *   **Hierarchical README Maintenance**: Whenever `README.md` is modified, the Table of Contents (TOC) MUST be updated to reflect all changes in sections (H2) and subsections (H3). All titles in the document MUST include hierarchical numbering (e.g., "1. Section", "1.1. Subsection") perfectly synchronized with the TOC.
-*   **Robust Title Standards**: Emojis and ampersands (&) MUST NOT be used in any section (H2) or subsection (H3) titles within `README.md` or the Table of Contents. Ampersands should be replaced with "and". This ensures maximum compatibility with Markdown anchor generation and prevents broken navigation links.
+*   **Universal Title Standards**: Emojis and ampersands (&) MUST NOT be used in any section titles or Table of Contents. Ampersands MUST be replaced with "and". All anchors MUST be lowercase slugs (Mandate 30).
 *   **Asset Inventory and Configuration**: The `README.md` MUST maintain a "Repository Inventory and Configuration" section (Section 13) that provides an exhaustive list of all key configuration files, centralized metadata databases, autonomous workflows, and core source code files. Each item MUST be linked using a relative Markdown path (e.g., `[file.yaml](data/file.yaml)`) to facilitate direct navigation.
 *   **Source Transparency**: Specific curation sources (e.g., X.com accounts) MUST be documented in the "Agentic AI Engine" section of the `README.md`. Any addition or removal of primary sources in `data/curation_sources.yaml` requires a corresponding update to the documentation.

--- a/src/agentic_curator.py
+++ b/src/agentic_curator.py
@@ -9,7 +9,7 @@ from datetime import datetime
 from typing import List, Dict, Optional, Tuple
 from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ
 from src.gitops_manager import RepositoryController
-from src.gemini_utils import call_gemini_with_retry, normalize_url
+from src.gemini_utils import call_gemini_with_retry, normalize_url, clean_toc_text
 from src.logger import log_event

 def get_best_category_match(suggested: str) -> Optional[str]:
@@ -166,7 +166,8 @@ class AgenticCurator:
        headers = []
        for line in lines:
            if line.startswith("## ") or line.startswith("### "):
-                title = line.strip("#").strip()
+                raw_title = line.strip("#").strip()
+                title = clean_toc_text(raw_title)
                anchor = title.lower().replace(" ", "-").replace(".", "").replace("/", "").replace("(", "").replace(")", "").replace(",", "")
                headers.append({"title": title, "anchor": anchor, "level": 2 if line.startswith("## ") else 3})
        if not headers: return content
--- a/src/gemini_utils.py
+++ b/src/gemini_utils.py
@@ -164,6 +164,20 @@ async def resolve_url(url: str) -> str:
            except: break
    return final_url

+def clean_toc_text(text: str) -> str:
+    """
+    Ensures technical titles and TOC entries are robust.
+    Strips emojis, replaces ampersands, and removes special chars.
+    """
+    if not text: return ""
+    # 1. Replace ampersands
+    text = text.replace("&", "and")
+    # 2. Strip Emojis (Regex for Unicode emoji ranges)
+    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)
+    # 3. Strip other common problematic non-alphanumeric chars (except spaces and hyphens)
+    text = re.sub(r'[^\w\s\-.]', '', text)
+    return text.strip()
+
 def normalize_url(url: str) -> str:
    """
    Normalización de URLs de alta precisión para Nubenetes.
--- a/src/v2_optimizer.py
+++ b/src/v2_optimizer.py
@@ -7,7 +7,7 @@ import httpx
 from datetime import datetime
 from typing import List, Dict, Set, Any, Tuple
 from src.config import GEMINI_API_KEYS, GH_TOKEN, TARGET_REPO, MADRID_TZ, INVENTORY_PATH
-from src.gemini_utils import call_gemini_with_retry, normalize_url
+from src.gemini_utils import call_gemini_with_retry, normalize_url, clean_toc_text
 from src.logger import log_event

 V1_DIR = "docs"
@@ -312,24 +312,26 @@ class V2VisionEngine:
            slug = dim.lower().replace(" ", "-").replace("&", "and").replace("(", "").replace(")", "")
            index_md += f"- **[{dim}](./{slug}.md)**: {content['summary']}\n"
        with open(os.path.join(V2_DIR, "index.md"), "w") as f: f.write(index_md)
+# Helper functions for recursive rendering
+def gen_toc(node, depth, base_slug):
+    toc = ""
+    for name, subnode in sorted(node.items()):
+        if name == "__links__": continue
+        clean_name = clean_toc_text(name)
+        slug = f"{base_slug}-{clean_name.lower().replace(' ', '-')}"
+        toc += f"{' ' * (depth * 4)}- [{clean_name}](#{slug})\n" + gen_toc(subnode, depth + 1, slug)
+    return toc

-        def gen_toc(node, depth, base_slug):
-            toc = ""
-            for name, subnode in sorted(node.items()):
-                if name == "__links__": continue
-                slug = f"{base_slug}-{name.lower().replace(' ', '-')}"
-                toc += f"{' ' * (depth * 4)}- [{name}](#{slug})\n" + gen_toc(subnode, depth + 1, slug)
-            return toc
-
-        async def render_node(node, depth, base_slug, is_intro=False):
-            md = ""
-            for name, subnode in sorted(node.items()):
-                if name == "__links__": continue
-                slug = f"{base_slug}-{name.lower().replace(' ', '-')}"
-                md += f"{'#' * min(6, depth + 2)} {name}\n\n"
-                if depth == 1 and "__links__" in subnode: md += await self._generate_comparison_table(subnode["__links__"])
-                md += await render_node(subnode, depth + 1, slug, is_intro)
-            if "__links__" in node:
+async def render_node(node, depth, base_slug, is_intro=False):
+    md = ""
+    for name, subnode in sorted(node.items()):
+        if name == "__links__": continue
+        clean_name = clean_toc_text(name)
+        slug = f"{base_slug}-{clean_name.lower().replace(' ', '-')}"
+        md += f"{'#' * min(6, depth + 2)} {clean_name}\n\n"
+        if depth == 1 and "__links__" in subnode: md += await self._generate_comparison_table(subnode["__links__"])
+        md += await render_node(subnode, depth + 1, slug, is_intro)
+    if "__links__" in node:
                for l in node["__links__"]:
                    is_gold = is_intro and l.get("stars", 0) >= 4
                    title = l['title'].replace("==", "")