From 4ee58cce04fda1c8a3490e71269548bb5d65d871 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Sat, 16 May 2026 11:50:16 +0200 Subject: [PATCH] feat(intelligence): implement Canonical Normalization, The Agentic Pulse, and Evolutionary Maturity --- GEMINI.md | 4 ++ README.md | 15 +++---- src/agentic_curator.py | 5 +++ src/intelligent_health_checker.py | 13 ++++-- src/main.py | 5 +++ src/v2_optimizer.py | 67 ++++++++++++++++++++++++------- 6 files changed, 84 insertions(+), 25 deletions(-) diff --git a/GEMINI.md b/GEMINI.md index f7eb9464..10ba3fd0 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -41,6 +41,10 @@ This file contains the accumulated instructions and long-term vision for the aut - **`structure_map.yaml`**: Tracks link locations and visual formatting (bold/highlight) across V1 and V2. - **Persistence**: Every agent MUST load these files at startup and save any modifications immediately to ensure state continuity across workflows. - **Manual Priority**: AI agents MUST NOT overwrite existing manual descriptions in the V1 archive files. Enrichment is strictly for `inventory.yaml` and the V2 portal. +23. **Canonical URL Normalization**: To prevent duplication and fragmented metadata, all agents MUST normalize URLs before any inventory operation. + - **Tracking Stripping**: Systematically remove UTM parameters, social media trackers (X.com, LinkedIn), and URL fragments (`#`). + - **Protocol Uniformity**: Standardize on `https://` whenever possible. + - **Merge Logic**: Metadata from multiple sources for the same canonical URL MUST be merged, prioritizing the highest star rating and most recent date. ## 🛠️ Structural Evolution & Navigation ... diff --git a/README.md b/README.md index 3468c27a..e37bcb0c 100644 --- a/README.md +++ b/README.md @@ -218,12 +218,12 @@ Nubenetes now utilizes a **Unified Metadata Architecture** to maintain consisten ### Agentic Data Flow ```mermaid graph TD - AC[Agentic Curator] -->|New Resource| DB[(Unified DB)] - LC[Link Cleaner] -->|Health & Metadata| DB - V2[V2 Vision Engine] -->|Elite Selection| DB + AC[Agentic Curator] -->|Canonical Normalization| DB[(Unified DB)] + LC[Link Cleaner] -->|Health & Metadata Enrichment| DB + V2[V2 Vision Engine] -->|Elite Selection & Maturity Evolution| DB DB -->|Metadata Sync| V1[V1 Archive: docs/] - DB -->|Advanced UI| V2P[V2 Portal: v2-docs/] + DB -->|Trending: The Agentic Pulse| V2P[V2 Portal: v2-docs/] subgraph Local Storage DB1[inventory.yaml] @@ -232,10 +232,11 @@ graph TD ``` ### Strategic Benefits +- **Canonical Deduplication**: Automatically merges duplicate resources (stripping UTM/trackers), ensuring a clean and precise inventory. +- **The Agentic Pulse**: A dynamic trending section on the V2 home page that highlights the freshest high-impact resources. - **Zero Redundancy**: Links already analyzed by Gemini are never re-evaluated unless forced. -- **Visual Consistency**: Highlighting (`==`) and Bold formatting are managed via the database to ensure high-signal discovery. -- **Cross-Edition Sync**: A metadata update in the YAML instantly propagates to both V1 and V2 during the next build cycle. -- **Manual Priority**: Existing V1 descriptions are protected; AI only intervenes for new additions or V2-specific enrichment. +- **Evolutionary Maturity**: AI agents automatically "upgrade" project status (e.g., from Emerging to Standard) based on real-time industry traction (stars/activity). +- **Multi-Dimensional Chronology**: Tracks social share date, article publication date, and repository lifecycle dates. --- diff --git a/src/agentic_curator.py b/src/agentic_curator.py index 0e298b8a..779a76a1 100644 --- a/src/agentic_curator.py +++ b/src/agentic_curator.py @@ -11,6 +11,11 @@ from src.config import GEMINI_API_KEYS, GH_TOKEN, TARGET_REPO, NUBENETES_CATEGOR from src.gitops_manager import RepositoryController from src.gemini_utils import call_gemini_with_retry +def normalize_url(url: str) -> str: + url = url.split(\"#\")[0].split(\"?\")[0].rstrip(\"/\") + if url.startswith(\"http://\"): url = \"https://\" + url[7:] + return url.lower() + # Silenciar advertencias de XML/HTML import warnings from bs4 import XMLParsedAsHTMLWarning diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py index 2a23e251..5b232112 100644 --- a/src/intelligent_health_checker.py +++ b/src/intelligent_health_checker.py @@ -12,6 +12,11 @@ from src.markdown_ast import MarkdownSanitizer from src.agentic_curator import AgenticCurator from src.logger import log_event +def normalize_url(url: str) -> str: + url = url.split(\"#\")[0].split(\"?\")[0].rstrip(\"/\") + if url.startswith(\"http://\"): url = \"https://\" + url[7:] + return url.lower() + # Configuración de Excepciones CORE_FILES = ["docs/index.md", "README.md"] MEMORY_FILE = "src/memory/health_learning.json" @@ -192,10 +197,10 @@ class IntelligentLinkCleaner: to respect manual curation. However, we ensure the INVENTORY has a description for the V2 Elite portal. """ - if url not in self.inventory: self.inventory[url] = {} + if url not in self.inventory: self.inventory[normalize_url(url)] = {} # If inventory already has a description, we are done - if self.inventory[url].get("ai_summary"): return + if self.inventory[normalize_url(url)].get("ai_summary"): return log_event(f" [✨] INVENTORY: Generating summary for {url} (V2 Only)") try: @@ -215,8 +220,8 @@ class IntelligentLinkCleaner: ) ai_data = await call_gemini_with_retry(prompt) if ai_data: - self.inventory[url]["ai_summary"] = ai_data.get("desc", "").strip() - self.inventory[url]["pub_date"] = ai_data.get("pub_date", "N/A") + self.inventory[normalize_url(url)]["ai_summary"] = ai_data.get("desc", "").strip() + self.inventory[normalize_url(url)]["pub_date"] = ai_data.get("pub_date", "N/A") self.stats["enriched_descriptions"] += 1 self.stats["enriched_descriptions"] += 1 log_event(f" [OK] Cached for V2: {desc[:50]}...") diff --git a/src/main.py b/src/main.py index a4878b36..21d89392 100644 --- a/src/main.py +++ b/src/main.py @@ -15,6 +15,11 @@ from src.logger import log_event from src.gemini_utils import call_gemini_with_retry, resolve_url from src.state_manager import get_last_date, save_state +def normalize_url(url: str) -> str: + url = url.split(\"#\")[0].split(\"?\")[0].rstrip(\"/\") + if url.startswith(\"http://\"): url = \"https://\" + url[7:] + return url.lower() + async def master_orchestrator(): git_controller = RepositoryController(GH_TOKEN, TARGET_REPO) diff --git a/src/v2_optimizer.py b/src/v2_optimizer.py index 6a653c39..11dcc4ce 100644 --- a/src/v2_optimizer.py +++ b/src/v2_optimizer.py @@ -10,6 +10,11 @@ from src.config import GEMINI_API_KEYS, GH_TOKEN, TARGET_REPO, MADRID_TZ from src.gemini_utils import call_gemini_with_retry from src.logger import log_event +def normalize_url(url: str) -> str: + url = url.split(\"#\")[0].split(\"?\")[0].rstrip(\"/\") + if url.startswith(\"http://\"): url = \"https://\" + url[7:] + return url.lower() + V1_DIR = "docs" V2_DIR = "v2-docs" INVENTORY_PATH = "data/inventory.yaml" @@ -194,7 +199,7 @@ class V2VisionEngine: return link # 2. Cached Health - if url in self.inventory and self.inventory[url].get("status") == "online": + if url in self.inventory and self.inventory[normalize_url(url)].get("status") == "online": link["health_status"] = "cached" return link @@ -245,11 +250,11 @@ class V2VisionEngine: # To allow the new logic to apply to cached items, we re-process GitHub links # and re-apply the tag logic even if it's in the cache. item = l.copy() - if not force_eval and url in self.inventory and "stars" in self.inventory[url]: - item.update(self.inventory[url]) + if not force_eval and url in self.inventory and "stars" in self.inventory[normalize_url(url)]: + item.update(self.inventory[normalize_url(url)]) # If cache has a generated description and item is missing one, use it - if "ai_summary" in self.inventory[url] and not item["description"]: - item["description"] = self.inventory[url]["ai_summary"] + if "ai_summary" in self.inventory[normalize_url(url)] and not item["description"]: + item["description"] = self.inventory[normalize_url(url)]["ai_summary"] # Re-evaluate if description is still missing even after cache check if not item.get("description"): @@ -324,13 +329,26 @@ class V2VisionEngine: await asyncio.sleep(0.3) return refined - def _calculate_tag(self, item: Dict) -> str: - # Dynamic Tagging Strategy based on Maturity and Real Data - if "github.com" in item["url"] and "gh_stars" in item: - stars = item["gh_stars"] - year = int(item.get("year")) if item.get("year", "").isdigit() else 2024 - if stars > 10000: return "[DE FACTO STANDARD]" - if stars > 500 and year >= 2024: return "[ENTERPRISE-STABLE]" + def _calculate_tag(self, item: Dict) -> str: + # Dynamic Evolutionary Tagging (Automatic Project Growth Detection) + url = item.get("url", "").lower() + stars = item.get("gh_stars", 0) + year = int(item.get("year")) if item.get("year", "").isdigit() else 2024 + + if "github.com" in url or "gitlab.com" in url: + if stars > 15000: return "[DE FACTO STANDARD]" + if stars > 3000: return "[ENTERPRISE-STABLE]" + if stars > 500 and year >= 2025: return "[HIGH-GROWTH / EMERGING]" + if year <= 2021 and stars < 100: return "[LEGACY / MAINTENANCE]" + return "[COMMUNITY-TOOL]" + + # Article/Guide Logic + title = item.get("title", "").lower() + if "awesome" in title: return "[FOUNDATIONAL]" + if "guide" in title or "architecture" in title: return "[ARCHITECTURE-GUIDE]" + if "deep dive" in title or "internals" in title: return "[TECHNICAL-DEEP-DIVE]" + if "how to" in title or "tutorial" in title: return "[CASE-STUDY]" + return "[EXPERT-ARTICLE]" if year >= 2025: return "[EMERGING / INNOVATION]" if year <= 2022: return "[LEGACY / MAINTENANCE]" return "[TOOLING]" @@ -419,6 +437,27 @@ class V2VisionEngine: ) ) + # --- THE AGENTIC PULSE (Trending) --- + trending_pool = [] + for url, meta in self.inventory.items(): + if meta.get("stars", 0) >= 3: + trending_pool.append(meta.copy()) + trending_pool[-1]["url"] = url + + # Sort by: 1. Pub/Post Date (DESC), 2. Stars (DESC) + trending_pool.sort(key=lambda x: ( + x.get("pub_date", "0000") if x.get("pub_date") != "N/A" else x.get("post_date", "0000"), + -x.get("stars", 0) + ), reverse=True) + + pulse_md = "## ⚡ The Agentic Pulse: Trending Excellence\n" + pulse_md += "Directly from the latest 2026 curation surges. High-impact technical depth recently added.\n\n" + for l in trending_pool[:5]: + stars = "🌟" * l.get("stars", 3) + date = l.get("pub_date") if l.get("pub_date") != "N/A" else l.get("post_date") + date_prefix = f"**({date[:10]})** " if date and date != "N/A" else "" + pulse_md += f"- {date_prefix}[**=={l['title']}==**]({l['url']}) {stars}\n" + index_md = ( "# Nubenetes V2 | The High-Density Library (2026)\n\n" "![Banner](images/kubernetes_logo.jpg)\n\n" @@ -426,8 +465,8 @@ class V2VisionEngine: " A meticulously curated reference of over 15,000 resources. This V2 portal preserves technical depth while providing " " impact-driven synthesis and expert quality classification.\n\n" f"
\n{mosaic_html}\n
\n\n" - - "## 🛡️ V2 Taxonomy & Maturity Tags\n" + f"{pulse_md}\n" + "## 🛡️ V2 Taxonomy & Elite Quality Tiers\n" "To maximize technical clarity, V2 resources are classified by maturity rather than subjective quality:\n\n" "- [DE FACTO STANDARD]: Foundational industry tools with massive adoption (>10k GitHub stars).\n" "- [ENTERPRISE-STABLE]: Production-ready tools actively maintained.\n"