From e69f60de44fc3878a8047798a63382b9524dd980 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Mon, 18 May 2026 15:25:35 +0200 Subject: [PATCH] feat(ops): implement Mandate 34 for URL normalization and automate V2 sync --- .github/workflows/agentic_v2_builder.yml | 2 +- src/gemini_utils.py | 26 +++++++++++++++-------- src/intelligent_health_checker.py | 27 ++++++++++++++++++++---- src/main.py | 10 +++++---- src/v2_optimizer.py | 10 ++++++++- 5 files changed, 56 insertions(+), 19 deletions(-) diff --git a/.github/workflows/agentic_v2_builder.yml b/.github/workflows/agentic_v2_builder.yml index 160ecd42..9abc5cfe 100644 --- a/.github/workflows/agentic_v2_builder.yml +++ b/.github/workflows/agentic_v2_builder.yml @@ -29,7 +29,7 @@ permissions: jobs: build-v2-edition: runs-on: ubuntu-latest - if: github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') + if: github.event_name == 'workflow_dispatch' || github.event_name == 'push' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') steps: - name: Repository Synchronization uses: actions/checkout@v4 diff --git a/src/gemini_utils.py b/src/gemini_utils.py index a09b85b8..c84b59b8 100644 --- a/src/gemini_utils.py +++ b/src/gemini_utils.py @@ -163,15 +163,8 @@ async def resolve_url(url: str) -> str: final_url, current_hop = new_url, current_hop + 1 except: break - # Mandate 34: Prevent multiple trailing slashes - if final_url and '://' in final_url: - parts = final_url.split('://') - if '/' in parts[-1]: - final_url = f"{parts[0]}://{re.sub(r'/+$', '/', parts[-1])}" - else: - final_url = final_url.rstrip('/') - - return final_url + # Mandate 34: Prevent multiple trailing slashes using centralized utility + return sanitize_trailing_slashes(final_url) def clean_toc_text(text: str) -> str: """ @@ -187,6 +180,17 @@ def clean_toc_text(text: str) -> str: text = re.sub(r'[^\w\s\-.]', '', text) return text.strip() +def sanitize_trailing_slashes(url: str) -> str: + """ + Mandate 34: Enforces a 0 or 1 trailing slash policy. + Collapses all multiple slashes (e.g., //) into one. + """ + if not url or '://' not in url: return url + parts = url.split('://', 1) + # Collapse all multiple slashes in domain and path to one + parts[1] = re.sub(r'/{2,}', '/', parts[1]) + return f"{parts[0]}://{parts[1]}" + def normalize_url(url: str) -> str: """ Normalización de URLs de alta precisión para Nubenetes. @@ -194,6 +198,9 @@ def normalize_url(url: str) -> str: """ if not url: return "" + # 0. Mandate 34: Cleanup redundant slashes first + url = sanitize_trailing_slashes(url) + # 1. Separar fragmento (pero preservar si es técnico como #L123) fragment = "" if "#" in url: @@ -202,6 +209,7 @@ def normalize_url(url: str) -> str: # 2. Limpiar parámetros de tracking social (UTM, etc.) url = re.sub(r'(\?|&)(utm_[^&]+|s=[^&]+|t=[^&]+|ref=[^&]+|fbclid=[^&]+)', '', url) + # Mandate 34: Remove all trailing slashes and question marks for internal canonical comparison url = url.rstrip("/").rstrip("?") # 3. Normalizar protocolo y dominio (Case Insensitive) diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py index 8eb78d26..ac2261e0 100644 --- a/src/intelligent_health_checker.py +++ b/src/intelligent_health_checker.py @@ -218,12 +218,20 @@ class IntelligentLinkCleaner: if resp.status_code < 400: text = resp.text.lower() final_url = str(resp.url) - # Mandate 34: Prevent multiple trailing slashes - final_url = re.sub(r'/+$', '/', final_url) if '/' in final_url.split('://')[-1] else final_url.rstrip('/') + # Mandate 34: Prevent multiple trailing slashes using centralized utility + from src.gemini_utils import sanitize_trailing_slashes + final_url = sanitize_trailing_slashes(final_url) if any(kw in text for kw in parked): return False, "parked", None + + # Mandate 34: Explicit detection of redundant slashes or single slash policy if final_url != url: u_p = url.split("://")[-1].rstrip("/"); f_p = final_url.split("://")[-1].rstrip("/") + # If it's just a slash/redundancy fix, we mark it as 'healed' or 'normalized' + if u_p == f_p: + return True, "normalized_slashes", final_url + + # Generic redirect loss protection if u_p.count("/") >= 3 and (f_p.count("/") <= 2 or any(kw in f_p for kw in ["/about", "/products", "/home"])): return False, "generic_redirect_loss", None return True, "OK", final_url if final_url != url else None @@ -252,8 +260,19 @@ class IntelligentLinkCleaner: for i, line in enumerate(file_updates[path]): if url in line: if fallback and fallback.startswith("CANONICAL:"): - file_updates[path][i] = line.replace(url, fallback.replace("CANONICAL:", "")) - else: file_updates[path][i] = None + fallback_url = fallback.replace("CANONICAL:", "") + # Mandate 34: Robust replacement to avoid path/ path// recursion + # We replace exactly the URL within Markdown link syntax or bounded by whitespace + line_updated = line.replace(f"({url})", f"({fallback_url})") + if line_updated == line: # Fallback if not in parens + line_updated = re.sub(rf'({re.escape(url)})(?=[)\s]|$)', fallback_url, line) + + # Final safety check: if line still has // after our intended clean URL + line_updated = line_updated.replace(f"{fallback_url}/", fallback_url) + file_updates[path][i] = line_updated + else: + # Delete dead link line + file_updates[path][i] = None final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()} await self.prune_orphaned_metadata(); self._save_inventory() diff --git a/src/main.py b/src/main.py index 4ad39fef..2cd2043f 100644 --- a/src/main.py +++ b/src/main.py @@ -12,7 +12,7 @@ from src.agentic_curator import evaluate_extracted_assets, AgenticCurator from src.autonomous_discovery import discover_trending_assets from src.gitops_manager import RepositoryController from src.logger import log_event -from src.gemini_utils import call_gemini_with_retry, resolve_url, normalize_url +from src.gemini_utils import call_gemini_with_retry, resolve_url, normalize_url, sanitize_trailing_slashes from src.state_manager import get_last_date, save_state async def master_orchestrator(): @@ -315,18 +315,20 @@ async def master_orchestrator(): }) if evaluation["status"] == "INCLUDED": + # Mandate 34: Sanitize new URLs before injection + sanitized_url = sanitize_trailing_slashes(url) unique_new_assets.append({ - "url": url, "title": evaluation["title"], + "url": sanitized_url, "title": evaluation["title"], "description": evaluation["description"], "year": evaluation.get("year", "N/A"), "category": evaluation.get("category", "kubernetes-tools"), "impact_score": evaluation["impact_score"], "reasoning": evaluation.get("reasoning") }) - existing_urls.add(url.split('#')[0].rstrip('/').lower()) + existing_urls.add(normalize_url(sanitized_url)) for rel_cat in evaluation.get("related_categories", []): interlink_asset = { - "url": url, "title": evaluation["title"], + "url": sanitized_url, "title": evaluation["title"], "description": f"*(Related to {evaluation.get('category')} topic)*", "category": rel_cat, "impact_score": 50 } diff --git a/src/v2_optimizer.py b/src/v2_optimizer.py index f97c4096..31ea1b8b 100644 --- a/src/v2_optimizer.py +++ b/src/v2_optimizer.py @@ -179,7 +179,15 @@ class V2VisionEngine: try: resp = await client.get(url, timeout=10.0) if resp.status_code < 400: - self.inventory.setdefault(norm_url, {})["status"] = "online" + final_url = str(resp.url) + from src.gemini_utils import sanitize_trailing_slashes + final_url = sanitize_trailing_slashes(final_url) + + # Update URL if it was redirected/normalized + if final_url != url: + link["url"] = final_url + + self.inventory.setdefault(normalize_url(final_url), {})["status"] = "online" return link except: pass return None