feat(ops): implement Mandate 34 for URL normalization and automate V2 sync

This commit is contained in:
Nubenetes Bot
2026-05-18 15:25:35 +02:00
parent 8119b67dab
commit e69f60de44
5 changed files with 56 additions and 19 deletions

View File

@@ -29,7 +29,7 @@ permissions:
jobs:
build-v2-edition:
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
if: github.event_name == 'workflow_dispatch' || github.event_name == 'push' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
steps:
- name: Repository Synchronization
uses: actions/checkout@v4

View File

@@ -163,15 +163,8 @@ async def resolve_url(url: str) -> str:
final_url, current_hop = new_url, current_hop + 1
except: break
# Mandate 34: Prevent multiple trailing slashes
if final_url and '://' in final_url:
parts = final_url.split('://')
if '/' in parts[-1]:
final_url = f"{parts[0]}://{re.sub(r'/+$', '/', parts[-1])}"
else:
final_url = final_url.rstrip('/')
return final_url
# Mandate 34: Prevent multiple trailing slashes using centralized utility
return sanitize_trailing_slashes(final_url)
def clean_toc_text(text: str) -> str:
"""
@@ -187,6 +180,17 @@ def clean_toc_text(text: str) -> str:
text = re.sub(r'[^\w\s\-.]', '', text)
return text.strip()
def sanitize_trailing_slashes(url: str) -> str:
"""
Mandate 34: Enforces a 0 or 1 trailing slash policy.
Collapses all multiple slashes (e.g., //) into one.
"""
if not url or '://' not in url: return url
parts = url.split('://', 1)
# Collapse all multiple slashes in domain and path to one
parts[1] = re.sub(r'/{2,}', '/', parts[1])
return f"{parts[0]}://{parts[1]}"
def normalize_url(url: str) -> str:
"""
Normalización de URLs de alta precisión para Nubenetes.
@@ -194,6 +198,9 @@ def normalize_url(url: str) -> str:
"""
if not url: return ""
# 0. Mandate 34: Cleanup redundant slashes first
url = sanitize_trailing_slashes(url)
# 1. Separar fragmento (pero preservar si es técnico como #L123)
fragment = ""
if "#" in url:
@@ -202,6 +209,7 @@ def normalize_url(url: str) -> str:
# 2. Limpiar parámetros de tracking social (UTM, etc.)
url = re.sub(r'(\?|&)(utm_[^&]+|s=[^&]+|t=[^&]+|ref=[^&]+|fbclid=[^&]+)', '', url)
# Mandate 34: Remove all trailing slashes and question marks for internal canonical comparison
url = url.rstrip("/").rstrip("?")
# 3. Normalizar protocolo y dominio (Case Insensitive)

View File

@@ -218,12 +218,20 @@ class IntelligentLinkCleaner:
if resp.status_code < 400:
text = resp.text.lower()
final_url = str(resp.url)
# Mandate 34: Prevent multiple trailing slashes
final_url = re.sub(r'/+$', '/', final_url) if '/' in final_url.split('://')[-1] else final_url.rstrip('/')
# Mandate 34: Prevent multiple trailing slashes using centralized utility
from src.gemini_utils import sanitize_trailing_slashes
final_url = sanitize_trailing_slashes(final_url)
if any(kw in text for kw in parked): return False, "parked", None
# Mandate 34: Explicit detection of redundant slashes or single slash policy
if final_url != url:
u_p = url.split("://")[-1].rstrip("/"); f_p = final_url.split("://")[-1].rstrip("/")
# If it's just a slash/redundancy fix, we mark it as 'healed' or 'normalized'
if u_p == f_p:
return True, "normalized_slashes", final_url
# Generic redirect loss protection
if u_p.count("/") >= 3 and (f_p.count("/") <= 2 or any(kw in f_p for kw in ["/about", "/products", "/home"])):
return False, "generic_redirect_loss", None
return True, "OK", final_url if final_url != url else None
@@ -252,8 +260,19 @@ class IntelligentLinkCleaner:
for i, line in enumerate(file_updates[path]):
if url in line:
if fallback and fallback.startswith("CANONICAL:"):
file_updates[path][i] = line.replace(url, fallback.replace("CANONICAL:", ""))
else: file_updates[path][i] = None
fallback_url = fallback.replace("CANONICAL:", "")
# Mandate 34: Robust replacement to avoid path/ path// recursion
# We replace exactly the URL within Markdown link syntax or bounded by whitespace
line_updated = line.replace(f"({url})", f"({fallback_url})")
if line_updated == line: # Fallback if not in parens
line_updated = re.sub(rf'({re.escape(url)})(?=[)\s]|$)', fallback_url, line)
# Final safety check: if line still has // after our intended clean URL
line_updated = line_updated.replace(f"{fallback_url}/", fallback_url)
file_updates[path][i] = line_updated
else:
# Delete dead link line
file_updates[path][i] = None
final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()}
await self.prune_orphaned_metadata(); self._save_inventory()

View File

@@ -12,7 +12,7 @@ from src.agentic_curator import evaluate_extracted_assets, AgenticCurator
from src.autonomous_discovery import discover_trending_assets
from src.gitops_manager import RepositoryController
from src.logger import log_event
from src.gemini_utils import call_gemini_with_retry, resolve_url, normalize_url
from src.gemini_utils import call_gemini_with_retry, resolve_url, normalize_url, sanitize_trailing_slashes
from src.state_manager import get_last_date, save_state
async def master_orchestrator():
@@ -315,18 +315,20 @@ async def master_orchestrator():
})
if evaluation["status"] == "INCLUDED":
# Mandate 34: Sanitize new URLs before injection
sanitized_url = sanitize_trailing_slashes(url)
unique_new_assets.append({
"url": url, "title": evaluation["title"],
"url": sanitized_url, "title": evaluation["title"],
"description": evaluation["description"],
"year": evaluation.get("year", "N/A"),
"category": evaluation.get("category", "kubernetes-tools"),
"impact_score": evaluation["impact_score"],
"reasoning": evaluation.get("reasoning")
})
existing_urls.add(url.split('#')[0].rstrip('/').lower())
existing_urls.add(normalize_url(sanitized_url))
for rel_cat in evaluation.get("related_categories", []):
interlink_asset = {
"url": url, "title": evaluation["title"],
"url": sanitized_url, "title": evaluation["title"],
"description": f"*(Related to {evaluation.get('category')} topic)*",
"category": rel_cat, "impact_score": 50
}

View File

@@ -179,7 +179,15 @@ class V2VisionEngine:
try:
resp = await client.get(url, timeout=10.0)
if resp.status_code < 400:
self.inventory.setdefault(norm_url, {})["status"] = "online"
final_url = str(resp.url)
from src.gemini_utils import sanitize_trailing_slashes
final_url = sanitize_trailing_slashes(final_url)
# Update URL if it was redirected/normalized
if final_url != url:
link["url"] = final_url
self.inventory.setdefault(normalize_url(final_url), {})["status"] = "online"
return link
except: pass
return None