mirror of
https://github.com/nubenetes/awesome-kubernetes.git
synced 2026-05-22 00:53:37 +00:00
feat(ops): implement Mandate 34 for URL normalization and automate V2 sync
This commit is contained in:
2
.github/workflows/agentic_v2_builder.yml
vendored
2
.github/workflows/agentic_v2_builder.yml
vendored
@@ -29,7 +29,7 @@ permissions:
|
||||
jobs:
|
||||
build-v2-edition:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
|
||||
if: github.event_name == 'workflow_dispatch' || github.event_name == 'push' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
|
||||
steps:
|
||||
- name: Repository Synchronization
|
||||
uses: actions/checkout@v4
|
||||
|
||||
@@ -163,15 +163,8 @@ async def resolve_url(url: str) -> str:
|
||||
final_url, current_hop = new_url, current_hop + 1
|
||||
except: break
|
||||
|
||||
# Mandate 34: Prevent multiple trailing slashes
|
||||
if final_url and '://' in final_url:
|
||||
parts = final_url.split('://')
|
||||
if '/' in parts[-1]:
|
||||
final_url = f"{parts[0]}://{re.sub(r'/+$', '/', parts[-1])}"
|
||||
else:
|
||||
final_url = final_url.rstrip('/')
|
||||
|
||||
return final_url
|
||||
# Mandate 34: Prevent multiple trailing slashes using centralized utility
|
||||
return sanitize_trailing_slashes(final_url)
|
||||
|
||||
def clean_toc_text(text: str) -> str:
|
||||
"""
|
||||
@@ -187,6 +180,17 @@ def clean_toc_text(text: str) -> str:
|
||||
text = re.sub(r'[^\w\s\-.]', '', text)
|
||||
return text.strip()
|
||||
|
||||
def sanitize_trailing_slashes(url: str) -> str:
|
||||
"""
|
||||
Mandate 34: Enforces a 0 or 1 trailing slash policy.
|
||||
Collapses all multiple slashes (e.g., //) into one.
|
||||
"""
|
||||
if not url or '://' not in url: return url
|
||||
parts = url.split('://', 1)
|
||||
# Collapse all multiple slashes in domain and path to one
|
||||
parts[1] = re.sub(r'/{2,}', '/', parts[1])
|
||||
return f"{parts[0]}://{parts[1]}"
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""
|
||||
Normalización de URLs de alta precisión para Nubenetes.
|
||||
@@ -194,6 +198,9 @@ def normalize_url(url: str) -> str:
|
||||
"""
|
||||
if not url: return ""
|
||||
|
||||
# 0. Mandate 34: Cleanup redundant slashes first
|
||||
url = sanitize_trailing_slashes(url)
|
||||
|
||||
# 1. Separar fragmento (pero preservar si es técnico como #L123)
|
||||
fragment = ""
|
||||
if "#" in url:
|
||||
@@ -202,6 +209,7 @@ def normalize_url(url: str) -> str:
|
||||
|
||||
# 2. Limpiar parámetros de tracking social (UTM, etc.)
|
||||
url = re.sub(r'(\?|&)(utm_[^&]+|s=[^&]+|t=[^&]+|ref=[^&]+|fbclid=[^&]+)', '', url)
|
||||
# Mandate 34: Remove all trailing slashes and question marks for internal canonical comparison
|
||||
url = url.rstrip("/").rstrip("?")
|
||||
|
||||
# 3. Normalizar protocolo y dominio (Case Insensitive)
|
||||
|
||||
@@ -218,12 +218,20 @@ class IntelligentLinkCleaner:
|
||||
if resp.status_code < 400:
|
||||
text = resp.text.lower()
|
||||
final_url = str(resp.url)
|
||||
# Mandate 34: Prevent multiple trailing slashes
|
||||
final_url = re.sub(r'/+$', '/', final_url) if '/' in final_url.split('://')[-1] else final_url.rstrip('/')
|
||||
# Mandate 34: Prevent multiple trailing slashes using centralized utility
|
||||
from src.gemini_utils import sanitize_trailing_slashes
|
||||
final_url = sanitize_trailing_slashes(final_url)
|
||||
|
||||
if any(kw in text for kw in parked): return False, "parked", None
|
||||
|
||||
# Mandate 34: Explicit detection of redundant slashes or single slash policy
|
||||
if final_url != url:
|
||||
u_p = url.split("://")[-1].rstrip("/"); f_p = final_url.split("://")[-1].rstrip("/")
|
||||
# If it's just a slash/redundancy fix, we mark it as 'healed' or 'normalized'
|
||||
if u_p == f_p:
|
||||
return True, "normalized_slashes", final_url
|
||||
|
||||
# Generic redirect loss protection
|
||||
if u_p.count("/") >= 3 and (f_p.count("/") <= 2 or any(kw in f_p for kw in ["/about", "/products", "/home"])):
|
||||
return False, "generic_redirect_loss", None
|
||||
return True, "OK", final_url if final_url != url else None
|
||||
@@ -252,8 +260,19 @@ class IntelligentLinkCleaner:
|
||||
for i, line in enumerate(file_updates[path]):
|
||||
if url in line:
|
||||
if fallback and fallback.startswith("CANONICAL:"):
|
||||
file_updates[path][i] = line.replace(url, fallback.replace("CANONICAL:", ""))
|
||||
else: file_updates[path][i] = None
|
||||
fallback_url = fallback.replace("CANONICAL:", "")
|
||||
# Mandate 34: Robust replacement to avoid path/ path// recursion
|
||||
# We replace exactly the URL within Markdown link syntax or bounded by whitespace
|
||||
line_updated = line.replace(f"({url})", f"({fallback_url})")
|
||||
if line_updated == line: # Fallback if not in parens
|
||||
line_updated = re.sub(rf'({re.escape(url)})(?=[)\s]|$)', fallback_url, line)
|
||||
|
||||
# Final safety check: if line still has // after our intended clean URL
|
||||
line_updated = line_updated.replace(f"{fallback_url}/", fallback_url)
|
||||
file_updates[path][i] = line_updated
|
||||
else:
|
||||
# Delete dead link line
|
||||
file_updates[path][i] = None
|
||||
|
||||
final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()}
|
||||
await self.prune_orphaned_metadata(); self._save_inventory()
|
||||
|
||||
10
src/main.py
10
src/main.py
@@ -12,7 +12,7 @@ from src.agentic_curator import evaluate_extracted_assets, AgenticCurator
|
||||
from src.autonomous_discovery import discover_trending_assets
|
||||
from src.gitops_manager import RepositoryController
|
||||
from src.logger import log_event
|
||||
from src.gemini_utils import call_gemini_with_retry, resolve_url, normalize_url
|
||||
from src.gemini_utils import call_gemini_with_retry, resolve_url, normalize_url, sanitize_trailing_slashes
|
||||
from src.state_manager import get_last_date, save_state
|
||||
|
||||
async def master_orchestrator():
|
||||
@@ -315,18 +315,20 @@ async def master_orchestrator():
|
||||
})
|
||||
|
||||
if evaluation["status"] == "INCLUDED":
|
||||
# Mandate 34: Sanitize new URLs before injection
|
||||
sanitized_url = sanitize_trailing_slashes(url)
|
||||
unique_new_assets.append({
|
||||
"url": url, "title": evaluation["title"],
|
||||
"url": sanitized_url, "title": evaluation["title"],
|
||||
"description": evaluation["description"],
|
||||
"year": evaluation.get("year", "N/A"),
|
||||
"category": evaluation.get("category", "kubernetes-tools"),
|
||||
"impact_score": evaluation["impact_score"],
|
||||
"reasoning": evaluation.get("reasoning")
|
||||
})
|
||||
existing_urls.add(url.split('#')[0].rstrip('/').lower())
|
||||
existing_urls.add(normalize_url(sanitized_url))
|
||||
for rel_cat in evaluation.get("related_categories", []):
|
||||
interlink_asset = {
|
||||
"url": url, "title": evaluation["title"],
|
||||
"url": sanitized_url, "title": evaluation["title"],
|
||||
"description": f"*(Related to {evaluation.get('category')} topic)*",
|
||||
"category": rel_cat, "impact_score": 50
|
||||
}
|
||||
|
||||
@@ -179,7 +179,15 @@ class V2VisionEngine:
|
||||
try:
|
||||
resp = await client.get(url, timeout=10.0)
|
||||
if resp.status_code < 400:
|
||||
self.inventory.setdefault(norm_url, {})["status"] = "online"
|
||||
final_url = str(resp.url)
|
||||
from src.gemini_utils import sanitize_trailing_slashes
|
||||
final_url = sanitize_trailing_slashes(final_url)
|
||||
|
||||
# Update URL if it was redirected/normalized
|
||||
if final_url != url:
|
||||
link["url"] = final_url
|
||||
|
||||
self.inventory.setdefault(normalize_url(final_url), {})["status"] = "online"
|
||||
return link
|
||||
except: pass
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user