Files
awesome-kubernetes/src/v2_optimizer.py

401 lines
22 KiB
Python

import os
import re
import json
import asyncio
import yaml
import httpx
from datetime import datetime
from typing import List, Dict, Set, Any
from src.config import GEMINI_API_KEYS, GH_TOKEN, TARGET_REPO, MADRID_TZ
from src.gemini_utils import call_gemini_with_retry
from src.logger import log_event
V1_DIR = "docs"
V2_DIR = "v2-docs"
V2_CACHE_PATH = "data/v2_cache.json"
class V2VisionEngine:
def __init__(self):
# 100% Comprehensive 2026 Taxonomy
self.dimensions = {
"Intelligent Control Plane": ["ai", "ai-agents-mcp", "chatgpt", "mlops"],
"Architectural Foundations": ["introduction", "faq", "kubernetes", "linux", "git", "cloud-arch-diagrams", "matrix-table", "other-awesome-lists", "about"],
"Platform & Site Reliability": ["sre", "devops", "developerportals", "scaffolding", "finops", "chaos-engineering", "performance-testing-with-jenkins-and-jmeter", "project-management-methodology", "project-management-tools", "qa", "test-automation-frameworks", "testops"],
"Hardened Infrastructure": ["iac", "terraform", "pulumi", "crossplane", "ansible", "securityascode", "kubernetes-security", "aws-security", "oauth", "devsecops", "kustomize", "liquibase", "chef"],
"Cloud Providers (Hyperscalers)": ["aws", "azure", "GoogleCloudPlatform", "ibm_cloud", "oraclecloud", "digitalocean", "cloudflare", "scaleway", "managed-kubernetes-in-public-cloud", "public-cloud-solutions", "private-cloud-solutions", "edge-computing", "aws-architecture", "aws-security", "aws-networking", "aws-databases", "aws-storage", "aws-monitoring", "aws-iac", "aws-tools-scripts", "aws-messaging", "aws-data", "aws-devops", "aws-serverless", "aws-containers", "aws-backup", "aws-training", "aws-newfeatures", "aws-miscellaneous", "aws-pricing", "aws-spain"],
"Networking & Service Mesh": ["networking", "kubernetes-networking", "servicemesh", "istio", "caching", "web-servers", "cloudflare"],
"The Container Stack": ["docker", "container-managers", "serverless", "kubernetes-autoscaling", "kubernetes-operators-controllers", "kubernetes-storage", "kubernetes-monitoring", "kubernetes-troubleshooting", "kubernetes-backup-migrations", "kubernetes-on-premise", "kubernetes-bigdata", "kubernetes-client-libraries", "kubernetes-releases", "kubernetes-based-devel", "kubernetes-alternatives", "kubectl-commands", "rancher", "openshift", "ocp3", "ocp4", "noops"],
"Data & Advanced Analytics": ["databases", "nosql", "newsql", "message-queue", "crunchydata", "yaml", "bigdata"],
"Engineering Pipeline": ["cicd", "gitops", "argo", "flux", "tekton", "jenkins", "jenkins-alternatives", "openshift-pipelines", "sonarqube", "registries", "keptn", "stackstorm", "cicd-kubernetes-plugins"],
"Developer Ecosystem": ["visual-studio", "javascript", "golang", "python", "java_frameworks", "java_app_servers", "java-and-java-performance-optimization", "dotnet", "angular", "react", "web3", "api", "swagger-code-generator-for-rest-apis", "postman", "lowcode-nocode", "devel-sites", "dom", "linux-dev-env", "ChromeDevTools", "xamarin", "jvm-parameters-matrix-table", "maven-gradle", "embedded-servlet-containers"],
"Career & Industry": ["recruitment", "hr", "freelancing", "remote-tech-jobs", "workfromhome", "interview-questions", "elearning", "digital-money", "appointment-scheduling", "newsfeeds"]
}
self.library_criteria = (
"You are a Technical Librarian in 2026. Your mission is to build a high-density, professional reference library.\n"
"PHASE 1: TECHNICAL PRESERVATION (HIGH INCLUSIVITY)\n"
"- KEEP >90% of technical resources. Only discard 404s, obvious spam, or non-technical content.\n"
"- 'Awesome' repositories, official documentation, and deep technical guides are mandatory.\n"
"- YouTube videos are HIGH-VALUE resources; keep them as technical references.\n\n"
"PHASE 2: TEMPORAL & QUALITY SYNTHESIS\n"
"- Identify/estimate PUBLICATION YEAR.\n"
"- Assign QUALITY level (1-3 stars):\n"
" * 3 stars (🌟🌟🌟): Masterpieces, foundational standards, definitive 'Awesome' lists.\n"
" * 2 stars (🌟🌟): Production-grade tools, deep tutorials, highly recommended videos.\n"
" * 1 star (🌟): Solid technical references.\n"
"- Identify if a resource is a 'YouTube Video/Playlist' for special rendering.\n"
)
self.cache = self._load_cache()
def _load_cache(self) -> Dict:
if os.path.exists(V2_CACHE_PATH):
try:
with open(V2_CACHE_PATH, "r") as f: return json.load(f)
except: return {}
return {}
def _save_cache(self):
os.makedirs(os.path.dirname(V2_CACHE_PATH), exist_ok=True)
with open(V2_CACHE_PATH, "w") as f: json.dump(self.cache, f, indent=2)
async def analyze_and_cluster(self):
log_event("STARTING V2 HIGH-DENSITY CHRONOLOGICAL LIBRARY GENERATION", section_break=True)
all_v1_links, mosaic_html, videos_html = await self._gather_all_v1_content()
log_event(f"[*] Discovery: Found {len(all_v1_links)} resources in V1 archive.")
log_event("[*] Phase 1: Health Check & Metadata Enrichment...")
# Rapid Async Health Check
health_inventory = await self._verify_link_health(all_v1_links)
log_event(f"[*] Health Check Complete. {len(health_inventory)}/{len(all_v1_links)} links are online.")
log_event("[*] Phase 2: Library Evaluation, Year Extraction & Quality Scoring...")
library_inventory = await self._evaluate_and_score_resources(health_inventory)
log_event(f"[*] Inventory Refined: {len(library_inventory)} resources kept.")
log_event("[*] Phase 3: Dimensional Clustering & Chronological Sorting...")
v2_data = await self._rebuild_structure(library_inventory)
log_event("[*] Phase 4: Generating Premium Portal Pages...")
os.makedirs(V2_DIR, exist_ok=True)
await self._write_premium_files(v2_data, mosaic_html, videos_html)
await self._sync_enterprise_navigation(v2_data)
self._save_cache()
log_event("V2 LIBRARY GENERATION COMPLETED.", section_break=True)
async def _gather_all_v1_content(self) -> (List[Dict], str, str):
all_links = []
mosaic_html = ""
videos_html = ""
if os.path.exists("docs/index.md"):
with open("docs/index.md", "r") as f:
idx_content = f.read()
mosaic_match = re.search(r'<center>\s*(\[!\[.*?)\s*</center>', idx_content, re.DOTALL)
if mosaic_match: mosaic_html = mosaic_match.group(1)
videos_match = re.search(r'\?\?\? note "Top Videos & Clips.*?\n(.*?\n)\s*</center>', idx_content, re.DOTALL)
if videos_match: videos_html = videos_match.group(1)
for root, _, files in os.walk(V1_DIR):
for file in files:
if not file.endswith(".md") or file == "index.md": continue
path = os.path.join(root, file)
with open(path, "r") as f:
content = f.read()
matches = re.finditer(r'^\s*-\s*\[([^\]]+)\]\(([^\)]+)\)(.*?(?:\n\s{2,}.*)*)', content, re.MULTILINE)
for m in matches:
title, url, full_desc = m.groups()
# FIX: Convert relative .md links to absolute V1 links for cross-edition stability
if not url.startswith(("http://", "https://", "mailto:", "#")):
if url.endswith(".md"):
url = f"https://nubenetes.com/{url.replace('.md', '/')}"
elif url.startswith("images/"):
url = f"https://nubenetes.com/{url}"
all_links.append({
"title": title,
"url": url,
"description": full_desc.strip(),
"original_file": file
})
return all_links, mosaic_html, videos_html
async def _verify_link_health(self, links: List[Dict]) -> List[Dict]:
online_links = []
BATCH_SIZE = 50 # Smaller batches for stability
# User-Agent rotation to mimic real browsers
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
]
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, verify=False) as client:
for i in range(0, len(links), BATCH_SIZE):
batch = links[i:i+BATCH_SIZE]
tasks = []
for l in batch:
ua = user_agents[i % len(user_agents)]
tasks.append(self._check_single_link_resilient(client, l, ua))
results = await asyncio.gather(*tasks)
online_links.extend([r for r in results if r is not None])
if i % 500 == 0:
log_event(f" [Resilient Health] Verified {i}/{len(links)} links...")
# Brief pause to avoid triggering Rate Limits
await asyncio.sleep(0.1)
return online_links
async def _check_single_link_resilient(self, client, link: Dict, ua: str, attempts: int = 3) -> Dict:
url = link["url"]
# 1. Immediate Pass for Trusted / Logic-Enriched Domains
if "github.com" in url or "awesome" in link["title"].lower():
link["health_status"] = "trusted"
return link
# 2. Cached Health
if url in self.cache and self.cache[url].get("status") == "online":
link["health_status"] = "cached"
return link
# 3. Multi-Attempt Verification with Identity Rotation
headers = {
"User-Agent": ua,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.google.com/"
}
for attempt in range(attempts):
try:
# Use GET instead of HEAD as many sites block HEAD or return 405
resp = await client.get(url, headers=headers, timeout=10.0)
if resp.status_code < 400:
self.cache.setdefault(url, {})["status"] = "online"
link["health_status"] = "online"
return link
# If 404, it's a definitive fail
if resp.status_code == 404:
log_event(f" [Health] Definitive 404: {url}")
return None
except Exception as e:
if attempt == attempts - 1:
# Final attempt failed - Soft Flagging instead of removal
# If it's not a 404, we keep it but with a warning
link["health_status"] = "uncertain"
link["warning"] = "offline"
return link
# Backoff before retry
await asyncio.sleep(0.5 * (attempt + 1))
return link
async def _evaluate_and_score_resources(self, links: List[Dict]) -> List[Dict]:
refined = []
to_evaluate = []
# Pull from cache first
for l in links:
url = l["url"]
if url in self.cache and "year" in self.cache[url]:
item = l.copy()
item.update(self.cache[url])
# Refresh GitHub metadata if it's a GH link
if "github.com" in url:
gh_meta = await self._fetch_github_metadata(url)
item.update(gh_meta)
refined.append(item)
else:
to_evaluate.append(l)
if not to_evaluate: return refined
BATCH_SIZE = 50
for i in range(0, len(to_evaluate), BATCH_SIZE):
batch = to_evaluate[i:i+BATCH_SIZE]
batch_num = i//BATCH_SIZE + 1
log_event(f" [>] Processing Batch {batch_num} with AI...")
prompt = (
f"{self.library_criteria}\n"
"Respond ONLY with a JSON object: {\"results\": [{\"idx\": int, \"year\": \"YYYY\", \"stars\": int, \"is_video\": bool}, ...]}\n\n"
"LINKS:\n" + "\n".join([f"{idx}. {l['title']} ({l['url']})" for idx, l in enumerate(batch)])
)
try:
data = await call_gemini_with_retry(prompt)
results = data.get("results", [])
for res in results:
try:
idx = int(res["idx"])
if idx < len(batch):
item = batch[idx].copy()
eval_data = {
"year": str(res.get("year", "2024")),
"stars": min(max(int(res.get("stars", 1)), 1), 3),
"is_video": res.get("is_video", False)
}
item.update(eval_data)
if "github.com" in item["url"]:
gh_meta = await self._fetch_github_metadata(item["url"])
item.update(gh_meta)
eval_data.update(gh_meta)
# Save to cache
self.cache[item["url"]] = eval_data
if item["year"].isdigit() and int(item["year"]) >= 2025: item["tag"] = "[CUTTING-EDGE]"
elif "awesome" in item["title"].lower(): item["tag"] = "[FOUNDATIONAL]"
else: item["tag"] = "[PRODUCTION-READY]"
refined.append(item)
except: continue
except:
for l in batch:
item = l.copy()
item["year"], item["stars"], item["is_video"] = "2024", 1, "youtube" in l["url"]
item["tag"] = "[FOUNDATIONAL]" if "awesome" in l["title"].lower() else "[PRODUCTION-READY]"
refined.append(item)
await asyncio.sleep(0.3)
return refined
async def _fetch_github_metadata(self, url: str) -> Dict:
match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
if not match: return {}
owner, repo = match.groups()
repo = repo.split("#")[0].split("?")[0] # Clean up
headers = {"Authorization": f"token {GH_TOKEN}"} if GH_TOKEN else {}
api_url = f"https://api.github.com/repos/{owner}/{repo}"
try:
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(api_url, headers=headers)
if resp.status_code == 200:
data = resp.json()
return {
"gh_stars": data.get("stargazers_count", 0),
"gh_updated": data.get("updated_at", "").split("T")[0]
}
except: pass
return {}
async def _rebuild_structure(self, inventory: List[Dict]) -> Dict[str, Dict]:
v2_structure = {dim: {"summary": "", "categories": {}} for dim in self.dimensions.keys()}
file_to_dim = {}
for dim, files in self.dimensions.items():
for f in files: file_to_dim[f + ".md"] = dim
for item in inventory:
dim = file_to_dim.get(item["original_file"], "Architectural Foundations")
cat_name = item["original_file"].replace(".md", "").capitalize()
if cat_name not in v2_structure[dim]["categories"]:
v2_structure[dim]["categories"][cat_name] = []
v2_structure[dim]["categories"][cat_name].append(item)
for dim in v2_structure.keys():
if not v2_structure[dim]["categories"]: continue
for cat in v2_structure[dim]["categories"]:
v2_structure[dim]["categories"][cat].sort(key=lambda x: (x.get("year", "0"), -x.get("stars", 0)))
prompt = f"Write a professional 2026 executive summary for '{dim}'. Focus on high-density value. 1 sentence only."
try:
v2_structure[dim]["summary"] = await call_gemini_with_retry(prompt, response_format="text")
except:
v2_structure[dim]["summary"] = f"Comprehensive chronological reference library for {dim}."
return v2_structure
async def _write_premium_files(self, data: Dict[str, Dict], mosaic_html: str, videos_html: str):
# FIX: Ensure mosaic images point to V1 root
mosaic_html = mosaic_html.replace('src="images/', 'src="https://nubenetes.com/images/').replace('](images/', '](https://nubenetes.com/images/')
master_selection = []
for dim in data.values():
for cat_links in dim["categories"].values():
master_selection.extend([l for l in cat_links if l.get("stars", 1) == 3])
master_selection.sort(key=lambda x: (x.get("year", "0"), x["title"]))
index_md = (
"# Nubenetes V2 | The High-Density Library (2026)\n\n"
"![Banner](https://raw.githubusercontent.com/nubenetes/awesome-kubernetes/master/docs/images/logo.png)\n\n"
"!!! quote \"The Library of 2026\"\n"
" A meticulously curated reference of over 15,000 resources. This V2 portal preserves technical depth while providing "
" chronological clarity and expert quality synthesis.\n\n"
f"<center>\n{mosaic_html}\n</center>\n\n"
"## 🌟 Master Selection (Top-Tier Gems)\n"
"A global selection of the most impactful resources across all dimensions.\n\n"
)
for l in master_selection[:100]:
gh_info = f" `[⭐ {l['gh_stars']}]`" if "gh_stars" in l else ""
index_md += f"- **({l['year']})** [{l['title']}]({l['url']}){gh_info} 🌟🌟🌟\n"
index_md += "\n??? note \"Elite Video Selection - Click to expand!\"\n"
index_md += f" <center>\n{videos_html}\n </center>\n\n"
index_md += "## Strategic Dimensions\n"
for dim, content in data.items():
if not content["categories"]: continue
slug = dim.lower().replace(" ", "-").replace("&", "and").replace("(", "").replace(")", "").replace(" ", "-")
index_md += f"- **[{dim}](./{slug}.md)**: {content['summary']}\n"
with open(os.path.join(V2_DIR, "index.md"), "w") as f: f.write(index_md)
for dim, content in data.items():
if not content["categories"]: continue
slug = dim.lower().replace(" ", "-").replace("&", "and").replace("(", "").replace(")", "").replace(" ", "-")
md = f"# {dim}\n\n"
md += f"!!! info \"Architectural Context\"\n {content['summary']}\n\n"
for cat, links in content["categories"].items():
md += f"## {cat}\n"
for l in links:
year, stars = l.get("year", "N/A"), "🌟" * l.get("stars", 1)
tag = l.get("tag", "[PRODUCTION-READY]")
color = "success" if "FOUNDATIONAL" in tag else "info" if "PRODUCTION" in tag else "warning"
title_display = f"**{l['title']}**" if l.get("stars", 1) >= 2 else l['title']
gh_info = f" <span class='md-tag md-tag--info'>⭐ {l['gh_stars']}</span>" if "gh_stars" in l else ""
icon = " 🎥" if l.get("is_video") else ""
md += f" - **({year})** [{title_display}]({l['url']}){icon}{gh_info} {stars} <span class='md-tag md-tag--{color}'>{tag}</span>\n"
if l['description']:
desc = l['description']
if "\n" in desc:
md += "\n" + "\n".join([" " + line for line in desc.split("\n")]) + "\n\n"
else:
md += f" {desc}\n"
md += "\n"
with open(os.path.join(V2_DIR, f"{slug}.md"), "w") as f: f.write(md)
async def _sync_enterprise_navigation(self, data: Dict[str, Dict]):
try:
with open("v2-mkdocs.yml", "r") as f: content = f.read()
nav_items = [
"nav:",
" - \"🔙 Back to V1 (Exhaustive)\": https://nubenetes.com/",
" - \"The 2026 Vision\": index.md"
]
for dim in data.keys():
if not data[dim]["categories"]: continue
slug = dim.lower().replace(" ", "-").replace("&", "and").replace("(", "").replace(")", "").replace(" ", "-")
nav_items.append(f" - \"{dim}\": {slug}.md")
new_nav = "\n".join(nav_items)
updated_content = re.sub(r'nav:.*', new_nav, content, flags=re.DOTALL)
with open("v2-mkdocs.yml", "w") as f: f.write(updated_content)
except: pass
if __name__ == "__main__":
engine = V2VisionEngine()
asyncio.run(engine.analyze_and_cluster())