mirror of
https://github.com/nubenetes/awesome-kubernetes.git
synced 2026-05-23 17:43:16 +00:00
300 lines
16 KiB
Python
300 lines
16 KiB
Python
import asyncio
|
|
import json
|
|
import os
|
|
import re
|
|
import httpx
|
|
import yaml
|
|
import hashlib
|
|
from datetime import datetime
|
|
from typing import List, Dict, Optional, Tuple
|
|
from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ
|
|
from src.gitops_manager import RepositoryController
|
|
from src.gemini_utils import call_gemini_with_retry, normalize_url, clean_toc_text
|
|
from src.logger import log_event
|
|
|
|
def get_best_category_match(suggested: str) -> Optional[str]:
|
|
if not suggested: return None
|
|
suggested = suggested.lower().strip()
|
|
for cat in NUBENETES_CATEGORIES:
|
|
if suggested in cat or cat in suggested: return cat
|
|
return None
|
|
|
|
async def _deep_fetch_content(url: str) -> Tuple[str, Dict]:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
}
|
|
try:
|
|
timeout = httpx.Timeout(12.0, connect=5.0)
|
|
async with httpx.AsyncClient(timeout=timeout, verify=False) as client:
|
|
resp = await client.get(url, headers=headers, follow_redirects=True)
|
|
if resp.status_code == 200:
|
|
from bs4 import BeautifulSoup
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
rich_meta = await _enrich_rich_metadata(url, soup)
|
|
for s in soup(["script", "style", "nav", "footer", "aside"]): s.decompose()
|
|
return soup.get_text(separator=" ", strip=True)[:4000], rich_meta
|
|
except: return "", {}
|
|
return "", {}
|
|
|
|
async def _get_github_activity(url: str) -> Dict:
|
|
match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
|
|
if not match: return {}
|
|
owner, repo = match.groups()
|
|
repo = repo.split("#")[0].split("?")[0].rstrip(".git")
|
|
headers = {"Authorization": f"token {GH_TOKEN}"} if GH_TOKEN else {}
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
resp = await client.get(f"https://api.github.com/repos/{owner}/{repo}", headers=headers)
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
return {"gh_stars": data.get("stargazers_count", 0), "gh_pushed": data.get("pushed_at", ""), "gh_created": data.get("created_at", "")}
|
|
except: pass
|
|
return {}
|
|
|
|
async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
|
|
evaluations = {}
|
|
curator = AgenticCurator()
|
|
|
|
# Mandate 2: Load Blacklist
|
|
memory_file = "src/memory/health_learning.json"
|
|
domain_blacklist = set()
|
|
if os.path.exists(memory_file):
|
|
try:
|
|
memory_data = json.load(open(memory_file, "r"))
|
|
domain_blacklist = set(memory_data.get("blacklisted_domains", []))
|
|
except: pass
|
|
|
|
for i, asset in enumerate(raw_assets):
|
|
url = asset["url"]
|
|
log_event(f"--- EVALUATING {i+1}/{len(raw_assets)}: {url} ---")
|
|
norm_url = normalize_url(url)
|
|
|
|
# Mandate 2: Skip Blacklisted
|
|
if any(domain in url.lower() for domain in domain_blacklist):
|
|
log_event(f" [-] SKIPPING: Blacklisted domain detected.")
|
|
evaluations[url] = {"status": "FILTERED", "reason": "Blacklisted"}
|
|
continue
|
|
|
|
# --- DATABASE-FIRST: Reuse insights ---
|
|
if norm_url in curator.inventory:
|
|
cached = curator.inventory[norm_url]
|
|
if cached.get("title") and cached.get("hierarchy"):
|
|
log_event(f" [⚡] REUSING CACHED INSIGHTS: {cached['title']}")
|
|
from src.gemini_utils import SESSION_TRACKER
|
|
SESSION_TRACKER.track_cache_hit(est_tokens=2200)
|
|
evaluations[url] = {"status": "INCLUDED", **cached}
|
|
continue
|
|
|
|
# 1. Fetch & Fingerprint
|
|
web_content, rich_meta = await _deep_fetch_content(url)
|
|
content_hash = hashlib.sha256(web_content.encode()).hexdigest() if web_content else "N/A"
|
|
|
|
# Mandate 3: MVQ Check (GitHub Activity)
|
|
mvq_penalty = False
|
|
gh_meta = {}
|
|
if "github.com" in url:
|
|
gh_meta = await _get_github_activity(url)
|
|
pushed = gh_meta.get("gh_pushed", "")
|
|
if pushed:
|
|
last_date = datetime.fromisoformat(pushed.replace("Z", "+00:00"))
|
|
if (datetime.now(last_date.tzinfo) - last_date).days > (365 * 4):
|
|
mvq_penalty = True
|
|
log_event(f" [!] MVQ ALERT: Stale repository (>4 years). Penalty applied.")
|
|
|
|
# 2. AI Logic (O'Reilly + Linguistic Diversity)
|
|
is_primary = "nubenetes" in asset.get("source_type", "Social").lower()
|
|
strictness = "BE EXTREMELY SELECTIVE.\n" if not is_primary else ""
|
|
prompt = (
|
|
"You act as a Senior Technical Librarian in 2026.\n" + strictness +
|
|
f"{'IMPORTANT: This repo is old (>4 years inactive). Assign impact_score < 30.' if mvq_penalty else ''}\n"
|
|
"PHASE 1: LINGUISTIC DIVERSITY (Mandate 10)\n"
|
|
"- DESC (V1 Archive): Professional summary in native language.\n"
|
|
"- EN_SUMMARY (V2 Portal): English synthesis.\n"
|
|
"Respond ONLY with JSON: {\"impact_score\": int, \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"cat\", \"title\": \"...\", \"desc\": \"...\", \"en_summary\": \"...\", \"language\": \"...\", \"resource_type\": \"...\", \"complexity\": \"...\", \"technical_hierarchy\": [\"Area\", ...], \"is_microservice\": bool}\n"
|
|
f"CONTENT: {web_content[:2000]}"
|
|
)
|
|
|
|
try:
|
|
data = await call_gemini_with_retry(prompt)
|
|
score = data.get("impact_score", 50)
|
|
primary_cat = get_best_category_match(data.get("primary_category"))
|
|
|
|
if score >= (5 if is_primary else 80) and primary_cat:
|
|
eval_data = {
|
|
"title": data["title"], "description": data["desc"], "ai_summary": data.get("en_summary", data["desc"]),
|
|
"language": data.get("language", "English"), "resource_type": data.get("resource_type", "Reference"),
|
|
"complexity": data.get("complexity", "Intermediate"), "hierarchy": data.get("technical_hierarchy", ["General"]),
|
|
"is_microservice": data.get("is_microservice", False), "year": data.get("pub_date", "N/A")[:4],
|
|
"stars": min(max(score // 20, 0), 5), "content_hash": content_hash,
|
|
"source_provenance": asset.get("source_type", "Social"), "social_preview_url": rich_meta.get("og_image", ""),
|
|
"mentions_count": curator.inventory.get(norm_url, {}).get("mentions_count", 0) + 1,
|
|
"category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp(),
|
|
**gh_meta
|
|
}
|
|
curator.inventory[norm_url] = eval_data
|
|
evaluations[url] = {"status": "INCLUDED", **eval_data}
|
|
curator._save_inventory()
|
|
log_event(f" [+] ACCEPTED: {data['title']}")
|
|
else:
|
|
evaluations[url] = {"status": "FILTERED"}
|
|
except Exception as e: log_event(f" [!] AI Error: {e}")
|
|
return evaluations
|
|
|
|
INVENTORY_PATH = "data/inventory.yaml"
|
|
|
|
class AgenticCurator:
|
|
def __init__(self):
|
|
self.git_controller = RepositoryController(GH_TOKEN, TARGET_REPO)
|
|
self.docs_dir = "docs"
|
|
self.inventory = self._load_inventory()
|
|
|
|
def _load_inventory(self) -> dict:
|
|
if os.path.exists(INVENTORY_PATH):
|
|
try:
|
|
with open(INVENTORY_PATH, "r") as f: return yaml.safe_load(f) or {}
|
|
except: return {}
|
|
return {}
|
|
|
|
def _save_inventory(self):
|
|
os.makedirs(os.path.dirname(INVENTORY_PATH), exist_ok=True)
|
|
with open(INVENTORY_PATH, "w") as f: yaml.dump(self.inventory, f, sort_keys=False, allow_unicode=True)
|
|
|
|
async def _rebuild_toc(self, content: str) -> str:
|
|
lines = content.splitlines()
|
|
headers = []
|
|
for line in lines:
|
|
if line.startswith("## ") or line.startswith("### "):
|
|
raw_title = line.strip("#").strip()
|
|
title = clean_toc_text(raw_title)
|
|
anchor = title.lower().replace(" ", "-").replace(".", "").replace("/", "").replace("(", "").replace(")", "").replace(",", "")
|
|
headers.append({"title": title, "anchor": anchor, "level": 2 if line.startswith("## ") else 3})
|
|
if not headers: return content
|
|
toc_start_idx = -1
|
|
toc_end_idx = -1
|
|
for i, line in enumerate(lines):
|
|
if re.match(r"^\d+\.\s+\[", line.strip()):
|
|
if toc_start_idx == -1: toc_start_idx = i
|
|
toc_end_idx = i
|
|
elif toc_start_idx != -1 and not re.match(r"^\s*\d+\.\s+\[", line.strip()) and line.strip() != "":
|
|
if toc_end_idx != -1: break
|
|
if toc_start_idx == -1: return content
|
|
new_toc = []
|
|
h2_count, h3_count = 0, 0
|
|
for h in headers:
|
|
if h["level"] == 2:
|
|
h2_count += 1; h3_count = 0
|
|
new_toc.append(f"{h2_count}. [{h['title']}](#{h['anchor']})")
|
|
else:
|
|
h3_count += 1
|
|
new_toc.append(f" {h3_count}. [{h['title']}](#{h['anchor']})")
|
|
return "\n".join(lines[:toc_start_idx] + new_toc + lines[toc_end_idx + 1:])
|
|
|
|
async def decide_smart_injection(self, markdown_content: str, asset: Dict) -> str:
|
|
lines = markdown_content.splitlines(); structure = "\n".join([l for l in lines if l.startswith("#")])
|
|
stars = " 🌟" if asset.get("stars", 0) >= 4 else ""
|
|
line = f" - **({asset.get('year', 'N/A')})** [{asset['title']}]({asset['url']}){stars} - {asset['description']}"
|
|
prompt = f"Inject resource: {line} into structure: {structure[:1000]}. JSON: {{\"target_header\": \"## ...\", \"is_new_header\": bool}}"
|
|
try:
|
|
data = await call_gemini_with_retry(prompt)
|
|
target = data.get("target_header")
|
|
if not target: return self._manual_fallback_injection(markdown_content, asset)
|
|
new_lines = []; inserted = False
|
|
for l in lines:
|
|
new_lines.append(l)
|
|
if not inserted and target.lower() in l.lower() and l.startswith("#"):
|
|
if data.get("is_new_header"): new_lines.append("")
|
|
new_lines.append(line); inserted = True
|
|
res = "\n".join(new_lines)
|
|
return await self._rebuild_toc(res) if data.get("is_new_header") else res
|
|
except: pass
|
|
return self._manual_fallback_injection(markdown_content, asset)
|
|
|
|
def _manual_fallback_injection(self, content: str, asset: Dict) -> str:
|
|
stars = " 🌟" if asset.get("stars", 0) >= 4 else ""
|
|
line = f" - **({asset.get('year', 'N/A')})** [{asset['title']}]({asset['url']}){stars} - {asset['description']}"
|
|
return content + f"\n{line}" if "##" in content else content + f"\n\n## Tools and Resources\n{line}"
|
|
|
|
async def suggest_reorganization(self):
|
|
log_event("[*] Starting Internal Reorganization & TOC Audit...", section_break=True)
|
|
# Load Special Assets & Link Rules for exceptions
|
|
special_rules = {}
|
|
exempt_files = []
|
|
if os.path.exists("data/special_assets.yaml"):
|
|
try: special_rules = {sa["file"]: sa for sa in yaml.safe_load(open("data/special_assets.yaml"))["special_assets"]}
|
|
except: pass
|
|
if os.path.exists("data/link_rules.yaml"):
|
|
try: exempt_files = yaml.safe_load(open("data/link_rules.yaml"))["hierarchy_rules"].get("toc_exempt_files", [])
|
|
except: pass
|
|
|
|
for file in os.listdir(self.docs_dir):
|
|
if not file.endswith(".md") or file == "index.md" or file in exempt_files: continue
|
|
path = os.path.join(self.docs_dir, file)
|
|
with open(path, "r") as f: content = f.read()
|
|
|
|
is_special = file in special_rules
|
|
link_count = len(re.findall(r"^\s*-\s*\[", content, re.MULTILINE))
|
|
headers = re.findall(r"^##+ ", content, re.MULTILINE)
|
|
|
|
# --- FEATURE: Automatic TOC Injection for V1 ---
|
|
# Check for existing TOC (explicit header or numbered list)
|
|
has_toc = "Table of Contents" in content or len(re.findall(r'^\d+\.\s+\[.*?\]\(#.*?\)', content, re.MULTILINE)) >= 3
|
|
|
|
if len(headers) >= 3 and not has_toc:
|
|
log_event(f" [+] INJECTING TOC: {file}")
|
|
content = await self._rebuild_toc(content)
|
|
with open(path, "w") as f: f.write(content)
|
|
|
|
# Reorganize if special OR if flat and large
|
|
if is_special or (link_count > 25 and len(headers) < 2):
|
|
log_event(f" [!] REORGANIZING: {file} ({'Special' if is_special else 'Standard'})")
|
|
instruction = (
|
|
"SOPHISTICATED O'REILLY HIERARCHY: Create nested sections (##) and subsections (###). "
|
|
"Group links by technical AREAS, TOPICS, and SUBTOPICS. Preserve all links."
|
|
if is_special else "Group into logical sections (##)."
|
|
)
|
|
prompt = f"You act as a Technical Content Architect. Reorganize '{file}' based on: {instruction}\nCONTENT:\n{content[:5000]}"
|
|
try:
|
|
reorg = await call_gemini_with_retry(prompt, response_format="text", prefer_flash=True)
|
|
if len(reorg) > len(content) * 0.7:
|
|
final = await self._rebuild_toc(reorg)
|
|
with open(path, "w") as f: f.write(final)
|
|
log_event(f" [OK] Reorganized: {file}")
|
|
except Exception as e: log_event(f" [!] Error: {e}")
|
|
|
|
async def apply_semantic_interlinking(self, evaluations: Dict[str, Dict]):
|
|
log_event("[*] Phase 5: Executing Semantic Interlinking (Mandate 5)...", section_break=True)
|
|
for url, eval_data in evaluations.items():
|
|
if eval_data.get("status") != "INCLUDED": continue
|
|
for rel_cat in eval_data.get("related_categories", []):
|
|
rel_path = os.path.join(self.docs_dir, f"{rel_cat}.md")
|
|
if os.path.exists(rel_path):
|
|
content = open(rel_path, "r").read()
|
|
if url not in content:
|
|
log_event(f" [+] Interlinking: {eval_data['title']} -> {rel_cat}.md")
|
|
see_also = f"\n - *See also: [{eval_data['title']}]({url}) in [{eval_data['category'].replace('-', ' ').title()}]*"
|
|
match = re.search(r'^## ', content, re.MULTILINE)
|
|
if match:
|
|
next_h2 = re.search(r'^## ', content[match.end():], re.MULTILINE)
|
|
pos = match.end() + next_h2.start() if next_h2 else len(content)
|
|
content = content[:pos] + see_also + "\n" + content[pos:]
|
|
else: content += f"\n\n## Related Resources\n{see_also}"
|
|
with open(rel_path, "w") as f: f.write(content)
|
|
|
|
async def _enrich_rich_metadata(url: str, soup) -> Dict:
|
|
meta = {}; url_l = url.lower()
|
|
og = soup.find("meta", property="og:image") or soup.find("meta", {"name": "twitter:image"})
|
|
if og: meta["og_image"] = og.get("content")
|
|
if "youtube.com" in url_l or "youtu.be" in url_l:
|
|
a = soup.find("link", itemprop="name"); d = soup.find("meta", itemprop="duration")
|
|
if a: meta["author"] = a.get("content")
|
|
if d:
|
|
m = re.search(r'PT(\d+H)?(\d+M)?(\d+S)?', d.get("content", ""))
|
|
if m: h, mi, s = m.groups(); meta["duration"] = f"{h.replace('H','h ') if h else ''}{mi.replace('M','m') if mi else '0m'}".strip()
|
|
elif any(d in url_l for d in ["medium.com", "dev.to", "blog"]):
|
|
rt = soup.find("meta", property="twitter:data1"); a = soup.find("meta", {"name": "author"}) or soup.find("meta", property="article:author")
|
|
if rt and "min" in rt.get("content", "").lower(): meta["reading_time"] = rt.get("content")
|
|
if a: meta["author"] = a.get("content")
|
|
return meta
|