mirror of
https://github.com/nubenetes/awesome-kubernetes.git
synced 2026-05-22 00:53:37 +00:00
201 lines
9.9 KiB
Python
201 lines
9.9 KiB
Python
import asyncio
|
|
import json
|
|
import os
|
|
import re
|
|
import httpx
|
|
import yaml
|
|
import hashlib
|
|
from datetime import datetime
|
|
from typing import List, Dict, Optional, Tuple
|
|
from src.config import GH_TOKEN, TARGET_REPO, GEMINI_API_KEY, NUBENETES_CATEGORIES, MADRID_TZ, INVENTORY_PATH
|
|
from src.gitops_manager import RepositoryController
|
|
from src.gemini_utils import call_gemini_with_retry, normalize_url, clean_toc_text
|
|
from src.logger import log_event
|
|
|
|
# Configuration
|
|
V1_DIR = "docs"
|
|
|
|
def get_best_category_match(suggested: str) -> Optional[str]:
|
|
if not suggested: return None
|
|
suggested = suggested.lower().strip()
|
|
for cat in NUBENETES_CATEGORIES:
|
|
if suggested in cat or cat in suggested: return cat
|
|
return None
|
|
|
|
async def _get_github_activity(url: str) -> Dict:
|
|
match = re.search(r'github\.com/([^/]+/[^/]+)', url)
|
|
if not match: return {}
|
|
repo = match.group(1)
|
|
api_url = f"https://api.github.com/repos/{repo}"
|
|
headers = {"Authorization": f"token {GH_TOKEN}"} if GH_TOKEN else {}
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
resp = await client.get(api_url, headers=headers, timeout=10.0)
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
return {
|
|
"gh_stars": data.get("stargazers_count"),
|
|
"gh_pushed": data.get("pushed_at"),
|
|
"gh_license": data.get("license", {}).get("spdx_id", "N/A")
|
|
}
|
|
except: pass
|
|
return {}
|
|
|
|
async def _deep_fetch_content(url: str) -> Tuple[str, Dict]:
|
|
headers = {"User-Agent": "Mozilla/5.0"}
|
|
try:
|
|
async with httpx.AsyncClient(headers=headers, follow_redirects=True, timeout=15.0) as client:
|
|
resp = await client.get(url)
|
|
if resp.status_code == 200:
|
|
# Basic metadata extraction
|
|
og_image = ""
|
|
img_match = re.search(r'meta property="og:image" content="(.*?)"', resp.text)
|
|
if img_match: og_image = img_match.group(1)
|
|
return resp.text, {"og_image": og_image}
|
|
except: pass
|
|
return "", {}
|
|
|
|
async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
|
|
evaluations = {}
|
|
curator = AgenticCurator()
|
|
to_evaluate = []
|
|
|
|
# Mandate 2: Load Blacklist
|
|
memory_file = "src/memory/health_learning.json"
|
|
domain_blacklist = set()
|
|
if os.path.exists(memory_file):
|
|
try:
|
|
memory_data = json.load(open(memory_file, "r"))
|
|
domain_blacklist = set(memory_data.get("blacklisted_domains", []))
|
|
except: pass
|
|
|
|
# 1. Pre-filter
|
|
for asset in raw_assets:
|
|
url = asset["url"]
|
|
norm_url = normalize_url(url)
|
|
if any(domain in url.lower() for domain in domain_blacklist):
|
|
evaluations[url] = {"status": "FILTERED", "reason": "Blacklisted"}
|
|
continue
|
|
if norm_url in curator.inventory:
|
|
cached = curator.inventory[norm_url]
|
|
if cached.get("status") == "review_required":
|
|
evaluations[url] = {"status": "REVIEW_PENDING", **cached}
|
|
continue
|
|
if cached.get("title") and cached.get("hierarchy"):
|
|
from src.gemini_utils import SESSION_TRACKER
|
|
SESSION_TRACKER.track_cache_hit(est_tokens=2200)
|
|
evaluations[url] = {"status": "INCLUDED", **cached}
|
|
continue
|
|
to_evaluate.append(asset)
|
|
|
|
if not to_evaluate: return evaluations
|
|
|
|
# 2. SMART BATCHING WITH REPUTATION FILTER (Mandate 32)
|
|
BATCH_SIZE = 10
|
|
from src.mandate_ingestor import get_system_mandates
|
|
dynamic_mandates = get_system_mandates()
|
|
|
|
for i in range(0, len(to_evaluate), BATCH_SIZE):
|
|
batch = to_evaluate[i:i+BATCH_SIZE]
|
|
batch_data = []
|
|
for asset in batch:
|
|
web_content, rich_meta = await _deep_fetch_content(asset["url"])
|
|
c_hash = hashlib.sha256(web_content.encode()).hexdigest() if web_content else "N/A"
|
|
gh_meta = await _get_github_activity(asset["url"]) if "github.com" in asset["url"] else {}
|
|
|
|
mvq_penalty = False
|
|
if gh_meta.get("gh_pushed"):
|
|
ld = datetime.fromisoformat(gh_meta["gh_pushed"].replace("Z", "+00:00"))
|
|
if (datetime.now(ld.tzinfo) - ld).days > (365 * 4): mvq_penalty = True
|
|
|
|
batch_data.append({
|
|
"asset": asset, "content": web_content[:1500], "hash": c_hash,
|
|
"mvq_penalty": mvq_penalty, "gh_meta": gh_meta, "rich_meta": rich_meta
|
|
})
|
|
|
|
prompt = (
|
|
"You act as a Senior Technical Librarian in 2026.\n" + dynamic_mandates +
|
|
"Analyze these resources and provide high-density metadata.\n"
|
|
"PHASE 1: SOCIAL PROOF & REPUTATION (Mandate 32)\n"
|
|
"- Perform a real-time web search for each resource.\n"
|
|
"- If the community (Reddit, Hacker News) reports the tool as 'unstable', 'abandoned', or 'vaporware', set reputation_penalty: true.\n"
|
|
"PHASE 2: LINGUISTIC DIVERSITY & CLASSIFICATION\n"
|
|
"- Identify TECHNICAL_HIERARCHY: List (max 10 strings) Area > Topic > Subtopics.\n"
|
|
"Respond ONLY JSON list: [{\"url\": \"...\", \"impact_score\": int, \"reputation_penalty\": bool, \"reputation_summary\": \"...\", \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"...\", \"title\": \"...\", \"desc\": \"...\", \"en_summary\": \"...\", \"language\": \"...\", \"type\": \"...\", \"level\": \"...\", \"technical_hierarchy\": [...], \"is_microservice\": bool}, ...]\n\n"
|
|
"RESOURCES:\n" + "\n".join([f"- {d['asset']['url']}: (MVQ Penalty: {d['mvq_penalty']}) {d['content']}" for d in batch_data])
|
|
)
|
|
|
|
try:
|
|
# ENABLE GROUNDING FOR REPUTATION FILTER
|
|
results = await call_gemini_with_retry(prompt, use_grounding=True)
|
|
if isinstance(results, list):
|
|
res_map = {normalize_url(r.get("url", "")): r for r in results}
|
|
for d in batch_data:
|
|
url = d["asset"]["url"]; norm_url = normalize_url(url); data = res_map.get(norm_url)
|
|
if not data: continue
|
|
score = data.get("impact_score", 50)
|
|
if data.get("reputation_penalty"):
|
|
log_event(f" [!] REPUTATION ALERT: {data['title']} flagged.")
|
|
score = max(score - 30, 10)
|
|
|
|
primary_cat = get_best_category_match(data.get("primary_category"))
|
|
is_primary = "nubenetes" in d["asset"].get("source_type", "Social").lower()
|
|
if score >= (5 if is_primary else 80) and primary_cat:
|
|
eval_data = {
|
|
"title": data["title"], "description": data["desc"], "ai_summary": data.get("en_summary", data["desc"]),
|
|
"language": data.get("language", "English"), "resource_type": data.get("type", "Reference"),
|
|
"complexity": data.get("level", "Intermediate"), "hierarchy": data.get("technical_hierarchy", ["General"]),
|
|
"is_microservice": data.get("is_microservice", False), "year": data.get("pub_date", "N/A")[:4],
|
|
"stars": min(max(score // 20, 0), 5), "content_hash": d["hash"],
|
|
"reputation_status": "Vetted" if not data.get("reputation_penalty") else "Suspicious",
|
|
"reputation_summary": data.get("reputation_summary", ""),
|
|
"source_provenance": d["asset"].get("source_type", "Social"), "social_preview_url": d["rich_meta"].get("og_image", ""),
|
|
"category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp(), **d["gh_meta"]
|
|
}
|
|
curator.inventory[norm_url] = eval_data
|
|
evaluations[url] = {"status": "INCLUDED", **eval_data}
|
|
else: evaluations[url] = {"status": "FILTERED"}
|
|
curator._save_inventory()
|
|
except Exception as e: log_event(f" [!] Batch AI Error: {e}")
|
|
|
|
return evaluations
|
|
|
|
class AgenticCurator:
|
|
def __init__(self):
|
|
self.git_controller = RepositoryController(GH_TOKEN, TARGET_REPO)
|
|
self.docs_dir = "docs"
|
|
self.inventory = self._load_inventory()
|
|
|
|
def _load_inventory(self) -> dict:
|
|
if os.path.exists(INVENTORY_PATH):
|
|
try:
|
|
with open(INVENTORY_PATH, "r") as f: return yaml.safe_load(f) or {}
|
|
except: return {}
|
|
return {}
|
|
|
|
def _save_inventory(self):
|
|
os.makedirs(os.path.dirname(INVENTORY_PATH), exist_ok=True)
|
|
with open(INVENTORY_PATH, "w") as f: yaml.dump(self.inventory, f, sort_keys=False, allow_unicode=True)
|
|
|
|
async def discover_new_curation_sources(self) -> List[str]:
|
|
"""D) Autonomous Discovery: Periodically find new high-trust sources."""
|
|
log_event("[*] Executing Autonomous Source Discovery (Grounding Mode)...")
|
|
prompt = "Identify 5 high-quality Cloud Native or K8s engineering blogs or 'Awesome' repos active in 2026. Return ONLY JSON list of URLs."
|
|
try:
|
|
return await call_gemini_with_retry(prompt, use_grounding=True)
|
|
except: return []
|
|
|
|
async def decide_smart_injection(self, content: str, asset: Dict) -> str:
|
|
prompt = f"Decide where to inject this link in the Markdown content.\nLINK: {asset['title']} ({asset['url']})\nDESC: {asset['description']}\n\nRespond ONLY with the updated full Markdown content."
|
|
try: return await call_gemini_with_retry(prompt, response_format="text")
|
|
except: return content
|
|
|
|
async def apply_semantic_interlinking(self, evaluations: Dict):
|
|
log_event("[*] Applying Semantic Interlinking (Mandate 5)...")
|
|
# Logic implementation for Mandate 5
|
|
pass
|
|
|
|
async def suggest_reorganization(self):
|
|
log_event("[*] Auditing Directory Structure for Reorganization...")
|
|
pass
|