mirror of
https://github.com/nubenetes/awesome-kubernetes.git
synced 2026-05-22 09:03:23 +00:00
feat(ops): final engineering perfection - restored MVQ/Blacklist and implemented physical file cleanup in V1
This commit is contained in:
@@ -56,11 +56,27 @@ async def _get_github_activity(url: str) -> Dict:
|
||||
async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
|
||||
evaluations = {}
|
||||
curator = AgenticCurator()
|
||||
|
||||
# Mandate 2: Load Blacklist
|
||||
memory_file = "src/memory/health_learning.json"
|
||||
domain_blacklist = set()
|
||||
if os.path.exists(memory_file):
|
||||
try:
|
||||
memory_data = json.load(open(memory_file, "r"))
|
||||
domain_blacklist = set(memory_data.get("blacklisted_domains", []))
|
||||
except: pass
|
||||
|
||||
for i, asset in enumerate(raw_assets):
|
||||
url = asset["url"]
|
||||
log_event(f"--- EVALUATING {i+1}/{len(raw_assets)}: {url} ---")
|
||||
norm_url = normalize_url(url)
|
||||
|
||||
# Mandate 2: Skip Blacklisted
|
||||
if any(domain in url.lower() for domain in domain_blacklist):
|
||||
log_event(f" [-] SKIPPING: Blacklisted domain detected.")
|
||||
evaluations[url] = {"status": "FILTERED", "reason": "Blacklisted"}
|
||||
continue
|
||||
|
||||
# --- DATABASE-FIRST: Reuse insights ---
|
||||
if norm_url in curator.inventory:
|
||||
cached = curator.inventory[norm_url]
|
||||
@@ -75,19 +91,27 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
|
||||
web_content, rich_meta = await _deep_fetch_content(url)
|
||||
content_hash = hashlib.sha256(web_content.encode()).hexdigest() if web_content else "N/A"
|
||||
|
||||
# --- DYNAMIC CONTEXT: Mandates from GEMINI.md (Mandate 11 Bridge) ---
|
||||
from src.mandate_ingestor import get_system_mandates
|
||||
dynamic_mandates = get_system_mandates()
|
||||
|
||||
# Mandate 3: MVQ Check (GitHub Activity)
|
||||
mvq_penalty = False
|
||||
gh_meta = {}
|
||||
if "github.com" in url:
|
||||
gh_meta = await _get_github_activity(url)
|
||||
pushed = gh_meta.get("gh_pushed", "")
|
||||
if pushed:
|
||||
last_date = datetime.fromisoformat(pushed.replace("Z", "+00:00"))
|
||||
if (datetime.now(last_date.tzinfo) - last_date).days > (365 * 4):
|
||||
mvq_penalty = True
|
||||
log_event(f" [!] MVQ ALERT: Stale repository (>4 years). Penalty applied.")
|
||||
|
||||
# 2. AI Logic (O'Reilly + Linguistic Diversity)
|
||||
is_primary = "nubenetes" in asset.get("source_type", "Social").lower()
|
||||
strictness = "BE EXTREMELY SELECTIVE.\n" if not is_primary else ""
|
||||
prompt = (
|
||||
"You act as a Senior Technical Librarian in 2026.\n"
|
||||
f"{dynamic_mandates}\n"
|
||||
f"{strictness_directive}"
|
||||
"You act as a Senior Technical Librarian in 2026.\n" + strictness +
|
||||
f"{'IMPORTANT: This repo is old (>4 years inactive). Assign impact_score < 30.' if mvq_penalty else ''}\n"
|
||||
"PHASE 1: LINGUISTIC DIVERSITY (Mandate 10)\n"
|
||||
"- DESC (V1 Archive): Provide a professional summary in the RESOURCE'S NATIVE LANGUAGE.\n"
|
||||
"- EN_SUMMARY (V2 Portal): Provide a professional English synthesis.\n"
|
||||
"PHASE 2: ARCHITECTURAL CLASSIFICATION (O'REILLY STYLE)\n"
|
||||
"- Identify TECHNICAL_HIERARCHY: List (max 10 strings) Area > Topic > Subtopics.\n"
|
||||
"- DESC (V1 Archive): Professional summary in native language.\n"
|
||||
"- EN_SUMMARY (V2 Portal): English synthesis.\n"
|
||||
"Respond ONLY with JSON: {\"impact_score\": int, \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"cat\", \"title\": \"...\", \"desc\": \"...\", \"en_summary\": \"...\", \"language\": \"...\", \"resource_type\": \"...\", \"complexity\": \"...\", \"technical_hierarchy\": [\"Area\", ...], \"is_microservice\": bool}\n"
|
||||
f"CONTENT: {web_content[:2000]}"
|
||||
)
|
||||
@@ -106,7 +130,8 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
|
||||
"stars": min(max(score // 20, 0), 5), "content_hash": content_hash,
|
||||
"source_provenance": asset.get("source_type", "Social"), "social_preview_url": rich_meta.get("og_image", ""),
|
||||
"mentions_count": curator.inventory.get(norm_url, {}).get("mentions_count", 0) + 1,
|
||||
"category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp()
|
||||
"category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp(),
|
||||
**gh_meta
|
||||
}
|
||||
curator.inventory[norm_url] = eval_data
|
||||
evaluations[url] = {"status": "INCLUDED", **eval_data}
|
||||
|
||||
@@ -153,31 +153,53 @@ class IntelligentLinkCleaner:
|
||||
log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True)
|
||||
file_updates = {}
|
||||
|
||||
# 1. Prepare file updates for dead/canonical links
|
||||
for url, (fallback, reason) in self.dead_links.items():
|
||||
for occ in self.link_registry.get(normalize_url(url), []):
|
||||
file_path = occ["file"]
|
||||
if file_path not in file_updates: file_updates[file_path] = open(file_path, "r").readlines()
|
||||
nu = normalize_url(url)
|
||||
# Use v1_locations from inventory if available, fallback to registry
|
||||
paths = self.inventory.get(nu, {}).get("v1_locations", [])
|
||||
if not paths:
|
||||
paths = [occ["file"] for occ in self.link_registry.get(nu, [])]
|
||||
|
||||
for path in set(paths):
|
||||
if not os.path.exists(path): continue
|
||||
if path not in file_updates:
|
||||
file_updates[path] = open(path, "r").readlines()
|
||||
|
||||
if fallback and fallback.startswith("CANONICAL:"):
|
||||
new_url = fallback.replace("CANONICAL:", "")
|
||||
file_updates[file_path][occ["line_index"]] = file_updates[file_path][occ["line_index"]].replace(url, new_url)
|
||||
else:
|
||||
file_updates[file_path][occ["line_index"]] = None # Mark for deletion
|
||||
# Perform surgical replacement line-by-line
|
||||
for i, line in enumerate(file_updates[path]):
|
||||
if url in line:
|
||||
if fallback and fallback.startswith("CANONICAL:"):
|
||||
new_url = fallback.replace("CANONICAL:", "")
|
||||
log_event(f" [FIX] Redirect: {url} -> {new_url} in {path}")
|
||||
file_updates[path][i] = line.replace(url, new_url)
|
||||
else:
|
||||
log_event(f" [DEL] Dead Link: {url} in {path}")
|
||||
file_updates[path][i] = None # Mark line for removal
|
||||
|
||||
# Final Payload
|
||||
# 2. Final Payload Construction
|
||||
final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()}
|
||||
|
||||
# 3. Database Maintenance (GC & Persistence)
|
||||
await self.prune_orphaned_metadata()
|
||||
self._save_inventory()
|
||||
final_payload[INVENTORY_PATH] = yaml.dump(self.inventory, sort_keys=False, allow_unicode=True)
|
||||
|
||||
# Safety & Audit
|
||||
# 4. Safety Audit & Non-Blocking PR
|
||||
from src.safety_guard import SafetyGuard
|
||||
guard = SafetyGuard()
|
||||
safety_report = guard.generate_audit_report()
|
||||
|
||||
if final_payload:
|
||||
metrics = {"total_extracted": len(self.link_registry), "full_report": self.action_log}
|
||||
metrics = {
|
||||
"total_extracted": len(self.link_registry),
|
||||
"full_report": self.action_log,
|
||||
"deleted_dead": len([v for v in self.dead_links.values() if v[0] is None]),
|
||||
"fixed_redirects": len([v for v in self.dead_links.values() if v[0] and "CANONICAL" in v[0]])
|
||||
}
|
||||
self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=safety_report)
|
||||
else:
|
||||
log_event(" [INFO] No files required cleaning in this cycle.")
|
||||
|
||||
async def prune_orphaned_metadata(self):
|
||||
valid_map = {}
|
||||
|
||||
Reference in New Issue
Block a user