feat(ops): final engineering perfection - restored MVQ/Blacklist and implemented physical file cleanup in V1

This commit is contained in:
Nubenetes Bot
2026-05-17 14:20:08 +02:00
parent b19fe89b00
commit 61325f8cae
2 changed files with 70 additions and 23 deletions

View File

@@ -56,11 +56,27 @@ async def _get_github_activity(url: str) -> Dict:
async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
evaluations = {}
curator = AgenticCurator()
# Mandate 2: Load Blacklist
memory_file = "src/memory/health_learning.json"
domain_blacklist = set()
if os.path.exists(memory_file):
try:
memory_data = json.load(open(memory_file, "r"))
domain_blacklist = set(memory_data.get("blacklisted_domains", []))
except: pass
for i, asset in enumerate(raw_assets):
url = asset["url"]
log_event(f"--- EVALUATING {i+1}/{len(raw_assets)}: {url} ---")
norm_url = normalize_url(url)
# Mandate 2: Skip Blacklisted
if any(domain in url.lower() for domain in domain_blacklist):
log_event(f" [-] SKIPPING: Blacklisted domain detected.")
evaluations[url] = {"status": "FILTERED", "reason": "Blacklisted"}
continue
# --- DATABASE-FIRST: Reuse insights ---
if norm_url in curator.inventory:
cached = curator.inventory[norm_url]
@@ -75,19 +91,27 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
web_content, rich_meta = await _deep_fetch_content(url)
content_hash = hashlib.sha256(web_content.encode()).hexdigest() if web_content else "N/A"
# --- DYNAMIC CONTEXT: Mandates from GEMINI.md (Mandate 11 Bridge) ---
from src.mandate_ingestor import get_system_mandates
dynamic_mandates = get_system_mandates()
# Mandate 3: MVQ Check (GitHub Activity)
mvq_penalty = False
gh_meta = {}
if "github.com" in url:
gh_meta = await _get_github_activity(url)
pushed = gh_meta.get("gh_pushed", "")
if pushed:
last_date = datetime.fromisoformat(pushed.replace("Z", "+00:00"))
if (datetime.now(last_date.tzinfo) - last_date).days > (365 * 4):
mvq_penalty = True
log_event(f" [!] MVQ ALERT: Stale repository (>4 years). Penalty applied.")
# 2. AI Logic (O'Reilly + Linguistic Diversity)
is_primary = "nubenetes" in asset.get("source_type", "Social").lower()
strictness = "BE EXTREMELY SELECTIVE.\n" if not is_primary else ""
prompt = (
"You act as a Senior Technical Librarian in 2026.\n"
f"{dynamic_mandates}\n"
f"{strictness_directive}"
"You act as a Senior Technical Librarian in 2026.\n" + strictness +
f"{'IMPORTANT: This repo is old (>4 years inactive). Assign impact_score < 30.' if mvq_penalty else ''}\n"
"PHASE 1: LINGUISTIC DIVERSITY (Mandate 10)\n"
"- DESC (V1 Archive): Provide a professional summary in the RESOURCE'S NATIVE LANGUAGE.\n"
"- EN_SUMMARY (V2 Portal): Provide a professional English synthesis.\n"
"PHASE 2: ARCHITECTURAL CLASSIFICATION (O'REILLY STYLE)\n"
"- Identify TECHNICAL_HIERARCHY: List (max 10 strings) Area > Topic > Subtopics.\n"
"- DESC (V1 Archive): Professional summary in native language.\n"
"- EN_SUMMARY (V2 Portal): English synthesis.\n"
"Respond ONLY with JSON: {\"impact_score\": int, \"pub_date\": \"YYYY-MM-DD\", \"primary_category\": \"cat\", \"title\": \"...\", \"desc\": \"...\", \"en_summary\": \"...\", \"language\": \"...\", \"resource_type\": \"...\", \"complexity\": \"...\", \"technical_hierarchy\": [\"Area\", ...], \"is_microservice\": bool}\n"
f"CONTENT: {web_content[:2000]}"
)
@@ -106,7 +130,8 @@ async def evaluate_extracted_assets(raw_assets: List[Dict]) -> Dict[str, Dict]:
"stars": min(max(score // 20, 0), 5), "content_hash": content_hash,
"source_provenance": asset.get("source_type", "Social"), "social_preview_url": rich_meta.get("og_image", ""),
"mentions_count": curator.inventory.get(norm_url, {}).get("mentions_count", 0) + 1,
"category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp()
"category": primary_cat, "status": "online", "last_checked": datetime.now().timestamp(),
**gh_meta
}
curator.inventory[norm_url] = eval_data
evaluations[url] = {"status": "INCLUDED", **eval_data}

View File

@@ -153,31 +153,53 @@ class IntelligentLinkCleaner:
log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True)
file_updates = {}
# 1. Prepare file updates for dead/canonical links
for url, (fallback, reason) in self.dead_links.items():
for occ in self.link_registry.get(normalize_url(url), []):
file_path = occ["file"]
if file_path not in file_updates: file_updates[file_path] = open(file_path, "r").readlines()
nu = normalize_url(url)
# Use v1_locations from inventory if available, fallback to registry
paths = self.inventory.get(nu, {}).get("v1_locations", [])
if not paths:
paths = [occ["file"] for occ in self.link_registry.get(nu, [])]
for path in set(paths):
if not os.path.exists(path): continue
if path not in file_updates:
file_updates[path] = open(path, "r").readlines()
if fallback and fallback.startswith("CANONICAL:"):
new_url = fallback.replace("CANONICAL:", "")
file_updates[file_path][occ["line_index"]] = file_updates[file_path][occ["line_index"]].replace(url, new_url)
else:
file_updates[file_path][occ["line_index"]] = None # Mark for deletion
# Perform surgical replacement line-by-line
for i, line in enumerate(file_updates[path]):
if url in line:
if fallback and fallback.startswith("CANONICAL:"):
new_url = fallback.replace("CANONICAL:", "")
log_event(f" [FIX] Redirect: {url} -> {new_url} in {path}")
file_updates[path][i] = line.replace(url, new_url)
else:
log_event(f" [DEL] Dead Link: {url} in {path}")
file_updates[path][i] = None # Mark line for removal
# Final Payload
# 2. Final Payload Construction
final_payload = {p: "".join([l for l in lines if l is not None]) for p, lines in file_updates.items()}
# 3. Database Maintenance (GC & Persistence)
await self.prune_orphaned_metadata()
self._save_inventory()
final_payload[INVENTORY_PATH] = yaml.dump(self.inventory, sort_keys=False, allow_unicode=True)
# Safety & Audit
# 4. Safety Audit & Non-Blocking PR
from src.safety_guard import SafetyGuard
guard = SafetyGuard()
safety_report = guard.generate_audit_report()
if final_payload:
metrics = {"total_extracted": len(self.link_registry), "full_report": self.action_log}
metrics = {
"total_extracted": len(self.link_registry),
"full_report": self.action_log,
"deleted_dead": len([v for v in self.dead_links.values() if v[0] is None]),
"fixed_redirects": len([v for v in self.dead_links.values() if v[0] and "CANONICAL" in v[0]])
}
self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=safety_report)
else:
log_event(" [INFO] No files required cleaning in this cycle.")
async def prune_orphaned_metadata(self):
valid_map = {}