feat(ops): implement high-density multi-part reporting engine for PRs and extraction logs

This commit is contained in:
Nubenetes Bot
2026-05-17 23:14:11 +02:00
parent d8b8b2da08
commit 5ffab5b5d9
2 changed files with 91 additions and 67 deletions

View File

@@ -2,6 +2,7 @@ import json
import base64
from github import Github
from datetime import datetime
from typing import List, Dict
from src.gemini_utils import SESSION_TRACKER
@@ -23,42 +24,14 @@ class RepositoryController:
except:
return ""
def apply_historical_chunk(self, updates: dict, next_since: str) -> None:
branch_name = "bot/historical-accumulator"
# Check if branch exists, if not, create from develop
try:
self.repository.get_branch(branch_name)
except:
self._create_feature_branch(branch_name)
for file_path, content in updates.items():
try:
try:
file_meta = self.repository.get_contents(file_path, ref=branch_name)
self.repository.update_file(
path=file_path, message=f"chore(historical): chunk sync since {next_since}",
content=content, sha=file_meta.sha, branch=branch_name
)
except Exception as e:
if "404" in str(e):
self.repository.create_file(
path=file_path, message=f"chore(historical): init {file_path}",
content=content, branch=branch_name
)
except Exception as e:
print(f"Error in historical chunk for {file_path}: {e}")
def apply_multi_file_changes(self, updates: dict, metrics: dict, safety_report: str = "") -> str:
timestamp_slug = datetime.now().strftime("%Y%m%d-%H%M")
branch_name = f"bot/knowledge-update-{timestamp_slug}"
# In the last historical chunk, use the accumulator as base if it exists
accumulator_branch = "bot/historical-accumulator"
try:
acc = self.repository.get_branch(accumulator_branch)
self.repository.create_git_ref(ref=f"refs/heads/{branch_name}", sha=acc.commit.sha)
self._create_feature_branch(branch_name)
except:
branch_name = f"bot/knowledge-update-{timestamp_slug}-{id(updates)}"
self._create_feature_branch(branch_name)
if not updates:
@@ -88,33 +61,48 @@ class RepositoryController:
# --- REPORT CONSTRUCTION ---
full_report = metrics.get('full_report', [])
sorted_report = sorted(full_report, key=lambda x: 0 if x['status'] == 'INCLUDED' else 1)
sorted_report = sorted(full_report, key=lambda x: 0 if x['status'] == 'INCLUDED' else (1 if x['status'] == 'DUPLICATE' else 2))
counts = {"INCLUDED": 0, "DUPLICATE": 0, "FILTERED": 0}
source_counts = {}
all_rows = []
header_table = "| # | Status | Score | Lang | Type | Date | Source | Reason | URL |\n| :--- | :--- | :---: | :---: | :--- | :---: | :--- | :--- | :--- |\n"
for idx, item in enumerate(sorted_report, 1):
status_emoji = {"INCLUDED": "", "DUPLICATE": "👯", "FILTERED": "🛡️"}.get(item['status'], "")
date_str = item.get('post_date', 'N/A')[:10] if item.get('post_date') else 'N/A'
all_rows.append(f"| {idx} | {status_emoji} {item['status']} | {date_str} | {item.get('source', 'N/A')} | {item['reason']} | `{item['category']}` | {item['url']} |\n")
date_str = str(item.get('post_date', 'N/A'))[:10] if item.get('post_date') else 'N/A'
score = item.get('impact_score', 'N/A')
lang = item.get('language', 'N/A')[:2].upper() if item.get('language') else 'EN'
res_type = item.get('type', 'Ref')
row = f"| {idx} | {status_emoji} {item['status']} | {score} | {lang} | {res_type} | {date_str} | {item.get('source', 'N/A')} | {item['reason']} | {item['url']} |\n"
all_rows.append(row)
counts[item['status']] = counts.get(item['status'], 0) + 1
if item['status'] == "INCLUDED":
src = item.get('source', 'Unknown')
source_counts[src] = source_counts.get(src, 0) + 1
src = item.get('source', 'Unknown')
source_counts[src] = source_counts.get(src, 0) + 1
# 1. Mermaid Diagrams & Stats
# AI Intel and Mermaid
ai_intel = SESSION_TRACKER.get_intelligence_report()
source_md = "#### 📊 Source Distribution\n| Source | Count |\n| :--- | :---: |\n"
for src, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
source_md += f"| {src} | {count} |\n"
mermaid = f"### 📊 Decision Metrics\n```mermaid\npie title Agentic Decision Distribution\n \"Accepted\" : {counts['INCLUDED']}\n \"Duplicates\" : {counts['DUPLICATE']}\n \"Filtered\" : {counts['FILTERED']}\n```\n"
pr_body = (
f"## 💎 Knowledge Update: {datetime.now().strftime('%d %b %Y')}\n\n"
f"Processed **{metrics.get('total_extracted', 0)}** links.\n\n"
f"{safety_report}\n\n"
f"{ai_intel}\n\n"
f"{mermaid}\n"
f"---\n"
f"**Audit Matrix follows in comments due to scale.**\n"
)
# Build PR Body (With Safety Guard Splitting)
pr_body = f"## 💎 Knowledge Update: {datetime.now().strftime('%d %b %Y')}\n\nProcessed **{metrics.get('total_extracted', 0)}** links.\n\n"
# If safety_report is huge, move it to its own comment
safety_in_body = True
if len(safety_report) > 30000:
pr_body += "⚠️ **Detailed Safety Audit moved to comments due to scale.**\n\n"
safety_in_body = False
else:
pr_body += f"{safety_report}\n\n"
pr_body += f"{ai_intel}\n\n{mermaid}\n{source_md}\n---\n**Audit Matrix and Logs follow in successive comments.**\n"
pr = self.repository.create_pull(
title=f"💎 Knowledge Update & Optimization: {datetime.now().strftime('%d %b %Y')}",
@@ -123,8 +111,32 @@ class RepositoryController:
base=self.default_branch_name
)
# 2. Split Audit Matrix into Comments
header_table = "| # | Status | Date | Source | Reason | Category | URL |\n| :--- | :--- | :--- | :--- | :--- | :--- | :--- |\n"
# 1. Safety Report (if huge)
if not safety_in_body:
log_header = "## 🛡️ Safety & Mandate Audit (Detailed)\n"
current_chunk = log_header
for line in safety_report.splitlines():
if len(current_chunk) + len(line) > 60000:
pr.create_issue_comment(current_chunk)
current_chunk = log_header + line + "\n"
else:
current_chunk += line + "\n"
pr.create_issue_comment(current_chunk)
# 2. X.com Extraction Audit Trail
x_audit = metrics.get('x_audit', [])
if x_audit:
log_header = "### 📜 Extraction Audit Trail\n*Detailed logs of social and RSS discovery attempts.*\n\n"
current_log = log_header
for entry in x_audit:
if len(current_log) + len(entry) > 60000:
pr.create_issue_comment(current_log)
current_log = log_header + entry + "\n"
else:
current_log += entry + "\n"
pr.create_issue_comment(current_log)
# 3. Split Audit Matrix into Comments
current_comment = header_table
part = 1
for row in all_rows:

View File

@@ -29,7 +29,7 @@ class IntelligentLinkCleaner:
self.dead_links: Dict[str, Tuple[str, str]] = {}
self.learning_data = self._load_memory()
self.inventory = self._load_inventory()
self.action_log: List[Dict] = []
self.full_report_metrics = [] # Track what happened to every link
self.detailed_stats = {"total_scanned": 0, "skipped_recent": 0, "by_file": {}, "operation_types": {"removals": 0, "consolidated": 0, "healed": 0, "enriched": 0}}
self.stats = {"total_links": 0, "dead_links_removed": 0, "orphans_fixed": 0, "enriched_descriptions": 0}
@@ -63,18 +63,15 @@ class IntelligentLinkCleaner:
content = open(path, "r").read()
lines = content.splitlines()
for idx, line in enumerate(lines):
# Enhanced Regex to capture surrounding formatting
matches = re.finditer(r'(\*\*|==)?\s*\[(.*?)\]\((https?://.*?)\)\s*(\*\*|==)?\s*(.*)', line)
for m in matches:
fmt_pre, title, url, fmt_post, desc = m.groups()
nu = normalize_url(url)
# Identify Importance Markers (Mandate 31 Expansion)
is_important = False
if fmt_pre or fmt_post: is_important = True # Bold or Highlighted
if "🌟" in title or "🌟" in desc: is_important = True # Stars
if len(desc.strip()) > 100: is_important = True # Deep description
if path in CORE_FILES: is_important = True # Foundational files
if fmt_pre or fmt_post: is_important = True
if "🌟" in title or "🌟" in (desc or ""): is_important = True
if desc and len(desc.strip()) > 100: is_important = True
if path in CORE_FILES: is_important = True
self.link_registry.setdefault(nu, []).append({
"file": path, "line_index": idx, "url": url,
@@ -84,7 +81,6 @@ class IntelligentLinkCleaner:
unique_urls = list(self.link_registry.keys())
random.shuffle(unique_urls)
# 1.5. Identify prioritized links for validation
to_check = []
for u in unique_urls:
nu = normalize_url(u); entry = self.inventory.get(nu, {})
@@ -155,27 +151,36 @@ class IntelligentLinkCleaner:
score = entry.get("health_score", 100)
score = (score * 0.8) + (100 if alive else 0) * 0.2
entry["health_score"] = round(score, 1); entry["last_checked"] = datetime.now().timestamp()
# Identify high-value status
is_important = any(occ.get("is_important") for occ in self.link_registry.get(nu, []))
if entry.get("stars", 0) >= 3: is_important = True
status = "INCLUDED" if alive else "FILTERED"
final_reason = reason
if not alive or reason == "generic_redirect_loss":
if is_important:
entry["status"] = "review_required"
entry["review_metadata"] = {
"original_url": url,
"proposed_url": final if final else "NONE",
"reason": f"High-Value Preservation: {reason}",
"timestamp": datetime.now().isoformat()
"original_url": url, "proposed_url": final if final else "NONE",
"reason": f"High-Value Preservation: {reason}", "timestamp": datetime.now().isoformat()
}
log_event(f" [⚠️] REVIEW STORED: {url} in inventory. Metadata preserved.")
log_event(f" [⚠️] REVIEW STORED: {url}")
status = "INCLUDED" # Kept in V1
final_reason = f"Preserved for Review ({reason})"
elif score < 20:
entry["status"] = "dead"; self.dead_links[url] = (None, reason)
status = "FILTERED"; final_reason = f"Dead: {reason}"
elif final and alive:
# If it's rescued or a valid redirect, we update
self.dead_links[url] = (f"CANONICAL:{final}", "Redirect/Resurrection")
final_reason = "Updated (Redirect/Rescued)"
self.full_report_metrics.append({
"url": url, "status": status, "reason": final_reason,
"category": entry.get("category", "N/A"), "post_date": entry.get("pub_date"),
"source": "Health Checker", "impact_score": entry.get("stars", 0) * 20,
"language": entry.get("language", "EN"), "type": entry.get("resource_type", "Ref")
})
self.inventory[nu] = entry
await self.apply_changes()
@@ -228,7 +233,14 @@ class IntelligentLinkCleaner:
from src.safety_guard import SafetyGuard
report = SafetyGuard().generate_audit_report()
if final_payload: self.git_controller.apply_multi_file_changes(final_payload, {"total_extracted": len(self.link_registry)}, safety_report=report)
metrics = {
"total_extracted": len(self.link_registry),
"full_report": self.full_report_metrics,
"end_date": datetime.now().isoformat()
}
if final_payload: self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=report)
async def prune_orphaned_metadata(self):
valid_map = {}