mirror of
https://github.com/nubenetes/awesome-kubernetes.git
synced 2026-05-23 17:43:16 +00:00
feat(ops): implement high-density multi-part reporting engine for PRs and extraction logs
This commit is contained in:
@@ -2,6 +2,7 @@ import json
|
||||
import base64
|
||||
from github import Github
|
||||
from datetime import datetime
|
||||
from typing import List, Dict
|
||||
|
||||
from src.gemini_utils import SESSION_TRACKER
|
||||
|
||||
@@ -23,42 +24,14 @@ class RepositoryController:
|
||||
except:
|
||||
return ""
|
||||
|
||||
def apply_historical_chunk(self, updates: dict, next_since: str) -> None:
|
||||
branch_name = "bot/historical-accumulator"
|
||||
|
||||
# Check if branch exists, if not, create from develop
|
||||
try:
|
||||
self.repository.get_branch(branch_name)
|
||||
except:
|
||||
self._create_feature_branch(branch_name)
|
||||
|
||||
for file_path, content in updates.items():
|
||||
try:
|
||||
try:
|
||||
file_meta = self.repository.get_contents(file_path, ref=branch_name)
|
||||
self.repository.update_file(
|
||||
path=file_path, message=f"chore(historical): chunk sync since {next_since}",
|
||||
content=content, sha=file_meta.sha, branch=branch_name
|
||||
)
|
||||
except Exception as e:
|
||||
if "404" in str(e):
|
||||
self.repository.create_file(
|
||||
path=file_path, message=f"chore(historical): init {file_path}",
|
||||
content=content, branch=branch_name
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error in historical chunk for {file_path}: {e}")
|
||||
|
||||
def apply_multi_file_changes(self, updates: dict, metrics: dict, safety_report: str = "") -> str:
|
||||
timestamp_slug = datetime.now().strftime("%Y%m%d-%H%M")
|
||||
branch_name = f"bot/knowledge-update-{timestamp_slug}"
|
||||
|
||||
# In the last historical chunk, use the accumulator as base if it exists
|
||||
accumulator_branch = "bot/historical-accumulator"
|
||||
try:
|
||||
acc = self.repository.get_branch(accumulator_branch)
|
||||
self.repository.create_git_ref(ref=f"refs/heads/{branch_name}", sha=acc.commit.sha)
|
||||
self._create_feature_branch(branch_name)
|
||||
except:
|
||||
branch_name = f"bot/knowledge-update-{timestamp_slug}-{id(updates)}"
|
||||
self._create_feature_branch(branch_name)
|
||||
|
||||
if not updates:
|
||||
@@ -88,33 +61,48 @@ class RepositoryController:
|
||||
|
||||
# --- REPORT CONSTRUCTION ---
|
||||
full_report = metrics.get('full_report', [])
|
||||
sorted_report = sorted(full_report, key=lambda x: 0 if x['status'] == 'INCLUDED' else 1)
|
||||
sorted_report = sorted(full_report, key=lambda x: 0 if x['status'] == 'INCLUDED' else (1 if x['status'] == 'DUPLICATE' else 2))
|
||||
|
||||
counts = {"INCLUDED": 0, "DUPLICATE": 0, "FILTERED": 0}
|
||||
source_counts = {}
|
||||
all_rows = []
|
||||
|
||||
header_table = "| # | Status | Score | Lang | Type | Date | Source | Reason | URL |\n| :--- | :--- | :---: | :---: | :--- | :---: | :--- | :--- | :--- |\n"
|
||||
|
||||
for idx, item in enumerate(sorted_report, 1):
|
||||
status_emoji = {"INCLUDED": "✅", "DUPLICATE": "👯", "FILTERED": "🛡️"}.get(item['status'], "❓")
|
||||
date_str = item.get('post_date', 'N/A')[:10] if item.get('post_date') else 'N/A'
|
||||
all_rows.append(f"| {idx} | {status_emoji} {item['status']} | {date_str} | {item.get('source', 'N/A')} | {item['reason']} | `{item['category']}` | {item['url']} |\n")
|
||||
date_str = str(item.get('post_date', 'N/A'))[:10] if item.get('post_date') else 'N/A'
|
||||
score = item.get('impact_score', 'N/A')
|
||||
lang = item.get('language', 'N/A')[:2].upper() if item.get('language') else 'EN'
|
||||
res_type = item.get('type', 'Ref')
|
||||
|
||||
row = f"| {idx} | {status_emoji} {item['status']} | {score} | {lang} | {res_type} | {date_str} | {item.get('source', 'N/A')} | {item['reason']} | {item['url']} |\n"
|
||||
all_rows.append(row)
|
||||
|
||||
counts[item['status']] = counts.get(item['status'], 0) + 1
|
||||
if item['status'] == "INCLUDED":
|
||||
src = item.get('source', 'Unknown')
|
||||
source_counts[src] = source_counts.get(src, 0) + 1
|
||||
src = item.get('source', 'Unknown')
|
||||
source_counts[src] = source_counts.get(src, 0) + 1
|
||||
|
||||
# 1. Mermaid Diagrams & Stats
|
||||
# AI Intel and Mermaid
|
||||
ai_intel = SESSION_TRACKER.get_intelligence_report()
|
||||
source_md = "#### 📊 Source Distribution\n| Source | Count |\n| :--- | :---: |\n"
|
||||
for src, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
|
||||
source_md += f"| {src} | {count} |\n"
|
||||
|
||||
mermaid = f"### 📊 Decision Metrics\n```mermaid\npie title Agentic Decision Distribution\n \"Accepted\" : {counts['INCLUDED']}\n \"Duplicates\" : {counts['DUPLICATE']}\n \"Filtered\" : {counts['FILTERED']}\n```\n"
|
||||
|
||||
pr_body = (
|
||||
f"## 💎 Knowledge Update: {datetime.now().strftime('%d %b %Y')}\n\n"
|
||||
f"Processed **{metrics.get('total_extracted', 0)}** links.\n\n"
|
||||
f"{safety_report}\n\n"
|
||||
f"{ai_intel}\n\n"
|
||||
f"{mermaid}\n"
|
||||
f"---\n"
|
||||
f"**Audit Matrix follows in comments due to scale.**\n"
|
||||
)
|
||||
# Build PR Body (With Safety Guard Splitting)
|
||||
pr_body = f"## 💎 Knowledge Update: {datetime.now().strftime('%d %b %Y')}\n\nProcessed **{metrics.get('total_extracted', 0)}** links.\n\n"
|
||||
|
||||
# If safety_report is huge, move it to its own comment
|
||||
safety_in_body = True
|
||||
if len(safety_report) > 30000:
|
||||
pr_body += "⚠️ **Detailed Safety Audit moved to comments due to scale.**\n\n"
|
||||
safety_in_body = False
|
||||
else:
|
||||
pr_body += f"{safety_report}\n\n"
|
||||
|
||||
pr_body += f"{ai_intel}\n\n{mermaid}\n{source_md}\n---\n**Audit Matrix and Logs follow in successive comments.**\n"
|
||||
|
||||
pr = self.repository.create_pull(
|
||||
title=f"💎 Knowledge Update & Optimization: {datetime.now().strftime('%d %b %Y')}",
|
||||
@@ -123,8 +111,32 @@ class RepositoryController:
|
||||
base=self.default_branch_name
|
||||
)
|
||||
|
||||
# 2. Split Audit Matrix into Comments
|
||||
header_table = "| # | Status | Date | Source | Reason | Category | URL |\n| :--- | :--- | :--- | :--- | :--- | :--- | :--- |\n"
|
||||
# 1. Safety Report (if huge)
|
||||
if not safety_in_body:
|
||||
log_header = "## 🛡️ Safety & Mandate Audit (Detailed)\n"
|
||||
current_chunk = log_header
|
||||
for line in safety_report.splitlines():
|
||||
if len(current_chunk) + len(line) > 60000:
|
||||
pr.create_issue_comment(current_chunk)
|
||||
current_chunk = log_header + line + "\n"
|
||||
else:
|
||||
current_chunk += line + "\n"
|
||||
pr.create_issue_comment(current_chunk)
|
||||
|
||||
# 2. X.com Extraction Audit Trail
|
||||
x_audit = metrics.get('x_audit', [])
|
||||
if x_audit:
|
||||
log_header = "### 📜 Extraction Audit Trail\n*Detailed logs of social and RSS discovery attempts.*\n\n"
|
||||
current_log = log_header
|
||||
for entry in x_audit:
|
||||
if len(current_log) + len(entry) > 60000:
|
||||
pr.create_issue_comment(current_log)
|
||||
current_log = log_header + entry + "\n"
|
||||
else:
|
||||
current_log += entry + "\n"
|
||||
pr.create_issue_comment(current_log)
|
||||
|
||||
# 3. Split Audit Matrix into Comments
|
||||
current_comment = header_table
|
||||
part = 1
|
||||
for row in all_rows:
|
||||
|
||||
@@ -29,7 +29,7 @@ class IntelligentLinkCleaner:
|
||||
self.dead_links: Dict[str, Tuple[str, str]] = {}
|
||||
self.learning_data = self._load_memory()
|
||||
self.inventory = self._load_inventory()
|
||||
self.action_log: List[Dict] = []
|
||||
self.full_report_metrics = [] # Track what happened to every link
|
||||
self.detailed_stats = {"total_scanned": 0, "skipped_recent": 0, "by_file": {}, "operation_types": {"removals": 0, "consolidated": 0, "healed": 0, "enriched": 0}}
|
||||
self.stats = {"total_links": 0, "dead_links_removed": 0, "orphans_fixed": 0, "enriched_descriptions": 0}
|
||||
|
||||
@@ -63,18 +63,15 @@ class IntelligentLinkCleaner:
|
||||
content = open(path, "r").read()
|
||||
lines = content.splitlines()
|
||||
for idx, line in enumerate(lines):
|
||||
# Enhanced Regex to capture surrounding formatting
|
||||
matches = re.finditer(r'(\*\*|==)?\s*\[(.*?)\]\((https?://.*?)\)\s*(\*\*|==)?\s*(.*)', line)
|
||||
for m in matches:
|
||||
fmt_pre, title, url, fmt_post, desc = m.groups()
|
||||
nu = normalize_url(url)
|
||||
|
||||
# Identify Importance Markers (Mandate 31 Expansion)
|
||||
is_important = False
|
||||
if fmt_pre or fmt_post: is_important = True # Bold or Highlighted
|
||||
if "🌟" in title or "🌟" in desc: is_important = True # Stars
|
||||
if len(desc.strip()) > 100: is_important = True # Deep description
|
||||
if path in CORE_FILES: is_important = True # Foundational files
|
||||
if fmt_pre or fmt_post: is_important = True
|
||||
if "🌟" in title or "🌟" in (desc or ""): is_important = True
|
||||
if desc and len(desc.strip()) > 100: is_important = True
|
||||
if path in CORE_FILES: is_important = True
|
||||
|
||||
self.link_registry.setdefault(nu, []).append({
|
||||
"file": path, "line_index": idx, "url": url,
|
||||
@@ -84,7 +81,6 @@ class IntelligentLinkCleaner:
|
||||
unique_urls = list(self.link_registry.keys())
|
||||
random.shuffle(unique_urls)
|
||||
|
||||
# 1.5. Identify prioritized links for validation
|
||||
to_check = []
|
||||
for u in unique_urls:
|
||||
nu = normalize_url(u); entry = self.inventory.get(nu, {})
|
||||
@@ -155,27 +151,36 @@ class IntelligentLinkCleaner:
|
||||
score = entry.get("health_score", 100)
|
||||
score = (score * 0.8) + (100 if alive else 0) * 0.2
|
||||
entry["health_score"] = round(score, 1); entry["last_checked"] = datetime.now().timestamp()
|
||||
|
||||
# Identify high-value status
|
||||
|
||||
is_important = any(occ.get("is_important") for occ in self.link_registry.get(nu, []))
|
||||
if entry.get("stars", 0) >= 3: is_important = True
|
||||
|
||||
status = "INCLUDED" if alive else "FILTERED"
|
||||
final_reason = reason
|
||||
|
||||
if not alive or reason == "generic_redirect_loss":
|
||||
if is_important:
|
||||
entry["status"] = "review_required"
|
||||
entry["review_metadata"] = {
|
||||
"original_url": url,
|
||||
"proposed_url": final if final else "NONE",
|
||||
"reason": f"High-Value Preservation: {reason}",
|
||||
"timestamp": datetime.now().isoformat()
|
||||
"original_url": url, "proposed_url": final if final else "NONE",
|
||||
"reason": f"High-Value Preservation: {reason}", "timestamp": datetime.now().isoformat()
|
||||
}
|
||||
log_event(f" [⚠️] REVIEW STORED: {url} in inventory. Metadata preserved.")
|
||||
log_event(f" [⚠️] REVIEW STORED: {url}")
|
||||
status = "INCLUDED" # Kept in V1
|
||||
final_reason = f"Preserved for Review ({reason})"
|
||||
elif score < 20:
|
||||
entry["status"] = "dead"; self.dead_links[url] = (None, reason)
|
||||
status = "FILTERED"; final_reason = f"Dead: {reason}"
|
||||
elif final and alive:
|
||||
# If it's rescued or a valid redirect, we update
|
||||
self.dead_links[url] = (f"CANONICAL:{final}", "Redirect/Resurrection")
|
||||
|
||||
final_reason = "Updated (Redirect/Rescued)"
|
||||
|
||||
self.full_report_metrics.append({
|
||||
"url": url, "status": status, "reason": final_reason,
|
||||
"category": entry.get("category", "N/A"), "post_date": entry.get("pub_date"),
|
||||
"source": "Health Checker", "impact_score": entry.get("stars", 0) * 20,
|
||||
"language": entry.get("language", "EN"), "type": entry.get("resource_type", "Ref")
|
||||
})
|
||||
self.inventory[nu] = entry
|
||||
|
||||
await self.apply_changes()
|
||||
@@ -228,7 +233,14 @@ class IntelligentLinkCleaner:
|
||||
|
||||
from src.safety_guard import SafetyGuard
|
||||
report = SafetyGuard().generate_audit_report()
|
||||
if final_payload: self.git_controller.apply_multi_file_changes(final_payload, {"total_extracted": len(self.link_registry)}, safety_report=report)
|
||||
|
||||
metrics = {
|
||||
"total_extracted": len(self.link_registry),
|
||||
"full_report": self.full_report_metrics,
|
||||
"end_date": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
if final_payload: self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=report)
|
||||
|
||||
async def prune_orphaned_metadata(self):
|
||||
valid_map = {}
|
||||
|
||||
Reference in New Issue
Block a user