feat(ops): implement high-density multi-part reporting engine for PRs and extraction logs

2026-07-28 09:32:20 +00:00 · 2026-05-17 23:14:11 +02:00
parent d8b8b2da08
commit 5ffab5b5d9
2 changed files with 91 additions and 67 deletions
--- a/src/gitops_manager.py
+++ b/src/gitops_manager.py
@@ -2,6 +2,7 @@ import json
 import base64
 from github import Github
 from datetime import datetime
+from typing import List, Dict

 from src.gemini_utils import SESSION_TRACKER

@@ -23,42 +24,14 @@ class RepositoryController:
        except:
            return ""

-    def apply_historical_chunk(self, updates: dict, next_since: str) -> None:
-        branch_name = "bot/historical-accumulator"
-        
-        # Check if branch exists, if not, create from develop
-        try:
-            self.repository.get_branch(branch_name)
-        except:
-            self._create_feature_branch(branch_name)
-
-        for file_path, content in updates.items():
-            try:
-                try:
-                    file_meta = self.repository.get_contents(file_path, ref=branch_name)
-                    self.repository.update_file(
-                        path=file_path, message=f"chore(historical): chunk sync since {next_since}",
-                        content=content, sha=file_meta.sha, branch=branch_name
-                    )
-                except Exception as e:
-                    if "404" in str(e):
-                        self.repository.create_file(
-                            path=file_path, message=f"chore(historical): init {file_path}",
-                            content=content, branch=branch_name
-                        )
-            except Exception as e:
-                print(f"Error in historical chunk for {file_path}: {e}")
-
    def apply_multi_file_changes(self, updates: dict, metrics: dict, safety_report: str = "") -> str:
        timestamp_slug = datetime.now().strftime("%Y%m%d-%H%M")
        branch_name = f"bot/knowledge-update-{timestamp_slug}"
        
-        # In the last historical chunk, use the accumulator as base if it exists
-        accumulator_branch = "bot/historical-accumulator"
        try:
-            acc = self.repository.get_branch(accumulator_branch)
-            self.repository.create_git_ref(ref=f"refs/heads/{branch_name}", sha=acc.commit.sha)
+            self._create_feature_branch(branch_name)
        except:
+            branch_name = f"bot/knowledge-update-{timestamp_slug}-{id(updates)}"
            self._create_feature_branch(branch_name)

        if not updates:
@@ -88,33 +61,48 @@ class RepositoryController:

        # --- REPORT CONSTRUCTION ---
        full_report = metrics.get('full_report', [])
-        sorted_report = sorted(full_report, key=lambda x: 0 if x['status'] == 'INCLUDED' else 1)
+        sorted_report = sorted(full_report, key=lambda x: 0 if x['status'] == 'INCLUDED' else (1 if x['status'] == 'DUPLICATE' else 2))
        
        counts = {"INCLUDED": 0, "DUPLICATE": 0, "FILTERED": 0}
        source_counts = {}
        all_rows = []
+        
+        header_table = "| # | Status | Score | Lang | Type | Date | Source | Reason | URL |\n| :--- | :--- | :---: | :---: | :--- | :---: | :--- | :--- | :--- |\n"
+        
        for idx, item in enumerate(sorted_report, 1):
            status_emoji = {"INCLUDED": "✅", "DUPLICATE": "👯", "FILTERED": "🛡️"}.get(item['status'], "❓")
-            date_str = item.get('post_date', 'N/A')[:10] if item.get('post_date') else 'N/A'
-            all_rows.append(f"| {idx} | {status_emoji} {item['status']} | {date_str} | {item.get('source', 'N/A')} | {item['reason']} | `{item['category']}` | {item['url']} |\n")
+            date_str = str(item.get('post_date', 'N/A'))[:10] if item.get('post_date') else 'N/A'
+            score = item.get('impact_score', 'N/A')
+            lang = item.get('language', 'N/A')[:2].upper() if item.get('language') else 'EN'
+            res_type = item.get('type', 'Ref')
+            
+            row = f"| {idx} | {status_emoji} {item['status']} | {score} | {lang} | {res_type} | {date_str} | {item.get('source', 'N/A')} | {item['reason']} | {item['url']} |\n"
+            all_rows.append(row)
+            
            counts[item['status']] = counts.get(item['status'], 0) + 1
-            if item['status'] == "INCLUDED":
-                src = item.get('source', 'Unknown')
-                source_counts[src] = source_counts.get(src, 0) + 1
+            src = item.get('source', 'Unknown')
+            source_counts[src] = source_counts.get(src, 0) + 1

-        # 1. Mermaid Diagrams & Stats
+        # AI Intel and Mermaid
        ai_intel = SESSION_TRACKER.get_intelligence_report()
+        source_md = "#### 📊 Source Distribution\n| Source | Count |\n| :--- | :---: |\n"
+        for src, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
+            source_md += f"| {src} | {count} |\n"
+
        mermaid = f"### 📊 Decision Metrics\n```mermaid\npie title Agentic Decision Distribution\n    \"Accepted\" : {counts['INCLUDED']}\n    \"Duplicates\" : {counts['DUPLICATE']}\n    \"Filtered\" : {counts['FILTERED']}\n```\n"
        
-        pr_body = (
-            f"## 💎 Knowledge Update: {datetime.now().strftime('%d %b %Y')}\n\n"
-            f"Processed **{metrics.get('total_extracted', 0)}** links.\n\n"
-            f"{safety_report}\n\n"
-            f"{ai_intel}\n\n"
-            f"{mermaid}\n"
-            f"---\n"
-            f"**Audit Matrix follows in comments due to scale.**\n"
-        )
+        # Build PR Body (With Safety Guard Splitting)
+        pr_body = f"## 💎 Knowledge Update: {datetime.now().strftime('%d %b %Y')}\n\nProcessed **{metrics.get('total_extracted', 0)}** links.\n\n"
+        
+        # If safety_report is huge, move it to its own comment
+        safety_in_body = True
+        if len(safety_report) > 30000:
+            pr_body += "⚠️ **Detailed Safety Audit moved to comments due to scale.**\n\n"
+            safety_in_body = False
+        else:
+            pr_body += f"{safety_report}\n\n"
+            
+        pr_body += f"{ai_intel}\n\n{mermaid}\n{source_md}\n---\n**Audit Matrix and Logs follow in successive comments.**\n"

        pr = self.repository.create_pull(
            title=f"💎 Knowledge Update & Optimization: {datetime.now().strftime('%d %b %Y')}",
@@ -123,8 +111,32 @@ class RepositoryController:
            base=self.default_branch_name
        )

-        # 2. Split Audit Matrix into Comments
-        header_table = "| # | Status | Date | Source | Reason | Category | URL |\n| :--- | :--- | :--- | :--- | :--- | :--- | :--- |\n"
+        # 1. Safety Report (if huge)
+        if not safety_in_body:
+            log_header = "## 🛡️ Safety & Mandate Audit (Detailed)\n"
+            current_chunk = log_header
+            for line in safety_report.splitlines():
+                if len(current_chunk) + len(line) > 60000:
+                    pr.create_issue_comment(current_chunk)
+                    current_chunk = log_header + line + "\n"
+                else:
+                    current_chunk += line + "\n"
+            pr.create_issue_comment(current_chunk)
+
+        # 2. X.com Extraction Audit Trail
+        x_audit = metrics.get('x_audit', [])
+        if x_audit:
+            log_header = "### 📜 Extraction Audit Trail\n*Detailed logs of social and RSS discovery attempts.*\n\n"
+            current_log = log_header
+            for entry in x_audit:
+                if len(current_log) + len(entry) > 60000:
+                    pr.create_issue_comment(current_log)
+                    current_log = log_header + entry + "\n"
+                else:
+                    current_log += entry + "\n"
+            pr.create_issue_comment(current_log)
+
+        # 3. Split Audit Matrix into Comments
        current_comment = header_table
        part = 1
        for row in all_rows:
--- a/src/intelligent_health_checker.py
+++ b/src/intelligent_health_checker.py
@@ -29,7 +29,7 @@ class IntelligentLinkCleaner:
        self.dead_links: Dict[str, Tuple[str, str]] = {} 
        self.learning_data = self._load_memory()
        self.inventory = self._load_inventory()
-        self.action_log: List[Dict] = [] 
+        self.full_report_metrics = [] # Track what happened to every link
        self.detailed_stats = {"total_scanned": 0, "skipped_recent": 0, "by_file": {}, "operation_types": {"removals": 0, "consolidated": 0, "healed": 0, "enriched": 0}}
        self.stats = {"total_links": 0, "dead_links_removed": 0, "orphans_fixed": 0, "enriched_descriptions": 0}

@@ -63,18 +63,15 @@ class IntelligentLinkCleaner:
                    content = open(path, "r").read()
                    lines = content.splitlines()
                    for idx, line in enumerate(lines):
-                        # Enhanced Regex to capture surrounding formatting
                        matches = re.finditer(r'(\*\*|==)?\s*\[(.*?)\]\((https?://.*?)\)\s*(\*\*|==)?\s*(.*)', line)
                        for m in matches:
                            fmt_pre, title, url, fmt_post, desc = m.groups()
                            nu = normalize_url(url)
-                            
-                            # Identify Importance Markers (Mandate 31 Expansion)
                            is_important = False
-                            if fmt_pre or fmt_post: is_important = True # Bold or Highlighted
-                            if "🌟" in title or "🌟" in desc: is_important = True # Stars
-                            if len(desc.strip()) > 100: is_important = True # Deep description
-                            if path in CORE_FILES: is_important = True # Foundational files
+                            if fmt_pre or fmt_post: is_important = True
+                            if "🌟" in title or "🌟" in (desc or ""): is_important = True
+                            if desc and len(desc.strip()) > 100: is_important = True
+                            if path in CORE_FILES: is_important = True
                            
                            self.link_registry.setdefault(nu, []).append({
                                "file": path, "line_index": idx, "url": url, 
@@ -84,7 +81,6 @@ class IntelligentLinkCleaner:
        unique_urls = list(self.link_registry.keys())
        random.shuffle(unique_urls)
        
-        # 1.5. Identify prioritized links for validation
        to_check = []
        for u in unique_urls:
            nu = normalize_url(u); entry = self.inventory.get(nu, {})
@@ -155,27 +151,36 @@ class IntelligentLinkCleaner:
            score = entry.get("health_score", 100)
            score = (score * 0.8) + (100 if alive else 0) * 0.2
            entry["health_score"] = round(score, 1); entry["last_checked"] = datetime.now().timestamp()
-
-            # Identify high-value status
+            
            is_important = any(occ.get("is_important") for occ in self.link_registry.get(nu, []))
            if entry.get("stars", 0) >= 3: is_important = True

+            status = "INCLUDED" if alive else "FILTERED"
+            final_reason = reason
+            
            if not alive or reason == "generic_redirect_loss":
                if is_important:
                    entry["status"] = "review_required"
                    entry["review_metadata"] = {
-                        "original_url": url,
-                        "proposed_url": final if final else "NONE",
-                        "reason": f"High-Value Preservation: {reason}",
-                        "timestamp": datetime.now().isoformat()
+                        "original_url": url, "proposed_url": final if final else "NONE",
+                        "reason": f"High-Value Preservation: {reason}", "timestamp": datetime.now().isoformat()
                    }
-                    log_event(f"  [⚠️] REVIEW STORED: {url} in inventory. Metadata preserved.")
+                    log_event(f"  [⚠️] REVIEW STORED: {url}")
+                    status = "INCLUDED" # Kept in V1
+                    final_reason = f"Preserved for Review ({reason})"
                elif score < 20: 
                    entry["status"] = "dead"; self.dead_links[url] = (None, reason)
+                    status = "FILTERED"; final_reason = f"Dead: {reason}"
            elif final and alive:
-                # If it's rescued or a valid redirect, we update
                self.dead_links[url] = (f"CANONICAL:{final}", "Redirect/Resurrection")
-
+                final_reason = "Updated (Redirect/Rescued)"
+            
+            self.full_report_metrics.append({
+                "url": url, "status": status, "reason": final_reason,
+                "category": entry.get("category", "N/A"), "post_date": entry.get("pub_date"),
+                "source": "Health Checker", "impact_score": entry.get("stars", 0) * 20,
+                "language": entry.get("language", "EN"), "type": entry.get("resource_type", "Ref")
+            })
            self.inventory[nu] = entry

        await self.apply_changes()
@@ -228,7 +233,14 @@ class IntelligentLinkCleaner:

        from src.safety_guard import SafetyGuard
        report = SafetyGuard().generate_audit_report()
-        if final_payload: self.git_controller.apply_multi_file_changes(final_payload, {"total_extracted": len(self.link_registry)}, safety_report=report)
+        
+        metrics = {
+            "total_extracted": len(self.link_registry),
+            "full_report": self.full_report_metrics,
+            "end_date": datetime.now().isoformat()
+        }
+        
+        if final_payload: self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=report)

    async def prune_orphaned_metadata(self):
        valid_map = {}