fix(ops): restore SHA256 drift detection and historical chunk sync logic

2026-07-29 01:51:21 +00:00 · 2026-05-17 23:16:47 +02:00
parent 9a7d8e17cc
commit c0bbe9b60b
2 changed files with 55 additions and 29 deletions
--- a/src/gitops_manager.py
+++ b/src/gitops_manager.py
@@ -24,6 +24,30 @@ class RepositoryController:
        except:
            return ""

+    def apply_historical_chunk(self, updates: dict, next_since: str) -> None:
+        branch_name = "bot/historical-accumulator"
+        try:
+            self.repository.get_branch(branch_name)
+        except:
+            self._create_feature_branch(branch_name)
+
+        for file_path, content in updates.items():
+            try:
+                try:
+                    file_meta = self.repository.get_contents(file_path, ref=branch_name)
+                    self.repository.update_file(
+                        path=file_path, message=f"chore(historical): chunk sync since {next_since}",
+                        content=content, sha=file_meta.sha, branch=branch_name
+                    )
+                except Exception as e:
+                    if "404" in str(e):
+                        self.repository.create_file(
+                            path=file_path, message=f"chore(historical): init {file_path}",
+                            content=content, branch=branch_name
+                        )
+            except Exception as e:
+                print(f"Error in historical chunk for {file_path}: {e}")
+
    def apply_multi_file_changes(self, updates: dict, metrics: dict, safety_report: str = "") -> str:
        timestamp_slug = datetime.now().strftime("%Y%m%d-%H%M")
        branch_name = f"bot/knowledge-update-{timestamp_slug}"
--- a/src/intelligent_health_checker.py
+++ b/src/intelligent_health_checker.py
@@ -29,7 +29,7 @@ class IntelligentLinkCleaner:
        self.dead_links: Dict[str, Tuple[str, str]] = {} 
        self.learning_data = self._load_memory()
        self.inventory = self._load_inventory()
-        self.full_report_metrics = [] # Track what happened to every link
+        self.full_report_metrics = [] 
        self.detailed_stats = {"total_scanned": 0, "skipped_recent": 0, "by_file": {}, "operation_types": {"removals": 0, "consolidated": 0, "healed": 0, "enriched": 0}}
        self.stats = {"total_links": 0, "dead_links_removed": 0, "orphans_fixed": 0, "enriched_descriptions": 0}

@@ -55,7 +55,7 @@ class IntelligentLinkCleaner:

    async def execute_clean_cycle(self):
        log_event("STARTING INTELLIGENT CLEANING CYCLE", section_break=True)
-        # 1. Map all links in V1 and detect Importance Markers
+        # 1. Map all links and Importance Markers
        for root, _, files in os.walk("docs"):
            for f in files:
                if f.endswith(".md"):
@@ -105,12 +105,11 @@ class IntelligentLinkCleaner:
            tasks = [self._check_url_logic(url) for url in batch]
            results = await asyncio.gather(*tasks)
            for url, res in zip(batch, results): check_results[url] = res
-            if i % 100 == 0: log_event(f"  [>] Network Check Progress: {i}/{len(to_check)} checked...")
+            if i % 100 == 0: log_event(f"  [>] Progress: {i}/{len(to_check)} checked...")

-        # 2.5. UNIVERSAL AI RESCUE (Mandate 31)
+        # 2.5. UNIVERSAL AI RESCUE
        to_rescue = [u for u, res in check_results.items() if not res[0] or res[1] == "generic_redirect_loss"]
        if to_rescue:
-            log_event(f"[*] Starting AI Rescue for {len(to_rescue)} links...")
            AI_BATCH_SIZE = 10
            for i in range(0, len(to_rescue), AI_BATCH_SIZE):
                batch = to_rescue[i:i+AI_BATCH_SIZE]
@@ -120,12 +119,10 @@ class IntelligentLinkCleaner:
                    batch_info.append({"url": u, "title": entry.get("title", u), "context": entry.get("description", "")})

                prompt = (
-                    "You act as a Technical Librarian. These resources are missing or redirecting to generic pages.\n"
-                    "Search for the specific Technical Article or Tool URL based on the title and description provided.\n"
-                    "Consider site migrations, acquisitions (Ansible->RedHat, Nginx->F5), and cross-domain moves to personal blogs.\n"
-                    "Return ONLY a JSON list: [{\"old_url\": \"...\", \"new_url\": \"...\"}, ...]\n"
-                    "If not found, set new_url to \"NONE\".\n\n"
-                    "RESOURCES:\n" + "\n".join([f"- Title: {d['title']} | Desc: {d['context'][:150]} | URL: {d['url']}" for d in batch_info])
+                    "You act as a Technical Librarian. Rescue missing resources based on title and description.\n"
+                    "Consider acquisitions (Ansible->RedHat, Nginx->F5) and cross-domain moves.\n"
+                    "Return ONLY JSON list: [{\"old_url\": \"...\", \"new_url\": \"...\"}, ...]\n\n"
+                    "RESOURCES:\n" + "\n".join([f"- {d['title']} | {d['url']}" for d in batch_info])
                )
                
                try:
@@ -138,42 +135,53 @@ class IntelligentLinkCleaner:
                                if new_loc and new_loc.startswith("http") and "NONE" not in new_loc.upper():
                                    try:
                                        async with httpx.AsyncClient(timeout=10, follow_redirects=True, verify=False) as client:
-                                            resp = await client.get(new_loc)
-                                            if resp.status_code < 400:
+                                            if (await client.get(new_loc)).status_code < 400:
                                                log_event(f"  [✨] RESCUED: {u} -> {new_loc}")
                                                check_results[u] = (True, "resurrected", new_loc)
                                    except: pass
                except: pass

-        # 2.8. Finalize Status with Foundational Preservation & Metadata
+        # 2.8. Finalize Status
        for url, (alive, reason, final) in check_results.items():
            nu = normalize_url(url); entry = self.inventory.get(nu, {})
            score = entry.get("health_score", 100)
            score = (score * 0.8) + (100 if alive else 0) * 0.2
            entry["health_score"] = round(score, 1); entry["last_checked"] = datetime.now().timestamp()
            
+            # --- MANDATE 22: SEMANTIC DRIFT (SHA256) ---
+            if alive:
+                try:
+                    from src.agentic_curator import _deep_fetch_content
+                    text, _ = await _deep_fetch_content(url if not final else final)
+                    if text:
+                        new_hash = hashlib.sha256(text.encode()).hexdigest()
+                        old_hash = entry.get("content_hash", "N/A")
+                        if old_hash != "N/A" and new_hash != old_hash:
+                            log_event(f"  [!] DRIFT DETECTED: {url}")
+                            entry["needs_ai_refresh"] = True
+                        entry["content_hash"] = new_hash
+                except: pass
+
            is_important = any(occ.get("is_important") for occ in self.link_registry.get(nu, []))
            if entry.get("stars", 0) >= 3: is_important = True

-            status = "INCLUDED" if alive else "FILTERED"
-            final_reason = reason
+            status, final_reason = ("INCLUDED", reason) if alive else ("FILTERED", reason)
            
            if not alive or reason == "generic_redirect_loss":
                if is_important:
                    entry["status"] = "review_required"
                    entry["review_metadata"] = {
                        "original_url": url, "proposed_url": final if final else "NONE",
-                        "reason": f"High-Value Preservation: {reason}", "timestamp": datetime.now().isoformat()
+                        "reason": f"High-Value Protection: {reason}", "timestamp": datetime.now().isoformat()
                    }
                    log_event(f"  [⚠️] REVIEW STORED: {url}")
-                    status = "INCLUDED" # Kept in V1
-                    final_reason = f"Preserved for Review ({reason})"
+                    status, final_reason = "INCLUDED", f"Preserved for Review ({reason})"
                elif score < 20: 
                    entry["status"] = "dead"; self.dead_links[url] = (None, reason)
-                    status = "FILTERED"; final_reason = f"Dead: {reason}"
+                    status, final_reason = "FILTERED", f"Dead: {reason}"
            elif final and alive:
                self.dead_links[url] = (f"CANONICAL:{final}", "Redirect/Resurrection")
-                final_reason = "Updated (Redirect/Rescued)"
+                final_reason = "Updated (Canonical)"
            
            self.full_report_metrics.append({
                "url": url, "status": status, "reason": final_reason,
@@ -213,7 +221,7 @@ class IntelligentLinkCleaner:
        except: return True, "Error", None

    async def apply_changes(self):
-        log_event("APPLYING CLEANING CHANGES & PR GENERATION...", section_break=True)
+        log_event("APPLYING CLEANING CHANGES...", section_break=True)
        file_updates = {}
        for url, (fallback, reason) in self.dead_links.items():
            nu = normalize_url(url); paths = self.inventory.get(nu, {}).get("v1_locations", [])
@@ -233,13 +241,7 @@ class IntelligentLinkCleaner:

        from src.safety_guard import SafetyGuard
        report = SafetyGuard().generate_audit_report()
-        
-        metrics = {
-            "total_extracted": len(self.link_registry),
-            "full_report": self.full_report_metrics,
-            "end_date": datetime.now().isoformat()
-        }
-        
+        metrics = {"total_extracted": len(self.link_registry), "full_report": self.full_report_metrics, "end_date": datetime.now().isoformat()}
        if final_payload: self.git_controller.apply_multi_file_changes(final_payload, metrics, safety_report=report)

    async def prune_orphaned_metadata(self):