From da3474ee47c2a4ecc90dbd05e8aafdf0d46d3555 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Fri, 15 May 2026 20:35:16 +0200 Subject: [PATCH] fix(cleaning): preserve V1 exhaustiveness by disabling MVQ-based repo deletion --- GEMINI.md | 5 +++-- README.md | 6 +++--- src/intelligent_health_checker.py | 7 ++++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/GEMINI.md b/GEMINI.md index 60d97021..1bd36159 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -19,9 +19,10 @@ This file contains the accumulated instructions and long-term vision for the aut 13. **Detailed Logging for V2**: When running the V2 Optimizer, agents MUST use unbuffered logging and detailed output messages. If the optimizer returns '0 links kept', the agent MUST investigate the logs to determine if it was due to AI selection or a parsing/API error. 14. **Persistent V2 Caching**: The V2 Optimizer MUST use a persistent cache file (`data/v2_cache.json`) to store AI evaluations (year, quality, category). This is mandatory to minimize API costs and ensure execution speed across 15k+ links. 15. **GitHub Metadata Enrichment**: For all `github.com` resources, the bot MUST attempt to fetch real-time metadata (stars, last commit) using the GitHub API. This data must be included in the V2 rendering to provide current context. -16. **Resilient Link Health & MVQ Cleaning**: +16. **Resilient Link Health & Global Cleaning**: - **Health Checks**: Every V2 generation and global cleaning cycle MUST perform asynchronous health checks using identity rotation (User-Agents) and multiple attempts (3x). - - **MVQ Cleaning**: The `IntelligentLinkChecker` MUST apply the V2 **Minimum Viable Quality (MVQ)** logic. GitHub repositories inactive for >4 years with low impact (stars < 30) MUST be purged to maintain archive freshness. + - **V1 Exhaustiveness**: The `IntelligentLinkChecker` operating on V1 MUST preserve all technically valid links regardless of their age. Deletion is strictly reserved for definitively invalid links (404s, dead redirects, etc.). + - **V2 Elite Selection (MVQ)**: The `V2VisionEngine` MUST continue to apply the **Minimum Viable Quality (MVQ)** logic. GitHub repositories inactive for >4 years with low impact (stars < 30) are deprioritized or excluded ONLY from the V2 Elite edition to ensure freshness. - **Foundational Protection**: GitHub and 'Foundational' resources are exempt from automatic removal based on health, but may be flagged for review. - **Consolidation**: If a deep link fails but the repository root is alive, the bot MUST consolidate the reference to the root. 17. **Unified Curation Chronology**: All curation workflows (V1 and V2) MUST utilize the same chronological and descriptive engine. diff --git a/README.md b/README.md index d824d31a..e18e4277 100644 --- a/README.md +++ b/README.md @@ -212,7 +212,7 @@ To maintain the high-density quality of V2 without redundant AI costs, the `V2Vi | **Depth** | Historical & Wide | Cutting-edge & Deep | | **Chronology** | **Unified Engine** (YYYY) | **Unified Engine** (YYYY) | | **Filtering** | Basic (Health only) | AI-Scored (🌟🌟🌟) | -| **MVQ Check** | **Global Cleaning (MVQ)** | **Elite Discovery (MVQ)** | +| **MVQ Check** | No (Exhaustive Preservation) | Yes (Stale repos deprioritized) | --- @@ -227,10 +227,10 @@ The heart of the new Nubenetes is a suite of AI Agents that operate on our `deve 2. **V2VisionEngine (`src/v2_optimizer.py`)**: - **Elite Selection:** Scans the massive V1 archive to select the "Elite" top-tier resources. - **2026 Taxonomy:** Reorganizes the content into high-density dimensions (e.g., "Intelligent Control Plane") using **relevance-first sorting**. - - **Deprioritization:** Automatically identifies stale repositories (>4 years without activity) and reduces their visibility. + - **MVQ Hardening:** Automatically identifies stale repositories (>4 years without activity) to exclude them from the Elite portal. 3. **IntelligentHealthChecker (`src/intelligent_health_checker.py`)**: - **Resilience:** Performs asynchronous health checks with 3x retry and identity rotation. - - **MVQ Cleaning:** Incorporates **Minimum Viable Quality** logic to automatically purge abandoned or low-value repositories. + - **V1 Integrity:** Focuses strictly on link validity (removing 404s) to ensure the exhaustive V1 archive remains accessible and error-free. - **Transparency:** Provides detailed, real-time unbuffered logging of all cleaning operations. --- diff --git a/src/intelligent_health_checker.py b/src/intelligent_health_checker.py index 3a1fbe8c..b63899c4 100644 --- a/src/intelligent_health_checker.py +++ b/src/intelligent_health_checker.py @@ -88,11 +88,12 @@ class IntelligentLinkCleaner: async def _check_url_with_retries(self, url: str, max_retries=5) -> Tuple[str, bool, Optional[str], str]: now = datetime.now().timestamp() - # 1. MVQ Decision Logic for GitHub + # NOTE: V1 Exhaustiveness Mandate + # We fetch GitHub metadata for logging/metrics, but we DO NOT delete based on activity. + # Only definitively dead links are removed in V1. if "github.com" in url: gh_meta = await self._fetch_github_metadata(url) - if gh_meta.get("is_abandoned") and gh_meta.get("stars", 0) < 30: - return url, False, None, f"Abandoned Repo (Inactive {gh_meta['years_inactive']:.1f}y, {gh_meta['stars']}⭐)" + # Metadata is stored in cache/logs but not used for deletion here. cache_entry = self.learning_data.get("link_cache", {}).get(url) if cache_entry and cache_entry.get("status") == "ALIVE":