feat: implement resilient multi-attempt health checks and soft-flagging for V2

This commit is contained in:
Nubenetes Bot
2026-05-15 12:21:39 +02:00
parent 781f83957a
commit 4c827afc30
2 changed files with 65 additions and 23 deletions

View File

@@ -19,7 +19,7 @@ This file contains the accumulated instructions and long-term vision for the aut
13. **Detailed Logging for V2**: When running the V2 Optimizer, agents MUST use unbuffered logging and detailed output messages. If the optimizer returns '0 links kept', the agent MUST investigate the logs to determine if it was due to AI selection or a parsing/API error.
14. **Persistent V2 Caching**: The V2 Optimizer MUST use a persistent cache file (`data/v2_cache.json`) to store AI evaluations (year, quality, category). This is mandatory to minimize API costs and ensure execution speed across 15k+ links.
15. **GitHub Metadata Enrichment**: For all `github.com` resources, the bot MUST attempt to fetch real-time metadata (stars, last commit) using the GitHub API. This data must be included in the V2 rendering to provide current context.
16. **Asynchronous Link Health**: Every V2 generation cycle MUST perform asynchronous health checks (HTTP HEAD) on all links. Broken links (404) should be excluded or flagged as offline to maintain the library's technical integrity.
16. **Resilient Link Health**: Every V2 generation cycle MUST perform asynchronous health checks. The bot MUST use identity rotation (User-Agents) and multiple attempts (3x) with backoff to minimize false negatives. Only definitive **404 Not Found** errors lead to removal; other failures (timeouts, 403s) result in the link being preserved but flagged as `[OFFLINE?]` to ensure maximum technical preservation. GitHub and 'Foundational' resources are exempt from removal based on health checks.
## 🛠️ Structural Evolution & Navigation
...

View File

@@ -115,40 +115,82 @@ class V2VisionEngine:
async def _verify_link_health(self, links: List[Dict]) -> List[Dict]:
online_links = []
BATCH_SIZE = 100
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
BATCH_SIZE = 50 # Smaller batches for stability
# User-Agent rotation to mimic real browsers
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
]
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, verify=False) as client:
for i in range(0, len(links), BATCH_SIZE):
batch = links[i:i+BATCH_SIZE]
tasks = [self._check_single_link(client, l) for l in batch]
tasks = []
for l in batch:
ua = user_agents[i % len(user_agents)]
tasks.append(self._check_single_link_resilient(client, l, ua))
results = await asyncio.gather(*tasks)
online_links.extend([r for r in results if r is not None])
if i % 500 == 0:
log_event(f" [Health] Verified {i}/{len(links)} links...")
log_event(f" [Resilient Health] Verified {i}/{len(links)} links...")
# Brief pause to avoid triggering Rate Limits
await asyncio.sleep(0.1)
return online_links
async def _check_single_link(self, client, link: Dict) -> Dict:
async def _check_single_link_resilient(self, client, link: Dict, ua: str, attempts: int = 3) -> Dict:
url = link["url"]
# Skip health check if cached as healthy recently (simple heuristic)
if url in self.cache and self.cache[url].get("status") == "online":
# 1. Immediate Pass for Trusted / Logic-Enriched Domains
if "github.com" in url or "awesome" in link["title"].lower():
link["health_status"] = "trusted"
return link
try:
resp = await client.head(url)
if resp.status_code < 400:
self.cache.setdefault(url, {})["status"] = "online"
return link
# Fallback to GET for some servers that block HEAD
resp = await client.get(url)
if resp.status_code < 400:
self.cache.setdefault(url, {})["status"] = "online"
return link
except: pass
# If it was foundational, keep even if down (maybe temporary)
if "awesome" in link["title"].lower():
# 2. Cached Health
if url in self.cache and self.cache[url].get("status") == "online":
link["health_status"] = "cached"
return link
# 3. Multi-Attempt Verification with Identity Rotation
headers = {
"User-Agent": ua,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.google.com/"
}
for attempt in range(attempts):
try:
# Use GET instead of HEAD as many sites block HEAD or return 405
resp = await client.get(url, headers=headers, timeout=10.0)
if resp.status_code < 400:
self.cache.setdefault(url, {})["status"] = "online"
link["health_status"] = "online"
return link
# If 404, it's a definitive fail
if resp.status_code == 404:
log_event(f" [Health] Definitive 404: {url}")
return None
except Exception as e:
if attempt == attempts - 1:
# Final attempt failed - Soft Flagging instead of removal
# If it's not a 404, we keep it but with a warning
link["health_status"] = "uncertain"
link["warning"] = "offline"
return link
return None
# Backoff before retry
await asyncio.sleep(0.5 * (attempt + 1))
return link
async def _evaluate_and_score_resources(self, links: List[Dict]) -> List[Dict]:
refined = []