feat: enhance V2 optimizer with dynamic maturity tags and real github metadata dates

This commit is contained in:
Nubenetes Bot
2026-05-15 17:36:50 +02:00
parent 9bc876266e
commit 5ab4e53901

View File

@@ -34,16 +34,12 @@ class V2VisionEngine:
self.library_criteria = (
"You are a Technical Librarian in 2026. Your mission is to build a high-density, professional reference library.\n"
"PHASE 1: TECHNICAL PRESERVATION (HIGH INCLUSIVITY)\n"
"- KEEP >90% of technical resources. Only discard 404s, obvious spam, or non-technical content.\n"
"- 'Awesome' repositories, official documentation, and deep technical guides are mandatory.\n"
"- YouTube videos are HIGH-VALUE resources; keep them as technical references.\n\n"
"PHASE 2: TEMPORAL & QUALITY SYNTHESIS\n"
"- Identify/estimate PUBLICATION YEAR.\n"
"- Assign QUALITY level (1-3 stars):\n"
" * 3 stars (🌟🌟🌟): Masterpieces, foundational standards, definitive 'Awesome' lists.\n"
" * 2 stars (🌟🌟): Production-grade tools, deep tutorials, highly recommended videos.\n"
" * 1 star (🌟): Solid technical references.\n"
"- Identify if a resource is a 'YouTube Video/Playlist' for special rendering.\n"
"- KEEP >90% of technical resources.\n"
"PHASE 2: SOPHISTICATED SYNTHESIS & DATING\n"
"- Extract precise PUBLICATION YEAR: Look for dates in the URL (e.g., /2023/05/ post dates), Twitter/X post dates, or text context. Return 'N/A' if truly unknown, do NOT guess '2024'.\n"
"- Assign QUALITY level (1-3 stars).\n"
"- Assign a MATURITY TAG based on content type/status: '[ENTERPRISE-STABLE]', '[EMERGING / INNOVATION]', '[ARCHITECTURE-GUIDE]', '[TOOLING]', '[CASE-STUDY]', or '[CHEATSHEET]'.\n"
" * Note: We will override tags for GitHub repos using real API data (Stars/Commits), so focus on classifying blogs and articles correctly.\n"
)
self.cache = self._load_cache()
@@ -93,7 +89,7 @@ class V2VisionEngine:
idx_content = f.read()
# Find the BIG mosaic (the one with many images)
# Support both old <center> and new <div style="text-align: center;" markdown="1">
mosaics = re.findall(r'<(?:div style="text-align: center;" markdown="1"|center)>\s*(.*?)\s*</(?:div|center)>', idx_content, re.DOTALL)
mosaics = re.findall(r'<(?:div style="text-align: center;" markdown="1"|center markdown="1"|center)>\s*(.*?)\s*</(?:div|center)>', idx_content, re.DOTALL)
if mosaics:
# Filter for the one containing many image links
for m in mosaics:
@@ -213,19 +209,28 @@ class V2VisionEngine:
refined = []
to_evaluate = []
# Pull from cache first
# We want to re-evaluate the tags and years, so we will bypass cache for tagging logic,
# but use cache for AI stars if available to save cost.
for l in links:
url = l["url"]
if url in self.cache and "year" in self.cache[url]:
item = l.copy()
# To allow the new logic to apply to cached items, we re-process GitHub links
# and re-apply the tag logic even if it's in the cache.
item = l.copy()
if url in self.cache and "stars" in self.cache[url]:
item.update(self.cache[url])
# Refresh GitHub metadata if it's a GH link
if "github.com" in url:
gh_meta = await self._fetch_github_metadata(url)
item.update(gh_meta)
refined.append(item)
else:
to_evaluate.append(l)
to_evaluate.append(item)
continue # process later via API
# Re-apply GitHub metadata and mature tagging for cached items
if "github.com" in url:
gh_meta = await self._fetch_github_metadata(url)
item.update(gh_meta)
if "gh_updated" in gh_meta and gh_meta["gh_updated"]:
item["year"] = gh_meta["gh_updated"].split("-")[0]
item["tag"] = self._calculate_tag(item)
refined.append(item)
if not to_evaluate: return refined
@@ -237,7 +242,7 @@ class V2VisionEngine:
prompt = (
f"{self.library_criteria}\n"
"Respond ONLY with a JSON object: {\"results\": [{\"idx\": int, \"year\": \"YYYY\", \"stars\": int, \"is_video\": bool}, ...]}\n\n"
"Respond ONLY with a JSON object: {\"results\": [{\"idx\": int, \"year\": \"YYYY\", \"stars\": int, \"is_video\": bool, \"tag\": \"[TAG]\"}, ...]}\n\n"
"LINKS:\n" + "\n".join([f"{idx}. {l['title']} ({l['url']})" for idx, l in enumerate(batch)])
)
@@ -251,35 +256,61 @@ class V2VisionEngine:
if idx < len(batch):
item = batch[idx].copy()
eval_data = {
"year": str(res.get("year", "2024")),
"year": str(res.get("year", "N/A")),
"stars": min(max(int(res.get("stars", 1)), 1), 3),
"is_video": res.get("is_video", False)
"is_video": res.get("is_video", False),
"tag": res.get("tag", "[ENTERPRISE-STABLE]")
}
item.update(eval_data)
# GitHub overrides
if "github.com" in item["url"]:
gh_meta = await self._fetch_github_metadata(item["url"])
item.update(gh_meta)
eval_data.update(gh_meta)
if "gh_updated" in gh_meta and gh_meta["gh_updated"]:
item["year"] = gh_meta["gh_updated"].split("-")[0]
eval_data["year"] = item["year"]
item["tag"] = self._calculate_tag(item)
eval_data["tag"] = item["tag"]
# Save to cache
self.cache[item["url"]] = eval_data
if item["year"].isdigit() and int(item["year"]) >= 2025: item["tag"] = "[CUTTING-EDGE]"
elif "awesome" in item["title"].lower(): item["tag"] = "[FOUNDATIONAL]"
else: item["tag"] = "[PRODUCTION-READY]"
refined.append(item)
except: continue
except:
for l in batch:
item = l.copy()
item["year"], item["stars"], item["is_video"] = "2024", 1, "youtube" in l["url"]
item["tag"] = "[FOUNDATIONAL]" if "awesome" in l["title"].lower() else "[PRODUCTION-READY]"
item["year"], item["stars"], item["is_video"] = "N/A", 1, "youtube" in l["url"]
item["tag"] = self._calculate_tag(item)
refined.append(item)
await asyncio.sleep(0.3)
return refined
def _calculate_tag(self, item: Dict) -> str:
# Dynamic Tagging Strategy based on Maturity and Real Data
if "github.com" in item["url"] and "gh_stars" in item:
stars = item["gh_stars"]
year = int(item.get("year")) if item.get("year", "").isdigit() else 2024
if stars > 10000: return "[DE FACTO STANDARD]"
if stars > 500 and year >= 2024: return "[ENTERPRISE-STABLE]"
if year >= 2025: return "[EMERGING / INNOVATION]"
if year <= 2022: return "[LEGACY / MAINTENANCE]"
return "[TOOLING]"
# Fallback to AI's tag or defaults for articles
tag = item.get("tag", "").upper()
valid_tags = ["[DE FACTO STANDARD]", "[ENTERPRISE-STABLE]", "[EMERGING / INNOVATION]", "[LEGACY / MAINTENANCE]", "[ARCHITECTURE-GUIDE]", "[TOOLING]", "[CASE-STUDY]", "[CHEATSHEET]"]
if tag in valid_tags:
return tag
# Basic inference for articles
title = item.get("title", "").lower()
if "awesome" in title: return "[FOUNDATIONAL]"
if "guide" in title or "architecture" in title: return "[ARCHITECTURE-GUIDE]"
if "how to" in title or "tutorial" in title: return "[CASE-STUDY]"
return "[ENTERPRISE-STABLE]"
async def _fetch_github_metadata(self, url: str) -> Dict:
match = re.search(r'github\.com/([^/]+)/([^/]+)', url)
if not match: return {}
@@ -335,7 +366,7 @@ class V2VisionEngine:
for dim in data.values():
for cat_links in dim["categories"].values():
master_selection.extend([l for l in cat_links if l.get("stars", 1) == 3])
master_selection.sort(key=lambda x: (x.get("year", "0"), x["title"]))
master_selection.sort(key=lambda x: (x.get("year", "0"), x["title"]), reverse=True)
index_md = (
"# Nubenetes V2 | The High-Density Library (2026)\n\n"
@@ -343,7 +374,16 @@ class V2VisionEngine:
"!!! quote \"The Library of 2026\"\n"
" A meticulously curated reference of over 15,000 resources. This V2 portal preserves technical depth while providing "
" chronological clarity and expert quality synthesis.\n\n"
f"<center>\n{mosaic_html}\n</center>\n\n"
f"<center markdown=\"1\">\n{mosaic_html}\n</center>\n\n"
"## 🛡️ V2 Taxonomy & Maturity Tags\n"
"To maximize technical clarity, V2 resources are classified by maturity rather than subjective quality:\n\n"
"- <span class='md-tag md-tag--success'>[DE FACTO STANDARD]</span>: Foundational industry tools with massive adoption (>10k GitHub stars).\n"
"- <span class='md-tag md-tag--info'>[ENTERPRISE-STABLE]</span>: Production-ready tools actively maintained.\n"
"- <span class='md-tag md-tag--warning'>[EMERGING / INNOVATION]</span>: High-growth technologies released or heavily updated recently (≥2025).\n"
"- <span class='md-tag md-tag--critical'>[LEGACY / MAINTENANCE]</span>: Proven solutions with no major updates since 2022. Use with caution.\n"
"- <span class='md-tag md-tag--primary'>[ARCHITECTURE-GUIDE]</span> / <span class='md-tag md-tag--primary'>[CASE-STUDY]</span>: High-value reading material and use cases.\n\n"
"## 🌟 Master Selection (Top-Tier Gems)\n"
"A global selection of the most impactful resources across all dimensions.\n\n"
)
@@ -352,7 +392,7 @@ class V2VisionEngine:
index_md += f"- **({l['year']})** [{l['title']}]({l['url']}){gh_info} 🌟🌟🌟\n"
index_md += "\n??? note \"Elite Video Selection - Click to expand!\"\n"
index_md += f" <center>\n{videos_html}\n </center>\n\n"
index_md += f" <center markdown=\"1\">\n{videos_html}\n </center>\n\n"
index_md += "## Strategic Dimensions\n"
for dim, content in data.items():
@@ -371,8 +411,15 @@ class V2VisionEngine:
md += f"## {cat}\n"
for l in links:
year, stars = l.get("year", "N/A"), "🌟" * l.get("stars", 1)
tag = l.get("tag", "[PRODUCTION-READY]")
color = "success" if "FOUNDATIONAL" in tag else "info" if "PRODUCTION" in tag else "warning"
tag = l.get("tag", "[ENTERPRISE-STABLE]")
# Determine color mapping for new tags
if "STANDARD" in tag or "FOUNDATIONAL" in tag: color = "success"
elif "EMERGING" in tag: color = "warning"
elif "LEGACY" in tag: color = "critical"
elif "STABLE" in tag: color = "info"
else: color = "primary"
title_display = f"**{l['title']}**" if l.get("stars", 1) >= 2 else l['title']
gh_info = f" <span class='md-tag md-tag--info'>⭐ {l['gh_stars']}</span>" if "gh_stars" in l else ""