From 5ab4e539013b4e4e6ccc58370e880504999da1d6 Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Fri, 15 May 2026 17:36:50 +0200 Subject: [PATCH] feat: enhance V2 optimizer with dynamic maturity tags and real github metadata dates --- src/v2_optimizer.py | 119 ++++++++++++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 36 deletions(-) diff --git a/src/v2_optimizer.py b/src/v2_optimizer.py index 4688ec86..0aa148f1 100644 --- a/src/v2_optimizer.py +++ b/src/v2_optimizer.py @@ -34,16 +34,12 @@ class V2VisionEngine: self.library_criteria = ( "You are a Technical Librarian in 2026. Your mission is to build a high-density, professional reference library.\n" "PHASE 1: TECHNICAL PRESERVATION (HIGH INCLUSIVITY)\n" - "- KEEP >90% of technical resources. Only discard 404s, obvious spam, or non-technical content.\n" - "- 'Awesome' repositories, official documentation, and deep technical guides are mandatory.\n" - "- YouTube videos are HIGH-VALUE resources; keep them as technical references.\n\n" - "PHASE 2: TEMPORAL & QUALITY SYNTHESIS\n" - "- Identify/estimate PUBLICATION YEAR.\n" - "- Assign QUALITY level (1-3 stars):\n" - " * 3 stars (🌟🌟🌟): Masterpieces, foundational standards, definitive 'Awesome' lists.\n" - " * 2 stars (🌟🌟): Production-grade tools, deep tutorials, highly recommended videos.\n" - " * 1 star (🌟): Solid technical references.\n" - "- Identify if a resource is a 'YouTube Video/Playlist' for special rendering.\n" + "- KEEP >90% of technical resources.\n" + "PHASE 2: SOPHISTICATED SYNTHESIS & DATING\n" + "- Extract precise PUBLICATION YEAR: Look for dates in the URL (e.g., /2023/05/ post dates), Twitter/X post dates, or text context. Return 'N/A' if truly unknown, do NOT guess '2024'.\n" + "- Assign QUALITY level (1-3 stars).\n" + "- Assign a MATURITY TAG based on content type/status: '[ENTERPRISE-STABLE]', '[EMERGING / INNOVATION]', '[ARCHITECTURE-GUIDE]', '[TOOLING]', '[CASE-STUDY]', or '[CHEATSHEET]'.\n" + " * Note: We will override tags for GitHub repos using real API data (Stars/Commits), so focus on classifying blogs and articles correctly.\n" ) self.cache = self._load_cache() @@ -93,7 +89,7 @@ class V2VisionEngine: idx_content = f.read() # Find the BIG mosaic (the one with many images) # Support both old
and new
- mosaics = re.findall(r'<(?:div style="text-align: center;" markdown="1"|center)>\s*(.*?)\s*', idx_content, re.DOTALL) + mosaics = re.findall(r'<(?:div style="text-align: center;" markdown="1"|center markdown="1"|center)>\s*(.*?)\s*', idx_content, re.DOTALL) if mosaics: # Filter for the one containing many image links for m in mosaics: @@ -213,19 +209,28 @@ class V2VisionEngine: refined = [] to_evaluate = [] - # Pull from cache first + # We want to re-evaluate the tags and years, so we will bypass cache for tagging logic, + # but use cache for AI stars if available to save cost. for l in links: url = l["url"] - if url in self.cache and "year" in self.cache[url]: - item = l.copy() + # To allow the new logic to apply to cached items, we re-process GitHub links + # and re-apply the tag logic even if it's in the cache. + item = l.copy() + if url in self.cache and "stars" in self.cache[url]: item.update(self.cache[url]) - # Refresh GitHub metadata if it's a GH link - if "github.com" in url: - gh_meta = await self._fetch_github_metadata(url) - item.update(gh_meta) - refined.append(item) else: - to_evaluate.append(l) + to_evaluate.append(item) + continue # process later via API + + # Re-apply GitHub metadata and mature tagging for cached items + if "github.com" in url: + gh_meta = await self._fetch_github_metadata(url) + item.update(gh_meta) + if "gh_updated" in gh_meta and gh_meta["gh_updated"]: + item["year"] = gh_meta["gh_updated"].split("-")[0] + + item["tag"] = self._calculate_tag(item) + refined.append(item) if not to_evaluate: return refined @@ -237,7 +242,7 @@ class V2VisionEngine: prompt = ( f"{self.library_criteria}\n" - "Respond ONLY with a JSON object: {\"results\": [{\"idx\": int, \"year\": \"YYYY\", \"stars\": int, \"is_video\": bool}, ...]}\n\n" + "Respond ONLY with a JSON object: {\"results\": [{\"idx\": int, \"year\": \"YYYY\", \"stars\": int, \"is_video\": bool, \"tag\": \"[TAG]\"}, ...]}\n\n" "LINKS:\n" + "\n".join([f"{idx}. {l['title']} ({l['url']})" for idx, l in enumerate(batch)]) ) @@ -251,35 +256,61 @@ class V2VisionEngine: if idx < len(batch): item = batch[idx].copy() eval_data = { - "year": str(res.get("year", "2024")), + "year": str(res.get("year", "N/A")), "stars": min(max(int(res.get("stars", 1)), 1), 3), - "is_video": res.get("is_video", False) + "is_video": res.get("is_video", False), + "tag": res.get("tag", "[ENTERPRISE-STABLE]") } item.update(eval_data) + # GitHub overrides if "github.com" in item["url"]: gh_meta = await self._fetch_github_metadata(item["url"]) item.update(gh_meta) - eval_data.update(gh_meta) + if "gh_updated" in gh_meta and gh_meta["gh_updated"]: + item["year"] = gh_meta["gh_updated"].split("-")[0] + eval_data["year"] = item["year"] + + item["tag"] = self._calculate_tag(item) + eval_data["tag"] = item["tag"] # Save to cache self.cache[item["url"]] = eval_data - - if item["year"].isdigit() and int(item["year"]) >= 2025: item["tag"] = "[CUTTING-EDGE]" - elif "awesome" in item["title"].lower(): item["tag"] = "[FOUNDATIONAL]" - else: item["tag"] = "[PRODUCTION-READY]" - refined.append(item) except: continue except: for l in batch: item = l.copy() - item["year"], item["stars"], item["is_video"] = "2024", 1, "youtube" in l["url"] - item["tag"] = "[FOUNDATIONAL]" if "awesome" in l["title"].lower() else "[PRODUCTION-READY]" + item["year"], item["stars"], item["is_video"] = "N/A", 1, "youtube" in l["url"] + item["tag"] = self._calculate_tag(item) refined.append(item) await asyncio.sleep(0.3) return refined + def _calculate_tag(self, item: Dict) -> str: + # Dynamic Tagging Strategy based on Maturity and Real Data + if "github.com" in item["url"] and "gh_stars" in item: + stars = item["gh_stars"] + year = int(item.get("year")) if item.get("year", "").isdigit() else 2024 + if stars > 10000: return "[DE FACTO STANDARD]" + if stars > 500 and year >= 2024: return "[ENTERPRISE-STABLE]" + if year >= 2025: return "[EMERGING / INNOVATION]" + if year <= 2022: return "[LEGACY / MAINTENANCE]" + return "[TOOLING]" + + # Fallback to AI's tag or defaults for articles + tag = item.get("tag", "").upper() + valid_tags = ["[DE FACTO STANDARD]", "[ENTERPRISE-STABLE]", "[EMERGING / INNOVATION]", "[LEGACY / MAINTENANCE]", "[ARCHITECTURE-GUIDE]", "[TOOLING]", "[CASE-STUDY]", "[CHEATSHEET]"] + if tag in valid_tags: + return tag + + # Basic inference for articles + title = item.get("title", "").lower() + if "awesome" in title: return "[FOUNDATIONAL]" + if "guide" in title or "architecture" in title: return "[ARCHITECTURE-GUIDE]" + if "how to" in title or "tutorial" in title: return "[CASE-STUDY]" + return "[ENTERPRISE-STABLE]" + async def _fetch_github_metadata(self, url: str) -> Dict: match = re.search(r'github\.com/([^/]+)/([^/]+)', url) if not match: return {} @@ -335,7 +366,7 @@ class V2VisionEngine: for dim in data.values(): for cat_links in dim["categories"].values(): master_selection.extend([l for l in cat_links if l.get("stars", 1) == 3]) - master_selection.sort(key=lambda x: (x.get("year", "0"), x["title"])) + master_selection.sort(key=lambda x: (x.get("year", "0"), x["title"]), reverse=True) index_md = ( "# Nubenetes V2 | The High-Density Library (2026)\n\n" @@ -343,7 +374,16 @@ class V2VisionEngine: "!!! quote \"The Library of 2026\"\n" " A meticulously curated reference of over 15,000 resources. This V2 portal preserves technical depth while providing " " chronological clarity and expert quality synthesis.\n\n" - f"
\n{mosaic_html}\n
\n\n" + f"
\n{mosaic_html}\n
\n\n" + + "## 🛡️ V2 Taxonomy & Maturity Tags\n" + "To maximize technical clarity, V2 resources are classified by maturity rather than subjective quality:\n\n" + "- [DE FACTO STANDARD]: Foundational industry tools with massive adoption (>10k GitHub stars).\n" + "- [ENTERPRISE-STABLE]: Production-ready tools actively maintained.\n" + "- [EMERGING / INNOVATION]: High-growth technologies released or heavily updated recently (≥2025).\n" + "- [LEGACY / MAINTENANCE]: Proven solutions with no major updates since 2022. Use with caution.\n" + "- [ARCHITECTURE-GUIDE] / [CASE-STUDY]: High-value reading material and use cases.\n\n" + "## 🌟 Master Selection (Top-Tier Gems)\n" "A global selection of the most impactful resources across all dimensions.\n\n" ) @@ -352,7 +392,7 @@ class V2VisionEngine: index_md += f"- **({l['year']})** [{l['title']}]({l['url']}){gh_info} 🌟🌟🌟\n" index_md += "\n??? note \"Elite Video Selection - Click to expand!\"\n" - index_md += f"
\n{videos_html}\n
\n\n" + index_md += f"
\n{videos_html}\n
\n\n" index_md += "## Strategic Dimensions\n" for dim, content in data.items(): @@ -371,8 +411,15 @@ class V2VisionEngine: md += f"## {cat}\n" for l in links: year, stars = l.get("year", "N/A"), "🌟" * l.get("stars", 1) - tag = l.get("tag", "[PRODUCTION-READY]") - color = "success" if "FOUNDATIONAL" in tag else "info" if "PRODUCTION" in tag else "warning" + tag = l.get("tag", "[ENTERPRISE-STABLE]") + + # Determine color mapping for new tags + if "STANDARD" in tag or "FOUNDATIONAL" in tag: color = "success" + elif "EMERGING" in tag: color = "warning" + elif "LEGACY" in tag: color = "critical" + elif "STABLE" in tag: color = "info" + else: color = "primary" + title_display = f"**{l['title']}**" if l.get("stars", 1) >= 2 else l['title'] gh_info = f" ⭐ {l['gh_stars']}" if "gh_stars" in l else ""