fix: resolve V2 optimizer issues (models 404, parsing errors, empty exceptions)

2026-07-28 09:32:20 +00:00 · 2026-05-15 01:08:45 +02:00
parent e48d56a1d5
commit 603ef5d5a5
4 changed files with 71 additions and 39 deletions
--- a/GEMINI.md
+++ b/GEMINI.md
@@ -16,9 +16,10 @@ This file contains the accumulated instructions and long-term vision for the aut
 10. **Official Language (English Only)**: All injected content (titles, descriptions, headers), execution logs, and automated communications (PRs) MUST be exclusively in ENGLISH. Nubenetes is a global resource and linguistic consistency is critical.
 11. **Workflow-Config Synchronization**: The GitHub Actions curation workflow form (`agentic_cron.yml`) MUST remain perfectly synchronized with the curation sources configuration file (`data/curation_sources.yaml`). Any addition, removal, or renaming of topics/categories in the configuration file requires a corresponding update to the workflow's input fields (checkboxes) to ensure users can toggle those sources manually. This maintains consistency between data-driven sources and the UI trigger.
 12. **V2 Elite Maintenance**: The Nubenetes V2 (Agentic Elite) edition is a derived view of the V1 archive. It is managed via the `src/v2_optimizer.py` script and stored in the `v2-docs/` directory. AI agents MUST NOT modify `v2-docs/` directly via standard curation workflows; they must only use the `agentic_v2_builder.yml` workflow to perform the periodic "Elite Selection" process. Standard curation and cleaning workflows must always target the `docs/` directory as the primary source of truth.
+13. **Detailed Logging for V2**: When running the V2 Optimizer, agents MUST use unbuffered logging and detailed output messages. If the optimizer returns '0 links kept', the agent MUST investigate the logs to determine if it was due to AI selection or a parsing/API error.

 ## 🛠️ Structural Evolution & Navigation
-
+...
 *   **No Link Limits**: There are NO hard limits on the number of links per page or per section (##/###). Nubenetes is built to host thousands of references.
 *   **TOC Consistency**: Every `.md` page (including the main index `docs/index.md`) MUST maintain an internal Table of Contents (TOC) at the beginning. This TOC must include all sections (##) and subsections (###) nested correctly using a numbered list format with working anchors.
 *   **Relative References & Anchors**:
--- a/src/config.py
+++ b/src/config.py
@@ -27,10 +27,10 @@ GH_TOKEN = os.getenv("GH_TOKEN")
 # Gemini Configuration (May 2026)
 GEMINI_API_VERSION = "v1beta"
 GEMINI_MODELS = [
-    "gemini-2.5-flash-lite",
-    "gemini-2.0-flash",
    "gemini-1.5-flash",
-    "gemini-2.5-flash"
+    "gemini-1.5-pro",
+    "gemini-2.0-flash-exp",
+    "gemini-1.5-flash-latest"
 ]

 TARGET_REPO = "nubenetes/awesome-kubernetes"
--- a/src/gemini_utils.py
+++ b/src/gemini_utils.py
@@ -148,7 +148,10 @@ async def call_gemini_with_retry(prompt: str, response_format: str = "json", max
                            break
                            
                    except Exception as e:
-                        diagnostics.add_attempt(model, 0, f"Exception: {str(e)}")
+                        import traceback
+                        error_msg = f"{type(e).__name__}: {str(e)}"
+                        diagnostics.add_attempt(model, 0, error_msg)
+                        log_event(f"  [!] Model {model} failed with exception: {error_msg}")
                        break
                
                if key_blocked:
--- a/src/v2_optimizer.py
+++ b/src/v2_optimizer.py
@@ -40,47 +40,75 @@ class V2Optimizer:
            # If no links, just copy the structure/headers
            headers = [l for l in content.splitlines() if l.startswith("#")]
            with open(v2_path, "w") as f:
-                f.write("\n".join(headers) + "\n\n*Content coming soon as part of the 2026 Agentic Elite curation.*")
+                v2_header = f"# {filename.replace('.md', '').capitalize()} (Elite Selection)\n\n"
+                v2_header += "!!! info \"Note\"\n    This category is currently under review by our Agentic AI.\n\n"
+                f.write(v2_header + "\n".join(headers))
            return

        formatted_links = []
-        for title, url, desc in links:
-            formatted_links.append(f"- [{title}]({url}) {desc.strip()}")
-
-        log_event(f"[*] V2 Optimizer: Analyzing {len(formatted_links)} links in {filename}")
-
-        prompt = (
-            f"{self.elite_criteria}\n"
-            f"FILE: {filename}\n"
-            f"LINKS TO EVALUATE:\n" + "\n".join(formatted_links[:100]) + "\n\n"
-            "Respond ONLY with a JSON list of indices to KEEP. "
-            "Example: [0, 5, 22]. Remember to ALWAYS keep 'Awesome' repos."
-        )
-
-        try:
-            indices = await call_gemini_with_retry(prompt)
-            if not isinstance(indices, list): indices = []
+        pre_selected_indices = []
+        for i, (title, url, desc) in enumerate(links):
+            link_text = f"[{title}]({url}) {desc.strip()}"
+            formatted_links.append(f"{i}. {link_text}")
            
-            selected_links = [formatted_links[i] for i in indices if i < len(formatted_links)]
-            
-            # Reconstruct V2 file
-            v2_content = f"# {filename.replace('.md', '').capitalize()} (Elite Selection)\n\n"
-            v2_content += "!!! abstract \"2026 Agentic Vision\"\n"
-            v2_content += "    This page contains a curated selection of top-tier resources, strictly filtered by our Agentic AI for high impact and modern relevance.\n\n"
-            
-            if selected_links:
-                v2_content += "## Selected Resources\n"
-                v2_content += "\n".join(selected_links)
-            else:
-                v2_content += "\n*No resources met the elite criteria for this specific category yet.*"
+            # MANDATE: Always keep Awesome lists
+            if "awesome" in title.lower() or "awesome" in url.lower():
+                pre_selected_indices.append(i)

-            with open(v2_path, "w") as f:
-                f.write(v2_content)
+        log_event(f"[*] V2 Optimizer: Analyzing {len(formatted_links)} links in {filename} (Pre-selected Awesome: {len(pre_selected_indices)})")
+
+        # Split into manageable chunks if too many links
+        MAX_LINKS_PER_PROMPT = 150
+        all_selected_indices = set(pre_selected_indices)
+        
+        for chunk_start in range(0, len(formatted_links), MAX_LINKS_PER_PROMPT):
+            chunk = formatted_links[chunk_start:chunk_start + MAX_LINKS_PER_PROMPT]
+            
+            prompt = (
+                f"{self.elite_criteria}\n"
+                f"FILE: {filename}\n"
+                f"LINKS TO EVALUATE (Indices {chunk_start} to {chunk_start + len(chunk) - 1}):\n" + "\n".join(chunk) + "\n\n"
+                "Respond ONLY with a JSON object: {\"keep_indices\": [int, int, ...]}\n"
+                "Example: {\"keep_indices\": [0, 5, 22]}"
+            )
+
+            try:
+                log_event(f"  [>] Requesting AI selection for chunk {chunk_start}...")
+                response_data = await call_gemini_with_retry(prompt)
                
-            log_event(f"  [OK] V2 file generated: {v2_path} ({len(selected_links)} links kept)")
+                # Robust parsing of keep_indices
+                indices = []
+                if isinstance(response_data, dict):
+                    indices = response_data.get("keep_indices", [])
+                elif isinstance(response_data, list):
+                    indices = response_data
+                
+                for idx in indices:
+                    try:
+                        all_selected_indices.add(int(idx))
+                    except: continue
+                    
+            except Exception as e:
+                log_event(f"  [!] AI error on chunk {chunk_start}: {e}")

-        except Exception as e:
-            log_event(f"  [!] Error optimizing {filename} for V2: {e}")
+        # Final reconstruction
+        selected_links = [links[i] for i in sorted(list(all_selected_indices)) if i < len(links)]
+        
+        v2_content = f"# {filename.replace('.md', '').capitalize()} (Elite Selection)\n\n"
+        v2_content += "!!! abstract \"2026 Agentic Vision\"\n"
+        v2_content += "    This page contains a curated selection of top-tier resources, strictly filtered by our Agentic AI for high impact and modern relevance.\n\n"
+        
+        if selected_links:
+            v2_content += "## Selected Resources\n"
+            for title, url, desc in selected_links:
+                v2_content += f"  - [{title}]({url}){desc}\n"
+        else:
+            v2_content += "\n*No resources met the elite criteria for this specific category yet.*"
+
+        with open(v2_path, "w") as f:
+            f.write(v2_content)
+            
+        log_event(f"  [OK] V2 file generated: {v2_path} (Total kept: {len(selected_links)})")

    async def run_full_optimization(self):
        log_event("STARTING V2 AGENTIC OPTIMIZATION (THE ARCHITECT'S CUT)", section_break=True)