From 69eeca361bb70f178c4e664276e48a045e39cbca Mon Sep 17 00:00:00 2001
From: Nubenetes Bot <bot@nubenetes.com>
Date: Fri, 22 May 2026 20:07:50 +0200
Subject: [PATCH] feat: integrate official YouTube Data API v3 for robust
 metadata extraction

---
 .github/workflows/agentic_cron.yml      |  4 +-
 .github/workflows/agentic_v2_ai.yml     |  1 +
 .github/workflows/agentic_v2_videos.yml |  1 +
 src/config.py                           |  1 +
 src/gemini_utils.py                     | 64 ++++++++++++++-----------
 5 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/agentic_cron.yml b/.github/workflows/agentic_cron.yml
index 652d6169..30da51b9 100644
--- a/.github/workflows/agentic_cron.yml
+++ b/.github/workflows/agentic_cron.yml
@@ -123,8 +123,10 @@ jobs:
           TWITTER_PASSWORD: ${{ secrets.TWITTER_PASSWORD }}
           TWITTER_COOKIES: ${{ secrets.TWITTER_COOKIES }}
           GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
-          GEMINI_API_KEY_2: ${{ secrets.GEMINI_API_KEY_2 }}
+          GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
+          YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
           ACTIVATE_BACKUP_KEY: ${{ github.event.inputs.activate_backup_key || 'false' }}
           EXTRACTION_STRATEGY: ${{ github.event.inputs.extraction_strategy || 'search' }}
           HISTORICAL_MODE: ${{ github.event.inputs.historical_mode || 'false' }}
diff --git a/.github/workflows/agentic_v2_ai.yml b/.github/workflows/agentic_v2_ai.yml
index cdae2819..e0eaa4f6 100644
--- a/.github/workflows/agentic_v2_ai.yml
+++ b/.github/workflows/agentic_v2_ai.yml
@@ -52,6 +52,7 @@ jobs:
       - name: Run V2 AI Curator
         env:
           GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
+          YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           FORCE_EVAL: ${{ github.event.inputs.force_reevaluate || 'false' }}
           PYTHONPATH: ${{ github.workspace }}
diff --git a/.github/workflows/agentic_v2_videos.yml b/.github/workflows/agentic_v2_videos.yml
index 7f908f49..fc3f366e 100644
--- a/.github/workflows/agentic_v2_videos.yml
+++ b/.github/workflows/agentic_v2_videos.yml
@@ -62,6 +62,7 @@ jobs:
         env:
           PYTHONPATH: ${{ github.workspace }}
           GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
+          YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
           FORCE_ENRICH: ${{ github.event.inputs.force_enrich || 'false' }}
         run: |
           python src/enrich_videos.py
diff --git a/src/config.py b/src/config.py
index d32b4f0e..ca91b684 100644
--- a/src/config.py
+++ b/src/config.py
@@ -32,6 +32,7 @@ if GEMINI_API_KEY and not os.getenv("GOOGLE_API_KEY"):
     os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY
 
 GH_TOKEN = os.getenv("GH_TOKEN")
+YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
 
 # Gemini Configuration (Dynamic Discovery Enabled)
 GEMINI_API_VERSION = "v1beta"
diff --git a/src/gemini_utils.py b/src/gemini_utils.py
index a48b192e..b8bee158 100644
--- a/src/gemini_utils.py
+++ b/src/gemini_utils.py
@@ -424,23 +424,43 @@ async def call_gemini_with_retry(prompt: str, response_format: str = "json", max
 
 async def fetch_youtube_metadata(url: str) -> Optional[Dict]:
     """
-    Fetches high-fidelity basic metadata (title, description) and optionally transcripts
-    from a YouTube page using robust third-party libraries (yt-dlp).
-    Used for pre-enriching AI prompts with real content data.
+    Fetches high-fidelity basic metadata (title, description) from a YouTube page.
+    Prioritizes Official YouTube Data API v3 if YOUTUBE_API_KEY is available.
+    Fallbacks to yt-dlp and eventually standard fetch.
     """
+    from src.config import YOUTUBE_API_KEY
+    
+    # Extract Video ID
+    vid = None
+    if "/embed/" in url: vid = url.split("/embed/")[-1].split("?")[0]
+    elif "youtu.be/" in url: vid = url.split("youtu.be/")[-1].split("?")[0]
+    elif "v=" in url: vid = url.split("v=")[-1].split("&")[0]
+    
+    if not vid: return None
+
+    # STRATEGY 1: Official YouTube Data API v3 (Guaranteed success)
+    if YOUTUBE_API_KEY:
+        try:
+            api_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={vid}&key={YOUTUBE_API_KEY}"
+            async with httpx.AsyncClient() as client:
+                resp = await client.get(api_url, timeout=10.0)
+                if resp.status_code == 200:
+                    data = resp.json()
+                    if data.get("items"):
+                        snippet = data["items"][0]["snippet"]
+                        log_event(f"    [YT-API] Success for {vid}: {snippet.get('title')}")
+                        return {
+                            "raw_title": snippet.get("title", "").strip(),
+                            "raw_description": snippet.get("description", "").strip()[:3000]
+                        }
+        except Exception as e:
+            log_event(f"    [YT-API] Failed for {vid}: {e}")
+
+    # STRATEGY 2: Robust Extraction (yt-dlp)
     try:
         import yt_dlp
         from youtube_transcript_api import YouTubeTranscriptApi
         
-        # Convert embed/short URLs to standard watch URLs
-        clean_url = url.split("?")[0].split("&")[0]
-        if "/embed/" in clean_url:
-            vid = clean_url.split("/embed/")[-1]
-        elif "youtu.be/" in clean_url:
-            vid = clean_url.split("youtu.be/")[-1]
-        else:
-            vid = clean_url.split("v=")[-1].split("&")[0]
-
         ydl_opts = {
             'quiet': True,
             'skip_download': True,
@@ -460,29 +480,19 @@ async def fetch_youtube_metadata(url: str) -> Optional[Dict]:
             title = info.get('title', 'YouTube Video')
             description = info.get('description', '')
 
-        # Attempt to get transcript for even higher fidelity
+        # Attempt to get transcript
         transcript_text = ""
         try:
             transcript = YouTubeTranscriptApi.get_transcript(vid, languages=['en', 'es'])
-            transcript_text = " ".join([t['text'] for t in transcript[:100]]) # Limit to first 100 segments
-        except:
-            pass
+            transcript_text = " ".join([t['text'] for t in transcript[:100]])
+        except: pass
 
         full_description = f"{description}\n\n[Transcript Snippet]: {transcript_text}" if transcript_text else description
 
         return {
             "raw_title": title.strip(),
-            "raw_description": full_description.strip()[:3000] # Limit size for AI context
+            "raw_description": full_description.strip()[:3000]
         }
     except Exception as e:
-        log_event(f"    [!] Robust YouTube metadata fetch failed for {url}: {e}. Falling back to standard fetch...")
-        # Fallback to the old simple httpx fetch if yt-dlp is not available or fails
-        try:
-            async with httpx.AsyncClient(follow_redirects=True, timeout=10.0) as client:
-                resp = await client.get(url)
-                if resp.status_code == 200:
-                    title_match = re.search(r'<title>(.*?)</title>', resp.text)
-                    return {"raw_title": title_match.group(1) if title_match else "YouTube Video", "raw_description": ""}
-        except:
-            pass
+        log_event(f"    [!] Robust fetch failed for {url}: {e}")
         return None