feat: integrate official YouTube Data API v3 for robust metadata extraction

2026-07-13 02:10:15 +00:00 · 2026-05-22 20:07:50 +02:00
parent 7beba344d6
commit 69eeca361b
5 changed files with 43 additions and 28 deletions
--- a/.github/workflows/agentic_cron.yml
+++ b/.github/workflows/agentic_cron.yml
@@ -123,8 +123,10 @@ jobs:
          TWITTER_PASSWORD: ${{ secrets.TWITTER_PASSWORD }}
          TWITTER_COOKIES: ${{ secrets.TWITTER_COOKIES }}
          GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
-          GEMINI_API_KEY_2: ${{ secrets.GEMINI_API_KEY_2 }}
+          GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
+          YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
          ACTIVATE_BACKUP_KEY: ${{ github.event.inputs.activate_backup_key || 'false' }}
          EXTRACTION_STRATEGY: ${{ github.event.inputs.extraction_strategy || 'search' }}
          HISTORICAL_MODE: ${{ github.event.inputs.historical_mode || 'false' }}
--- a/.github/workflows/agentic_v2_ai.yml
+++ b/.github/workflows/agentic_v2_ai.yml
@@ -52,6 +52,7 @@ jobs:
      - name: Run V2 AI Curator
        env:
          GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
+          YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          FORCE_EVAL: ${{ github.event.inputs.force_reevaluate || 'false' }}
          PYTHONPATH: ${{ github.workspace }}
--- a/.github/workflows/agentic_v2_videos.yml
+++ b/.github/workflows/agentic_v2_videos.yml
@@ -62,6 +62,7 @@ jobs:
        env:
          PYTHONPATH: ${{ github.workspace }}
          GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
+          YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
          FORCE_ENRICH: ${{ github.event.inputs.force_enrich || 'false' }}
        run: |
          python src/enrich_videos.py
--- a/src/config.py
+++ b/src/config.py
@@ -32,6 +32,7 @@ if GEMINI_API_KEY and not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

 GH_TOKEN = os.getenv("GH_TOKEN")
+YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")

 # Gemini Configuration (Dynamic Discovery Enabled)
 GEMINI_API_VERSION = "v1beta"
--- a/src/gemini_utils.py
+++ b/src/gemini_utils.py
@@ -424,23 +424,43 @@ async def call_gemini_with_retry(prompt: str, response_format: str = "json", max

 async def fetch_youtube_metadata(url: str) -> Optional[Dict]:
    """
-    Fetches high-fidelity basic metadata (title, description) and optionally transcripts
-    from a YouTube page using robust third-party libraries (yt-dlp).
-    Used for pre-enriching AI prompts with real content data.
+    Fetches high-fidelity basic metadata (title, description) from a YouTube page.
+    Prioritizes Official YouTube Data API v3 if YOUTUBE_API_KEY is available.
+    Fallbacks to yt-dlp and eventually standard fetch.
    """
+    from src.config import YOUTUBE_API_KEY
+    
+    # Extract Video ID
+    vid = None
+    if "/embed/" in url: vid = url.split("/embed/")[-1].split("?")[0]
+    elif "youtu.be/" in url: vid = url.split("youtu.be/")[-1].split("?")[0]
+    elif "v=" in url: vid = url.split("v=")[-1].split("&")[0]
+    
+    if not vid: return None
+
+    # STRATEGY 1: Official YouTube Data API v3 (Guaranteed success)
+    if YOUTUBE_API_KEY:
+        try:
+            api_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={vid}&key={YOUTUBE_API_KEY}"
+            async with httpx.AsyncClient() as client:
+                resp = await client.get(api_url, timeout=10.0)
+                if resp.status_code == 200:
+                    data = resp.json()
+                    if data.get("items"):
+                        snippet = data["items"][0]["snippet"]
+                        log_event(f"    [YT-API] Success for {vid}: {snippet.get('title')}")
+                        return {
+                            "raw_title": snippet.get("title", "").strip(),
+                            "raw_description": snippet.get("description", "").strip()[:3000]
+                        }
+        except Exception as e:
+            log_event(f"    [YT-API] Failed for {vid}: {e}")
+
+    # STRATEGY 2: Robust Extraction (yt-dlp)
    try:
        import yt_dlp
        from youtube_transcript_api import YouTubeTranscriptApi
        
-        # Convert embed/short URLs to standard watch URLs
-        clean_url = url.split("?")[0].split("&")[0]
-        if "/embed/" in clean_url:
-            vid = clean_url.split("/embed/")[-1]
-        elif "youtu.be/" in clean_url:
-            vid = clean_url.split("youtu.be/")[-1]
-        else:
-            vid = clean_url.split("v=")[-1].split("&")[0]
-
        ydl_opts = {
            'quiet': True,
            'skip_download': True,
@@ -460,29 +480,19 @@ async def fetch_youtube_metadata(url: str) -> Optional[Dict]:
            title = info.get('title', 'YouTube Video')
            description = info.get('description', '')

-        # Attempt to get transcript for even higher fidelity
+        # Attempt to get transcript
        transcript_text = ""
        try:
            transcript = YouTubeTranscriptApi.get_transcript(vid, languages=['en', 'es'])
-            transcript_text = " ".join([t['text'] for t in transcript[:100]]) # Limit to first 100 segments
-        except:
-            pass
+            transcript_text = " ".join([t['text'] for t in transcript[:100]])
+        except: pass

        full_description = f"{description}\n\n[Transcript Snippet]: {transcript_text}" if transcript_text else description

        return {
            "raw_title": title.strip(),
-            "raw_description": full_description.strip()[:3000] # Limit size for AI context
+            "raw_description": full_description.strip()[:3000]
        }
    except Exception as e:
-        log_event(f"    [!] Robust YouTube metadata fetch failed for {url}: {e}. Falling back to standard fetch...")
-        # Fallback to the old simple httpx fetch if yt-dlp is not available or fails
-        try:
-            async with httpx.AsyncClient(follow_redirects=True, timeout=10.0) as client:
-                resp = await client.get(url)
-                if resp.status_code == 200:
-                    title_match = re.search(r'<title>(.*?)</title>', resp.text)
-                    return {"raw_title": title_match.group(1) if title_match else "YouTube Video", "raw_description": ""}
-        except:
-            pass
+        log_event(f"    [!] Robust fetch failed for {url}: {e}")
        return None