From 69eeca361bb70f178c4e664276e48a045e39cbca Mon Sep 17 00:00:00 2001 From: Nubenetes Bot Date: Fri, 22 May 2026 20:07:50 +0200 Subject: [PATCH] feat: integrate official YouTube Data API v3 for robust metadata extraction --- .github/workflows/agentic_cron.yml | 4 +- .github/workflows/agentic_v2_ai.yml | 1 + .github/workflows/agentic_v2_videos.yml | 1 + src/config.py | 1 + src/gemini_utils.py | 64 ++++++++++++++----------- 5 files changed, 43 insertions(+), 28 deletions(-) diff --git a/.github/workflows/agentic_cron.yml b/.github/workflows/agentic_cron.yml index 652d6169..30da51b9 100644 --- a/.github/workflows/agentic_cron.yml +++ b/.github/workflows/agentic_cron.yml @@ -123,8 +123,10 @@ jobs: TWITTER_PASSWORD: ${{ secrets.TWITTER_PASSWORD }} TWITTER_COOKIES: ${{ secrets.TWITTER_COOKIES }} GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }} - GEMINI_API_KEY_2: ${{ secrets.GEMINI_API_KEY_2 }} + GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }} + YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ACTIVATE_BACKUP_KEY: ${{ github.event.inputs.activate_backup_key || 'false' }} EXTRACTION_STRATEGY: ${{ github.event.inputs.extraction_strategy || 'search' }} HISTORICAL_MODE: ${{ github.event.inputs.historical_mode || 'false' }} diff --git a/.github/workflows/agentic_v2_ai.yml b/.github/workflows/agentic_v2_ai.yml index cdae2819..e0eaa4f6 100644 --- a/.github/workflows/agentic_v2_ai.yml +++ b/.github/workflows/agentic_v2_ai.yml @@ -52,6 +52,7 @@ jobs: - name: Run V2 AI Curator env: GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }} + YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} FORCE_EVAL: ${{ github.event.inputs.force_reevaluate || 'false' }} PYTHONPATH: ${{ github.workspace }} diff --git a/.github/workflows/agentic_v2_videos.yml b/.github/workflows/agentic_v2_videos.yml index 7f908f49..fc3f366e 100644 --- a/.github/workflows/agentic_v2_videos.yml +++ b/.github/workflows/agentic_v2_videos.yml @@ -62,6 +62,7 @@ jobs: env: PYTHONPATH: ${{ github.workspace }} GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }} + YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} FORCE_ENRICH: ${{ github.event.inputs.force_enrich || 'false' }} run: | python src/enrich_videos.py diff --git a/src/config.py b/src/config.py index d32b4f0e..ca91b684 100644 --- a/src/config.py +++ b/src/config.py @@ -32,6 +32,7 @@ if GEMINI_API_KEY and not os.getenv("GOOGLE_API_KEY"): os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY GH_TOKEN = os.getenv("GH_TOKEN") +YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY") # Gemini Configuration (Dynamic Discovery Enabled) GEMINI_API_VERSION = "v1beta" diff --git a/src/gemini_utils.py b/src/gemini_utils.py index a48b192e..b8bee158 100644 --- a/src/gemini_utils.py +++ b/src/gemini_utils.py @@ -424,23 +424,43 @@ async def call_gemini_with_retry(prompt: str, response_format: str = "json", max async def fetch_youtube_metadata(url: str) -> Optional[Dict]: """ - Fetches high-fidelity basic metadata (title, description) and optionally transcripts - from a YouTube page using robust third-party libraries (yt-dlp). - Used for pre-enriching AI prompts with real content data. + Fetches high-fidelity basic metadata (title, description) from a YouTube page. + Prioritizes Official YouTube Data API v3 if YOUTUBE_API_KEY is available. + Fallbacks to yt-dlp and eventually standard fetch. """ + from src.config import YOUTUBE_API_KEY + + # Extract Video ID + vid = None + if "/embed/" in url: vid = url.split("/embed/")[-1].split("?")[0] + elif "youtu.be/" in url: vid = url.split("youtu.be/")[-1].split("?")[0] + elif "v=" in url: vid = url.split("v=")[-1].split("&")[0] + + if not vid: return None + + # STRATEGY 1: Official YouTube Data API v3 (Guaranteed success) + if YOUTUBE_API_KEY: + try: + api_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={vid}&key={YOUTUBE_API_KEY}" + async with httpx.AsyncClient() as client: + resp = await client.get(api_url, timeout=10.0) + if resp.status_code == 200: + data = resp.json() + if data.get("items"): + snippet = data["items"][0]["snippet"] + log_event(f" [YT-API] Success for {vid}: {snippet.get('title')}") + return { + "raw_title": snippet.get("title", "").strip(), + "raw_description": snippet.get("description", "").strip()[:3000] + } + except Exception as e: + log_event(f" [YT-API] Failed for {vid}: {e}") + + # STRATEGY 2: Robust Extraction (yt-dlp) try: import yt_dlp from youtube_transcript_api import YouTubeTranscriptApi - # Convert embed/short URLs to standard watch URLs - clean_url = url.split("?")[0].split("&")[0] - if "/embed/" in clean_url: - vid = clean_url.split("/embed/")[-1] - elif "youtu.be/" in clean_url: - vid = clean_url.split("youtu.be/")[-1] - else: - vid = clean_url.split("v=")[-1].split("&")[0] - ydl_opts = { 'quiet': True, 'skip_download': True, @@ -460,29 +480,19 @@ async def fetch_youtube_metadata(url: str) -> Optional[Dict]: title = info.get('title', 'YouTube Video') description = info.get('description', '') - # Attempt to get transcript for even higher fidelity + # Attempt to get transcript transcript_text = "" try: transcript = YouTubeTranscriptApi.get_transcript(vid, languages=['en', 'es']) - transcript_text = " ".join([t['text'] for t in transcript[:100]]) # Limit to first 100 segments - except: - pass + transcript_text = " ".join([t['text'] for t in transcript[:100]]) + except: pass full_description = f"{description}\n\n[Transcript Snippet]: {transcript_text}" if transcript_text else description return { "raw_title": title.strip(), - "raw_description": full_description.strip()[:3000] # Limit size for AI context + "raw_description": full_description.strip()[:3000] } except Exception as e: - log_event(f" [!] Robust YouTube metadata fetch failed for {url}: {e}. Falling back to standard fetch...") - # Fallback to the old simple httpx fetch if yt-dlp is not available or fails - try: - async with httpx.AsyncClient(follow_redirects=True, timeout=10.0) as client: - resp = await client.get(url) - if resp.status_code == 200: - title_match = re.search(r'(.*?)', resp.text) - return {"raw_title": title_match.group(1) if title_match else "YouTube Video", "raw_description": ""} - except: - pass + log_event(f" [!] Robust fetch failed for {url}: {e}") return None