feat: integrate official YouTube Data API v3 for robust metadata extraction

This commit is contained in:
Nubenetes Bot
2026-05-22 20:07:50 +02:00
parent 7beba344d6
commit 69eeca361b
5 changed files with 43 additions and 28 deletions

View File

@@ -123,8 +123,10 @@ jobs:
TWITTER_PASSWORD: ${{ secrets.TWITTER_PASSWORD }}
TWITTER_COOKIES: ${{ secrets.TWITTER_COOKIES }}
GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
GEMINI_API_KEY_2: ${{ secrets.GEMINI_API_KEY_2 }}
GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
ACTIVATE_BACKUP_KEY: ${{ github.event.inputs.activate_backup_key || 'false' }}
EXTRACTION_STRATEGY: ${{ github.event.inputs.extraction_strategy || 'search' }}
HISTORICAL_MODE: ${{ github.event.inputs.historical_mode || 'false' }}

View File

@@ -52,6 +52,7 @@ jobs:
- name: Run V2 AI Curator
env:
GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
FORCE_EVAL: ${{ github.event.inputs.force_reevaluate || 'false' }}
PYTHONPATH: ${{ github.workspace }}

View File

@@ -62,6 +62,7 @@ jobs:
env:
PYTHONPATH: ${{ github.workspace }}
GEMINI_API_KEY_1: ${{ secrets.GEMINI_API_KEY_1 }}
YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
FORCE_ENRICH: ${{ github.event.inputs.force_enrich || 'false' }}
run: |
python src/enrich_videos.py

View File

@@ -32,6 +32,7 @@ if GEMINI_API_KEY and not os.getenv("GOOGLE_API_KEY"):
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY
GH_TOKEN = os.getenv("GH_TOKEN")
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
# Gemini Configuration (Dynamic Discovery Enabled)
GEMINI_API_VERSION = "v1beta"

View File

@@ -424,23 +424,43 @@ async def call_gemini_with_retry(prompt: str, response_format: str = "json", max
async def fetch_youtube_metadata(url: str) -> Optional[Dict]:
"""
Fetches high-fidelity basic metadata (title, description) and optionally transcripts
from a YouTube page using robust third-party libraries (yt-dlp).
Used for pre-enriching AI prompts with real content data.
Fetches high-fidelity basic metadata (title, description) from a YouTube page.
Prioritizes Official YouTube Data API v3 if YOUTUBE_API_KEY is available.
Fallbacks to yt-dlp and eventually standard fetch.
"""
from src.config import YOUTUBE_API_KEY
# Extract Video ID
vid = None
if "/embed/" in url: vid = url.split("/embed/")[-1].split("?")[0]
elif "youtu.be/" in url: vid = url.split("youtu.be/")[-1].split("?")[0]
elif "v=" in url: vid = url.split("v=")[-1].split("&")[0]
if not vid: return None
# STRATEGY 1: Official YouTube Data API v3 (Guaranteed success)
if YOUTUBE_API_KEY:
try:
api_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={vid}&key={YOUTUBE_API_KEY}"
async with httpx.AsyncClient() as client:
resp = await client.get(api_url, timeout=10.0)
if resp.status_code == 200:
data = resp.json()
if data.get("items"):
snippet = data["items"][0]["snippet"]
log_event(f" [YT-API] Success for {vid}: {snippet.get('title')}")
return {
"raw_title": snippet.get("title", "").strip(),
"raw_description": snippet.get("description", "").strip()[:3000]
}
except Exception as e:
log_event(f" [YT-API] Failed for {vid}: {e}")
# STRATEGY 2: Robust Extraction (yt-dlp)
try:
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi
# Convert embed/short URLs to standard watch URLs
clean_url = url.split("?")[0].split("&")[0]
if "/embed/" in clean_url:
vid = clean_url.split("/embed/")[-1]
elif "youtu.be/" in clean_url:
vid = clean_url.split("youtu.be/")[-1]
else:
vid = clean_url.split("v=")[-1].split("&")[0]
ydl_opts = {
'quiet': True,
'skip_download': True,
@@ -460,29 +480,19 @@ async def fetch_youtube_metadata(url: str) -> Optional[Dict]:
title = info.get('title', 'YouTube Video')
description = info.get('description', '')
# Attempt to get transcript for even higher fidelity
# Attempt to get transcript
transcript_text = ""
try:
transcript = YouTubeTranscriptApi.get_transcript(vid, languages=['en', 'es'])
transcript_text = " ".join([t['text'] for t in transcript[:100]]) # Limit to first 100 segments
except:
pass
transcript_text = " ".join([t['text'] for t in transcript[:100]])
except: pass
full_description = f"{description}\n\n[Transcript Snippet]: {transcript_text}" if transcript_text else description
return {
"raw_title": title.strip(),
"raw_description": full_description.strip()[:3000] # Limit size for AI context
"raw_description": full_description.strip()[:3000]
}
except Exception as e:
log_event(f" [!] Robust YouTube metadata fetch failed for {url}: {e}. Falling back to standard fetch...")
# Fallback to the old simple httpx fetch if yt-dlp is not available or fails
try:
async with httpx.AsyncClient(follow_redirects=True, timeout=10.0) as client:
resp = await client.get(url)
if resp.status_code == 200:
title_match = re.search(r'<title>(.*?)</title>', resp.text)
return {"raw_title": title_match.group(1) if title_match else "YouTube Video", "raw_description": ""}
except:
pass
log_event(f" [!] Robust fetch failed for {url}: {e}")
return None