feat: support unified historical curation as default and optional chunking

2026-07-28 09:32:20 +00:00 · 2026-05-14 21:45:59 +02:00
parent 9025528870
commit f8f326c698
2 changed files with 30 additions and 17 deletions
--- a/.github/workflows/agentic_cron.yml
+++ b/.github/workflows/agentic_cron.yml
@@ -22,6 +22,11 @@ on:
        required: false
        default: 'true'
        type: boolean
+      historical_chunked:
+        description: 'Trocear ejecución (múltiples PRs)'
+        required: false
+        default: 'false'
+        type: boolean
      historical_until_date:
        description: 'Fecha límite superior (tramo)'
        required: false
@@ -64,6 +69,7 @@ jobs:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          EXTRACTION_STRATEGY: ${{ github.event.inputs.extraction_strategy || 'search' }}
          HISTORICAL_MODE: ${{ github.event.inputs.historical_mode || 'false' }}
+          HISTORICAL_CHUNKED: ${{ github.event.inputs.historical_chunked || 'false' }}
          HISTORICAL_UNTIL_DATE: ${{ github.event.inputs.historical_until_date || '' }}
          CURATION_START_DATE: ${{ github.event.inputs.start_date || '' }}
          HISTORICAL_CHUNK_DAYS: '180'
@@ -71,9 +77,9 @@ jobs:
        run: |
          python -u src/main.py 2>&1 | tee output.log
          
-          # Lógica de Re-disparo para Modo Histórico
-          if grep -q "NEXT_CHUNK_START:" output.log; then
+          # Lógica de Re-disparo para Modo Histórico (SOLO SI SE SOLICITA TROCEADO)
+          if [ "${{ github.event.inputs.historical_chunked }}" == "true" ] && grep -q "NEXT_CHUNK_START:" output.log; then
            NEXT_DATE=$(grep "NEXT_CHUNK_START:" output.log | awk '{print $2}')
            echo "Disparando siguiente tramo histórico hasta: $NEXT_DATE"
-            gh workflow run agentic_cron.yml -f historical_mode=true -f historical_until_date=$NEXT_DATE
+            gh workflow run agentic_cron.yml -f historical_mode=true -f historical_chunked=true -f historical_until_date=$NEXT_DATE
          fi
--- a/src/main.py
+++ b/src/main.py
@@ -22,24 +22,31 @@ async def master_orchestrator():
    
    # 1. Dynamic / Historical Time Horizon
    is_historical = os.getenv("HISTORICAL_MODE", "false").lower() == "true"
+    is_chunked = os.getenv("HISTORICAL_CHUNKED", "false").lower() == "true"
+    
+    until_date = datetime.now(MADRID_TZ)
    
    if is_historical:
-        # Historical Mode by Chunks (e.g., 180-day chunks)
+        # Unified mode is now DEFAULT for historical
        final_stop_date = datetime(2024, 10, 1, 0, 0, tzinfo=MADRID_TZ)
-        chunk_days = int(os.getenv("HISTORICAL_CHUNK_DAYS", "180"))
        
-        # Current chunk ends where the previous one started (or 'now' if first)
-        until_str = os.getenv("HISTORICAL_UNTIL_DATE")
-        if until_str:
-            until_date = datetime.fromisoformat(until_str).replace(tzinfo=MADRID_TZ)
+        if is_chunked:
+            # Chunked Mode: Use chunks (e.g., 180 days)
+            chunk_days = int(os.getenv("HISTORICAL_CHUNK_DAYS", "180"))
+            until_str = os.getenv("HISTORICAL_UNTIL_DATE")
+            if until_str:
+                until_date = datetime.fromisoformat(until_str).replace(tzinfo=MADRID_TZ)
+            else:
+                until_date = datetime.now(MADRID_TZ)
+                
+            since_date = until_date - timedelta(days=chunk_days)
+            if since_date < final_stop_date:
+                since_date = final_stop_date
+            log_event(f"[*] HISTORICAL MODE (CHUNKED): Chunk {since_date.date()} -> {until_date.date()}")
        else:
-            until_date = datetime.now(MADRID_TZ)
-            
-        since_date = until_date - timedelta(days=chunk_days)
-        if since_date < final_stop_date:
+            # Unified Historical Mode: process all in one go (Single PR)
            since_date = final_stop_date
-            
-        log_event(f"[*] HISTORICAL MODE: Chunk {since_date.date()} -> {until_date.date()}")
+            log_event(f"[*] HISTORICAL MODE (UNIFIED): Processing all since {since_date.date()} in a single run")
    else:
        # Normal Mode: Use CURATION_START_DATE if exists, else state.json
        env_start = os.getenv("CURATION_START_DATE")
@@ -284,8 +291,8 @@ async def master_orchestrator():
    if max_tweet_date > since_date:
        save_state(max_tweet_date + timedelta(seconds=1))

-    # Re-trigger logic for Historical Mode in GitHub Actions
-    if is_historical and since_date > final_stop_date:
+    # Re-trigger logic for Historical Mode in GitHub Actions (ONLY IF CHUNKED)
+    if is_historical and is_chunked and since_date > final_stop_date:
        # Print for YAML to capture
        print(f"\nNEXT_CHUNK_START: {since_date.isoformat()}")
        log_event(f"[*] CHUNK FINISHED. Suggesting next chunk from: {since_date.date()}", section_break=True)