Merge pull request #46 from Huanshere/dev_v8_longvideo

Dev v8 longvideo
Huanshere · Sep 16, 2024 · 392e61b · 392e61b
2 parents eda2295 + d4faf5b
commit 392e61b
Show file tree

Hide file tree

Showing 23 changed files with 326 additions and 446 deletions.
diff --git a/.gitignore b/.gitignore
@@ -158,6 +158,8 @@ _model_cache/
 # large files
 /ffmpeg.exe
 /ffmpeg
+/ffprobe.exe
+/ffprobe
 .DS_Store
 _config.py
 config.py

diff --git a/Dockerfile b/Dockerfile
diff --git a/config.example.py b/config.example.py
@@ -22,14 +22,14 @@
 # Subtitle settings
 # 每行字幕的最大长度字母数量
 # Maximum number of characters per line of subtitle
-MAX_SUB_LENGTH = 80
+MAX_SUB_LENGTH = 75
 # 输出字幕字号更大一些
 # Increase the font size of the output subtitles
 TARGET_SUB_MULTIPLIER = 1.2
 
 # 视频分辨率
 # Video resolution
-RESOLUTIOM = '854x480'
+RESOLUTIOM = '640x360'
 
 # 显示语言
 # Display language
@@ -101,6 +101,7 @@
     "de": "de_core_news_md",
     "it": "it_core_news_md",
 
+
     # Not supported
     # "zh": "zh_core_web_md",
 

diff --git a/core/all_whisper_methods/whisperX.py b/core/all_whisper_methods/whisperX.py
@@ -2,68 +2,76 @@
 import sys
 import whisperx
 import torch
-import pandas as pd
-import json
 from typing import Dict
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 from config import MODEL_DIR
-from core.all_whisper_methods.whisperXapi import process_transcription, convert_video_to_audio
+from core.all_whisper_methods.whisperXapi import (
+    process_transcription, convert_video_to_audio, split_audio,
+    save_results, save_language
+)
 
-def transcribe_audio(audio_file: str) -> Dict:
+def transcribe_audio(audio_file: str, start: float, end: float) -> Dict:
     from config import WHISPER_LANGUAGE
     device = "cuda" if torch.cuda.is_available() else "cpu"
     batch_size = 16  # TODO Reduce this value if GPU memory is insufficient
     compute_type = "float16"  # TODO Change to "int8" if GPU memory is insufficient (may reduce accuracy)
-    print(f"🚀 Starting WhisperX... Please wait patiently...")
+    print(f"🚀 Starting WhisperX for segment {start:.2f}s to {end:.2f}s... Please wait patiently...")
     try:
         whisperx_model_dir = os.path.join(MODEL_DIR, "whisperx")
         model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=whisperx_model_dir)
 
+        # Load audio segment
         audio = whisperx.load_audio(audio_file)
-        result = model.transcribe(audio, batch_size=batch_size, language=(None if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE))
+        audio_segment = audio[int(start * 16000):int(end * 16000)]  # Assuming 16kHz sample rate
+
+        result = model.transcribe(audio_segment, batch_size=batch_size, language=(None if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE))
         # Free GPU resources
         del model
         torch.cuda.empty_cache()
-
-        # Save language information
-        save_language(result['language'])
 
         # Align whisper output
         model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+        result = whisperx.align(result["segments"], model_a, metadata, audio_segment, device, return_char_alignments=False)
 
         # Free GPU resources again
         del model_a
         torch.cuda.empty_cache()
 
+        # Adjust timestamps
+        for segment in result['segments']:
+            segment['start'] += start
+            segment['end'] += start
+            for word in segment['words']:
+                word['start'] += start
+                word['end'] += start
+
         return result
     except Exception as e:
         raise Exception(f"WhisperX processing error: {e}")
 
-def save_results(df: pd.DataFrame):
-    os.makedirs('output/log', exist_ok=True)
-    excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")
-    df['text'] = df['text'].apply(lambda x: f'"{x}"')
-    df.to_excel(excel_path, index=False)
-    print(f"📊 Excel file saved to {excel_path}")
-
-def save_language(language: str):
-    os.makedirs('output/log', exist_ok=True)
-    with open('output/log/transcript_language.json', 'w', encoding='utf-8') as f:
-        json.dump({"language": language}, f, ensure_ascii=False, indent=4)
-
 def transcribe(video_file: str):
     if not os.path.exists("output/log/cleaned_chunks.xlsx"):
         audio_file = convert_video_to_audio(video_file)
 
-        if os.path.getsize(audio_file) > 25 * 1024 * 1024:
-            print("⚠️ File size exceeds 25MB. Please use a smaller file.")
-            return
+        segments = split_audio(audio_file)
+
+        all_results = []
+        for start, end in segments:
+            result = transcribe_audio(audio_file, start, end)
+            all_results.append(result)
+
+        # Combine results
+        combined_result = {
+            'segments': [],
+            'language': all_results[0]['language']
+        }
+        for result in all_results:
+            combined_result['segments'].extend(result['segments'])
 
-        result = transcribe_audio(audio_file)
+        save_language(combined_result['language'])
 
-        df = process_transcription(result)
+        df = process_transcription(combined_result)
         save_results(df)
     else:
         print("📊 Transcription results already exist, skipping transcription step.")

diff --git a/core/all_whisper_methods/whisperXapi.py b/core/all_whisper_methods/whisperXapi.py
@@ -3,7 +3,7 @@
 import replicate
 import pandas as pd
 import json
-from typing import Dict
+from typing import Dict, List, Tuple
 import subprocess
 import base64
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
@@ -28,6 +28,81 @@ def convert_video_to_audio(input_file: str) -> str:
 
     return audio_file
 
+def split_audio(audio_file: str, target_duration: int = 20*60, window: int = 60) -> List[Tuple[float, float]]:
+    print("🔪 Splitting audio into segments...")
+    duration = float(subprocess.check_output(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', audio_file]).decode('utf-8').strip())
+
+    segments = []
+    start = 0
+    while start < duration:
+        end = min(start + target_duration + window, duration)
+        if end - start < target_duration:
+            segments.append((start, end))
+            break
+
+        # Analyze audio in the 2-minute window
+        window_start = start + target_duration - window
+        window_end = min(window_start + 2 * window, duration)
+
+        ffmpeg_cmd = [
+            'ffmpeg',
+            '-i', audio_file,
+            '-ss', str(window_start),
+            '-to', str(window_end),
+            '-af', 'silencedetect=n=-30dB:d=0.5',
+            '-f', 'null',
+            '-'
+        ]
+
+        output = subprocess.run(ffmpeg_cmd, capture_output=True, text=True).stderr
+
+        # Parse silence detection output
+        silence_end_times = [float(line.split('silence_end: ')[1].split(' ')[0]) for line in output.split('\n') if 'silence_end' in line]
+
+        if silence_end_times:
+            # Find the first silence after the target duration
+            split_point = next((t for t in silence_end_times if t > target_duration), None)
+            if split_point:
+                segments.append((start, start + split_point))
+                start += split_point
+                continue
+
+        # If no suitable split point found, split at the target duration
+        segments.append((start, start + target_duration))
+        start += target_duration
+
+    print(f"🔪 Split audio into {len(segments)} segments")
+    return segments
+
+def transcribe_segment(audio_file: str, start: float, end: float) -> Dict:
+    print(f"🎙️ Transcribing segment from {start:.2f}s to {end:.2f}s")
+
+    segment_file = f'output/audio/segment_{start:.2f}_{end:.2f}.wav'
+    ffmpeg_cmd = [
+        'ffmpeg',
+        '-i', audio_file,
+        '-ss', str(start),
+        '-to', str(end),
+        '-c', 'copy',
+        segment_file
+    ]
+    subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE)
+
+    # Encode to base64
+    with open(segment_file, 'rb') as file:
+        audio_base64 = base64.b64encode(file.read()).decode('utf-8')
+
+    # Check segment size
+    segment_size = len(audio_base64) / (1024 * 1024)  # Size in MB
+    print(f"📊 Segment size: {segment_size:.2f} MB")
+
+    result = transcribe_audio(audio_base64)
+
+    # delete the segment file
+    os.remove(segment_file)
+
+    return result
+
 def encode_file_to_base64(file_path: str) -> str:
     print("🔄 Encoding audio file to base64...")
     with open(file_path, 'rb') as file:
@@ -83,7 +158,7 @@ def process_transcription(result: Dict) -> pd.DataFrame:
                     }
                     all_words.append(word_dict)
                 else:
-                    # If it’s the first word, look next for a timestamp then assign it to the current word
+                    # If it's the first word, look next for a timestamp then assign it to the current word
                     next_word = next((w for w in segment['words'] if 'start' in w and 'end' in w), None)
                     if next_word:
                         word_dict = {
@@ -122,16 +197,33 @@ def transcribe(video_file: str):
     if not os.path.exists("output/log/cleaned_chunks.xlsx"):
         audio_file = convert_video_to_audio(video_file)
 
-        if os.path.getsize(audio_file) > 25 * 1024 * 1024:
-            print("⚠️ File size exceeds 25MB. Please use a smaller file.")
-            return
+        segments = split_audio(audio_file)
 
-        audio_base64 = encode_file_to_base64(audio_file)
-        result = transcribe_audio(audio_base64)
+        all_results = []
+        for start, end in segments:
+            result = transcribe_segment(audio_file, start, end)
+            result['time_offset'] = start  # Add time offset to the result
+            all_results.append(result)
+
+        # Combine results
+        combined_result = {
+            'segments': [],
+            'detected_language': all_results[0]['detected_language']
+        }
+        for result in all_results:
+            for segment in result['segments']:
+                segment['start'] += result['time_offset']
+                segment['end'] += result['time_offset']
+                for word in segment['words']:
+                    if 'start' in word:
+                        word['start'] += result['time_offset']
+                    if 'end' in word:
+                        word['end'] += result['time_offset']
+            combined_result['segments'].extend(result['segments'])
 
-        save_language(result['detected_language'])
+        save_language(combined_result['detected_language'])
 
-        df = process_transcription(result)
+        df = process_transcription(combined_result)
         save_results(df)
     else:
         print("📊 Transcription results already exist, skipping transcription step.")

diff --git a/core/spacy_utils/load_nlp_model.py b/core/spacy_utils/load_nlp_model.py
@@ -1,30 +1,31 @@
 import os,sys
 import spacy
 from spacy.cli import download
+from rich import print
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 from core.step2_whisper import get_whisper_language
 from config import SPACY_MODEL_MAP
 
 def get_spacy_model(language: str):
     model = SPACY_MODEL_MAP.get(language.lower(), "en_core_web_sm")
     if language not in SPACY_MODEL_MAP:
-        print(f"Spacy model does not support '{language}', using en_core_web_sm model as fallback...")
+        print(f"[yellow]Spacy model does not support '{language}', using en_core_web_sm model as fallback...[/yellow]")
     return model
 
 def init_nlp():
     try:
         from config import WHISPER_LANGUAGE
         language = "en" if WHISPER_LANGUAGE == "en" else get_whisper_language()
         model = get_spacy_model(language)
-        print(f"⏳ Loading NLP Spacy model: <{model}> ...")
+        print(f"[blue]⏳ Loading NLP Spacy model: <{model}> ...[/blue]")
         try:
             nlp = spacy.load(model)
         except:
-            print(f"Downloading {model} model...")
-            print("If download failed, please check your network and try again.")
+            print(f"[yellow]Downloading {model} model...[/yellow]")
+            print("[yellow]If download failed, please check your network and try again.[/yellow]")
             download(model)
             nlp = spacy.load(model)
     except:
         raise ValueError(f"❌ Failed to load NLP Spacy model: {model}")
-    print(f"✅ NLP Spacy model loaded successfully!")
+    print(f"[green]✅ NLP Spacy model loaded successfully![/green]")
     return nlp
diff --git a/core/spacy_utils/split_by_comma.py b/core/spacy_utils/split_by_comma.py
@@ -4,6 +4,7 @@
 import os,sys
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from load_nlp_model import init_nlp
+from rich import print
 
 def is_valid_phrase(phrase):
     # 🔍 Check for subject and verb
@@ -37,13 +38,13 @@ def split_by_comma(text, nlp):
 
             if suitable_for_splitting :
                 sentences.append(doc[start:token.i].text.strip())
-                print(f"✂️  Split at comma: {doc[start:token.i][-4:]},| {doc[token.i + 1:][:4]}")
+                print(f"[yellow]✂️  Split at comma: {doc[start:token.i][-4:]},| {doc[token.i + 1:][:4]}[/yellow]")
                 start = token.i + 1
 
     for i, token in enumerate(doc):
         if token.text == ":": # Split at colon
             sentences.append(doc[start:token.i].text.strip())
-            print(f"✂️  Split at colon: {doc[start:token.i][-4:]}:| {doc[token.i + 1:][:4]}")
+            print(f"[yellow]✂️  Split at colon: {doc[start:token.i][-4:]}:| {doc[token.i + 1:][:4]}[/yellow]")
 
 
     sentences.append(doc[start:].text.strip())
@@ -63,7 +64,7 @@ def split_by_comma_main(nlp):
         for sentence in all_split_sentences:
             output_file.write(sentence + "\n")
 
-    print("💾 Sentences split by commas saved to →  `sentences_by_comma.txt`")
+    print("[green]💾 Sentences split by commas saved to →  `sentences_by_comma.txt`[/green]")
 
 if __name__ == "__main__":
     nlp = init_nlp()