Skip to content

Commit

Permalink
Merge pull request #46 from Huanshere/dev_v8_longvideo
Browse files Browse the repository at this point in the history
Dev v8 longvideo
  • Loading branch information
Huanshere authored Sep 16, 2024
2 parents eda2295 + d4faf5b commit 392e61b
Show file tree
Hide file tree
Showing 23 changed files with 326 additions and 446 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ _model_cache/
# large files
/ffmpeg.exe
/ffmpeg
/ffprobe.exe
/ffprobe
.DS_Store
_config.py
config.py
Expand Down
48 changes: 0 additions & 48 deletions Dockerfile

This file was deleted.

5 changes: 3 additions & 2 deletions config.example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@
# Subtitle settings
# 每行字幕的最大长度字母数量
# Maximum number of characters per line of subtitle
MAX_SUB_LENGTH = 80
MAX_SUB_LENGTH = 75
# 输出字幕字号更大一些
# Increase the font size of the output subtitles
TARGET_SUB_MULTIPLIER = 1.2

# 视频分辨率
# Video resolution
RESOLUTIOM = '854x480'
RESOLUTIOM = '640x360'

# 显示语言
# Display language
Expand Down Expand Up @@ -101,6 +101,7 @@
"de": "de_core_news_md",
"it": "it_core_news_md",


# Not supported
# "zh": "zh_core_web_md",

Expand Down
62 changes: 35 additions & 27 deletions core/all_whisper_methods/whisperX.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,68 +2,76 @@
import sys
import whisperx
import torch
import pandas as pd
import json
from typing import Dict

sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from config import MODEL_DIR
from core.all_whisper_methods.whisperXapi import process_transcription, convert_video_to_audio
from core.all_whisper_methods.whisperXapi import (
process_transcription, convert_video_to_audio, split_audio,
save_results, save_language
)

def transcribe_audio(audio_file: str) -> Dict:
def transcribe_audio(audio_file: str, start: float, end: float) -> Dict:
from config import WHISPER_LANGUAGE
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 16 # TODO Reduce this value if GPU memory is insufficient
compute_type = "float16" # TODO Change to "int8" if GPU memory is insufficient (may reduce accuracy)
print(f"🚀 Starting WhisperX... Please wait patiently...")
print(f"🚀 Starting WhisperX for segment {start:.2f}s to {end:.2f}s... Please wait patiently...")
try:
whisperx_model_dir = os.path.join(MODEL_DIR, "whisperx")
model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=whisperx_model_dir)

# Load audio segment
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size, language=(None if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE))
audio_segment = audio[int(start * 16000):int(end * 16000)] # Assuming 16kHz sample rate

result = model.transcribe(audio_segment, batch_size=batch_size, language=(None if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE))
# Free GPU resources
del model
torch.cuda.empty_cache()

# Save language information
save_language(result['language'])

# Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
result = whisperx.align(result["segments"], model_a, metadata, audio_segment, device, return_char_alignments=False)

# Free GPU resources again
del model_a
torch.cuda.empty_cache()

# Adjust timestamps
for segment in result['segments']:
segment['start'] += start
segment['end'] += start
for word in segment['words']:
word['start'] += start
word['end'] += start

return result
except Exception as e:
raise Exception(f"WhisperX processing error: {e}")

def save_results(df: pd.DataFrame):
os.makedirs('output/log', exist_ok=True)
excel_path = os.path.join('output/log', "cleaned_chunks.xlsx")
df['text'] = df['text'].apply(lambda x: f'"{x}"')
df.to_excel(excel_path, index=False)
print(f"📊 Excel file saved to {excel_path}")

def save_language(language: str):
os.makedirs('output/log', exist_ok=True)
with open('output/log/transcript_language.json', 'w', encoding='utf-8') as f:
json.dump({"language": language}, f, ensure_ascii=False, indent=4)

def transcribe(video_file: str):
if not os.path.exists("output/log/cleaned_chunks.xlsx"):
audio_file = convert_video_to_audio(video_file)

if os.path.getsize(audio_file) > 25 * 1024 * 1024:
print("⚠️ File size exceeds 25MB. Please use a smaller file.")
return
segments = split_audio(audio_file)

all_results = []
for start, end in segments:
result = transcribe_audio(audio_file, start, end)
all_results.append(result)

# Combine results
combined_result = {
'segments': [],
'language': all_results[0]['language']
}
for result in all_results:
combined_result['segments'].extend(result['segments'])

result = transcribe_audio(audio_file)
save_language(combined_result['language'])

df = process_transcription(result)
df = process_transcription(combined_result)
save_results(df)
else:
print("📊 Transcription results already exist, skipping transcription step.")
Expand Down
110 changes: 101 additions & 9 deletions core/all_whisper_methods/whisperXapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import replicate
import pandas as pd
import json
from typing import Dict
from typing import Dict, List, Tuple
import subprocess
import base64
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
Expand All @@ -28,6 +28,81 @@ def convert_video_to_audio(input_file: str) -> str:

return audio_file

def split_audio(audio_file: str, target_duration: int = 20*60, window: int = 60) -> List[Tuple[float, float]]:
print("🔪 Splitting audio into segments...")
duration = float(subprocess.check_output(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', audio_file]).decode('utf-8').strip())

segments = []
start = 0
while start < duration:
end = min(start + target_duration + window, duration)
if end - start < target_duration:
segments.append((start, end))
break

# Analyze audio in the 2-minute window
window_start = start + target_duration - window
window_end = min(window_start + 2 * window, duration)

ffmpeg_cmd = [
'ffmpeg',
'-i', audio_file,
'-ss', str(window_start),
'-to', str(window_end),
'-af', 'silencedetect=n=-30dB:d=0.5',
'-f', 'null',
'-'
]

output = subprocess.run(ffmpeg_cmd, capture_output=True, text=True).stderr

# Parse silence detection output
silence_end_times = [float(line.split('silence_end: ')[1].split(' ')[0]) for line in output.split('\n') if 'silence_end' in line]

if silence_end_times:
# Find the first silence after the target duration
split_point = next((t for t in silence_end_times if t > target_duration), None)
if split_point:
segments.append((start, start + split_point))
start += split_point
continue

# If no suitable split point found, split at the target duration
segments.append((start, start + target_duration))
start += target_duration

print(f"🔪 Split audio into {len(segments)} segments")
return segments

def transcribe_segment(audio_file: str, start: float, end: float) -> Dict:
print(f"🎙️ Transcribing segment from {start:.2f}s to {end:.2f}s")

segment_file = f'output/audio/segment_{start:.2f}_{end:.2f}.wav'
ffmpeg_cmd = [
'ffmpeg',
'-i', audio_file,
'-ss', str(start),
'-to', str(end),
'-c', 'copy',
segment_file
]
subprocess.run(ffmpeg_cmd, check=True, stderr=subprocess.PIPE)

# Encode to base64
with open(segment_file, 'rb') as file:
audio_base64 = base64.b64encode(file.read()).decode('utf-8')

# Check segment size
segment_size = len(audio_base64) / (1024 * 1024) # Size in MB
print(f"📊 Segment size: {segment_size:.2f} MB")

result = transcribe_audio(audio_base64)

# delete the segment file
os.remove(segment_file)

return result

def encode_file_to_base64(file_path: str) -> str:
print("🔄 Encoding audio file to base64...")
with open(file_path, 'rb') as file:
Expand Down Expand Up @@ -83,7 +158,7 @@ def process_transcription(result: Dict) -> pd.DataFrame:
}
all_words.append(word_dict)
else:
# If its the first word, look next for a timestamp then assign it to the current word
# If it's the first word, look next for a timestamp then assign it to the current word
next_word = next((w for w in segment['words'] if 'start' in w and 'end' in w), None)
if next_word:
word_dict = {
Expand Down Expand Up @@ -122,16 +197,33 @@ def transcribe(video_file: str):
if not os.path.exists("output/log/cleaned_chunks.xlsx"):
audio_file = convert_video_to_audio(video_file)

if os.path.getsize(audio_file) > 25 * 1024 * 1024:
print("⚠️ File size exceeds 25MB. Please use a smaller file.")
return
segments = split_audio(audio_file)

audio_base64 = encode_file_to_base64(audio_file)
result = transcribe_audio(audio_base64)
all_results = []
for start, end in segments:
result = transcribe_segment(audio_file, start, end)
result['time_offset'] = start # Add time offset to the result
all_results.append(result)

# Combine results
combined_result = {
'segments': [],
'detected_language': all_results[0]['detected_language']
}
for result in all_results:
for segment in result['segments']:
segment['start'] += result['time_offset']
segment['end'] += result['time_offset']
for word in segment['words']:
if 'start' in word:
word['start'] += result['time_offset']
if 'end' in word:
word['end'] += result['time_offset']
combined_result['segments'].extend(result['segments'])

save_language(result['detected_language'])
save_language(combined_result['detected_language'])

df = process_transcription(result)
df = process_transcription(combined_result)
save_results(df)
else:
print("📊 Transcription results already exist, skipping transcription step.")
Expand Down
11 changes: 6 additions & 5 deletions core/spacy_utils/load_nlp_model.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
import os,sys
import spacy
from spacy.cli import download
from rich import print
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from core.step2_whisper import get_whisper_language
from config import SPACY_MODEL_MAP

def get_spacy_model(language: str):
model = SPACY_MODEL_MAP.get(language.lower(), "en_core_web_sm")
if language not in SPACY_MODEL_MAP:
print(f"Spacy model does not support '{language}', using en_core_web_sm model as fallback...")
print(f"[yellow]Spacy model does not support '{language}', using en_core_web_sm model as fallback...[/yellow]")
return model

def init_nlp():
try:
from config import WHISPER_LANGUAGE
language = "en" if WHISPER_LANGUAGE == "en" else get_whisper_language()
model = get_spacy_model(language)
print(f"⏳ Loading NLP Spacy model: <{model}> ...")
print(f"[blue]⏳ Loading NLP Spacy model: <{model}> ...[/blue]")
try:
nlp = spacy.load(model)
except:
print(f"Downloading {model} model...")
print("If download failed, please check your network and try again.")
print(f"[yellow]Downloading {model} model...[/yellow]")
print("[yellow]If download failed, please check your network and try again.[/yellow]")
download(model)
nlp = spacy.load(model)
except:
raise ValueError(f"❌ Failed to load NLP Spacy model: {model}")
print(f"✅ NLP Spacy model loaded successfully!")
print(f"[green]✅ NLP Spacy model loaded successfully![/green]")
return nlp
7 changes: 4 additions & 3 deletions core/spacy_utils/split_by_comma.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os,sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from load_nlp_model import init_nlp
from rich import print

def is_valid_phrase(phrase):
# 🔍 Check for subject and verb
Expand Down Expand Up @@ -37,13 +38,13 @@ def split_by_comma(text, nlp):

if suitable_for_splitting :
sentences.append(doc[start:token.i].text.strip())
print(f"✂️ Split at comma: {doc[start:token.i][-4:]},| {doc[token.i + 1:][:4]}")
print(f"[yellow]✂️ Split at comma: {doc[start:token.i][-4:]},| {doc[token.i + 1:][:4]}[/yellow]")
start = token.i + 1

for i, token in enumerate(doc):
if token.text == ":": # Split at colon
sentences.append(doc[start:token.i].text.strip())
print(f"✂️ Split at colon: {doc[start:token.i][-4:]}:| {doc[token.i + 1:][:4]}")
print(f"[yellow]✂️ Split at colon: {doc[start:token.i][-4:]}:| {doc[token.i + 1:][:4]}[/yellow]")


sentences.append(doc[start:].text.strip())
Expand All @@ -63,7 +64,7 @@ def split_by_comma_main(nlp):
for sentence in all_split_sentences:
output_file.write(sentence + "\n")

print("💾 Sentences split by commas saved to → `sentences_by_comma.txt`")
print("[green]💾 Sentences split by commas saved to → `sentences_by_comma.txt`[/green]")

if __name__ == "__main__":
nlp = init_nlp()
Expand Down
Loading

0 comments on commit 392e61b

Please sign in to comment.