from google.colab import drive
drive.mount('/content/drive')
!pip install moviepy
!pip install git+https://github.com/openai/whisper.git
!pip install opencc-python-reimplemented
import whisper
from opencc import OpenCC # 用於簡體轉繁體中文
import os
# Step 1: 使用 Whisper 將音頻轉換為文字
def transcribe_audio(audio_file):
model = whisper.load_model("medium") # 使用 Whisper 模型
result = model.transcribe(audio_file, language='zh') # 語言設置為中文
return result['text']
# Step 2: 將簡體中文轉為繁體中文
def simplify_to_traditional(text):
cc = OpenCC('s2t') # s2t 表示簡體轉繁體
return cc.convert(text)
# Step 3: 完整流程:m4a -> 繁體中文文字
def m4a_to_text(m4a_file, output_folder):
# 語音轉文字
transcript = transcribe_audio(m4a_file)
# 簡體轉繁體
traditional_text = simplify_to_traditional(transcript)
# 將文字儲存為文本文件
text_output_file = os.path.join(output_folder, 'transcription.txt')
with open(text_output_file, 'w', encoding='utf-8') as f:
f.write(traditional_text)
return traditional_text
# 您的 .m4a 文件路徑和輸出資料夾
data_path = '/content/drive/My Drive/12345678/12345678_file.m4a'
output_folder = '/content/drive/My Drive/12345678/'
# 開始處理
transcribed_text = m4a_to_text(data_path, output_folder)
print(transcribed_text)