当前位置: article > 正文

python调用麦克风和扬声器，并调用百度实时语音转文字_python麦克风

作者：喵喵爱编程 | 2024-08-17 22:01:54

踩

python麦克风


# [1]导入必要的模块和配置百度的 SDK
import time
import queue
import sounddevice as sd
import numpy as np
from aip import AipSpeech
import sys
 
# 百度云配置信息
APP_ID = ''  # 替换为实际的 APP ID
API_KEY = ''  # 替换为实际的 API KEY
SECRET_KEY = ''  # 替换为实际的 SECRET KEY
 
client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
 
# Queue to hold the recorded audio data
audio_queue = queue.Queue()
speaker_queue = queue.Queue()
 
# Callback function to capture audio data from microphone
def audio_callback(indata, frames, time, status):
    if status:
        print(status, file=sys.stderr)
    audio_queue.put(indata.copy())
 
# Callback function to capture audio data from speaker
def speaker_callback(indata, frames, time, status):
    if status:
        print(status, file=sys.stderr)
    speaker_queue.put(indata.copy())
# [2]实现实时语音识别类
class RealTimeSpeechRecognizer:
    def __init__(self, client, name):
        self.client = client
        self.name = name
 
    def send_audio(self, audio_data):
        result = self.client.asr(audio_data, 'pcm', 16000, {
            'dev_pid': 1537,
        })
        if result.get('err_no') == 0:
            print(f"{self.name} 识别结果: {result['result']}")
        else:
            print(f"{self.name} 错误: {result['err_msg']}")
 
# 调用百度的语音转文字的接口
def recognize_speech(audio_data, recognizer):
    audio_data = np.concatenate(audio_data)
    recognizer.send_audio(audio_data.tobytes())
# [3]开始音频流并处理音频数据
def start_audio_stream(mic_recognizer, speaker_recognizer, speaker_device_index):
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=16000, dtype='int16') as mic_stream, \
            sd.InputStream(callback=speaker_callback, channels=1, samplerate=16000, dtype='int16', device=speaker_device_index) as spk_stream:
        print("Recording audio... Press Ctrl+C to stop.")
        mic_audio_buffer = []
        speaker_audio_buffer = []
        try:
            while True:
                while not audio_queue.empty():
                    mic_audio_buffer.append(audio_queue.get())
                while not speaker_queue.empty():
                    speaker_audio_buffer.append(speaker_queue.get())
 
                if len(mic_audio_buffer) >= 10:
                    recognize_speech(mic_audio_buffer, mic_recognizer)
                    mic_audio_buffer = []  # Clear buffer after sending
 
                if len(speaker_audio_buffer) >= 10:
                    recognize_speech(speaker_audio_buffer, speaker_recognizer)
                    speaker_audio_buffer = []  # Clear buffer after sending
 
                time.sleep(0.1)
        except KeyboardInterrupt:
            print("Stopping audio recording.")
# [4]主程序入口
if __name__ == "__main__":
    speaker_device_index = 8  # 使用 pulse 设备（索引 8）来捕获扬声器输出
 
    mic_recognizer = RealTimeSpeechRecognizer(client, "麦克风接收：")
    speaker_recognizer = RealTimeSpeechRecognizer(client, "扬声器接收：")
 
    start_audio_stream(mic_recognizer, speaker_recognizer, speaker_device_index)

一、实时的短语音识别场景

在某些应用场景中，可能需要同时捕获麦克风和扬声器的音频数据，例如以下几种情况：

1. 实时翻译和转录会议

在一个会议或对话中，你可能需要捕获：

麦克风音频：捕获发言者的声音，进行实时转录。
扬声器音频：捕获对话的另一方通过扬声器播放的声音，进行实时转录。

这样可以同时转录双方的发言，提供完整的对话记录。

2. 语言学习和教学

在语言学习的应用中，教师可能会播放音频材料，而学生则通过麦克风回答。捕获这两种音频数据可以帮助：

麦克风音频：捕获学生的回答和发言。
扬声器音频：捕获教师播放的音频材料。

通过同时转录这两种音频，可以对学生的发言和教师播放的材料进行分析和评估。

3. 语音控制系统

在语音控制系统中，系统可能需要捕获：

麦克风音频：捕获用户的语音指令。
扬声器音频：捕获系统播放的反馈音，确认系统是否正确播放了反馈信息。

这有助于确保系统对用户的指令做出了正确的反馈。

4. 电话会议录音

在电话会议中，可能需要捕获：

麦克风音频：捕获本地发言者的声音。
扬声器音频：捕获远程参与者通过扬声器播放的声音。

这样可以完整地记录整个会议过程。

基于上述场景，我的代码实现了同时捕获麦克风和扬声器的音频数据，并分别进行转录。

二、实现步骤

1. 导入必要的模块和配置百度的 SDK


import time
import queue
import sounddevice as sd
import numpy as np
from aip import AipSpeech
import sys
 
# 百度云配置信息
APP_ID = '你的 App ID'  # 替换为实际的 APP ID
API_KEY = '你的 Api Key'  # 替换为实际的 API KEY
SECRET_KEY = '你的 Secret Key'  # 替换为实际的 SECRET KEY
 
client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
 
# Queue to hold the recorded audio data
audio_queue = queue.Queue()
speaker_queue = queue.Queue()
 
# Callback function to capture audio data from microphone
def audio_callback(indata, frames, time, status):
    if status:
        print(status, file=sys.stderr)
    audio_queue.put(indata.copy())
 
# Callback function to capture audio data from speaker
def speaker_callback(indata, frames, time, status):
    if status:
        print(status, file=sys.stderr)
    speaker_queue.put(indata.copy())

2. 实现实时语音识别类


class RealTimeSpeechRecognizer:
    def __init__(self, client, name):
        self.client = client
        self.name = name
 
    def send_audio(self, audio_data):
        result = self.client.asr(audio_data, 'pcm', 16000, {
            'dev_pid': 1537,
        })
        if result.get('err_no') == 0:
            print(f"{self.name} 识别结果: {result['result']}")
        else:
            print(f"{self.name} 错误: {result['err_msg']}")
 
# 调用百度的语音转文字的接口
def recognize_speech(audio_data, recognizer):
    audio_data = np.concatenate(audio_data)
    recognizer.send_audio(audio_data.tobytes())

3. 开始音频流并处理音频数据


def start_audio_stream(mic_recognizer, speaker_recognizer, speaker_device_index):
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=16000, dtype='int16') as mic_stream, \
            sd.InputStream(callback=speaker_callback, channels=1, samplerate=16000, dtype='int16', device=speaker_device_index) as spk_stream:
        print("Recording audio... Press Ctrl+C to stop.")
        mic_audio_buffer = []
        speaker_audio_buffer = []
        try:
            while True:
                while not audio_queue.empty():
                    mic_audio_buffer.append(audio_queue.get())
                while not speaker_queue.empty():
                    speaker_audio_buffer.append(speaker_queue.get())
 
                if len(mic_audio_buffer) >= 10:
                    recognize_speech(mic_audio_buffer, mic_recognizer)
                    mic_audio_buffer = []  # Clear buffer after sending
 
                if len(speaker_audio_buffer) >= 10:
                    recognize_speech(speaker_audio_buffer, speaker_recognizer)
                    speaker_audio_buffer = []  # Clear buffer after sending
 
                time.sleep(0.1)
        except KeyboardInterrupt:
            print("Stopping audio recording.")

4. 主程序入口


if __name__ == "__main__":
    speaker_device_index = 8  # 使用 pulse 设备（索引 8）来捕获扬声器输出
 
    mic_recognizer = RealTimeSpeechRecognizer(client, "麦克风接收：")
    speaker_recognizer = RealTimeSpeechRecognizer(client, "扬声器接收：")
 
    start_audio_stream(mic_recognizer, speaker_recognizer, speaker_device_index)

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/喵喵爱编程/article/detail/994614