当前位置:   article > 正文

Azure cognitive services speech example[python]_azure.cognitiveservices.speech

azure.cognitiveservices.speech

参考链接:

Text-to-speech API reference (REST) - Speech service - Azure Cognitive Services | Microsoft Learn

​​​​​​​cognitive-services-speech-sdk/long_form_text_synthesis.py at 34ba838dd06cc9bb07b1441984265e5859944550 · Azure-Samples/cognitive-services-speech-sdk · GitHub

cognitive-services-speech-sdk/speech_synthesis_sample.py at 34ba838dd06cc9bb07b1441984265e5859944550 · Azure-Samples/cognitive-services-speech-sdk · GitHub

long_form_text_synthesis.py 

  1. #!/usr/bin/env python
  2. # coding: utf-8
  3. # Copyright (c) Microsoft. All rights reserved.
  4. # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
  5. import copy
  6. import json
  7. import logging
  8. import time
  9. import xml.etree.ElementTree as ET
  10. from multiprocessing.pool import ThreadPool
  11. from pathlib import Path
  12. from typing import List, Tuple
  13. import azure.cognitiveservices.speech as speechsdk
  14. import nltk
  15. from nltk.tokenize import sent_tokenize
  16. from tqdm import tqdm
  17. from synthesizer_pool import SynthesizerPool
  18. # Only needed for first run
  19. nltk.download('punkt')
  20. logger = logging.getLogger(__name__)
  21. class LongTextSynthesizer:
  22. def __init__(self, subscription: str, region: str, language: str = 'english',
  23. voice: str = 'en-US-JennyNeural', parallel_threads: int = 8) -> None:
  24. self.is_ssml = None
  25. self.subscription = subscription
  26. self.region = region
  27. self.language = language
  28. self.voice = voice
  29. self.parallel_threads = parallel_threads
  30. self.synthesizer_pool = SynthesizerPool(self._create_synthesizer, self.parallel_threads)
  31. def _create_synthesizer(self) -> speechsdk.SpeechSynthesizer:
  32. config = speechsdk.SpeechConfig(subscription=self.subscription, region=self.region)
  33. config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio24Khz48KBitRateMonoMp3)
  34. config.set_property(
  35. speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary, 'true')
  36. config.speech_synthesis_voice_name = self.voice
  37. return speechsdk.SpeechSynthesizer(config, audio_config=None)
  38. def synthesize_text_once(self, text: str) -> Tuple[speechsdk.SpeechSynthesisResult,
  39. List[speechsdk.SpeechSynthesisWordBoundaryEventArgs]]:
  40. logger.debug("Synthesis started %s", text)
  41. text_boundaries = []
  42. finished = []
  43. def word_boundary_cb(evt: speechsdk.SpeechSynthesisWordBoundaryEventArgs) -> None:
  44. text_boundaries.append(evt)
  45. with self.synthesizer_pool.borrow_synthesizer() as synthesizer:
  46. synthesizer.synthesis_word_boundary.connect(word_boundary_cb)
  47. synthesizer.synthesis_completed.connect(lambda _: finished.append(True))
  48. synthesizer.synthesis_canceled.connect(lambda _: finished.append(True))
  49. for _ in range(3): # retry count
  50. text_boundaries = []
  51. finished = []
  52. result = synthesizer.speak_ssml_async(text).get() if self.is_ssml else \
  53. synthesizer.speak_text_async(text).get()
  54. if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
  55. logger.debug("Synthesis completed %s", text)
  56. while not finished:
  57. time.sleep(0.1)
  58. return result, text_boundaries
  59. elif result.reason == speechsdk.ResultReason.Canceled:
  60. cancellation_details = result.cancellation_details
  61. logger.warning("Synthesis canceled, error details %s", cancellation_details.error_details)
  62. if cancellation_details.error_code in \
  63. [speechsdk.CancellationErrorCode.ConnectionFailure,
  64. speechsdk.CancellationErrorCode.ServiceUnavailable,
  65. speechsdk.CancellationErrorCode.ServiceTimeout]:
  66. logger.info("Synthesis canceled with connection failure, retrying.")
  67. continue
  68. break
  69. logger.error("Synthesizer failed to synthesize text")
  70. return None, None
  71. def synthesize_text(self, text: str = None, ssml_path: Path = None, output_path: Path = Path.cwd()) -> None:
  72. output_path.mkdir(parents=True, exist_ok=True)
  73. all_word_boundaries, all_sentence_boundaries = [], []
  74. if text is not None:
  75. sentences = self.split_text(text)
  76. self.is_ssml = False
  77. elif ssml_path is not None:
  78. sentences = self.read_and_split_ssml(ssml_path)
  79. self.is_ssml = True
  80. else:
  81. raise ValueError('Either text or ssml_path must be provided')
  82. offset = 0
  83. with ThreadPool(processes=self.parallel_threads) as pool:
  84. audio_path = output_path / 'audio.mp3'
  85. with audio_path.open("wb") as f:
  86. for result, text_boundaries in tqdm(
  87. pool.imap(self.synthesize_text_once, sentences), total=len(sentences)):
  88. if result is not None:
  89. f.write(result.audio_data)
  90. for text_boundary in text_boundaries:
  91. text_boundary_dict = {
  92. 'audio_offset': offset + text_boundary.audio_offset / 10000,
  93. 'duration': text_boundary.duration.total_seconds() * 1000,
  94. 'text': text_boundary.text
  95. }
  96. if text_boundary.boundary_type == speechsdk.SpeechSynthesisBoundaryType.Sentence:
  97. all_sentence_boundaries.append(text_boundary_dict)
  98. else:
  99. all_word_boundaries.append(text_boundary_dict)
  100. # Calculate the offset for the next sentence,
  101. offset += len(result.audio_data) / (48 / 8)
  102. with (output_path / "word_boundaries.json").open("w", encoding="utf-8") as f:
  103. json.dump(all_word_boundaries, f, indent=4, ensure_ascii=False)
  104. with (output_path / "sentence_boundaries.json").open("w", encoding="utf-8") as f:
  105. json.dump(all_sentence_boundaries, f, indent=4, ensure_ascii=False)
  106. def split_text(self, text: str) -> List[str]:
  107. sentences = sent_tokenize(text, language=self.language)
  108. logger.info(f'Splitting into {len(sentences)} sentences')
  109. logger.debug(sentences)
  110. return sentences
  111. @staticmethod
  112. def read_and_split_ssml(ssml_path: Path) -> List[str]:
  113. namespaces = dict([node for _, node in ET.iterparse(ssml_path, events=['start-ns'])])
  114. for ns in namespaces:
  115. ET.register_namespace(ns, namespaces[ns])
  116. root = ET.parse(ssml_path).getroot()
  117. sentences = []
  118. speak_element = copy.deepcopy(root)
  119. for child in list(speak_element):
  120. _, _, tag = child.tag.rpartition('}')
  121. if tag != 'voice':
  122. raise ValueError(f'Only voice element is supported, got {tag}')
  123. speak_element.remove(child)
  124. for child in root:
  125. single_voice = copy.deepcopy(speak_element)
  126. single_voice.append(child)
  127. sentences.append(ET.tostring(single_voice, encoding='unicode'))
  128. return sentences
  129. if __name__ == "__main__":
  130. logging.basicConfig(level=logging.INFO)
  131. s = LongTextSynthesizer(subscription="YourSubscriptionKey", region="YourServiceRegion")
  132. with Path('./Gatsby-chapter1.txt').open('r', encoding='utf-8') as r:
  133. s.synthesize_text(r.read(), output_path=Path('./gatsby'))
  134. s.synthesize_text(ssml_path=Path('multi-role.xml'), output_path=Path('./multi-role'))

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/正经夜光杯/article/detail/792558
推荐阅读
相关标签
  

闽ICP备14008679号