utils.py · openbmb/MiniCPM-o-2

update

0893f10 about 1 year ago

7.24 kB

	# coding=utf-8
	# Copyright 2025 The OpenBMB Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import logging
	import re

	import librosa
	import numpy as np

	logger = logging.getLogger(__name__)


	def is_silent(data):
	if np.abs(data).max() < 3e-3:
	return True
	else:
	return False


	def sentence_end(txt):
	for c in [".", "。", "!", "?", "！", "？"]:
	if c in txt:
	if c == ".": # check not number before it like 1.
	idx = txt.find(c)
	if idx > 0:
	if txt[idx - 1].isdigit():
	continue
	return c
	return ""


	class NumberToTextConverter:
	r"""
	A helper class to ensure text-to-speech (TTS) systems read numeric digits
	in the desired language (Chinese or English) digit-by-digit. It forcibly
	replaces all numeric substrings in text with their language-specific
	textual representations, thereby reducing the likelihood of TTS mistakes
	on numbers.
	Note: MiniCPM-o 2.6 only use this in streaming mode.

	Attributes:
	num_to_chinese (dict):
	Mapping from digit (str) to its Chinese textual form (str).
	num_to_english (dict):
	Mapping from digit (str) to its English textual form (str).

	Example:
	>>> converter = NumberToTextConverter()
	>>> converter.replace_numbers_with_text("我有2个苹果", language="chinese")
	'我有两个苹果'
	>>> converter.replace_numbers_with_text("I have 23 books", language="english")
	'I have two three books'
	"""

	def __init__(self):
	self.num_to_chinese = {
	"0": "零",
	"1": "一",
	"2": "二",
	"3": "三",
	"4": "四",
	"5": "五",
	"6": "六",
	"7": "七",
	"8": "八",
	"9": "九",
	}
	self.num_to_english = {
	"0": "zero",
	"1": "one",
	"2": "two",
	"3": "three",
	"4": "four",
	"5": "five",
	"6": "six",
	"7": "seven",
	"8": "eight",
	"9": "nine",
	}

	def number_to_chinese_digit_by_digit(self, num_str):
	result = ""
	for char in num_str:
	if char in self.num_to_chinese:
	result += self.num_to_chinese[char]
	return result

	def number_to_english_digit_by_digit(self, num_str):
	result = []
	for char in num_str:
	if char in self.num_to_english:
	result.append(self.num_to_english[char])
	return " ".join(result)

	def detect_language(self, text):
	chinese_count = len(re.findall(r"[\u4e00-\u9fff]", text))
	english_count = len(re.findall(r"[a-zA-Z]", text))
	return "chinese" if chinese_count >= english_count else "english"

	def replace_numbers_with_text(self, text, language=None):
	if language is None:
	language = self.detect_language(text)
	numbers = re.findall(r"\d+", text)

	for num in numbers:
	if language == "chinese":
	replacement = self.number_to_chinese_digit_by_digit(num)
	else:
	replacement = self.number_to_english_digit_by_digit(num)
	text = text.replace(num, replacement, 1)

	return text


	class VoiceChecker:
	r"""
	A simple utility class to detect silence or low variation in consecutive audio chunks by comparing
	the mel-spectrogram distances. It keeps track of consecutive zero-distance and low-distance chunks
	to decide if the audio is considered "bad" (e.g., overly silent or not changing enough).

	Attributes:
	previous_mel (`np.ndarray` or `None`):
	Holds the previously observed mel-spectrogram in decibel scale. Used to compute
	the next distance; reset via :meth:`reset`.
	consecutive_zeros (`int`):
	The number of consecutive chunks that were detected as silent (distance = 0).
	consecutive_low_distance (`int`):
	The number of consecutive chunks whose distance was below the threshold.

	Example:
	>>> checker = VoiceChecker()
	>>> # Suppose we have audio_wav (list or np.ndarray) and mel_spec (np.ndarray)
	>>> # We split them into chunks and call checker.is_bad(...)
	>>> is_audio_bad = checker.is_bad(audio_wav, mel_spec, chunk_size=2560, thresh=100.0)
	>>> if is_audio_bad:
	... print("Audio deemed bad!")
	>>> # Reset states if needed
	>>> checker.reset()
	"""

	def __init__(self):
	self.previous_mel = None
	self.consecutive_zeros = 0
	self.consecutive_low_distance = 0

	def compute_distance(self, audio_chunk, mel_spec):
	if is_silent(audio_chunk):
	return 0.0 # 检查是否为空白片段

	mel_db = librosa.power_to_db(mel_spec)
	if self.previous_mel is None:
	self.previous_mel = mel_db
	return -1.0

	distance = np.linalg.norm(np.mean(mel_db, axis=1) - np.mean(self.previous_mel, axis=1))
	self.previous_mel = mel_db
	return distance

	def is_bad(self, audio_wav, mel_spec, chunk_size=2560, thresh=100.0):
	num_chunks = len(audio_wav) // chunk_size
	mel_chunk_size = mel_spec.shape[-1] // num_chunks
	for i in range(num_chunks):
	audio_chunk = audio_wav[i * chunk_size : (i + 1) * chunk_size]
	mel_spec_chunk = mel_spec[:, i * mel_chunk_size : (i + 1) * mel_chunk_size]

	distance = self.compute_distance(audio_chunk, mel_spec_chunk)
	logger.warning(
	f"mel dist: {distance:.1f}, zero: {self.consecutive_zeros}, low: {self.consecutive_low_distance}"
	)
	if distance == 0:
	self.consecutive_low_distance = 0 # reset
	self.consecutive_zeros += 1
	if self.consecutive_zeros >= 12:
	logger.warning("VoiceChecker detected 1.2 s silent. Marking as failed.")
	return True
	elif distance < thresh:
	self.consecutive_zeros = 0
	self.consecutive_low_distance += 1
	if self.consecutive_low_distance >= 5:
	logger.warning("VoiceChecker detected 5 consecutive low distance chunks. Marking as failed.")
	return True
	else:
	self.consecutive_low_distance = 0
	self.consecutive_zeros = 0

	return False

	def reset(self):
	self.previous_mel = None
	self.consecutive_zeros = 0
	self.consecutive_low_distance = 0